Commit ad65e200 authored by 李小芳's avatar 李小芳

update

parent 67b86bd3
import json
import logging
import smtplib
import sys
import time
import traceback
import datetime
import os
import random
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formataddr
import re
import pandas as pd
import requests
from lxml import etree
from pypinyin import lazy_pinyin
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm_notebook
from fontTools.ttLib import TTFont
logger = logging.getLogger(__name__)
class DianPintCraw(object):
def __init__(self):
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
"Cookie": "fspop=test; cy=2; cye=beijing; _lxsdk_cuid=17ac7b3482cc8-0577482fde4f21-34647600-1fa400-17ac7b3482cc8; _lxsdk=17ac7b3482cc8-0577482fde4f21-34647600-1fa400-17ac7b3482cc8; _hc.v=7cd93c95-3674-1de2-0725-2e8d4141c973.1626848053; s_ViewType=10; dplet=45b53ad04cb79c04c2e30bea98dca7ef; dper=8591feb7929077261e0c0702628cd4314faa13a74729c7e6480d13c3220c85e5b0f336a0b2af7450370e86f53958152509c44d579007ab941b3a66bc922cdf19cde4eecbdb3f94ef3a0532a955ea9e11803bbf18d01a29bad962ca22e13f6543; ll=7fd06e815b796be3df069dec7836c3df; ua=%E9%99%AA%E4%BD%A0%E6%90%9E%E6%80%AA; ctu=23034069fac8b78bdb78108ada1c10714737c4da63d46c011bfd4779f1daa177; cityid=2; switchcityflashtoast=1; default_ab=citylist%3AA%3A1%7Cindex%3AA%3A3; source=m_browser_test_33; Appshare2021_ab=shop%3AA%3A1%7Cmap%3AA%3A1%7Cshopphoto%3AA%3A1; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1626862684,1627020606,1627041159,1627292689; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1627294126; _lxsdk_s=17ae233df3e-b4b-9f4-00d%7C%7C304",
'Host': 'www.dianping.com',
'Referer': 'http://www.dianping.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
}
self.url = 'http://www.dianping.com/search/keyword/2/0_%E6%B0%B4%E5%85%89%E9%92%88'
def parse_url(self):
response = requests.get(url=self.url, headers=self.headers)
content = response.text
print(content)
if response.status_code == 200:
return content
else:
return None
def search(self):
content = self.parse_url()
parsed_response = BeautifulSoup(content, "lxml")
shop_search = parsed_response.find(
attrs={"class": "section Fix J-shop-search"}).find(
attrs={"class": "content-wrap"}).find(
attrs={"class": "shop-wrap"}).find(
attrs={"class": "content"}).find(
attrs={"class": "shop-list J_shop-list shop-all-list"}).find("ul").find_all("li")
for item in shop_search:
hospital_name = item.find(attrs={"class": "txt"}).find(attrs={"class": "tit"}).find("a").find(
"h4").get_text()
print(hospital_name)
star_info = item.find(attrs={"class": "txt"}).find(attrs={"class": "comment"}).find(
attrs={"class": "nebula_star"}).find(attrs={"class": "star_icon"}).find_all("span")
print("star_info:", star_info)
review_num_info = item.find(attrs={"class": "txt"}).find(attrs={"class": "comment"}).find(
attrs={"class": "review-num"}).find("b")
print("review_num_info:", review_num_info)
meanprice_info = item.find(attrs={"class": "txt"}).find(attrs={"class": "comment"}).find(
attrs={"class": "mean-price"}).find("b")
print("meanprice_info:", meanprice_info)
# service_info_data = item.find(attrs={"class": "svr-info"}).find(
# attrs={"class": "si-deal d-packup"}).find_all("a")
# for service_info in service_info_data:
# sku_info = service_info.text()
# print(base_info_data)
# print(service_info_data)
print("-----------")
return shop_search
def woff_change(self, wofflist, TTG, woffdict):
try:
woff_string = '''
1234567890店中美家馆小车大市公酒行国品发电金心业商司超生装园场食有新限天面工服海华水房饰城乐汽香部利子老艺花专东肉菜学福
人百餐茶务通味所山区门药银 农龙停尚安广鑫一容动南具源兴鲜记时机烤文康信果阳理锅宝达地儿衣特产西批坊州牛佳化五米修爱北养卖建 '''
woffs = [i for i in woff_string if i != '\n' and i != ' ']
woff_content = ''
for char in wofflist:
text = str(char.encode('raw_unicode_escape').replace(b'\\u', b'uni'), 'utf-8')
if text in TTG:
content = woffs[woffdict[str(char.encode('raw_unicode_escape').replace(b'\\u', b'uni'), 'utf-8')]]
else:
content = char
woff_content += ''.join(content)
except UnicodeDecodeError:
return "编码错误"
else:
return woff_content
# 以爬取地址为例子
# soup为网页的内容
# def get_adress(self):
# addressfont = TTFont('/Users/edz/Downloads/3944c230.woff')
# address_TTGlyphs = addressfont['cmap'].tables[0].ttFont.getGlyphOrder()[2:]
# address_dict = {}
# for i, x in enumerate(address_TTGlyphs):
# address_dict[x] = i
# # adress = soup("div.tag-addr > span").text()
#
# location = self.woff_change(adress, address_TTGlyphs, address_dict)
# locations = re.sub('\s', '', location)
# return locations
if __name__ == '__main__':
spider = DianPintCraw()
spider.parse_url()
import json
import logging
import smtplib
import sys
import time
import traceback
import datetime
import os
import random
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formataddr
import re
import pandas as pd
import requests
from lxml import etree
from pypinyin import lazy_pinyin
import gevent
logger = logging.getLogger(__name__)
......@@ -58,7 +55,6 @@ def send_email_tome():
def get_keynote_sentence(content):
try:
content_list = []
ss = content.encode('utf-16', 'surrogatepass').decode('utf-16')
dr = re.compile(r"<[^>]+>", re.S)
str_re = dr.sub("", ss)
......@@ -78,12 +74,13 @@ def get_service_info(query="", city_name="", get_data_file=None):
break_flat = False
other_city_count = 0
for page in range(0, 3000, 10):
if break_flat == False and other_city_count < 100:
if break_flat == False and other_city_count < 10:
s = random.random()
time.sleep(s)
# time.sleep(s)
url = 'https://backend.igengmei.com/api/janus/search/v7/content?platform=iPhone&os_version=13.6.1&version=7.46.0&model=iphone%20X%20++&release=1&idfa=057F28DF-20B8-488F-A285-931367FCC110&idfv=74FE9CFB-DAD2-4379-B8F8-FC656F38BCA5&device_id=057F28DF-20B8-488F-A285-931367FCC110&uqid=47517624-F42B-469C-96EC-3BF936E44613&channel=App%20Store&app_name=gengmeiios&current_city_id={}&lat=39.98323941854652&lng=116.4880417854629&is_WiFi=1&hardware_model=iPhone12,1&ua=Mozilla/5.0%20(iPhone;%20CPU%20iPhone%20OS%2013_6_1%20like%20Mac%20OS%20X)%20AppleWebKit/605.1.15%20(KHTML,%20like%20Gecko)%20Mobile/15E148&sm_idfa=(null)&trace_id=2021/07/22/0956/53f8f1c10868&area_id=worldwide&count=10&is_first=1&is_gray=1&max_price=100000&min_price=0&offset={}&order_by=0&query={}&show_mode=1&size=10&tab_type=0'.format(
city_name, page, query)
response_res = requests.get(url, verify=False)
response_res = requests.get(url, verify=False, timeout=30)
time.sleep(1)
if response_res.status_code == 200 and response_res.text:
response = json.loads(response_res.text)
responseData = response.get("data", {}).get("cards")
......@@ -93,7 +90,6 @@ def get_service_info(query="", city_name="", get_data_file=None):
for service_data in item.get("feed", []):
city_id = service_data.get("hospital_info", {}).get("city_id")
if str(city_id) == str(city_name):
# print(service_data)
service_info = dict()
service_info['skuid'] = service_data.get("service_item_id")
service_info['美购id'] = service_data.get("service_id")
......@@ -123,9 +119,10 @@ def get_service_info(query="", city_name="", get_data_file=None):
service_info[
'链接'] = "https://m.igengmei.com/promotion/{}?sku_id={}&distribute_type=1&distribute_id=30775628&is_share=1".format(
service_info['美购id'], service_info['skuid'])
print(service_info)
get_data_file.write(json.dumps(service_info))
get_data_file.write("\n")
print("write success ", query, city_name)
else:
other_city_count += 1
......@@ -139,17 +136,13 @@ def get_service_info(query="", city_name="", get_data_file=None):
print(page, city_name, query, "本地已爬完")
def main(city_tag=""):
def main():
begin = time.time()
nowday = datetime.datetime.now()
today = str(nowday).split()[0]
file_name = "gengmei_save_data_" + str(today) + city_tag + ".txt"
file_name = "test_gengmei_save_data_" + str(today) + ".txt"
get_data_file = open(file_name, "a+", encoding="utf-8")
cityIdMapping = {'北京': '328', '上海': '9', '重庆': '22', '广州市': '289', '深圳市': '291', '郑州市': '240', '武汉市': '258',
'长沙市': '275', '南京市': '162', '成都市': '385', '西安市': '438', '杭州市': '175'}
keywords = ['瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉',
'脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀',
'美白针', '眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提', '点阵激光',
......@@ -164,10 +157,24 @@ def main(city_tag=""):
'美白', '厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑', '除皱针', '开眼角',
'海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
# city_list = ["beijing", "shanghai", "guangzhou", "shenzhen", "hangzhou", "chengdu", "chongqing", "nanjing", "wuhan", "changsha", "zhengzhou", "xian"]
city_list = ["北京", "上海", "广州", "深圳", "杭州", "成都", "重庆", "南京", "武汉", "长沙", "郑州", "西安"]
city_list = [city_tag]
city_list = ['合肥市', '芜湖市', '蚌埠市', '淮南市', '马鞍山市', '淮北市', '铜陵市', '安庆市', '滁州市', '阜阳市', '宿州市', '六安地区', '亳州市', '池州市',
'宣城市', '福州市', '厦门市', '莆田市', '三明市', '泉州市', '漳州市', '南平市', '龙岩市', '宁德市', '韶关市', '东莞市', '中山市', '珠海市',
'佛山市', '汕头市', '江门市', '湛江市', '茂名市', '肇庆市', '惠州市', '梅州市', '河源市', '阳江市', '清远市', '潮州市', '揭阳市', '南宁市',
'柳州市', '桂林市', '梧州市', '北海市', '钦州市', '贵港市', '玉林市', '百色市', '贵阳市', '遵义市', '六盘水市', '铜仁地区', '毕节地区', '兰州市',
'张掖市', '平凉市', '庆阳市', '开封市', '洛阳市', '平顶山市', '安阳市', '新乡市', '焦作市', '濮阳市', '许昌市', '漯河市', '三门峡市', '南阳市',
'商丘市', '信阳市', '周口市', '驻马店市', '黄石市', '荆州市', '宜昌市', '襄阳市', '天门市', '十堰市', '荆门市', '孝感市', '黄冈市', '咸宁市',
'随州市', '仙桃市', '潜江市', '株洲市', '衡阳市', '益阳市', '邵阳市', '湘潭市', '岳阳市', '常德市', '郴州市', '永州市', '怀化市', '娄底市',
'石家庄市', '唐山市', '秦皇岛市', '邯郸市', '邢台市', '保定市', '张家口市', '承德市', '衡水市', '廊坊市', '沧州市', '哈尔滨市', '齐齐哈尔市',
'鸡西市', '鹤岗市', '大庆市', '佳木斯市', '七台河市', '牡丹江市', '绥化市', '海口市', '三亚市', '东方市', '苏州市', '无锡市', '常州市', '南通市',
'盐城市', '徐州市', '连云港市', '扬州市', '淮安市', '镇江市', '泰州市', '宿迁市', '南昌市', '景德镇市', '萍乡市', '九江市', '新余市', '鹰潭市',
'赣州市', '吉安市', '宜春市', '抚州市', '上饶市', '长春市', '吉林市', '四平市', '通化市', '白山市', '松原市', '沈阳市', '大连市', '鞍山市',
'抚顺市', '丹东市', '锦州市', '营口市', '阜新市', '辽阳市', '铁岭市', '朝阳市', '葫芦岛市', '呼和浩特市', '包头市', '赤峰市', '通辽市', '鄂尔多斯市',
'呼伦贝尔市', '巴彦淖尔市', '银川市', '西宁市', '自贡市', '绵阳市', '德阳市', '乐山市', '南充市', '攀枝花市', '泸州市', '广元市', '遂宁市', '内江市',
'眉山市', '宜宾市', '广安市', '达州市', '雅安市', '巴中市', '资阳市', '济南市', '青岛市', '枣庄市', '烟台市', '淄博市', '临沂市', '东营市',
'潍坊市', '济宁市', '泰安市', '威海市', '日照市', '德州市', '聊城市', '滨州市', '菏泽市', '宝鸡市', '咸阳市', '汉中市', '渭南市', '延安市',
'安康市', '太原市', '晋中市', '运城市', '大同市', '阳泉市', '长治市', '晋城市', '临汾市', '乌鲁木齐市', '石河子市', '昆明市', '曲靖市', '玉溪市',
'保山市', '昭通市', '丽江市', '宁波市', '温州市', '嘉兴市', '金华市', '台州市', '湖州市', '绍兴市', '舟山市', '衢州市', '丽水市', '台北市',
'台中市', '台北县']
for city_name in city_list:
for word in keywords:
......@@ -178,5 +185,4 @@ def main(city_tag=""):
if __name__ == '__main__':
args = sys.argv[1]
main(city_tag=args)
main()
......@@ -123,12 +123,14 @@ if __name__ == '__main__':
nowday = datetime.datetime.now()
today = str(nowday).split()[0]
all_data = []
city_list = ["北京", "上海", "广州", "深圳", "杭州", "成都", "重庆", "南京", "武汉", "长沙", "郑州", "西安"]
# city_list = ["北京", "上海", "广州", "深圳", "杭州", "成都", "重庆", "南京", "武汉", "长沙", "郑州", "西安"]
city_list = ["gengmei_save_data_2021-07-27.txt", "gengmei_save_data_2021-07-28.txt",
"gengmei_save_data_2021-07-29.txt", "gengmei_save_data_2021-07-30.txt"]
for city_name in city_list:
file_name = "gengmei_save_data_2021-07-22" + city_name + ".txt"
if os.path.exists(file_name):
open_file = open(file_name, "r", encoding="utf-8")
print(file_name)
# file_name = "gengmei_save_data_2021-07-22" + city_name + ".txt"
if os.path.exists(city_name):
open_file = open(city_name, "r", encoding="utf-8")
print(city_name)
for item in open_file.readlines():
try:
data = json.loads(item.strip())
......
import json
import logging
import smtplib
import sys
import time
import traceback
import datetime
import os
import random
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formataddr
import re
import pandas as pd
import requests
from lxml import etree
logger = logging.getLogger(__name__)
def send_email_tome():
try:
from_addrs = 'lixiaofang@igengmei.com'
password = 'EzJzSRyEG4Jibuy9'
toaddrs = "lixiaofang@igengmei.com"
content = '爬取新氧热搜前100的词召回的商品,内容详见表格'
text_apart = MIMEText(content, 'plain', "utf-8")
zip_file_week = 'result1.csv'
zip_apart_week = MIMEApplication(open(zip_file_week, 'rb').read())
zip_apart_week.add_header('Content-Disposition', 'attachment', filename=zip_file_week)
m = MIMEMultipart()
m.attach(text_apart)
m.attach(zip_apart_week)
m['From'] = formataddr(("李小芳", from_addrs))
m["To"] = formataddr(("李小芳", toaddrs))
m['Subject'] = '新氧商品信息'
try:
server = smtplib.SMTP_SSL('smtp.exmail.qq.com', 465)
server.login(from_addrs, password)
server.sendmail(from_addrs, [toaddrs], m.as_string())
print('success')
server.quit()
except smtplib.SMTPException as e:
print('error', e)
except Exception as e:
print(str(e))
logger.error("catch exception,main:%s" % traceback.format_exc())
def get_keynote_sentence(content):
try:
content_list = []
ss = content.encode('utf-16', 'surrogatepass').decode('utf-16')
dr = re.compile(r"<[^>]+>", re.S)
str_re = dr.sub("", ss)
para = re.sub('([;。!?\?])([^”’])', r"\1\n\2", str_re) # 单字符断句符
para = re.sub('(\.{6})([^”’])', r"\1\n\2", para) # 英文省略号
para = re.sub('(\…{2})([^”’])', r"\1\n\2", para) # 中文省略号
para = re.sub('([;。!?\?][”’])([^,。!?\?])', r'\1\n\2', para)
para = para.rstrip() # 段尾如果有多余的\n就去掉它
return para
except:
logging.error("catch exception,logins:%s" % traceback.format_exc())
return []
def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_data_file=None):
print("get_service_info")
url = 'https://api.soyoung.com/v8/superList/index'
break_flat = False
other_city_count = 0
for page in range(1, 500):
if break_flat == False and other_city_count < 10:
data = {'_time': '1626769752',
'ab_id': 'C521C79519A5D544390E60FEA08B32DB',
"app_id": 42,
"area_belong": 4,
"channel": 1,
"cityId": str(city_id),
"device_id": 196374256,
"device_model": "iPhone12,1",
'device_version': '13.6.1',
"event": "--Boundary+D46DCD61FE6FA268",
"filter": {},
"from_site": 1,
"gps_city_id": 1,
"idfa": "057F28DF-20B8-488F-A285-931367FCC110",
"is_tf": 0,
"item_id": "--Boundary+D46DCD61FE6FA268",
"keyword": str(keyword),
"list_name": "sy_app_superlist_search_page",
"lver": "8.28.2",
# "menu1_id": "--Boundary+D46DCD61FE6FA268",
# "menu2_id": "--Boundary+D46DCD61FE6FA268",
"page": page,
"page_size": 20,
"push_app_id": 42,
"request_id": "14d1e2b53ca644242ec7ccd7316a0aa2",
"s_mei_device_id": "20200317131719d8bcbc37c54be511421dc3ebf7f1d0a801036b566bd47092",
"s_meng_device_id": "D2VCzq4o472Ur7QtdVY6RlcjO6h3455JlJ+OC39JcQC7sX6a",
"schemecard": "--Boundary+D46DCD61FE6FA268",
# "sub_tab": "--Boundary+D46DCD61FE6FA268",
"sys": 1,
"tab": "mix",
"uid": "48804194",
"vistor_uid": "",
"xy_device_token": "33fa06111dea535c88cc07521f2e466c91",
"xy_token": "ad970db3d79f0833d1d25d3942068585"
}
s = random.random()
time.sleep(s)
response_res = requests.post(url, data, verify=False)
if response_res.status_code == 200 and response_res.text:
response = json.loads(response_res.text)
print(response)
responseData = response.get("responseData", {}).get("data")
for item in responseData:
if item.get("type") == "feed_area":
if item.get("items", {}).get("feed_list", []):
for data in item.get("items", {}).get("feed_list", []):
if data.get("type") == "feed_shop_diallel":
for service in data.get("items", []):
service_data = service.get("data")
district_2 = service_data.get("district_2")
if str(district_2) == str(city_id):
service_info = dict()
service_info['skuid'] = service_data.get("pid")
service_info['美购id'] = service_data.get("spu_id")
# service_info['医生名'] = service_data.get("doctor_name")
service_info['医院名称'] = service_data.get("hospital_name")
service_info['sku原价'] = service_data.get("price_origin")
service_info['sku活动价'] = service_data.get("price_online")
service_info['机构等级'] = service_data.get("avg_score")
service_info['美购名称'] = get_keynote_sentence(service_data.get("title"))
service_info['销量'] = service_data.get("order_cnt")
icon_data = service_data.get("icons", [])
service_info['可用尾款券'] = service_data.get("wei_kuan_list", [])
service_info['可领取预约金优惠券'] = [
service_data.get("new_user_text", "")] if service_data.get(
"new_user_text", "") else []
for item in icon_data:
if "预约金满" in item:
service_info['可领取预约金优惠券'].append(item)
elif "尾款满" in item:
service_info['可用尾款券'].append(item)
service_info['query词'] = keyword
service_info['城市'] = city_name
service_info['平台'] = "新氧"
service_info['链接'] = "https://m.soyoung.com/normal/cpwap{}".format(
service_info['skuid'])
if service_data.get("pid") not in all_skuids:
get_data_file.write(json.dumps(service_info))
get_data_file.write("\n")
print("write success")
else:
other_city_count += 1
else:
print("break")
break_flat = True
break
else:
print(city_id, keyword, "爬取失败")
else:
print(page, city_id, keyword, "本地已爬完")
def main():
begin = time.time()
nowday = datetime.datetime.now()
today = str(nowday).split()[0]
file_name = "save_data_" + str(today) + ".txt"
get_data_file = open(file_name, "a+", encoding="utf-8")
cityIdMapping = {'北京': '1', '上海': '9', '重庆': '22', '广州市': '289', '深圳市': '291', '郑州市': '240', '武汉市': '258',
'长沙市': '275', '南京市': '162', '成都市': '385', '西安市': '438', '铜陵市': '192',
'杭州市': '175', '合肥市': '186', '芜湖市': '187', '蚌埠市': '188', '淮南市': '189', '马鞍山市': '190', '淮北市': '191',
'安庆市': '193', '滁州市': '195', '阜阳市': '196', '宿州市': '197', '六安地区': '199', '亳州市': '200', '池州市': '201',
'宣城市': '202', '福州市': '203', '厦门市': '204', '莆田市': '205', '三明市': '206', '泉州市': '207', '漳州市': '208',
'南平市': '209', '龙岩市': '210', '宁德市': '211', '韶关市': '290', '东莞市': '305', '中山市': '306', '珠海市': '292',
'佛山市': '294', '汕头市': '293', '江门市': '295', '湛江市': '296', '茂名市': '297', '肇庆市': '298', '惠州市': '299',
'梅州市': '300', '河源市': '302', '阳江市': '303', '清远市': '304', '潮州市': '307', '揭阳市': '308', '南宁市': '310',
'柳州市': '311', '桂林市': '312', '梧州市': '313', '北海市': '314', '钦州市': '316', '贵港市': '317', '玉林市': '318',
'百色市': '319', '贵阳市': '406', '遵义市': '408', '六盘水市': '407', '铜仁地区': '410', '毕节地区': '412',
'兰州市': '448', '张掖市': '454', '平凉市': '455', '庆阳市': '457', '开封市': '241', '洛阳市': '242', '平顶山市': '243',
'安阳市': '244', '新乡市': '246', '焦作市': '247', '濮阳市': '248', '许昌市': '249', '漯河市': '250', '三门峡市': '251',
'南阳市': '252', '商丘市': '253', '信阳市': '254', '周口市': '255', '驻马店市': '256', '黄石市': '259',
'荆州市': '266', '宜昌市': '261', '襄阳市': '45068', '天门市': '273', '十堰市': '260', '荆门市': '264', '孝感市': '265',
'黄冈市': '267', '咸宁市': '268', '随州市': '269', '仙桃市': '271', '潜江市': '272', '株洲市': '276', '衡阳市': '278',
'益阳市': '283', '邵阳市': '279', '湘潭市': '277', '岳阳市': '280', '常德市': '281', '郴州市': '284', '永州市': '285',
'娄底市': '287', '石家庄市': '73', '唐山市': '74', '秦皇岛市': '75', '邯郸市': '76', '邢台市': '77', '保定市': '78',
'张家口市': '79', '承德市': '80', '衡水市': '81', '廊坊市': '82', '沧州市': '83', '哈尔滨市': '130', '齐齐哈尔市': '131',
'鸡西市': '132', '鹤岗市': '133', '大庆市': '135', '佳木斯市': '137', '七台河市': '138', '牡丹江市': '139',
'绥化市': '141', '海口市': '324', '三亚市': '325', '东方市': '331', '苏州市': '166', '无锡市': '163', '常州市': '165',
'南通市': '167', '盐城市': '170', '徐州市': '164', '连云港市': '168', '扬州市': '171', '淮安市': '169', '镇江市': '172',
'泰州市': '173', '宿迁市': '174', '南昌市': '212', '景德镇市': '213', '萍乡市': '214', '九江市': '215', '新余市': '216',
'鹰潭市': '217', '赣州市': '218', '吉安市': '219', '宜春市': '220', '抚州市': '221', '上饶市': '222', '长春市': '121',
'吉林市': '122', '四平市': '123', '通化市': '125', '白山市': '126', '松原市': '127', '沈阳市': '107', '大连市': '108',
'鞍山市': '109', '抚顺市': '110', '丹东市': '112', '锦州市': '113', '营口市': '114', '阜新市': '115', '辽阳市': '116',
'铁岭市': '118', '朝阳市': '119', '葫芦岛市': '120', '呼和浩特市': '95', '包头市': '96', '赤峰市': '98', '通辽市': '99',
'鄂尔多斯市': '100', '呼伦贝尔市': '101', '巴彦淖尔市': '102', '银川市': '470', '西宁市': '462', '自贡市': '386',
'绵阳市': '390', '德阳市': '389', '乐山市': '394', '南充市': '395', '攀枝花市': '387', '泸州市': '388', '广元市': '391',
'遂宁市': '392', '内江市': '393', '眉山市': '396', '宜宾市': '397', '广安市': '398', '达州市': '399', '雅安市': '400',
'巴中市': '401', '资阳市': '402', '济南市': '223', '青岛市': '224', '枣庄市': '226', '烟台市': '228', '淄博市': '225',
'临沂市': '235', '东营市': '227', '潍坊市': '229', '济宁市': '230', '泰安市': '231', '威海市': '232', '日照市': '233',
'德州市': '236', '聊城市': '237', '滨州市': '238', '菏泽市': '239', '宝鸡市': '440', '咸阳市': '441', '汉中市': '444',
'渭南市': '442', '延安市': '443', '安康市': '446', '太原市': '84', '晋中市': '90', '运城市': '91', '大同市': '85',
'阳泉市': '86', '长治市': '87', '晋城市': '88', '临汾市': '93', '乌鲁木齐市': '475', '石河子市': '489', '昆明市': '415',
'曲靖市': '416', '玉溪市': '417', '保山市': '418', '昭通市': '419', '丽江市': '420', '宁波市': '176', '温州市': '177',
'嘉兴市': '178', '金华市': '183', '台州市': '184', '湖州市': '179', '绍兴市': '180', '舟山市': '181', '衢州市': '182',
'丽水市': '185', '台北市': '493', '台中市': '496', '怀化市': '286'}
keywords = ['瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀', '美白针',
'眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提', '点阵激光', '面部吸脂', '开内眼角', '嗨体', '牙齿矫正', '皮秒', '超皮秒', '植发', '黄金微针', '隆胸',
'微针', '光子嫩肤', '祛斑', '小气泡', '嗨体熊猫针', '熊猫针', '果酸焕肤', '自体脂肪隆胸', '7D聚拉提', '瘦脸', '埋线双眼皮', '菲洛嘉水光针', '双眼皮修复',
'欧洲之星', '脂肪填充', '溶脂针', '法令纹', '鼻基底', '全切双眼皮', '颧骨内推', '鼻子', '光子嫩肤m22', '下颌缘提升', 'm22', '鼻翼缩小',
'欧洲之星fotona4d', '自体脂肪全面部填充', '玻尿酸丰唇', '水光', '嗨体祛颈纹', '假体隆胸', '英诺小棕瓶', '黄金微雕', '眼袋', '乔雅登', '除皱', '颧骨',
'艾莉薇', '抽脂', '瘦腿', '玻尿酸丰下巴', '纹眉', '伊肤泉微针', '美白', '厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑', '除皱针', '开眼角',
'海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
all_skuids = []
for city_name in city_list:
city_id = cityIdMapping.get(city_name)
for word in keywords:
get_service_info(city_id=city_id, keyword=word, city_name=city_name,
all_skuids=all_skuids, get_data_file=get_data_file)
get_data_file.close()
print(time.time() - begin)
if __name__ == '__main__':
main()
......@@ -57,7 +57,6 @@ def send_email_tome():
def get_keynote_sentence(content):
try:
content_list = []
ss = content.encode('utf-16', 'surrogatepass').decode('utf-16')
dr = re.compile(r"<[^>]+>", re.S)
str_re = dr.sub("", ss)
......@@ -98,8 +97,6 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
"keyword": str(keyword),
"list_name": "sy_app_superlist_search_page",
"lver": "8.28.2",
# "menu1_id": "--Boundary+D46DCD61FE6FA268",
# "menu2_id": "--Boundary+D46DCD61FE6FA268",
"page": page,
"page_size": 20,
"push_app_id": 42,
......@@ -107,7 +104,6 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
"s_mei_device_id": "20200317131719d8bcbc37c54be511421dc3ebf7f1d0a801036b566bd47092",
"s_meng_device_id": "D2VCzq4o472Ur7QtdVY6RlcjO6h3455JlJ+OC39JcQC7sX6a",
"schemecard": "--Boundary+D46DCD61FE6FA268",
# "sub_tab": "--Boundary+D46DCD61FE6FA268",
"sys": 1,
"tab": "mix",
"uid": "48804194",
......@@ -184,23 +180,20 @@ def main(city_tag=""):
cityIdMapping = {'北京': '1', '上海': '9', '重庆': '22', '广州市': '289', '深圳市': '291', '郑州市': '240', '武汉市': '258',
'长沙市': '275', '南京市': '162', '成都市': '385', '西安市': '438', '杭州市': '175'}
"""
'瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀',
'美白针',
'眼综合', '隆鼻',
"""
keywords = ['菲洛嘉', '下巴', '热拉提', '点阵激光', '面部吸脂', '开内眼角', '嗨体', '牙齿矫正', '皮秒', '超皮秒', '植发',
'黄金微针', '隆胸',
'微针', '光子嫩肤', '祛斑', '小气泡', '嗨体熊猫针', '熊猫针', '果酸焕肤', '自体脂肪隆胸', '7D聚拉提', '瘦脸', '埋线双眼皮', '菲洛嘉水光针',
'双眼皮修复',
'欧洲之星', '脂肪填充', '溶脂针', '法令纹', '鼻基底', '全切双眼皮', '颧骨内推', '鼻子', '光子嫩肤m22', '下颌缘提升', 'm22', '鼻翼缩小',
'欧洲之星fotona4d', '自体脂肪全面部填充', '玻尿酸丰唇', '水光', '嗨体祛颈纹', '假体隆胸', '英诺小棕瓶', '黄金微雕', '眼袋', '乔雅登',
'除皱', '颧骨',
'艾莉薇', '抽脂', '瘦腿', '玻尿酸丰下巴', '纹眉', '伊肤泉微针', '美白', '厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑', '除皱针',
'开眼角',
keywords = ['瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉',
'脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀',
'美白针', '眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提', '点阵激光',
'面部吸脂', '开内眼角', '嗨体', '牙齿矫正', '皮秒', '超皮秒', '植发',
'黄金微针', '隆胸', '微针', '光子嫩肤', '祛斑', '小气泡', '嗨体熊猫针',
'熊猫针', '果酸焕肤', '自体脂肪隆胸', '7D聚拉提', '瘦脸', '埋线双眼皮',
'菲洛嘉水光针', '双眼皮修复', '欧洲之星', '脂肪填充', '溶脂针', '法令纹',
'鼻基底', '全切双眼皮', '颧骨内推', '鼻子', '光子嫩肤m22', '下颌缘提升',
'm22', '鼻翼缩小', '欧洲之星fotona4d', '自体脂肪全面部填充', '玻尿酸丰唇',
'水光', '嗨体祛颈纹', '假体隆胸', '英诺小棕瓶', '黄金微雕', '眼袋', '乔雅登',
'除皱', '颧骨', '艾莉薇', '抽脂', '瘦腿', '玻尿酸丰下巴', '纹眉', '伊肤泉微针',
'美白', '厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑', '除皱针', '开眼角',
'海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
# city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
city_list = [city_tag]
all_skuids = []
......
import json
import logging
import smtplib
import re
import time
import traceback
import datetime
import os
import random
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formataddr
import pandas as pd
import requests
from lxml import etree
logger = logging.getLogger(__name__)
def send_email_tome():
try:
from_addrs = 'lixiaofang@igengmei.com'
password = 'EzJzSRyEG4Jibuy9'
toaddrs = "lixiaofang@igengmei.com"
content = '爬取新氧热搜前100的词召回的商品,内容详见表格'
text_apart = MIMEText(content, 'plain', "utf-8")
zip_file_week = 'soyoung_result.csv'
zip_apart_week = MIMEApplication(open(zip_file_week, 'rb').read())
zip_apart_week.add_header('Content-Disposition', 'attachment', filename=zip_file_week)
m = MIMEMultipart()
m.attach(text_apart)
m.attach(zip_apart_week)
m['From'] = formataddr(("李小芳", from_addrs))
m["To"] = formataddr(("李小芳", toaddrs))
m['Subject'] = '新氧商品信息'
try:
server = smtplib.SMTP_SSL('smtp.exmail.qq.com', 465)
server.login(from_addrs, password)
server.sendmail(from_addrs, [toaddrs], m.as_string())
print('success')
server.quit()
except smtplib.SMTPException as e:
print('error', e)
except Exception as e:
print(str(e))
logger.error("catch exception,main:%s" % traceback.format_exc())
def get_keynote_sentence(content):
try:
ss = content.encode('utf-16', 'surrogatepass').decode('utf-16')
dr = re.compile(r"<[^>]+>", re.S)
str_re = dr.sub("", ss)
para = re.sub('([;。!?\?])([^”’])', r"\1\n\2", str_re) # 单字符断句符
para = re.sub('(\.{6})([^”’])', r"\1\n\2", para) # 英文省略号
para = re.sub('(\…{2})([^”’])', r"\1\n\2", para) # 中文省略号
para = re.sub('([;。!?\?][”’])([^,。!?\?])', r'\1\n\2', para)
para = para.rstrip() # 段尾如果有多余的\n就去掉它
return para
except:
logging.error("catch exception,logins:%s" % traceback.format_exc())
return []
def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_data_file=None):
print("get_service_info")
service_info_list = []
url = 'https://api.soyoung.com/v8/superList/index'
for page in range(1, 500):
data = {'_time': '1626769752',
'ab_id': 'C521C79519A5D544390E60FEA08B32DB',
"app_id": 42,
"area_belong": 4,
"channel": 1,
"cityId": str(city_id),
"device_id": 196374256,
"device_model": "iPhone12,1",
'device_version': '13.6.1',
"event": "--Boundary+D46DCD61FE6FA268",
"filter": {},
"from_site": 1,
"gps_city_id": 1,
"idfa": "057F28DF-20B8-488F-A285-931367FCC110",
"is_tf": 0,
"item_id": "--Boundary+D46DCD61FE6FA268",
"keyword": str(keyword),
"list_name": "sy_app_superlist_search_page",
"lver": "8.28.2",
# "menu1_id": "--Boundary+D46DCD61FE6FA268",
# "menu2_id": "--Boundary+D46DCD61FE6FA268",
"page": page,
"page_size": 20,
"push_app_id": 42,
"request_id": "14d1e2b53ca644242ec7ccd7316a0aa2",
"s_mei_device_id": "20200317131719d8bcbc37c54be511421dc3ebf7f1d0a801036b566bd47092",
"s_meng_device_id": "D2VCzq4o472Ur7QtdVY6RlcjO6h3455JlJ+OC39JcQC7sX6a",
"schemecard": "--Boundary+D46DCD61FE6FA268",
# "sub_tab": "--Boundary+D46DCD61FE6FA268",
"sys": 1,
"tab": "mix",
"uid": "48804194",
"vistor_uid": "",
"xy_device_token": "33fa06111dea535c88cc07521f2e466c91",
'xy_sign': "Z1VfaYFXrpWBPeizj2VGeQ%3D%3D",
"xy_token": "ad970db3d79f0833d1d25d3942068585"
}
s = random.random()
time.sleep(s)
response_res = requests.post(url, data, verify=False)
if response_res.status_code == 200 and response_res.text:
response = json.loads(response_res.text)
responseData = response.get("responseData", {}).get("data")
for item in responseData:
if item.get("type") == "feed_area":
if item.get("items", {}).get("feed_list", []):
for data in item.get("items", {}).get("feed_list", []):
if data.get("type") == "feed_shop_diallel":
for service in data.get("items", []):
service_data = service.get("data")
if str(service_data.get("district_2")) in ["1", "9", '22', '289', '291', '240',
'258', '275', '162', '385', '438',
'175']:
service_info = dict()
service_info['skuid'] = service_data.get("pid")
service_info['美购id'] = service_data.get("spu_id")
# service_info['医生名'] = service_data.get("doctor_name")
service_info['医院名称'] = service_data.get("hospital_name")
service_info['sku原价'] = service_data.get("price_origin")
service_info['sku活动价'] = service_data.get("price_online")
service_info['机构等级'] = service_data.get("avg_score")
service_info['美购名称'] = get_keynote_sentence(service_data.get("title"))
service_info['销量'] = service_data.get("order_cnt")
icon_data = service_data.get("icons", [])
service_info['可用尾款券'] = service_data.get("wei_kuan_list", [])
service_info['可领取预约金优惠券'] = [
service_data.get("new_user_text", "")] if service_data.get(
"new_user_text", "") else []
for item in icon_data:
if "预约金满" in item:
service_info['可领取预约金优惠券'].append(item)
elif "尾款满" in item:
service_info['可用尾款券'].append(item)
service_info['query词'] = keyword
service_info['城市'] = city_name
service_info['平台'] = "新氧"
service_info['链接'] = "https://y.soyoung.com/cp{}".format(service_info['skuid'])
print(service_info)
if service_data.get("pid") not in all_skuids:
get_data_file.write(json.dumps(service_info))
get_data_file.write("\n")
else:
print("break")
break
else:
# if response_res
print(city_id, keyword, "fail or end")
if __name__ == '__main__':
begin = time.time()
nowday = datetime.datetime.now()
today = str(nowday).split()[0]
file_name = "soyoung_save_data_" + str(today) + ".txt"
get_data_file = open(file_name, "a+", encoding="utf-8")
cityIdMapping = {'北京': '1', '上海': '9', '重庆': '22', '广州市': '289', '深圳市': '291', '郑州市': '240', '武汉市': '258',
'长沙市': '275', '南京市': '162', '成都市': '385', '西安市': '438', '杭州市': '175'}
keywords = [
'瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针',
'玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合',
'瘦肩针', '下颌角', '线雕', '超声刀', '美白针',
'眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提',
'点阵激光', '面部吸脂', '开内眼角', '嗨体', '牙齿矫正',
'皮秒', '超皮秒', '植发', '黄金微针', '隆胸',
'微针', '光子嫩肤', '祛斑', '小气泡', '嗨体熊猫针',
'熊猫针', '果酸焕肤', '自体脂肪隆胸', '7D聚拉提', '瘦脸',
'埋线双眼皮', '菲洛嘉水光针', '双眼皮修复', '欧洲之星', '脂肪填充',
'溶脂针', '法令纹', '鼻基底', '全切双眼皮', '颧骨内推',
'鼻子', '抽脂', '光子嫩肤m22', '下颌缘提升', 'm22',
'鼻翼缩小', 'fotona4d欧洲之星', '自体脂肪全面部填充', '玻尿酸丰唇', '除皱针',
'水光', '嗨体祛颈纹', '假体隆胸', '英诺小棕瓶', '黄金微雕',
'眼袋', '乔雅登', '除皱', '颧骨', '艾莉薇', '瘦腿', '玻尿酸丰下巴',
'纹眉', '伊肤泉微针', '美白', '厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑',
'开眼角', '海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
city_list = ["北京", ""
"", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
all_skuids = []
for city_name in city_list:
city_id = cityIdMapping.get(city_name)
for word in keywords:
get_service_info(city_id=city_id, keyword=word, city_name=city_name,
all_skuids=all_skuids, get_data_file=get_data_file)
get_data_file.close()
all_data = []
if os.path.exists(file_name):
open_file = open(file_name, "r", encoding="utf-8")
for item in open_file.readlines():
data = json.loads(item.strip())
data['美购名称'] = get_keynote_sentence(data.get("美购名称"))
all_data.append(json.loads(data))
open_file.close()
res = pd.DataFrame(all_data)
res.to_csv("soyoung_result.csv", encoding="gb18030")
send_email_tome()
print(time.time() - begin)
import csv
def read_cika_info():
all_cika_title = {}
cika_file = open("soyoung_service_cika.csv", "r", encoding="utf-8")
for item in cika_file.readlines():
cika = eval(item)
for key, values in cika.items():
all_cika_title[key] = values
print(all_cika_title)
cika_file.close()
return all_cika_title
def np_write_csv_data():
all_cika_title = read_cika_info()
write_file = open("soyoung_service_write_cika.csv", "a+", encoding="utf-8")
# /Users/edz/Desktop/xinyang.csv
with open("soyoung_service.csv", encoding='utf-8') as f:
reader = csv.reader(f)
header_row = next(reader)
print(header_row)
for row in reader:
service_id = row[2]
cika_info = all_cika_title.get(service_id)
if cika_info:
print('cika_info:', cika_info)
write_file.write(str(row.append(cika_info)))
write_file.write("\n")
write_file.close()
if __name__ == '__main__':
np_write_csv_data()
......@@ -3942,25 +3942,91 @@ city_info = [
cityId_mapping = dict()
city = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
# city = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
city = ['东莞', '福州', '贵阳', '海口', '合肥', '昆明', '南昌', '南宁', '宁波', '三亚', '沈阳', '苏州', '天津', '乌鲁木齐', '阿拉善', '安康', '安宁', '安庆',
'鞍山', '安顺', '安阳', '百色', '白沙', '白山', '保定', '宝鸡', '保山', '包头', '巴彦淖尔', '巴中', '北海', '蚌埠', '本溪', '毕节', '滨州', '亳州',
'沧州', '长春', '常德', '常熟', '长治', '常州', '巢湖', '朝阳', '潮州', '承德', '郴州', '赤峰', '池州', '楚雄', '滁州', '大理', '大连', '丹东',
'大庆', '大同', '达州', '德宏', '德阳', '德州', '东方', '东京', '东阳', '东营', '鄂尔多斯', '恩施', '佛山', '抚顺', '阜新', '阜阳', '抚州', '甘南',
'赣州', '广安', '广元', '贵港', '桂林', '果洛', '固原', '哈尔滨', '海南', '邯郸', '汉中', '鹤岗', '黑河', '衡水', '衡阳', '河源', '菏泽', '红河',
'淮安', '淮北', '怀化', '淮南', '黄冈', '黄山', '黄石', '呼和浩特', '惠州', '葫芦岛', '呼伦贝尔', '湖州', '佳木斯', '吉安', '江门', '江油', '焦作',
'嘉兴', '揭阳', '吉林', '济南', '晋城', '景德镇', '荆门', '荆州', '金华', '济宁', '晋中', '锦州', '九江', '鸡西', '济州岛', '开封', '喀什', '克拉玛依',
'库尔勒', '昆山', '莱芜', '廊坊', '兰州', '拉萨', '乐山', '凉山', '连云港', '聊城', '辽阳', '辽源', '丽江', '临汾', '临夏', '临沂', '丽水', '六安',
'六盘水', '柳州', '龙岩', '娄底', '漯河', '洛阳', '泸州', '马鞍山', '曼谷', '茂名', '眉山', '梅州', '绵阳', '牡丹江', '南充', '南平', '南通', '南阳',
'内江', '宁德', '攀枝花', '平顶山', '平凉', '萍乡', '普洱', '莆田', '濮阳', '潜江', '青岛', '庆阳', '清远', '秦皇岛', '钦州', '齐齐哈尔', '七台河',
'泉州', '曲靖', '衢州', '日照', '三门峡', '三明', '商丘', '上饶', '汕头', '韶关', '绍兴', '邵阳', '嵊州', '石河子', '石家庄', '十堰', '市中心', '石嘴山',
'首尔', '双鸭山', '四平', '松原', '绥化', '遂宁', '随州', '宿迁', '宿州', '泰安', '台北', '太仓', '太原', '台中', '泰州', '台州', '唐山', '天门',
'天水', '铁岭', '铜川', '通化', '通辽', '铜陵', '铜仁', '潍坊', '威海', '渭南', '温州', '芜湖', '武威', '无锡', '吴忠', '梧州', '厦门', '中国香港',
'湘潭', '湘西', '襄阳', '咸宁', '仙桃', '咸阳', '孝感', '邢台', '悉尼', '西宁', '新加坡', '新乡', '信阳', '新余', '西双版纳', '宣城', '许昌', '徐州',
'雅安', '延安', '延边', '盐城', '阳江', '阳泉', '扬州', '延吉', '烟台', '宜宾', '宜昌', '伊春', '宜春', '银川', '营口', '鹰潭', '义乌', '益阳',
'永州', '岳阳', '玉林', '运城', '玉溪', '枣庄', '张家港', '张家口', '张掖', '漳州', '湛江', '肇庆', '昭通', '镇江', '中山', '周口', '舟山', '珠海',
'驻马店', '株洲', '淄博', '自贡', '资阳', '遵义']
for item in city_info:
if "level" in item.keys():
cityId = item.get("id")
city_Name = item.get("name")
cityId_mapping[city_Name] = []
if 'son' in item.keys():
for level2Item in item.get("son", []):
cityId = level2Item.get("id")
cityName = level2Item.get("name")
cityId_mapping[city_Name].append(cityName)
level = level2Item.get("level")
if int(level) == 2:
if cityName in city:
cityId_mapping[cityName] = cityId
elif cityName[:-1] in city:
cityId_mapping[cityName] = cityId
elif cityName[:-2] in city:
cityId_mapping[cityName] = cityId
else:
print(cityName)
print(cityId_mapping)
import datetime
# print(len(city))
# print(cityId_mapping)
# print(len(cityId_mapping))
# print(cityId_mapping.keys())
dict_keys = ['合肥市', '芜湖市', '蚌埠市', '淮南市', '马鞍山市', '淮北市', '铜陵市', '安庆市', '滁州市', '阜阳市', '宿州市', '六安地区', '亳州市', '池州市', '宣城市',
'福州市', '厦门市', '莆田市', '三明市', '泉州市', '漳州市', '南平市', '龙岩市', '宁德市', '韶关市', '东莞市', '中山市', '珠海市', '佛山市', '汕头市',
'江门市', '湛江市', '茂名市', '肇庆市', '惠州市', '梅州市', '河源市', '阳江市', '清远市', '潮州市', '揭阳市', '南宁市', '柳州市', '桂林市', '梧州市',
'北海市', '钦州市', '贵港市', '玉林市', '百色市', '贵阳市', '遵义市', '六盘水市', '铜仁地区', '毕节地区', '兰州市', '张掖市', '平凉市', '庆阳市', '开封市',
'洛阳市', '平顶山市', '安阳市', '新乡市', '焦作市', '濮阳市', '许昌市', '漯河市', '三门峡市', '南阳市', '商丘市', '信阳市', '周口市', '驻马店市', '黄石市',
'荆州市', '宜昌市', '襄阳市', '天门市', '十堰市', '荆门市', '孝感市', '黄冈市', '咸宁市', '随州市', '仙桃市', '潜江市', '株洲市', '衡阳市', '益阳市',
'邵阳市', '湘潭市', '岳阳市', '常德市', '郴州市', '永州市', '怀化市', '娄底市', '石家庄市', '唐山市', '秦皇岛市', '邯郸市', '邢台市', '保定市', '张家口市',
'承德市', '衡水市', '廊坊市', '沧州市', '哈尔滨市', '齐齐哈尔市', '鸡西市', '鹤岗市', '大庆市', '佳木斯市', '七台河市', '牡丹江市', '绥化市', '海口市',
'三亚市', '东方市', '苏州市', '无锡市', '常州市', '南通市', '盐城市', '徐州市', '连云港市', '扬州市', '淮安市', '镇江市', '泰州市', '宿迁市', '南昌市',
'景德镇市', '萍乡市', '九江市', '新余市', '鹰潭市', '赣州市', '吉安市', '宜春市', '抚州市', '上饶市', '长春市', '吉林市', '四平市', '通化市', '白山市',
'松原市', '沈阳市', '大连市', '鞍山市', '抚顺市', '丹东市', '锦州市', '营口市', '阜新市', '辽阳市', '铁岭市', '朝阳市', '葫芦岛市', '呼和浩特市', '包头市',
'赤峰市', '通辽市', '鄂尔多斯市', '呼伦贝尔市', '巴彦淖尔市', '银川市', '西宁市', '自贡市', '绵阳市', '德阳市', '乐山市', '南充市', '攀枝花市', '泸州市',
'广元市', '遂宁市', '内江市', '眉山市', '宜宾市', '广安市', '达州市', '雅安市', '巴中市', '资阳市', '济南市', '青岛市', '枣庄市', '烟台市', '淄博市',
'临沂市', '东营市', '潍坊市', '济宁市', '泰安市', '威海市', '日照市', '德州市', '聊城市', '滨州市', '菏泽市', '宝鸡市', '咸阳市', '汉中市', '渭南市',
'延安市', '安康市', '太原市', '晋中市', '运城市', '大同市', '阳泉市', '长治市', '晋城市', '临汾市', '乌鲁木齐市', '石河子市', '昆明市', '曲靖市', '玉溪市',
'保山市', '昭通市', '丽江市', '宁波市', '温州市', '嘉兴市', '金华市', '台州市', '湖州市', '绍兴市', '舟山市', '衢州市', '丽水市', '台北市', '台中市',
'台北县']
# from datetime import datetime, date, timedelta
# from datetime import datetime, date, timedelta
today = datetime.datetime.now().date().strftime('%Y-%m-%d')
yesterday = (datetime.datetime.now() + datetime.timedelta(days=-1)).date().strftime('%Y-%m-%d')
print(yesterday)
mp = {'合肥市': '186', '芜湖市': '187', '蚌埠市': '188', '淮南市': '189', '马鞍山市': '190', '淮北市': '191', '铜陵市': '192', '安庆市': '193',
'滁州市': '195', '阜阳市': '196', '宿州市': '197', '六安地区': '199', '亳州市': '200', '池州市': '201', '宣城市': '202', '福州市': '203',
'厦门市': '204', '莆田市': '205', '三明市': '206', '泉州市': '207', '漳州市': '208', '南平市': '209', '龙岩市': '210', '宁德市': '211',
'韶关市': '290', '东莞市': '305', '中山市': '306', '珠海市': '292', '佛山市': '294', '汕头市': '293', '江门市': '295', '湛江市': '296',
'茂名市': '297', '肇庆市': '298', '惠州市': '299', '梅州市': '300', '河源市': '302', '阳江市': '303', '清远市': '304', '潮州市': '307',
'揭阳市': '308', '南宁市': '310', '柳州市': '311', '桂林市': '312', '梧州市': '313', '北海市': '314', '钦州市': '316', '贵港市': '317',
'玉林市': '318', '百色市': '319', '贵阳市': '406', '遵义市': '408', '六盘水市': '407', '铜仁地区': '410', '毕节地区': '412', '兰州市': '448',
'张掖市': '454', '平凉市': '455', '庆阳市': '457', '开封市': '241', '洛阳市': '242', '平顶山市': '243', '安阳市': '244', '新乡市': '246',
'焦作市': '247', '濮阳市': '248', '许昌市': '249', '漯河市': '250', '三门峡市': '251', '南阳市': '252', '商丘市': '253', '信阳市': '254',
'周口市': '255', '驻马店市': '256', '黄石市': '259', '荆州市': '266', '宜昌市': '261', '襄阳市': '45068', '天门市': '273', '十堰市': '260',
'荆门市': '264', '孝感市': '265', '黄冈市': '267', '咸宁市': '268', '随州市': '269', '仙桃市': '271', '潜江市': '272', '株洲市': '276',
'衡阳市': '278', '益阳市': '283', '邵阳市': '279', '湘潭市': '277', '岳阳市': '280', '常德市': '281', '郴州市': '284', '永州市': '285',
'怀化市': '286', '娄底市': '287', '石家庄市': '73', '唐山市': '74', '秦皇岛市': '75', '邯郸市': '76', '邢台市': '77', '保定市': '78',
'张家口市': '79', '承德市': '80', '衡水市': '81', '廊坊市': '82', '沧州市': '83', '哈尔滨市': '130', '齐齐哈尔市': '131', '鸡西市': '132',
'鹤岗市': '133', '大庆市': '135', '佳木斯市': '137', '七台河市': '138', '牡丹江市': '139', '绥化市': '141', '海口市': '324', '三亚市': '325',
'东方市': '331', '苏州市': '166', '无锡市': '163', '常州市': '165', '南通市': '167', '盐城市': '170', '徐州市': '164', '连云港市': '168',
'扬州市': '171', '淮安市': '169', '镇江市': '172', '泰州市': '173', '宿迁市': '174', '南昌市': '212', '景德镇市': '213', '萍乡市': '214',
'九江市': '215', '新余市': '216', '鹰潭市': '217', '赣州市': '218', '吉安市': '219', '宜春市': '220', '抚州市': '221', '上饶市': '222',
'长春市': '121', '吉林市': '122', '四平市': '123', '通化市': '125', '白山市': '126', '松原市': '127', '沈阳市': '107', '大连市': '108',
'鞍山市': '109', '抚顺市': '110', '丹东市': '112', '锦州市': '113', '营口市': '114', '阜新市': '115', '辽阳市': '116', '铁岭市': '118',
'朝阳市': '119', '葫芦岛市': '120', '呼和浩特市': '95', '包头市': '96', '赤峰市': '98', '通辽市': '99', '鄂尔多斯市': '100', '呼伦贝尔市': '101',
'巴彦淖尔市': '102', '银川市': '470', '西宁市': '462', '自贡市': '386', '绵阳市': '390', '德阳市': '389', '乐山市': '394', '南充市': '395',
'攀枝花市': '387', '泸州市': '388', '广元市': '391', '遂宁市': '392', '内江市': '393', '眉山市': '396', '宜宾市': '397', '广安市': '398',
'达州市': '399', '雅安市': '400', '巴中市': '401', '资阳市': '402', '济南市': '223', '青岛市': '224', '枣庄市': '226', '烟台市': '228',
'淄博市': '225', '临沂市': '235', '东营市': '227', '潍坊市': '229', '济宁市': '230', '泰安市': '231', '威海市': '232', '日照市': '233',
'德州市': '236', '聊城市': '237', '滨州市': '238', '菏泽市': '239', '宝鸡市': '440', '咸阳市': '441', '汉中市': '444', '渭南市': '442',
'延安市': '443', '安康市': '446', '太原市': '84', '晋中市': '90', '运城市': '91', '大同市': '85', '阳泉市': '86', '长治市': '87',
'晋城市': '88', '临汾市': '93', '乌鲁木齐市': '475', '石河子市': '489', '昆明市': '415', '曲靖市': '416', '玉溪市': '417', '保山市': '418',
'昭通市': '419', '丽江市': '420', '宁波市': '176', '温州市': '177', '嘉兴市': '178', '金华市': '183', '台州市': '184', '湖州市': '179',
'绍兴市': '180', '舟山市': '181', '衢州市': '182', '丽水市': '185', '台北市': '493', '台中市': '496', '台北县': '500'}
import json
import logging
import smtplib
import socket
import time
import traceback
import datetime
import os
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formataddr
from urllib import error
import requests
# 导入requests.exceptions模块中的三种异常类
from requests.exceptions import ReadTimeout, HTTPError, RequestException, ConnectionError
from requests.packages.urllib3.exceptions import ReadTimeoutError
from socket import timeout
from retrying import retry
import pandas as pd
import requests
from lxml import etree
logger = logging.getLogger(__name__)
def send_email_tome():
try:
from_addrs = 'lixiaofang@igengmei.com'
password = 'EzJzSRyEG4Jibuy9'
toaddrs = "lixiaofang@igengmei.com"
content = '爬取新氧热搜前100的词召回的商品,内容详见表格'
text_apart = MIMEText(content, 'plain', "utf-8")
zip_file_week = 'result1.csv'
zip_apart_week = MIMEApplication(open(zip_file_week, 'rb').read())
zip_apart_week.add_header('Content-Disposition', 'attachment', filename=zip_file_week)
m = MIMEMultipart()
m.attach(text_apart)
m.attach(zip_apart_week)
m['From'] = formataddr(("李小芳", from_addrs))
m["To"] = formataddr(("李小芳", toaddrs))
m['Subject'] = '新氧商品信息'
try:
server = smtplib.SMTP_SSL('smtp.exmail.qq.com', 465)
server.login(from_addrs, password)
server.sendmail(from_addrs, [toaddrs], m.as_string())
print('success')
server.quit()
except smtplib.SMTPException as e:
print('error', e)
except Exception as e:
print(str(e))
logger.error("catch exception,main:%s" % traceback.format_exc())
class CrawlerMain(object):
def __init__(self, city_id=-1):
self.headers = {
"cookie": "__order_time__=2021-07-16 15:22:00; msg_time=2021-07-16 15:22:00; back_order_time=2021-07-16 15:22:00; complain_time=2021-07-16 15:22:00; uuid=2E2206C5-B5CD-18F9-8B76-D5FE0D078395; __usersign__=1626341221780983876; _ga=GA1.2.2084074278.1626341224; smidV2=20210715174222a8c0fc7fc96128d6b9c09abf5787b250008f7cb10a6f61380; _gid=GA1.2.2004598599.1626602683; PHPSESSID=ace2ec3e62b7d5a8f7021c3c85e0bb00; __postion__=a%3A4%3A%7Bs%3A6%3A%22cityId%22%3Bi%3A0%3Bs%3A8%3A%22cityName%22%3Bs%3A0%3A%22%22%3Bs%3A8%3A%22cityCode%22%3Bi%3A0%3Bs%3A3%3A%22jwd%22%3Bi%3A0%3B%7D; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1626341224,1626510298,1626602683,1626675657; _gat=1; cityId={}; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1626675685".format(
city_id),
"referer": "https://www.soyoung.com/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
self.info_headers = {
"cookie": "__usersign__=1626341221780983876; _ga=GA1.2.2084074278.1626341224; smidV2=20210715174222a8c0fc7fc96128d6b9c09abf5787b250008f7cb10a6f61380; __order_time__=2021-07-16 15:22:00; msg_time=2021-07-16 15:22:00; back_order_time=2021-07-16 15:22:00; complain_time=2021-07-16 15:22:00; _gid=GA1.2.2004598599.1626602683; PHPSESSID=ace2ec3e62b7d5a8f7021c3c85e0bb00; __postion__=a%3A4%3A%7Bs%3A6%3A%22cityId%22%3Bi%3A0%3Bs%3A8%3A%22cityName%22%3Bs%3A0%3A%22%22%3Bs%3A8%3A%22cityCode%22%3Bi%3A0%3Bs%3A3%3A%22jwd%22%3Bi%3A0%3B%7D; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1626341224,1626510298,1626602683,1626675657; cityId=9; Hm_lvt_757750800edc4c9eade200248b2aa23f=1626426185,1626426729,1626510306,1626676317; Hm_lpvt_757750800edc4c9eade200248b2aa23f=1626676317; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1626676317",
"referer": "https://www.soyoung.com/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
self.title = [
"query词", "城市", "平台", "美购id", "skuid",
"美购名称", "sku名称", "医院名称", "机构等级", "医生名",
"销量", "sku原价", "sku活动价", "可领取预约金优惠券", "可用尾款券",
"链接"
]
# 全部报错才会报错,如果其中一次正常,则继续执行
# 两次retry之间等待2秒,重试5次
@retry(stop_max_attempt_number=5, wait_fixed=1000)
def get_service_base_info_list(self, pid):
url = "https://y.soyoung.com/cp{}".format(pid)
try:
requests_res = requests.get(url, headers=self.headers, allow_redirects=False, timeout=10)
# 打印状态码
page_obj = etree.HTML(requests_res.text)
return page_obj, url
except ReadTimeout:
print('timeout')
return None, None
except HTTPError:
print('httperror')
return None, None
# 请求异常
except RequestException:
print('reqerror')
return None, None
except socket.timeout:
print(socket.timeout)
return None, None
except ReadTimeoutError:
print("ReadTimeoutError")
return None, None
except ConnectionError:
print("ConnectionError")
return None, None
def get_search_service_info_list(self, page=1, city_id=-1, query=""):
url = "https://www.soyoung.com/searchNew/product?" \
"keyword={0}&cityId=&page_size=&_json=1&sort=0&service=&coupon=&group=&maxprice=&minprice=&page={1}" \
.format(query, page)
try:
requests_res = requests.get(url, headers=self.headers, allow_redirects=False, timeout=10)
res_json = requests_res.json()
return res_json
except ReadTimeout:
print('超时,执行下一个请求')
return None
except HTTPError:
print('httperror')
return None
# 请求异常
except RequestException:
print('reqerror')
return None
except socket.timeout:
print(socket.timeout)
return None
except ReadTimeoutError:
print("ReadTimeoutError")
return None
except ConnectionError:
print("ConnectionError")
return None
def get_services_list(self, res_json, query="", city_name="", city_id=-1):
page_service_pids = []
current_end_flat = False
for service in res_json.get("responseData", {}).get("arr_product", []):
current_city = service.get("district_2")
if int(current_city) == int(city_id):
pid = service.get("pid")
spu_id = service.get("spu_id")
doctor_name = service.get("doctor_name")
hospital_name = service.get("hospital_name")
service_info = self.get_service_base_info(pid)
service_info['美购id'] = spu_id
service_info['sku原价'] = service.get("price_origin")
service_info['sku活动价'] = service.get("price_online")
service_info['机构等级'] = service.get("avg_score")
service_info['美购名称'] = service.get("title")
service_info['销量'] = service.get("order_cnt")
service_info['skuid'] = pid
service_info['医生名'] = doctor_name
service_info['医院名称'] = hospital_name
service_info['query词'] = query
service_info['城市'] = city_name
service_info['平台'] = "新氧"
service_info['链接'] = "https://y.soyoung.com/cp{}".format(pid)
sort_service_info = sorted(service_info.items(), key=lambda x: self.title.index(x[0]), reverse=False)
page_service_pids.append(dict(sort_service_info))
else:
current_end_flat = True
break
return page_service_pids, current_end_flat
def get_service_base_info(self, pid):
service_info = dict()
res_json, url = self.get_service_base_info_list(pid)
if res_json:
res_json = res_json.xpath(
"/html[1]/body[1]/div[@class='page-content']"
"/div[@class='w1000']/div[@class='detail-wrap']/div[@class='width-control']/div"
)
for base_info in res_json:
if "basic-info" in base_info.xpath("div/@class"):
# service_info["美购名称"] = str(base_info.xpath("div/h1/text()")[0].strip())
# service_info["sku活动价"] = base_info.xpath("div/div[@class='base-price']/em/text()")[0].strip() # 980
# service_info["sku原价"] = base_info.xpath("div/div[@class='base-price']/del/text()")[
# 0].strip() # 1980
# service_info["销量"] = base_info.xpath("div/div[@class='base-relation']/div[3]/em/text()")[
# 0].strip() # 110
# service_info["机构等级"] = base_info.xpath("div/div[@class='base-relation']/div[1]/text()")[
# 0].strip() # 110
service_info['可领取预约金优惠券'] = []
service_info['可用尾款券'] = []
for vip_info in base_info.xpath("div/dl[@class='base-param']/dd[@class='app-vip']/div"):
vip_str_info = ""
vip_title = vip_info.xpath("div[@class='label']/text()")[0].strip() if vip_info.xpath(
"div[@class='label']/text()") else ""
if vip_title in ["支持分期"]:
vip_data = vip_info.xpath("div[@class='text']/text()")[0].strip() if vip_info.xpath(
"div[@class='text']/text()") else ""
vip_str_info += vip_title + ":" + vip_data
elif vip_title in ["尾款红包"]:
vip_youhui = []
for youhui in vip_info.xpath("div[@class='text']/span"):
vip_data = youhui.xpath("em/text()")
vip_youhui.append(vip_data[0] + "元红包满" + vip_data[1] + "可用")
vip_str_info += vip_title + ":" + " ".join(vip_youhui)
elif vip_title in ["氧分抵扣"]:
vip_data = vip_info.xpath("div[@class='text']/text()")[0].strip() if vip_info.xpath(
"div[@class='text']/text()") else ""
vip_money = vip_info.xpath("div[@class='text']/em/text()")[0].strip() if vip_info.xpath(
"div[@class='text']/em/text()") else ""
vip_str_info += vip_title + ":" + vip_data + str(vip_money) + "元"
else:
pass
service_info['可领取预约金优惠券'].append(vip_str_info)
for pay_info in base_info.xpath("div/div[@class='base-buy']/div[@class='price-box']"):
deposit_title = pay_info.xpath("span/i/text()")[0].strip()
deposit_price = pay_info.xpath("span/em/text()")[0].strip()
to_pay_title = pay_info.xpath("p/text()")[0].strip()
to_pay_price = pay_info.xpath("p/span/text()")[0].strip()
service_info['可用尾款券'].append(
deposit_title + ":" + deposit_price + "," + to_pay_title + ":" + to_pay_price
)
else:
pass
return service_info
class SoYongSpider(object):
def __init__(self, file_name):
self.cityIdMapping = {'北京': '1', '上海': '9', '重庆': '22', '广州市': '289', '深圳市': '291', '郑州市': '240', '武汉市': '258',
'长沙市': '275', '南京市': '162', '成都市': '385', '西安市': '438', '杭州市': '175'}
self.keywords = ['瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀',
'美白针',
'眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提', '点阵激光', '面部吸脂', '开内眼角', '嗨体', '牙齿矫正', '皮秒', '超皮秒', '植发',
'黄金微针', '隆胸',
'微针', '光子嫩肤', '祛斑', '小气泡', '嗨体熊猫针', '熊猫针', '果酸焕肤', '自体脂肪隆胸', '7D聚拉提', '瘦脸', '埋线双眼皮', '菲洛嘉水光针',
'双眼皮修复',
'欧洲之星', '脂肪填充', '溶脂针', '法令纹', '鼻基底', '全切双眼皮', '颧骨内推', '鼻子', '光子嫩肤m22', '下颌缘提升', 'm22', '鼻翼缩小',
'欧洲之星fotona4d', '自体脂肪全面部填充', '玻尿酸丰唇', '水光', '嗨体祛颈纹', '假体隆胸', '英诺小棕瓶', '黄金微雕', '眼袋', '乔雅登',
'除皱', '颧骨',
'艾莉薇', '抽脂', '瘦腿', '玻尿酸丰下巴', '纹眉', '伊肤泉微针', '美白', '厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑', '除皱针',
'开眼角',
'海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
self.test_keywords = ['瘦脸针', '双眼皮']
self.city_list = ["南京市", "北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
self.test_city_list = ["北京", "上海"]
self.page_num = 500
self.file_name = file_name
self.have_get_service_info = self.get_have_spider_keywords()
# self.get_data_file = open(file_name, "a+", encoding="utf-8")
# self.read_data_file = open(self.file_name, "r", encoding="utf-8")
def get_have_spider_keywords(self):
have_get_service_info = {}
if os.path.exists(self.file_name):
read_data_file = open(self.file_name, "r", encoding="utf-8")
# 先获取已经请求完的数据
for item in read_data_file.readlines():
data = json.loads(item.strip())
query = data.get("query词")
city_name = data.get("城市")
word = query + city_name
if str(word) in have_get_service_info.keys():
have_get_service_info[str(word)] += 1
else:
have_get_service_info[str(word)] = 1
read_data_file.close()
return have_get_service_info
def run(self):
get_data_file = open(self.file_name, "a+", encoding="utf-8")
get_lasted_data = []
for city_name in self.city_list: # 热门城市
city_id = self.cityIdMapping[city_name]
crawler_xinyang = CrawlerMain(city_id=city_id)
# print(city_name, self.city_list.index(city_name), len(self.city_list) - 1)
if self.city_list.index(city_name) == len(self.city_list) - 1:
get_lasted_data.append(city_name)
for keyword in self.keywords: # 热门词
# print(keyword, self.keywords.index(keyword), len(self.keywords) - 1)
if self.keywords.index(keyword) == len(self.keywords) - 1 and len(get_lasted_data) == 1:
get_lasted_data.append(keyword)
for page in range(1, self.page_num): # 筛选前100个
if self.page_num == page + 1 and len(get_lasted_data) == 2:
get_lasted_data.append(page)
word = str(keyword + city_name)
if word not in self.have_get_service_info.keys() or self.have_get_service_info[word] < 10:
print(city_name, ",", city_id, ",", keyword, ",", page)
resJson = crawler_xinyang.get_search_service_info_list(query=keyword, page=page,
city_id=city_id)
service_info_list, current_end_flat = crawler_xinyang.get_services_list(res_json=resJson,
query=keyword,
city_name=city_name,
city_id=city_id)
for data in service_info_list:
get_data_file.write(json.dumps(data))
get_data_file.write("\n")
if current_end_flat == True:
break
else:
pass
time.sleep(1)
get_data_file.close()
print("get_lasted_data:", get_lasted_data)
if len(get_lasted_data) == 3:
return True
else:
return False
if __name__ == "__main__":
begin = time.time()
nowday = datetime.datetime.now()
today = str(nowday).split()[0]
file_name = "save_data_" + str(today) + ".txt"
while (True):
spider_obj = SoYongSpider(file_name)
flat = spider_obj.run()
print("flat:", flat)
if flat == True:
break
all_data = []
open_file = open(file_name, "r", encoding="utf-8")
for item in open_file.readlines():
all_data.append(json.loads(item))
res = pd.DataFrame(all_data)
res.to_csv("result1.csv", encoding="gb18030")
send_email_tome()
open_file.close()
print(time.time() - begin)
print("end")
import json
import logging
import smtplib
import socket
import time
import traceback
import datetime
import os
import sys
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formataddr
from urllib import error
import requests
# 导入requests.exceptions模块中的三种异常类
from requests.exceptions import ReadTimeout, HTTPError, RequestException, ConnectionError
from requests.packages.urllib3.exceptions import ReadTimeoutError
from socket import timeout
from retrying import retry
import pandas as pd
import requests
from lxml import etree
logger = logging.getLogger(__name__)
def send_email_tome():
try:
from_addrs = 'lixiaofang@igengmei.com'
password = 'EzJzSRyEG4Jibuy9'
toaddrs = "lixiaofang@igengmei.com"
content = '爬取新氧热搜前100的词召回的商品,内容详见表格'
text_apart = MIMEText(content, 'plain', "utf-8")
zip_file_week = 'result1.csv'
zip_apart_week = MIMEApplication(open(zip_file_week, 'rb').read())
zip_apart_week.add_header('Content-Disposition', 'attachment', filename=zip_file_week)
m = MIMEMultipart()
m.attach(text_apart)
m.attach(zip_apart_week)
m['From'] = formataddr(("李小芳", from_addrs))
m["To"] = formataddr(("李小芳", toaddrs))
m['Subject'] = '新氧商品信息'
try:
server = smtplib.SMTP_SSL('smtp.exmail.qq.com', 465)
server.login(from_addrs, password)
server.sendmail(from_addrs, [toaddrs], m.as_string())
print('success')
server.quit()
except smtplib.SMTPException as e:
print('error', e)
except Exception as e:
print(str(e))
logger.error("catch exception,main:%s" % traceback.format_exc())
class CrawlerMain(object):
def __init__(self, city_id=-1):
self.headers = {
"cookie": "__order_time__=2021-07-16 15:22:00; msg_time=2021-07-16 15:22:00; back_order_time=2021-07-16 15:22:00; complain_time=2021-07-16 15:22:00; uuid=2E2206C5-B5CD-18F9-8B76-D5FE0D078395; __usersign__=1626341221780983876; _ga=GA1.2.2084074278.1626341224; smidV2=20210715174222a8c0fc7fc96128d6b9c09abf5787b250008f7cb10a6f61380; _gid=GA1.2.2004598599.1626602683; PHPSESSID=ace2ec3e62b7d5a8f7021c3c85e0bb00; __postion__=a%3A4%3A%7Bs%3A6%3A%22cityId%22%3Bi%3A0%3Bs%3A8%3A%22cityName%22%3Bs%3A0%3A%22%22%3Bs%3A8%3A%22cityCode%22%3Bi%3A0%3Bs%3A3%3A%22jwd%22%3Bi%3A0%3B%7D; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1626341224,1626510298,1626602683,1626675657; _gat=1; cityId={}; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1626675685".format(
city_id),
"referer": "https://www.soyoung.com/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
self.info_headers = {
"cookie": "__usersign__=1626341221780983876; _ga=GA1.2.2084074278.1626341224; smidV2=20210715174222a8c0fc7fc96128d6b9c09abf5787b250008f7cb10a6f61380; __order_time__=2021-07-16 15:22:00; msg_time=2021-07-16 15:22:00; back_order_time=2021-07-16 15:22:00; complain_time=2021-07-16 15:22:00; _gid=GA1.2.2004598599.1626602683; PHPSESSID=ace2ec3e62b7d5a8f7021c3c85e0bb00; __postion__=a%3A4%3A%7Bs%3A6%3A%22cityId%22%3Bi%3A0%3Bs%3A8%3A%22cityName%22%3Bs%3A0%3A%22%22%3Bs%3A8%3A%22cityCode%22%3Bi%3A0%3Bs%3A3%3A%22jwd%22%3Bi%3A0%3B%7D; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1626341224,1626510298,1626602683,1626675657; cityId=9; Hm_lvt_757750800edc4c9eade200248b2aa23f=1626426185,1626426729,1626510306,1626676317; Hm_lpvt_757750800edc4c9eade200248b2aa23f=1626676317; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1626676317",
"referer": "https://www.soyoung.com/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
self.title = [
"query词", "城市", "平台", "美购id", "skuid",
"美购名称", "sku名称", "医院名称", "机构等级", "医生名",
"销量", "sku原价", "sku活动价", "可领取预约金优惠券", "可用尾款券",
"链接"
]
# 全部报错才会报错,如果其中一次正常,则继续执行
# 两次retry之间等待2秒,重试5次
@retry(stop_max_attempt_number=5, wait_fixed=1000)
def get_service_base_info_list(self, pid):
url = "https://y.soyoung.com/cp{}".format(pid)
try:
requests_res = requests.get(url, headers=self.headers, allow_redirects=False, timeout=10)
# 打印状态码
if requests_res:
page_obj = etree.HTML(requests_res.text)
return page_obj, url
else:
return None, None
except ReadTimeout:
print('timeout')
return None, None
except HTTPError:
print('httperror')
return None, None
# 请求异常
except RequestException:
print('reqerror')
return None, None
except socket.timeout:
print(socket.timeout)
return None, None
except ReadTimeoutError:
print("ReadTimeoutError")
return None, None
except ConnectionError:
print("ConnectionError")
return None, None
def get_search_service_info_list(self, page=1, city_id=-1, query=""):
url = "https://www.soyoung.com/searchNew/product?" \
"keyword={0}&cityId=&page_size=&_json=1&sort=0&service=&coupon=&group=&maxprice=&minprice=&page={1}" \
.format(query, page)
try:
requests_res = requests.get(url, headers=self.headers, allow_redirects=False, timeout=10)
if requests_res:
res_json = requests_res.json()
return res_json
else:
return None
except ReadTimeout:
print('超时,执行下一个请求')
return None
except HTTPError:
print('httperror')
return None
# 请求异常
except RequestException:
print('reqerror')
return None
except socket.timeout:
print(socket.timeout)
return None
except ReadTimeoutError:
print("ReadTimeoutError")
return None
except ConnectionError:
print("ConnectionError")
return None
def get_services_list(self, res_json, query="", city_name="", city_id=-1):
page_service_pids = []
current_end_flat = False
if res_json:
for service in res_json.get("responseData", {}).get("arr_product", []):
current_city = service.get("district_2")
if int(current_city) == int(city_id):
pid = service.get("pid")
spu_id = service.get("spu_id")
doctor_name = service.get("doctor_name")
hospital_name = service.get("hospital_name")
service_info = self.get_service_base_info(pid)
service_info['美购id'] = spu_id
service_info['sku原价'] = service.get("price_origin")
service_info['sku活动价'] = service.get("price_online")
service_info['机构等级'] = service.get("avg_score")
service_info['美购名称'] = service.get("title")
service_info['销量'] = service.get("order_cnt")
service_info['skuid'] = pid
service_info['医生名'] = doctor_name
service_info['医院名称'] = hospital_name
service_info['query词'] = query
service_info['城市'] = city_name
service_info['平台'] = "新氧"
service_info['链接'] = "https://y.soyoung.com/cp{}".format(pid)
sort_service_info = sorted(service_info.items(), key=lambda x: self.title.index(x[0]),
reverse=False)
page_service_pids.append(dict(sort_service_info))
else:
current_end_flat = True
break
return page_service_pids, current_end_flat
def get_service_base_info(self, pid):
service_info = dict()
res_json, url = self.get_service_base_info_list(pid)
if res_json:
res_json = res_json.xpath(
"/html[1]/body[1]/div[@class='page-content']"
"/div[@class='w1000']/div[@class='detail-wrap']/div[@class='width-control']/div"
)
for base_info in res_json:
if "basic-info" in base_info.xpath("div/@class"):
# service_info["美购名称"] = str(base_info.xpath("div/h1/text()")[0].strip())
# service_info["sku活动价"] = base_info.xpath("div/div[@class='base-price']/em/text()")[0].strip() # 980
# service_info["sku原价"] = base_info.xpath("div/div[@class='base-price']/del/text()")[
# 0].strip() # 1980
# service_info["销量"] = base_info.xpath("div/div[@class='base-relation']/div[3]/em/text()")[
# 0].strip() # 110
# service_info["机构等级"] = base_info.xpath("div/div[@class='base-relation']/div[1]/text()")[
# 0].strip() # 110
service_info['可领取预约金优惠券'] = []
service_info['可用尾款券'] = []
for vip_info in base_info.xpath("div/dl[@class='base-param']/dd[@class='app-vip']/div"):
vip_str_info = ""
vip_title = vip_info.xpath("div[@class='label']/text()")[0].strip() if vip_info.xpath(
"div[@class='label']/text()") else ""
if vip_title in ["支持分期"]:
vip_data = vip_info.xpath("div[@class='text']/text()")[0].strip() if vip_info.xpath(
"div[@class='text']/text()") else ""
vip_str_info += vip_title + ":" + vip_data
elif vip_title in ["尾款红包"]:
vip_youhui = []
for youhui in vip_info.xpath("div[@class='text']/span"):
vip_data = youhui.xpath("em/text()")
vip_youhui.append(vip_data[0] + "元红包满" + vip_data[1] + "可用")
vip_str_info += vip_title + ":" + " ".join(vip_youhui)
elif vip_title in ["氧分抵扣"]:
vip_data = vip_info.xpath("div[@class='text']/text()")[0].strip() if vip_info.xpath(
"div[@class='text']/text()") else ""
vip_money = vip_info.xpath("div[@class='text']/em/text()")[0].strip() if vip_info.xpath(
"div[@class='text']/em/text()") else ""
vip_str_info += vip_title + ":" + vip_data + str(vip_money) + "元"
else:
pass
service_info['可领取预约金优惠券'].append(vip_str_info)
for pay_info in base_info.xpath("div/div[@class='base-buy']/div[@class='price-box']"):
deposit_title = pay_info.xpath("span/i/text()")[0].strip()
deposit_price = pay_info.xpath("span/em/text()")[0].strip()
to_pay_title = pay_info.xpath("p/text()")[0].strip()
to_pay_price = pay_info.xpath("p/span/text()")[0].strip()
service_info['可用尾款券'].append(
deposit_title + ":" + deposit_price + "," + to_pay_title + ":" + to_pay_price
)
else:
pass
return service_info
class SoYongSpider(object):
def __init__(self, file_name):
self.cityIdMapping = {'北京': '1', '上海': '9', '重庆': '22', '广州市': '289', '深圳市': '291', '郑州市': '240', '武汉市': '258',
'长沙市': '275', '南京市': '162', '成都市': '385', '西安市': '438', '杭州市': '175'}
self.keywords = ['瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀',
'美白针',
'眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提', '点阵激光', '面部吸脂', '开内眼角', '嗨体', '牙齿矫正', '皮秒', '超皮秒', '植发',
'黄金微针', '隆胸',
'微针', '光子嫩肤', '祛斑', '小气泡', '嗨体熊猫针', '熊猫针', '果酸焕肤', '自体脂肪隆胸', '7D聚拉提', '瘦脸', '埋线双眼皮', '菲洛嘉水光针',
'双眼皮修复',
'欧洲之星', '脂肪填充', '溶脂针', '法令纹', '鼻基底', '全切双眼皮', '颧骨内推', '鼻子', '光子嫩肤m22', '下颌缘提升', 'm22', '鼻翼缩小',
'欧洲之星fotona4d', '自体脂肪全面部填充', '玻尿酸丰唇', '水光', '嗨体祛颈纹', '假体隆胸', '英诺小棕瓶', '黄金微雕', '眼袋', '乔雅登',
'除皱', '颧骨',
'艾莉薇', '抽脂', '瘦腿', '玻尿酸丰下巴', '纹眉', '伊肤泉微针', '美白', '厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑', '除皱针',
'开眼角',
'海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
self.test_keywords = ['瘦脸针', '双眼皮']
self.city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
self.test_city_list = ["北京", "上海"]
self.page_num = 500
self.file_name = file_name
self.have_get_service_info = self.get_have_spider_keywords()
# self.get_data_file = open(file_name, "a+", encoding="utf-8")
# self.read_data_file = open(self.file_name, "r", encoding="utf-8")
def get_have_spider_keywords(self):
have_get_service_info = {}
if os.path.exists(self.file_name):
read_data_file = open(self.file_name, "r", encoding="utf-8")
# 先获取已经请求完的数据
for item in read_data_file.readlines():
data = json.loads(item.strip())
query = data.get("query词")
city_name = data.get("城市")
word = query + city_name
if str(word) in have_get_service_info.keys():
have_get_service_info[str(word)] += 1
else:
have_get_service_info[str(word)] = 1
read_data_file.close()
return have_get_service_info
def run(self, city_tags):
get_data_file = open(self.file_name, "a+", encoding="utf-8")
self.city_list = [city_tags]
for city_name in self.city_list: # 热门城市
city_id = self.cityIdMapping[city_name]
crawler_xinyang = CrawlerMain(city_id=city_id)
for keyword in self.keywords: # 热门词
for page in range(1, self.page_num): # 筛选前100个
word = str(keyword + city_name)
if word not in self.have_get_service_info.keys() or self.have_get_service_info[word] < 10:
print(city_name, ",", city_id, ",", keyword, ",", page)
resJson = crawler_xinyang.get_search_service_info_list(query=keyword, page=page,
city_id=city_id)
service_info_list, current_end_flat = crawler_xinyang.get_services_list(res_json=resJson,
query=keyword,
city_name=city_name,
city_id=city_id)
for data in service_info_list:
get_data_file.write(json.dumps(data))
get_data_file.write("\n")
if current_end_flat == True:
break
else:
pass
time.sleep(1)
get_data_file.close()
def main(city_tag):
begin = time.time()
nowday = datetime.datetime.now()
today = str(nowday).split()[0]
file_name = "save_data_" + str(today) + city_tag + ".txt"
spider_obj = SoYongSpider(file_name)
spider_obj.run(city_tags=city_tag)
print(time.time() - begin)
print("end")
if __name__ == "__main__":
args = sys.argv[1]
main(city_tag=args)
# -*- coding:utf-8 -*-
# @Time : 2019/12/27 15:49
# @Author : litao
"""
新氧https://www.soyoung.com/itemk// 页下各标签的问答
"""
import numpy as np
import random
import argparse
import json, redis, re, requests
from selenium.webdriver import ActionChains
import time, datetime, copy
from selenium import webdriver
# from PIL import Image
import os
from selenium.webdriver.support.ui import WebDriverWait
# import cv2
from fontTools.ttLib import *
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
from concurrent.futures import ProcessPoolExecutor
from lxml import etree
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
from bs4 import BeautifulSoup
# rds_list = redis.StrictRedis(host='127.0.0.1', port=6379, db=1, decode_responses=True)
# rds_single = redis.StrictRedis(host='127.0.0.1', port=6379, db=0, decode_responses=True)
# rds_get = redis.StrictRedis(host='127.0.0.1', port=6379, db=15, decode_responses=True)
# rds_copy = redis.StrictRedis(host='127.0.0.1', port=6379, db=1, decode_responses=True)
rds_list = redis.StrictRedis(host='192.168.17.60', port=6379, db=1, decode_responses=True)
rds_single = redis.StrictRedis(host='192.168.17.60', port=6379, db=0, decode_responses=True)
rds_get = redis.StrictRedis(host='192.168.17.60', port=6379, db=15, decode_responses=True)
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--max_page', default=0, type=int,
help=('The max page numbers'))
parser.add_argument('-t', '--style_tag', default="", type=str,
help=('style_tag'))
parser.add_argument('-c', '--countries', default="", type=str,
help=('style_tag'))
args = parser.parse_args()
def revise_data():
scan_re = rds_list.scan_iter()
for one_scan in scan_re:
# print(one_scan)
data = rds_list.hgetall(one_scan)
# data["title"] = data["title"].replace("\r", "").replace("\n", "")
# data["describe"] = data["describe"].replace("\r", "").replace("\n", "")
if not data.get("directors"):
rds_get.hmset(one_scan, data)
# rds_list.hmset(one_scan,data)
class Crawler_main(object):
def __init__(self):
# self.chrome_options = webdriver.ChromeOptions()
# # self.chrome_options.add_argument('--headless')
# self.chrome_options.add_argument('--disable-gpu')
# # self.chrome_options.add_argument("--start-maximized")
# self.chrome_options.add_argument("--no-sandbox")
# self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# self.timestamp = str(datetime.datetime.now().timestamp() * 1e3)
# prefs = {"profile.managed_default_content_settings.images": 2}
# self.chrome_options.add_experimental_option("prefs", prefs)
# self.driver = webdriver.Chrome(options=self.chrome_options)
self.headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cookie": "_ga=GA1.2.193275787.1596185563; cookie_id=1605149277057691084; xysource=15154; PHPSESSID=e0ae5890a52041aa000765f7ddd6488b; __usersign__=1605149277224950668; __postion__=a%3A4%3A%7Bs%3A6%3A%22cityId%22%3Bi%3A0%3Bs%3A8%3A%22cityName%22%3Bs%3A0%3A%22%22%3Bs%3A8%3A%22cityCode%22%3Bi%3A0%3Bs%3A3%3A%22jwd%22%3Bi%3A0%3B%7D; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1605149278; _gid=GA1.2.1287643971.1605149278; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1605150670",
"referer": "https://www.soyoung.com/itemk//",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
}
self.one_video_dic = {
"platform": "douban",
"title": "",
"url": "",
"describe": "",
}
def __exit__(self):
# self.driver.close()
pass
def list_page(self, releaserUrl="https://www.soyoung.com/itemk//",
tag_list_xpath=None,
):
offset = 0
count_false = 0
proxies = get_proxy(0)
requests_res = requests.get(releaserUrl, headers=self.headers, allow_redirects=False, timeout=5)
page_obj = etree.HTML(requests_res.text)
obj_list = page_obj.xpath("/html[1]/body[1]/div")
for first_title_obj in obj_list:
try:
tag_id = first_title_obj.xpath("./@id")[0]
print(tag_id)
first_title = first_title_obj.xpath("./div[1]/div[1]/text()")[0].strip()
print("first_title", first_title)
except:
continue
second_title_str_obj_list = first_title_obj.xpath("./div[1]/div[2]/div[1]/div[1]/a")
if 'product100' in tag_id:
second_title_obj_list = first_title_obj.xpath("./div[2]/div")
for count_tag, one_second_title_obj in enumerate(second_title_obj_list):
second_title = second_title_str_obj_list[count_tag].xpath("./text()")[0].strip()
second_id = second_title_str_obj_list[count_tag].xpath("./@data-id")[0].strip()
# second_obj_list = one_second_title_obj.xpath("./div[2]/div")
print("second_title", second_title)
for third_title_obj_product in self.get_third_tag_list(second_id):
# third_title_obj_list = one_third_title_obj.xpath("./div[2]/div")
# third_name = third_title_obj_product.xpath("./div[1]/text()")[0].strip()
# third_name_info = third_title_obj_product.xpath("./div[1]/span[1]/text()")[0].strip()
# third_name_des = third_title_obj_product.xpath("./p[1]/text()")[0].strip()
# third_name_url = "https:" + third_title_obj_product.xpath("./@data-url")[0].strip()
# print(third_title_obj_product)
third_name = third_title_obj_product.get("name")
third_name_info = third_title_obj_product.get("one_feature")
third_name_des = third_title_obj_product.get("summary")
try:
third_name_url = "https://www.soyoung.com/itemk/%s/" % third_title_obj_product.get(
"seo").get("pinyin")
except:
third_name_url = ""
print(first_title, second_title, third_name)
for qa_title, qa_answer in self.parse_single_data(third_name_url):
data_dict = {
"first_title": first_title,
"second_title": second_title,
"third_name": third_name,
"third_name_info": third_name_info,
"third_name_des": third_name_des,
"third_name_url": third_name_url,
"qa_title": qa_title,
"qa_answer": qa_answer,
}
yield data_dict
# break
def parse_single_data(self, data_url):
try:
requests_res = requests.get(data_url, headers=self.headers, allow_redirects=False, timeout=5)
page_obj = etree.HTML(requests_res.text)
obj_list = page_obj.xpath("//section[@id='qa']/div")
for qa_obj in obj_list:
qa_title = qa_obj.xpath("./div[1]/p[1]/text()")[0].strip()
qa_answer = qa_obj.xpath("./div[2]/p[1]/span[1]/text()")[0].strip()
# print(qa_title,qa_answer)
yield qa_title, qa_answer
except:
yield "", ""
def get_third_tag_list(self, menu_id):
headers = {
"accept": "application/json, text/javascript, */*; q=0.01",
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9",
# "cookie": "_ga=GA1.2.193275787.1596185563; cookie_id=1605149277057691084; xysource=15154; PHPSESSID=e0ae5890a52041aa000765f7ddd6488b; __usersign__=1605149277224950668; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1605149278; _gid=GA1.2.1287643971.1605149278; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1605165197",
"referer": "https://www.soyoung.com/itemk//",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
}
url = "https://www.soyoung.com/items/itemList?_json=1&menu_id=%s" % menu_id
requests_res = requests.get(url, headers=headers, allow_redirects=False, timeout=5)
res_json = requests_res.json()
return res_json
if __name__ == "__main__":
# if args.style_tag or args.countries:
# Crawler_douban = Crawler_main()
# Crawler_douban.list_page(style_tags=args.style_tag,countries=args.countries)
# else:
# executor = ProcessPoolExecutor(max_workers=5)
# futures = []
# for one_scan in range(5):
# Crawler_douban = Crawler_main()
# future = executor.submit(Crawler_douban.detail_page, task=one_scan)
# futures.append(future)
# executor.shutdown(True)
import pandas as pd
data_list = []
Crawler_xinyang = Crawler_main()
try:
for data in Crawler_xinyang.list_page():
data_list.append(data)
except:
res = pd.DataFrame(data_list)
res.to_csv("wrong.csv", encoding="gb18030")
finally:
res = pd.DataFrame(data_list)
res.to_csv("result.csv", encoding="gb18030")
# revise_data()
......@@ -36,7 +36,7 @@ def send_email_tome():
content = '爬取新氧热搜前100的词召回的商品,内容详见表格'
text_apart = MIMEText(content, 'plain', "utf-8")
zip_file_week = 'soyoung_service.csv.gz'
zip_file_week = 'soyoung_service_other.csv.gz'
zip_apart_week = MIMEApplication(open(zip_file_week, 'rb').read())
zip_apart_week.add_header('Content-Disposition', 'attachment', filename=zip_file_week)
......@@ -87,12 +87,13 @@ if __name__ == '__main__':
nowday = datetime.datetime.now()
today = str(nowday).split()[0]
all_data = []
city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
# city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
city_list = ["save_data_2021-07-27.txt", "save_data_2021-07-28.txt", "save_data_2021-07-29.txt"]
for city_name in city_list:
file_name = "save_data_" + today + city_name + ".txt"
print(file_name)
if os.path.exists(file_name):
open_file = open(file_name, "r", encoding="utf-8")
# file_name = "save_data_" + today + ".txt"
# print(file_name)
if os.path.exists(city_name):
open_file = open(city_name, "r", encoding="utf-8")
for item in open_file.readlines():
try:
data = json.loads(item.strip())
......@@ -103,7 +104,7 @@ if __name__ == '__main__':
if len(all_data):
res = pd.DataFrame(all_data)
res.to_csv("soyoung_service.csv.gz", compression='gzip', index=False, encoding="gb18030")
res.to_csv("soyoung_service_other.csv.gz", compression='gzip', index=False, encoding="gb18030")
send_email_tome()
print(time.time() - begin)
......
This source diff could not be displayed because it is too large. You can view the blob instead.
0,666426,108536,北京俏中关医疗美容门诊部,9800,1972,5,【除皱瘦脸】美国进口标准装【除皱瘦脸】瘦脸针100U·足量·正品 进口/提升/下颌线,329,"['付尾款,最高立减068', '尾款满100减8']",[],瘦脸针,北京,新氧,https://m.soyoung.com/normal/cpwap666426
1,84880,82258,北京画美医疗美容医院,1680,551,4.8,【注射瘦脸】除皱瘦脸国产80-100u 限购一次 正品足量 正品可验 小V脸 去咬肌 咬肌肥大瘦脸针,2321,[],['新人首单立减0629'],瘦脸针,北京,新氧,https://m.soyoung.com/normal/cpwap84880
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
import requests
import gevent
from gevent import monkey
import re
from pypinyin import lazy_pinyin
import time
import random
monkey.patch_all()
def get_service_info(spu_id, url, ids_bug):
time.sleep(random.uniform(1, 2))
url = url + "?tab=0&rj=0"
headers = {
"Cookie": "smidV2=202108111914527c82de7c31a1ebbb6f65a12dfd5021de00aefc1da947e1af0; _ga=GA1.2.1705589232.1628680492; _gid=GA1.2.1191277434.1628680492; PHPSESSID=e81ba73301b430097db1c9599826880a; __usersign__=1628680488549938593",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 AliApp(DingTalk/6.0.23) com.laiwang.DingTalk/15108471 Channel/201200 language/zh-Hans-CN UT4Aplus/0.0.6 WK",
"Accept-Language": "zh-cn",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"DingTalk-Flag": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Host": "m.soyoung.com"
}
response_res = requests.get(url, headers=headers, verify=False, timeout=30)
print("url:%s,status:%s" % (url, response_res.status_code))
sub_title_dict = {}
if response_res.status_code == 200:
if spu_id in ids_bug:
ids_bug.remove(spu_id)
sub_title = re.compile('<span class="sub-title">(.*?)</span><span class=".*?">(.*?)</span>').findall(
response_res.text)
for item in sub_title:
times = item[0].strip()
price = item[1].strip().split("<")[0]
sub_title_dict[times] = price
x_p_o = re.compile('<span class="x-p-o">(.*?)</span>').findall(response_res.text)
if x_p_o:
sub_title_dict['门市价'] = x_p_o
else:
ids_bug.append(spu_id)
time.sleep(600)
return sub_title_dict
def gevent_test():
tasks = []
for city_name in ['合肥市', '芜湖市', '蚌埠市', '淮南市']:
for word in ['瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉']:
tasks.append(
gevent.spawn(get_service_info, query=word, city_name="".join(lazy_pinyin(city_name)))
)
print(tasks)
gevent.joinall(tasks)
def get_cika_info_to_csv():
read_file = open("soyoung_service.csv", "r", encoding="utf-8")
write_file = open("soyoung_service_cika.csv", "a+", encoding="utf-8")
all_cika_title = list()
ids_bug = []
have_read_service = []
for item in read_file.readlines():
try:
cika_price_dict = dict()
print("index:", item.strip().split(",")[0])
if int(item.strip().split(",")[0]) > 33755:
service_id = item.strip().split(",")[2]
url = item.strip().split(",")[-1]
if service_id in have_read_service:
pass
else:
have_read_service.append(service_id)
sub_title_dict = get_service_info(str(service_id), url, ids_bug)
if sub_title_dict:
all_cika_title.extend(sub_title_dict.keys())
cika_price_dict[service_id] = sub_title_dict
print(cika_price_dict)
write_file.write(str(cika_price_dict))
write_file.write("\n")
except Exception as e:
print(e)
pass
print(all_cika_title)
read_file.close()
write_file.close()
def read_cika_info():
all_cika_title = {}
cika_file = open("soyoung_service_cika.csv", "r", encoding="utf-8")
for item in cika_file.readlines():
cika = eval(item)
for key, values in cika.items():
all_cika_title[key] = values
print(all_cika_title)
cika_file.close()
return all_cika_title
def write_service_info_to_csv():
all_cika_sub = ['单次', '2次', '3次', '4次', '5次', '6次', '8次', '9次', '10次', '20次' '门市价']
read_file = open("soyoung_service.csv", "r", encoding="utf-8")
write_file = open("soyoung_service_write_cika.csv", "w", encoding="utf-8")
all_cika_title = read_cika_info()
for item in read_file.readlines():
try:
service_info = item.strip().split(",")
print("service_info", service_info)
service_id = service_info[2]
cika_info = all_cika_title.get(service_id)
if cika_info:
print('cika_info:', cika_info)
for cika in all_cika_sub:
service_info.append(cika_info.get(cika, ""))
write_file.write(str(service_info))
write_file.write("\n")
except Exception as e:
print("eeeeee:", e)
pass
read_file.close()
write_file.close()
if __name__ == '__main__':
get_cika_info_to_csv()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment