Commit aeb48597 authored by 李小芳's avatar 李小芳

update

parent b6079ebc
This diff is collapsed.
......@@ -19,71 +19,15 @@ from lxml import etree
logger = logging.getLogger(__name__)
def send_email_tome():
try:
from_addrs = 'lixiaofang@igengmei.com'
password = 'EzJzSRyEG4Jibuy9'
toaddrs = "lixiaofang@igengmei.com"
content = '爬取新氧热搜前100的词召回的商品,内容详见表格'
text_apart = MIMEText(content, 'plain', "utf-8")
zip_file_week = 'result1.csv'
zip_apart_week = MIMEApplication(open(zip_file_week, 'rb').read())
zip_apart_week.add_header('Content-Disposition', 'attachment', filename=zip_file_week)
m = MIMEMultipart()
m.attach(text_apart)
m.attach(zip_apart_week)
m['From'] = formataddr(("李小芳", from_addrs))
m["To"] = formataddr(("李小芳", toaddrs))
m['Subject'] = '新氧商品信息'
try:
server = smtplib.SMTP_SSL('smtp.exmail.qq.com', 465)
server.login(from_addrs, password)
server.sendmail(from_addrs, [toaddrs], m.as_string())
print('success')
server.quit()
except smtplib.SMTPException as e:
print('error', e)
except Exception as e:
print(str(e))
logger.error("catch exception,main:%s" % traceback.format_exc())
def get_keynote_sentence(content):
try:
ss = content.encode('utf-16', 'surrogatepass').decode('utf-16')
dr = re.compile(r"<[^>]+>", re.S)
str_re = dr.sub("", ss)
para = re.sub('([;。!?\?])([^”’])', r"\1\n\2", str_re) # 单字符断句符
para = re.sub('(\.{6})([^”’])', r"\1\n\2", para) # 英文省略号
para = re.sub('(\…{2})([^”’])', r"\1\n\2", para) # 中文省略号
para = re.sub('([;。!?\?][”’])([^,。!?\?])', r'\1\n\2', para)
para = para.rstrip() # 段尾如果有多余的\n就去掉它
return para
except:
logging.error("catch exception,logins:%s" % traceback.format_exc())
return []
def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_data_file=None):
print("get_service_info")
url = 'https://api.soyoung.com/v8/superList/index'
break_flat = False
other_city_count = 0
for page in range(1, 500):
if break_flat == False and other_city_count < 100:
data = {'_time': '1626769752',
class SoYoung(object):
def __init__(self):
self.url = "https://api.soyoung.com/v8/superList/index"
self.headers = {'_time': '1626769752',
'ab_id': 'C521C79519A5D544390E60FEA08B32DB',
"app_id": 42,
"area_belong": 4,
"channel": 1,
"cityId": str(city_id),
# "cityId": str(city_id),
"device_id": 196374256,
"device_model": "iPhone12,1",
'device_version': '13.6.1',
......@@ -94,10 +38,10 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
"idfa": "057F28DF-20B8-488F-A285-931367FCC110",
"is_tf": 0,
"item_id": "--Boundary+D46DCD61FE6FA268",
"keyword": str(keyword),
# "keyword": str(keyword),
"list_name": "sy_app_superlist_search_page",
"lver": "8.28.2",
"page": page,
# "page": page,
"page_size": 20,
"push_app_id": 42,
"request_id": "14d1e2b53ca644242ec7ccd7316a0aa2",
......@@ -109,14 +53,46 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
"uid": "48804194",
"vistor_uid": "",
"xy_device_token": "33fa06111dea535c88cc07521f2e466c91",
'xy_sign': "Z1VfaYFXrpWBPeizj2VGeQ%3D%3D",
"xy_token": "ad970db3d79f0833d1d25d3942068585"
}
def get_keynote_sentence(self, content):
try:
ss = content.encode('utf-16', 'surrogatepass').decode('utf-16')
dr = re.compile(r"<[^>]+>", re.S)
str_re = dr.sub("", ss)
para = re.sub('([;。!?\?])([^”’])', r"\1\n\2", str_re) # 单字符断句符
para = re.sub('(\.{6})([^”’])', r"\1\n\2", para) # 英文省略号
para = re.sub('(\…{2})([^”’])', r"\1\n\2", para) # 中文省略号
para = re.sub('([;。!?\?][”’])([^,。!?\?])', r'\1\n\2', para)
para = para.rstrip() # 段尾如果有多余的\n就去掉它
return para
except:
logging.error("catch exception,logins:%s" % traceback.format_exc())
return []
def search(self):
try:
# 每次请求之前先暂停几秒 防止被ban
s = random.random()
time.sleep(s)
response_res = requests.post(url, data, verify=False)
response_res = requests.post(self.url, self.headers, verify=False)
if response_res.status_code == 200 and response_res.text:
response = json.loads(response_res.text)
return response
elif response_res.status_code == 403:
return self.search()
else:
print("发生错误 停止请求")
except:
return print("发生错误 停止请求")
def response_analysis(self, response=None, city_id=-1, city_name=None, keyword=None, all_skuids=[],
get_data_file=None, other_city_count=0, break_flat=False):
"""
解析获取到的数据
"""
try:
responseData = response.get("responseData", {}).get("data")
for item in responseData:
if item.get("type") == "feed_area":
......@@ -135,7 +111,7 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
service_info['sku原价'] = service_data.get("price_origin")
service_info['sku活动价'] = service_data.get("price_online")
service_info['机构等级'] = service_data.get("avg_score")
service_info['美购名称'] = get_keynote_sentence(service_data.get("title"))
service_info['美购名称'] = self.get_keynote_sentence(service_data.get("title"))
service_info['销量'] = service_data.get("order_cnt")
icon_data = service_data.get("icons", [])
service_info['可用尾款券'] = service_data.get("wei_kuan_list", [])
......@@ -153,16 +129,34 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
service_info['平台'] = "新氧"
service_info['链接'] = "https://m.soyoung.com/normal/cpwap{}".format(
service_info['skuid'])
print(service_info)
if service_data.get("pid") not in all_skuids:
get_data_file.write(json.dumps(service_info))
get_data_file.write("\n")
print("write success")
else:
other_city_count += 1
else:
print("break")
break_flat = True
break
except:
pass
def get_service_info(self, city_id=-1, keyword="", city_name="", all_skuids=[], get_data_file=None):
break_flat = False
other_city_count = 0
for page in range(1, 500):
# other_city_count <10 代表已经结束
if break_flat == False and other_city_count < 10:
self.headers['cityId'] = str(city_id)
self.headers['keyword'] = str(keyword)
self.headers['page'] = str(page)
response = self.search()
if response:
self.response_analysis(response=response, city_id=city_id, keyword=keyword, city_name=city_name,
all_skuids=all_skuids, get_data_file=get_data_file,
other_city_count=other_city_count, break_flat=break_flat)
else:
print(city_id, keyword, "爬取失败")
else:
......@@ -194,17 +188,19 @@ def main(city_tag=""):
'美白', '厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑', '除皱针', '开眼角',
'海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
soyoung = SoYoung()
city_list = [city_tag]
all_skuids = []
for city_name in city_list:
city_id = cityIdMapping.get(city_name)
for word in keywords:
get_service_info(city_id=city_id, keyword=word, city_name=city_name,
soyoung.get_service_info(city_id=city_id, keyword=word, city_name=city_name,
all_skuids=all_skuids, get_data_file=get_data_file)
get_data_file.close()
print(time.time() - begin)
print("全部爬取完成")
if __name__ == '__main__':
......
......@@ -87,11 +87,8 @@ if __name__ == '__main__':
nowday = datetime.datetime.now()
today = str(nowday).split()[0]
all_data = []
# city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
city_list = ["save_data_2021-07-27.txt", "save_data_2021-07-28.txt", "save_data_2021-07-29.txt"]
for city_name in city_list:
# file_name = "save_data_" + today + ".txt"
# print(file_name)
file_list = ["save_data_2021-07-27.txt", "save_data_2021-07-28.txt", "save_data_2021-07-29.txt"]
for city_name in file_list:
if os.path.exists(city_name):
open_file = open(city_name, "r", encoding="utf-8")
for item in open_file.readlines():
......
0,666426,108536,北京俏中关医疗美容门诊部,9800,1972,5,【除皱瘦脸】美国进口标准装【除皱瘦脸】瘦脸针100U·足量·正品 进口/提升/下颌线,329,"['付尾款,最高立减068', '尾款满100减8']",[],瘦脸针,北京,新氧,https://m.soyoung.com/normal/cpwap666426
1,84880,82258,北京画美医疗美容医院,1680,551,4.8,【注射瘦脸】除皱瘦脸国产80-100u 限购一次 正品足量 正品可验 小V脸 去咬肌 咬肌肥大瘦脸针,2321,[],['新人首单立减0629'],瘦脸针,北京,新氧,https://m.soyoung.com/normal/cpwap84880
This diff is collapsed.
This diff is collapsed.
......@@ -66,7 +66,7 @@ def get_cika_info_to_csv():
try:
cika_price_dict = dict()
print("index:", item.strip().split(",")[0])
if int(item.strip().split(",")[0]) > 33755:
if int(item.strip().split(",")[0]) > 34645:
service_id = item.strip().split(",")[2]
url = item.strip().split(",")[-1]
if service_id in have_read_service:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment