Commit 827549ca authored by 李小芳's avatar 李小芳

add

parent bb6ce850
This diff is collapsed.
...@@ -11,7 +11,7 @@ from email.mime.application import MIMEApplication ...@@ -11,7 +11,7 @@ from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText from email.mime.text import MIMEText
from email.utils import formataddr from email.utils import formataddr
import re
import pandas as pd import pandas as pd
import requests import requests
from lxml import etree from lxml import etree
...@@ -55,6 +55,23 @@ def send_email_tome(): ...@@ -55,6 +55,23 @@ def send_email_tome():
logger.error("catch exception,main:%s" % traceback.format_exc()) logger.error("catch exception,main:%s" % traceback.format_exc())
def get_keynote_sentence(content):
try:
content_list = []
ss = content.encode('utf-16', 'surrogatepass').decode('utf-16')
dr = re.compile(r"<[^>]+>", re.S)
str_re = dr.sub("", ss)
para = re.sub('([;。!?\?])([^”’])', r"\1\n\2", str_re) # 单字符断句符
para = re.sub('(\.{6})([^”’])', r"\1\n\2", para) # 英文省略号
para = re.sub('(\…{2})([^”’])', r"\1\n\2", para) # 中文省略号
para = re.sub('([;。!?\?][”’])([^,。!?\?])', r'\1\n\2', para)
para = para.rstrip() # 段尾如果有多余的\n就去掉它
return para
except:
logging.error("catch exception,logins:%s" % traceback.format_exc())
return []
def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_data_file=None): def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_data_file=None):
print("get_service_info") print("get_service_info")
url = 'https://api.soyoung.com/v8/superList/index' url = 'https://api.soyoung.com/v8/superList/index'
...@@ -112,7 +129,8 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da ...@@ -112,7 +129,8 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
if data.get("type") == "feed_shop_diallel": if data.get("type") == "feed_shop_diallel":
for service in data.get("items", []): for service in data.get("items", []):
service_data = service.get("data") service_data = service.get("data")
if str(service_data.get("district_2")) == str(city_id): district_2 = service_data.get("district_2")
if str(district_2) == str(city_id):
service_info = dict() service_info = dict()
service_info['skuid'] = service_data.get("pid") service_info['skuid'] = service_data.get("pid")
service_info['美购id'] = service_data.get("spu_id") service_info['美购id'] = service_data.get("spu_id")
...@@ -121,7 +139,7 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da ...@@ -121,7 +139,7 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
service_info['sku原价'] = service_data.get("price_origin") service_info['sku原价'] = service_data.get("price_origin")
service_info['sku活动价'] = service_data.get("price_online") service_info['sku活动价'] = service_data.get("price_online")
service_info['机构等级'] = service_data.get("avg_score") service_info['机构等级'] = service_data.get("avg_score")
service_info['美购名称'] = service_data.get("title") service_info['美购名称'] = get_keynote_sentence(service_data.get("title"))
service_info['销量'] = service_data.get("order_cnt") service_info['销量'] = service_data.get("order_cnt")
icon_data = service_data.get("icons", []) icon_data = service_data.get("icons", [])
service_info['可用尾款券'] = service_data.get("wei_kuan_list", []) service_info['可用尾款券'] = service_data.get("wei_kuan_list", [])
...@@ -137,7 +155,7 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da ...@@ -137,7 +155,7 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
service_info['query词'] = keyword service_info['query词'] = keyword
service_info['城市'] = city_name service_info['城市'] = city_name
service_info['平台'] = "新氧" service_info['平台'] = "新氧"
service_info['链接'] = "https://y.soyoung.com/cp{}".format( service_info['链接'] = "https://m.soyoung.com/normal/cpwap{}".format(
service_info['skuid']) service_info['skuid'])
print(service_info) print(service_info)
if service_data.get("pid") not in all_skuids: if service_data.get("pid") not in all_skuids:
...@@ -152,7 +170,7 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da ...@@ -152,7 +170,7 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
else: else:
print(city_id, keyword, "爬取失败") print(city_id, keyword, "爬取失败")
else: else:
print(page,city_id, keyword, "本地已爬完") print(page, city_id, keyword, "本地已爬完")
def main(city_tag=""): def main(city_tag=""):
...@@ -166,19 +184,21 @@ def main(city_tag=""): ...@@ -166,19 +184,21 @@ def main(city_tag=""):
cityIdMapping = {'北京': '1', '上海': '9', '重庆': '22', '广州市': '289', '深圳市': '291', '郑州市': '240', '武汉市': '258', cityIdMapping = {'北京': '1', '上海': '9', '重庆': '22', '广州市': '289', '深圳市': '291', '郑州市': '240', '武汉市': '258',
'长沙市': '275', '南京市': '162', '成都市': '385', '西安市': '438', '杭州市': '175'} '长沙市': '275', '南京市': '162', '成都市': '385', '西安市': '438', '杭州市': '175'}
# keywords = ['瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀', """
# '美白针', '瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀',
# '眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提', '点阵激光', '面部吸脂', '开内眼角', '嗨体', '牙齿矫正', '皮秒', '超皮秒', '植发', '美白针',
# '黄金微针', '隆胸', '眼综合', '隆鼻',
# '微针', '光子嫩肤', '祛斑', '小气泡', '嗨体熊猫针', '熊猫针', '果酸焕肤', '自体脂肪隆胸', '7D聚拉提', '瘦脸', '埋线双眼皮', '菲洛嘉水光针', """
# '双眼皮修复', keywords = ['菲洛嘉', '下巴', '热拉提', '点阵激光', '面部吸脂', '开内眼角', '嗨体', '牙齿矫正', '皮秒', '超皮秒', '植发',
# '欧洲之星', '脂肪填充', '溶脂针', '法令纹', '鼻基底', '全切双眼皮', '颧骨内推', '鼻子', '光子嫩肤m22', '下颌缘提升', 'm22', '鼻翼缩小', '黄金微针', '隆胸',
# '欧洲之星fotona4d', '自体脂肪全面部填充', '玻尿酸丰唇', '水光', '嗨体祛颈纹', '假体隆胸', '英诺小棕瓶', '黄金微雕', '眼袋', '乔雅登', '微针', '光子嫩肤', '祛斑', '小气泡', '嗨体熊猫针', '熊猫针', '果酸焕肤', '自体脂肪隆胸', '7D聚拉提', '瘦脸', '埋线双眼皮', '菲洛嘉水光针',
# '除皱', '颧骨', '双眼皮修复',
# '艾莉薇', '抽脂', '瘦腿', '玻尿酸丰下巴', '纹眉', '伊肤泉微针', '美白', '厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑', '除皱针', '欧洲之星', '脂肪填充', '溶脂针', '法令纹', '鼻基底', '全切双眼皮', '颧骨内推', '鼻子', '光子嫩肤m22', '下颌缘提升', 'm22', '鼻翼缩小',
# '开眼角', '欧洲之星fotona4d', '自体脂肪全面部填充', '玻尿酸丰唇', '水光', '嗨体祛颈纹', '假体隆胸', '英诺小棕瓶', '黄金微雕', '眼袋', '乔雅登',
# '海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼'] '除皱', '颧骨',
keywords = ['欧洲之星fotona4d'] '艾莉薇', '抽脂', '瘦腿', '玻尿酸丰下巴', '纹眉', '伊肤泉微针', '美白', '厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑', '除皱针',
'开眼角',
'海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
# city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"] # city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
......
...@@ -28,7 +28,7 @@ def send_email_tome(): ...@@ -28,7 +28,7 @@ def send_email_tome():
content = '爬取新氧热搜前100的词召回的商品,内容详见表格' content = '爬取新氧热搜前100的词召回的商品,内容详见表格'
text_apart = MIMEText(content, 'plain', "utf-8") text_apart = MIMEText(content, 'plain', "utf-8")
zip_file_week = 'result1.csv' zip_file_week = 'soyoung_result.csv'
zip_apart_week = MIMEApplication(open(zip_file_week, 'rb').read()) zip_apart_week = MIMEApplication(open(zip_file_week, 'rb').read())
zip_apart_week.add_header('Content-Disposition', 'attachment', filename=zip_file_week) zip_apart_week.add_header('Content-Disposition', 'attachment', filename=zip_file_week)
...@@ -57,7 +57,6 @@ def send_email_tome(): ...@@ -57,7 +57,6 @@ def send_email_tome():
def get_keynote_sentence(content): def get_keynote_sentence(content):
try: try:
content_list = []
ss = content.encode('utf-16', 'surrogatepass').decode('utf-16') ss = content.encode('utf-16', 'surrogatepass').decode('utf-16')
dr = re.compile(r"<[^>]+>", re.S) dr = re.compile(r"<[^>]+>", re.S)
str_re = dr.sub("", ss) str_re = dr.sub("", ss)
...@@ -172,35 +171,33 @@ if __name__ == '__main__': ...@@ -172,35 +171,33 @@ if __name__ == '__main__':
begin = time.time() begin = time.time()
nowday = datetime.datetime.now() nowday = datetime.datetime.now()
today = str(nowday).split()[0] today = str(nowday).split()[0]
file_name = "save_data_" + str(today) + ".txt" file_name = "soyoung_save_data_" + str(today) + ".txt"
get_data_file = open(file_name, "a+", encoding="utf-8") get_data_file = open(file_name, "a+", encoding="utf-8")
cityIdMapping = {'北京': '1', '上海': '9', '重庆': '22', '广州市': '289', '深圳市': '291', '郑州市': '240', '武汉市': '258', cityIdMapping = {'北京': '1', '上海': '9', '重庆': '22', '广州市': '289', '深圳市': '291', '郑州市': '240', '武汉市': '258',
'长沙市': '275', '南京市': '162', '成都市': '385', '西安市': '438', '杭州市': '175'} '长沙市': '275', '南京市': '162', '成都市': '385', '西安市': '438', '杭州市': '175'}
# '瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针',
# '玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合',
# '瘦肩针','下颌角', '线雕', '超声刀', '美白针',
# '眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提',
# '点阵激光', '面部吸脂','开内眼角', '嗨体', '牙齿矫正',
# '皮秒', '超皮秒','植发', '黄金微针', '隆胸',
# '微针', '光子嫩肤', '祛斑','小气泡', '嗨体熊猫针',
# '熊猫针', '果酸焕肤', '自体脂肪隆胸', '7D聚拉提','瘦脸',
# '埋线双眼皮', '菲洛嘉水光针', '双眼皮修复', '欧洲之星', '脂肪填充',
# '溶脂针', '法令纹', '鼻基底','全切双眼皮', '颧骨内推',
# '鼻子', '抽脂', '光子嫩肤m22', '下颌缘提升', 'm22',
# '鼻翼缩小', 'fotona4d欧洲之星', '自体脂肪全面部填充', '玻尿酸丰唇', '除皱针',
# '水光', '嗨体祛颈纹','假体隆胸', '英诺小棕瓶', '黄金微雕',
# '眼袋', '乔雅登',
keywords = [ keywords = [
'除皱', '颧骨', '艾莉薇', '瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针',
'瘦腿', '玻尿酸丰下巴', '纹眉', '伊肤泉微针', '美白', '玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合',
'厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑', '瘦肩针', '下颌角', '线雕', '超声刀', '美白针',
'眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提',
'点阵激光', '面部吸脂', '开内眼角', '嗨体', '牙齿矫正',
'皮秒', '超皮秒', '植发', '黄金微针', '隆胸',
'微针', '光子嫩肤', '祛斑', '小气泡', '嗨体熊猫针',
'熊猫针', '果酸焕肤', '自体脂肪隆胸', '7D聚拉提', '瘦脸',
'埋线双眼皮', '菲洛嘉水光针', '双眼皮修复', '欧洲之星', '脂肪填充',
'溶脂针', '法令纹', '鼻基底', '全切双眼皮', '颧骨内推',
'鼻子', '抽脂', '光子嫩肤m22', '下颌缘提升', 'm22',
'鼻翼缩小', 'fotona4d欧洲之星', '自体脂肪全面部填充', '玻尿酸丰唇', '除皱针',
'水光', '嗨体祛颈纹', '假体隆胸', '英诺小棕瓶', '黄金微雕',
'眼袋', '乔雅登', '除皱', '颧骨', '艾莉薇', '瘦腿', '玻尿酸丰下巴',
'纹眉', '伊肤泉微针', '美白', '厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑',
'开眼角', '海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼'] '开眼角', '海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"] city_list = ["北京", ""
"", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
all_skuids = [] all_skuids = []
for city_name in city_list: for city_name in city_list:
...@@ -221,6 +218,6 @@ if __name__ == '__main__': ...@@ -221,6 +218,6 @@ if __name__ == '__main__':
open_file.close() open_file.close()
res = pd.DataFrame(all_data) res = pd.DataFrame(all_data)
res.to_csv("result1.csv", encoding="gb18030") res.to_csv("soyoung_result.csv", encoding="gb18030")
send_email_tome() send_email_tome()
print(time.time() - begin) print(time.time() - begin)
...@@ -3947,15 +3947,13 @@ city = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", ...@@ -3947,15 +3947,13 @@ city = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市",
for item in city_info: for item in city_info:
if "level" in item.keys(): if "level" in item.keys():
cityId = item.get("id") cityId = item.get("id")
cityName = item.get("name") city_Name = item.get("name")
if cityName in city: cityId_mapping[city_Name] = []
cityId_mapping[cityName] = cityId
if 'son' in item.keys(): if 'son' in item.keys():
for level2Item in item.get("son", []): for level2Item in item.get("son", []):
cityId = level2Item.get("id") cityId = level2Item.get("id")
cityName = level2Item.get("name") cityName = level2Item.get("name")
if cityName in city: cityId_mapping[city_Name].append(cityName)
cityId_mapping[cityName] = cityId
print(cityId_mapping) print(cityId_mapping)
import datetime import datetime
......
import json
import logging
import smtplib
import socket
import time
import traceback
import datetime
import os
import sys
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formataddr
from urllib import error
import requests
# 导入requests.exceptions模块中的三种异常类
from requests.exceptions import ReadTimeout, HTTPError, RequestException, ConnectionError
from requests.packages.urllib3.exceptions import ReadTimeoutError
from socket import timeout
import zipfile
from retrying import retry
import pandas as pd
import requests
from lxml import etree
logger = logging.getLogger(__name__)
def send_email_tome():
try:
from_addrs = 'lixiaofang@igengmei.com'
password = 'EzJzSRyEG4Jibuy9'
toaddrs = "lixiaofang@igengmei.com"
content = '爬取更美热搜前100的词召回的商品,内容详见表格'
text_apart = MIMEText(content, 'plain', "utf-8")
zip_file_week = 'gengmei_service.csv'
zip_apart_week = MIMEApplication(open(zip_file_week, 'rb').read())
zip_apart_week.add_header('Content-Disposition', 'attachment', filename=zip_file_week)
m = MIMEMultipart()
m.attach(text_apart)
m.attach(zip_apart_week)
m['From'] = formataddr(("李小芳", from_addrs))
m["To"] = formataddr(("李小芳", toaddrs))
m['Subject'] = '新氧商品信息'
try:
server = smtplib.SMTP_SSL('smtp.exmail.qq.com', 465)
server.login(from_addrs, password)
server.sendmail(from_addrs, [toaddrs], m.as_string())
print('success')
server.quit()
except smtplib.SMTPException as e:
print('error', e)
except Exception as e:
print(str(e))
logger.error("catch exception,main:%s" % traceback.format_exc())
import re
def get_keynote_sentence(content):
try:
content_list = []
ss = content.encode('utf-16', 'surrogatepass').decode('utf-16')
dr = re.compile(r"<[^>]+>", re.S)
str_re = dr.sub("", ss)
para = re.sub('([;。!?\?])([^”’])', r"\1\n\2", str_re) # 单字符断句符
para = re.sub('(\.{6})([^”’])', r"\1\n\2", para) # 英文省略号
para = re.sub('(\…{2})([^”’])', r"\1\n\2", para) # 中文省略号
para = re.sub('([;。!?\?][”’])([^,。!?\?])', r'\1\n\2', para)
para = para.rstrip() # 段尾如果有多余的\n就去掉它
return para
except:
logging.error("catch exception,logins:%s" % traceback.format_exc())
return []
# all_no_city = []
#
# def update_city_name(hospital_name, city_name):
# if hospital_name[:2] in ["北京", "上海", "广州", "深圳", "杭州", "成都", "重庆", "南京", "武汉", "长沙", "郑州", "西安"]:
# return hospital_name[:2]
# if "北京" in hospital_name or "清华" in hospital_name:
# return "北京"
# if "杭州" in hospital_name:
# return "杭州"
# if "西安" in hospital_name:
# return "西安"
# if "深圳" in hospital_name:
# return "深圳"
# if "重庆" in hospital_name:
# return "重庆"
# if "南京" in hospital_name:
# return "南京"
# if "武汉" in hospital_name:
# return "武汉"
# if "成都" in hospital_name or "四川" in hospital_name:
# return "成都"
# if "长沙" in hospital_name or "湖南" in hospital_name:
# return "长沙"
# if "上海" in hospital_name:
# return "上海"
# if "广州" in hospital_name:
# return "广州"
# if "郑州" in hospital_name or "河南" in hospital_name:
# return "郑州"
# else:
# all_no_city.append(hospital_name)
# return city_name
#
if __name__ == '__main__':
begin = time.time()
nowday = datetime.datetime.now()
today = str(nowday).split()[0]
all_data = []
city_list = ["北京", "上海", "广州", "深圳", "杭州", "成都", "重庆", "南京", "武汉", "长沙", "郑州", "西安"]
for city_name in city_list:
file_name = "gengmei_save_data_" + today + city_name + ".txt"
if os.path.exists(file_name):
open_file = open(file_name, "r", encoding="utf-8")
print(file_name)
for item in open_file.readlines():
try:
data = json.loads(item.strip())
all_data.append(data)
except:
pass
open_file.close()
if len(all_data):
res = pd.DataFrame(all_data)
res.to_csv("gengmei_service.csv", encoding="gb18030")
# f = zipfile.ZipFile("gengmei_service.csv", "w", zipfile.ZIP_DEFLATED)
#
# send_email_tome()
print(time.time() - begin)
print("end")
...@@ -36,7 +36,7 @@ def send_email_tome(): ...@@ -36,7 +36,7 @@ def send_email_tome():
content = '爬取新氧热搜前100的词召回的商品,内容详见表格' content = '爬取新氧热搜前100的词召回的商品,内容详见表格'
text_apart = MIMEText(content, 'plain', "utf-8") text_apart = MIMEText(content, 'plain', "utf-8")
zip_file_week = 'result1.csv' zip_file_week = 'soyoung_service.csv'
zip_apart_week = MIMEApplication(open(zip_file_week, 'rb').read()) zip_apart_week = MIMEApplication(open(zip_file_week, 'rb').read())
zip_apart_week.add_header('Content-Disposition', 'attachment', filename=zip_file_week) zip_apart_week.add_header('Content-Disposition', 'attachment', filename=zip_file_week)
...@@ -68,7 +68,6 @@ import re ...@@ -68,7 +68,6 @@ import re
def get_keynote_sentence(content): def get_keynote_sentence(content):
try: try:
content_list = []
ss = content.encode('utf-16', 'surrogatepass').decode('utf-16') ss = content.encode('utf-16', 'surrogatepass').decode('utf-16')
dr = re.compile(r"<[^>]+>", re.S) dr = re.compile(r"<[^>]+>", re.S)
str_re = dr.sub("", ss) str_re = dr.sub("", ss)
...@@ -88,19 +87,24 @@ if __name__ == '__main__': ...@@ -88,19 +87,24 @@ if __name__ == '__main__':
nowday = datetime.datetime.now() nowday = datetime.datetime.now()
today = str(nowday).split()[0] today = str(nowday).split()[0]
all_data = [] all_data = []
file_name = "save_data_2021-07-21.txt" city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
if os.path.exists(file_name): for city_name in city_list:
open_file = open(file_name, "r", encoding="utf-8") file_name = "save_data_" + today + city_name + ".txt"
for item in open_file.readlines(): print(file_name)
data = json.loads(item.strip()) if os.path.exists(file_name):
data['美购名称'] = get_keynote_sentence(data.get("美购名称")) open_file = open(file_name, "r", encoding="utf-8")
# print(data['美购名称']) for item in open_file.readlines():
all_data.append(data) try:
open_file.close() data = json.loads(item.strip())
print(len(all_data)) all_data.append(data)
res = pd.DataFrame(all_data) except:
res.to_csv("result1.csv", encoding="gb18030") pass
send_email_tome() open_file.close()
if len(all_data):
res = pd.DataFrame(all_data)
res.to_csv("soyoung_service.csv", encoding="gb18030")
send_email_tome()
print(time.time() - begin) print(time.time() - begin)
print("end") print("end")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment