Commit e0268805 authored by 李小芳's avatar 李小芳

add

parent 3c19d6e7
import json
import logging
import smtplib
import socket
import time
import traceback
import datetime
import os
import sys
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formataddr
from urllib import error
import requests
# 导入requests.exceptions模块中的三种异常类
from requests.exceptions import ReadTimeout, HTTPError, RequestException, ConnectionError
from requests.packages.urllib3.exceptions import ReadTimeoutError
from socket import timeout
from retrying import retry
import pandas as pd
import requests
from lxml import etree
logger = logging.getLogger(__name__)
def send_email_tome():
try:
from_addrs = 'lixiaofang@igengmei.com'
password = 'EzJzSRyEG4Jibuy9'
toaddrs = "lixiaofang@igengmei.com"
content = '爬取新氧热搜前100的词召回的商品,内容详见表格'
text_apart = MIMEText(content, 'plain', "utf-8")
zip_file_week = 'result1.csv'
zip_apart_week = MIMEApplication(open(zip_file_week, 'rb').read())
zip_apart_week.add_header('Content-Disposition', 'attachment', filename=zip_file_week)
m = MIMEMultipart()
m.attach(text_apart)
m.attach(zip_apart_week)
m['From'] = formataddr(("李小芳", from_addrs))
m["To"] = formataddr(("李小芳", toaddrs))
m['Subject'] = '新氧商品信息'
try:
server = smtplib.SMTP_SSL('smtp.exmail.qq.com', 465)
server.login(from_addrs, password)
server.sendmail(from_addrs, [toaddrs], m.as_string())
print('success')
server.quit()
except smtplib.SMTPException as e:
print('error', e)
except Exception as e:
print(str(e))
logger.error("catch exception,main:%s" % traceback.format_exc())
class CrawlerMain(object):
def __init__(self, city_id=-1):
self.headers = {
"cookie": "__order_time__=2021-07-16 15:22:00; msg_time=2021-07-16 15:22:00; back_order_time=2021-07-16 15:22:00; complain_time=2021-07-16 15:22:00; uuid=2E2206C5-B5CD-18F9-8B76-D5FE0D078395; __usersign__=1626341221780983876; _ga=GA1.2.2084074278.1626341224; smidV2=20210715174222a8c0fc7fc96128d6b9c09abf5787b250008f7cb10a6f61380; _gid=GA1.2.2004598599.1626602683; PHPSESSID=ace2ec3e62b7d5a8f7021c3c85e0bb00; __postion__=a%3A4%3A%7Bs%3A6%3A%22cityId%22%3Bi%3A0%3Bs%3A8%3A%22cityName%22%3Bs%3A0%3A%22%22%3Bs%3A8%3A%22cityCode%22%3Bi%3A0%3Bs%3A3%3A%22jwd%22%3Bi%3A0%3B%7D; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1626341224,1626510298,1626602683,1626675657; _gat=1; cityId={}; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1626675685".format(
city_id),
"referer": "https://www.soyoung.com/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
self.info_headers = {
"cookie": "__usersign__=1626341221780983876; _ga=GA1.2.2084074278.1626341224; smidV2=20210715174222a8c0fc7fc96128d6b9c09abf5787b250008f7cb10a6f61380; __order_time__=2021-07-16 15:22:00; msg_time=2021-07-16 15:22:00; back_order_time=2021-07-16 15:22:00; complain_time=2021-07-16 15:22:00; _gid=GA1.2.2004598599.1626602683; PHPSESSID=ace2ec3e62b7d5a8f7021c3c85e0bb00; __postion__=a%3A4%3A%7Bs%3A6%3A%22cityId%22%3Bi%3A0%3Bs%3A8%3A%22cityName%22%3Bs%3A0%3A%22%22%3Bs%3A8%3A%22cityCode%22%3Bi%3A0%3Bs%3A3%3A%22jwd%22%3Bi%3A0%3B%7D; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1626341224,1626510298,1626602683,1626675657; cityId=9; Hm_lvt_757750800edc4c9eade200248b2aa23f=1626426185,1626426729,1626510306,1626676317; Hm_lpvt_757750800edc4c9eade200248b2aa23f=1626676317; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1626676317",
"referer": "https://www.soyoung.com/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
self.title = [
"query词", "城市", "平台", "美购id", "skuid",
"美购名称", "sku名称", "医院名称", "机构等级", "医生名",
"销量", "sku原价", "sku活动价", "可领取预约金优惠券", "可用尾款券",
"链接"
]
# 全部报错才会报错,如果其中一次正常,则继续执行
# 两次retry之间等待2秒,重试5次
@retry(stop_max_attempt_number=5, wait_fixed=1000)
def get_service_base_info_list(self, pid):
url = "https://y.soyoung.com/cp{}".format(pid)
try:
requests_res = requests.get(url, headers=self.headers, allow_redirects=False, timeout=10)
# 打印状态码
if requests_res.status_code == 200:
page_obj = etree.HTML(requests_res.text)
return page_obj, url
except ReadTimeout:
print('timeout')
pass
return None, None
except HTTPError:
print('httperror')
pass
return None, None
# 请求异常
except RequestException:
print('reqerror')
pass
return None, None
except socket.timeout:
print(socket.timeout)
pass
return None, None
except ReadTimeoutError:
print("ReadTimeoutError")
pass
return None, None
except ConnectionError:
print("ConnectionError")
pass
return None, None
def get_search_service_info_list(self, page=1, city_id=-1, query=""):
url = "https://www.soyoung.com/searchNew/product?" \
"keyword={0}&cityId=&page_size=&_json=1&sort=0&service=&coupon=&group=&maxprice=&minprice=&page={1}" \
.format(query, page)
try:
requests_res = requests.get(url, headers=self.headers, allow_redirects=False, timeout=10)
if requests_res.status_code == 200:
res_json = requests_res.json()
return res_json
except ReadTimeout:
print('超时,执行下一个请求')
pass
return None
except HTTPError:
print('httperror')
pass
return None
# 请求异常
except RequestException:
print('reqerror')
pass
return None
except socket.timeout:
print(socket.timeout)
pass
return None
except ReadTimeoutError:
print("ReadTimeoutError")
pass
return None
except ConnectionError:
print("ConnectionError")
pass
return None
def get_services_list(self, res_json, query="", city_name=""):
page_service_pids = []
for service in res_json.get("responseData", {}).get("arr_product", []):
pid = service.get("pid")
spu_id = service.get("spu_id")
doctor_name = service.get("doctor_name")
hospital_name = service.get("hospital_name")
service_info = self.get_service_base_info(pid)
service_info['美购id'] = spu_id
service_info['skuid'] = pid
service_info['医生名'] = doctor_name
service_info['医院名称'] = hospital_name
print(hospital_name)
service_info['query词'] = query
service_info['城市'] = city_name
service_info['平台'] = "新氧"
sort_service_info = sorted(service_info.items(), key=lambda x: self.title.index(x[0]), reverse=False)
page_service_pids.append(dict(sort_service_info))
return page_service_pids
def get_service_base_info(self, pid):
service_info = dict()
res_json, url = self.get_service_base_info_list(pid)
if res_json:
res_json = res_json.xpath(
"/html[1]/body[1]/div[@class='page-content']"
"/div[@class='w1000']/div[@class='detail-wrap']/div[@class='width-control']/div"
)
service_info['链接'] = url
for base_info in res_json:
if "basic-info" in base_info.xpath("div/@class"):
service_info["美购名称"] = str(base_info.xpath("div/h1/text()")[0].strip())
service_info["sku活动价"] = base_info.xpath("div/div[@class='base-price']/em/text()")[0].strip() # 980
service_info["sku原价"] = base_info.xpath("div/div[@class='base-price']/del/text()")[
0].strip() # 1980
service_info["销量"] = base_info.xpath("div/div[@class='base-relation']/div[3]/em/text()")[
0].strip() # 110
service_info["机构等级"] = base_info.xpath("div/div[@class='base-relation']/div[1]/text()")[
0].strip() # 110
service_info['可领取预约金优惠券'] = []
service_info['可用尾款券'] = []
for vip_info in base_info.xpath("div/dl[@class='base-param']/dd[@class='app-vip']/div"):
vip_str_info = ""
vip_title = vip_info.xpath("div[@class='label']/text()")[0].strip() if vip_info.xpath(
"div[@class='label']/text()") else ""
if vip_title in ["支持分期"]:
vip_data = vip_info.xpath("div[@class='text']/text()")[0].strip() if vip_info.xpath(
"div[@class='text']/text()") else ""
vip_str_info += vip_title + ":" + vip_data
elif vip_title in ["尾款红包"]:
vip_youhui = []
for youhui in vip_info.xpath("div[@class='text']/span"):
vip_data = youhui.xpath("em/text()")
vip_youhui.append(vip_data[0] + "元红包满" + vip_data[1] + "可用")
vip_str_info += vip_title + ":" + " ".join(vip_youhui)
elif vip_title in ["氧分抵扣"]:
vip_data = vip_info.xpath("div[@class='text']/text()")[0].strip() if vip_info.xpath(
"div[@class='text']/text()") else ""
vip_money = vip_info.xpath("div[@class='text']/em/text()")[0].strip() if vip_info.xpath(
"div[@class='text']/em/text()") else ""
vip_str_info += vip_title + ":" + vip_data + str(vip_money) + "元"
else:
pass
service_info['可领取预约金优惠券'].append(vip_str_info)
for pay_info in base_info.xpath("div/div[@class='base-buy']/div[@class='price-box']"):
deposit_title = pay_info.xpath("span/i/text()")[0].strip()
deposit_price = pay_info.xpath("span/em/text()")[0].strip()
to_pay_title = pay_info.xpath("p/text()")[0].strip()
to_pay_price = pay_info.xpath("p/span/text()")[0].strip()
service_info['可用尾款券'].append(
deposit_title + ":" + deposit_price + "," + to_pay_title + ":" + to_pay_price
)
else:
pass
return service_info
class SoYongSpider(object):
def __init__(self, file_name):
self.cityIdMapping = {'北京': '1', '上海': '9', '重庆': '22', '广州市': '289', '深圳市': '291', '郑州市': '240', '武汉市': '258',
'长沙市': '275', '南京市': '162', '成都市': '385', '西安市': '438', '杭州市': '175'}
self.keywords = ['瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀',
'美白针',
'眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提', '点阵激光', '面部吸脂', '开内眼角', '嗨体', '牙齿矫正', '皮秒', '超皮秒', '植发',
'黄金微针', '隆胸',
'微针', '光子嫩肤', '祛斑', '小气泡', '嗨体熊猫针', '熊猫针', '果酸焕肤', '自体脂肪隆胸', '7D聚拉提', '瘦脸', '埋线双眼皮', '菲洛嘉水光针',
'双眼皮修复',
'欧洲之星', '脂肪填充', '溶脂针', '法令纹', '鼻基底', '全切双眼皮', '颧骨内推', '鼻子', '光子嫩肤m22', '下颌缘提升', 'm22', '鼻翼缩小',
'欧洲之星fotona4d', '自体脂肪全面部填充', '玻尿酸丰唇', '水光', '嗨体祛颈纹', '假体隆胸', '英诺小棕瓶', '黄金微雕', '眼袋', '乔雅登',
'除皱', '颧骨',
'艾莉薇', '抽脂', '瘦腿', '玻尿酸丰下巴', '纹眉', '伊肤泉微针', '美白', '厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑', '除皱针',
'开眼角',
'海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
self.test_keywords = ['瘦脸针', '双眼皮']
self.city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
self.test_city_list = ["北京", "上海"]
self.page_num = 11
self.file_name = file_name
self.have_get_service_info = self.get_have_spider_keywords()
# self.get_data_file = open(file_name, "a+", encoding="utf-8")
# self.read_data_file = open(self.file_name, "r", encoding="utf-8")
def get_have_spider_keywords(self):
have_get_service_info = {}
if os.path.exists(self.file_name):
read_data_file = open(self.file_name, "r", encoding="utf-8")
# 先获取已经请求完的数据
for item in read_data_file.readlines():
data = json.loads(item.strip())
query = data.get("query词")
city_name = data.get("城市")
word = query + city_name
if str(word) in have_get_service_info.keys():
have_get_service_info[str(word)] += 1
else:
have_get_service_info[str(word)] = 1
read_data_file.close()
return have_get_service_info
def run(self, city_tags):
get_data_file = open(self.file_name, "a+", encoding="utf-8")
get_lasted_data = []
self.city_list = [city_tags]
for city_name in self.city_list: # 热门城市
city_id = self.cityIdMapping[city_name]
crawler_xinyang = CrawlerMain(city_id=city_id)
# print(city_name, self.city_list.index(city_name), len(self.city_list) - 1)
if self.city_list.index(city_name) == len(self.city_list) - 1:
get_lasted_data.append(city_name)
for keyword in self.keywords: # 热门词
# print(keyword, self.keywords.index(keyword), len(self.keywords) - 1)
if self.keywords.index(keyword) == len(self.keywords) - 1 and len(get_lasted_data) == 1:
get_lasted_data.append(keyword)
for page in range(1, self.page_num): # 筛选前100个
if self.page_num == page + 1 and len(get_lasted_data) == 2:
get_lasted_data.append(page)
word = str(keyword + city_name)
if word not in self.have_get_service_info.keys() or self.have_get_service_info[word] < 10:
print(city_name, ",", city_id, ",", keyword, ",", page)
resJson = crawler_xinyang.get_search_service_info_list(query=keyword, page=page,
city_id=city_id)
for data in crawler_xinyang.get_services_list(res_json=resJson, query=keyword,
city_name=city_name):
get_data_file.write(json.dumps(data))
get_data_file.write("\n")
else:
pass
time.sleep(1)
get_data_file.close()
print("get_lasted_data:", get_lasted_data)
if len(get_lasted_data) == 3:
return True
else:
return False
def main(city_tags):
begin = time.time()
nowday = datetime.datetime.now()
today = str(nowday).split()[0]
file_name = "save_data_" + str(today) + city_tags + ".txt"
while (True):
spider_obj = SoYongSpider(file_name)
flat = spider_obj.run(city_tags=city_tags)
print("flat:", flat)
if flat == True:
break
all_data = []
open_file = open(file_name, "r", encoding="utf-8")
for item in open_file.readlines():
all_data.append(json.loads(item))
res = pd.DataFrame(all_data)
res.to_csv("result1.csv", encoding="gb18030")
send_email_tome()
open_file.close()
print(time.time() - begin)
print("end")
if __name__ == "__main__":
args = sys.argv[1]
main(city_tags=args)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment