Commit e440244e authored by 李小芳's avatar 李小芳

add

parent 43d2037c
......@@ -64,7 +64,6 @@ def send_email_tome():
class CrawlerMain(object):
def __init__(self, city_id=-1):
self.headers = {
"cookie": "__order_time__=2021-07-16 15:22:00; msg_time=2021-07-16 15:22:00; back_order_time=2021-07-16 15:22:00; complain_time=2021-07-16 15:22:00; uuid=2E2206C5-B5CD-18F9-8B76-D5FE0D078395; __usersign__=1626341221780983876; _ga=GA1.2.2084074278.1626341224; smidV2=20210715174222a8c0fc7fc96128d6b9c09abf5787b250008f7cb10a6f61380; _gid=GA1.2.2004598599.1626602683; PHPSESSID=ace2ec3e62b7d5a8f7021c3c85e0bb00; __postion__=a%3A4%3A%7Bs%3A6%3A%22cityId%22%3Bi%3A0%3Bs%3A8%3A%22cityName%22%3Bs%3A0%3A%22%22%3Bs%3A8%3A%22cityCode%22%3Bi%3A0%3Bs%3A3%3A%22jwd%22%3Bi%3A0%3B%7D; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1626341224,1626510298,1626602683,1626675657; _gat=1; cityId={}; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1626675685".format(
city_id),
......@@ -88,13 +87,11 @@ class CrawlerMain(object):
@retry(stop_max_attempt_number=5, wait_fixed=1000)
def get_service_base_info_list(self, pid):
url = "https://y.soyoung.com/cp{}".format(pid)
try:
requests_res = requests.get(url, headers=self.headers, allow_redirects=False, timeout=10)
# 打印状态码
page_obj = etree.HTML(requests_res.text)
return page_obj, url
except ReadTimeout:
print('timeout')
return None, None
......@@ -144,26 +141,37 @@ class CrawlerMain(object):
print("ConnectionError")
return None
def get_services_list(self, res_json, query="", city_name=""):
def get_services_list(self, res_json, query="", city_name="", city_id=-1):
page_service_pids = []
current_end_flat = False
for service in res_json.get("responseData", {}).get("arr_product", []):
pid = service.get("pid")
spu_id = service.get("spu_id")
doctor_name = service.get("doctor_name")
hospital_name = service.get("hospital_name")
service_info = self.get_service_base_info(pid)
service_info['美购id'] = spu_id
service_info['skuid'] = pid
service_info['医生名'] = doctor_name
service_info['医院名称'] = hospital_name
service_info['query词'] = query
service_info['城市'] = city_name
service_info['平台'] = "新氧"
sort_service_info = sorted(service_info.items(), key=lambda x: self.title.index(x[0]), reverse=False)
page_service_pids.append(dict(sort_service_info))
return page_service_pids
current_city = service.get("district_2")
if int(current_city) == int(city_id):
pid = service.get("pid")
spu_id = service.get("spu_id")
doctor_name = service.get("doctor_name")
hospital_name = service.get("hospital_name")
service_info = self.get_service_base_info(pid)
service_info['美购id'] = spu_id
service_info['sku原价'] = service.get("price_origin")
service_info['sku活动价'] = service.get("price_online")
service_info['机构等级'] = service.get("avg_score")
service_info['美购名称'] = service.get("title")
service_info['销量'] = service.get("order_cnt")
service_info['skuid'] = pid
service_info['医生名'] = doctor_name
service_info['医院名称'] = hospital_name
service_info['query词'] = query
service_info['城市'] = city_name
service_info['平台'] = "新氧"
service_info['链接'] = "https://y.soyoung.com/cp{}".format(pid)
sort_service_info = sorted(service_info.items(), key=lambda x: self.title.index(x[0]), reverse=False)
page_service_pids.append(dict(sort_service_info))
else:
current_end_flat = True
break
return page_service_pids, current_end_flat
def get_service_base_info(self, pid):
service_info = dict()
......@@ -173,17 +181,16 @@ class CrawlerMain(object):
"/html[1]/body[1]/div[@class='page-content']"
"/div[@class='w1000']/div[@class='detail-wrap']/div[@class='width-control']/div"
)
service_info['链接'] = url
for base_info in res_json:
if "basic-info" in base_info.xpath("div/@class"):
service_info["美购名称"] = str(base_info.xpath("div/h1/text()")[0].strip())
service_info["sku活动价"] = base_info.xpath("div/div[@class='base-price']/em/text()")[0].strip() # 980
service_info["sku原价"] = base_info.xpath("div/div[@class='base-price']/del/text()")[0].strip() # 1980
service_info["销量"] = base_info.xpath("div/div[@class='base-relation']/div[3]/em/text()")[
0].strip() # 110
service_info["机构等级"] = base_info.xpath("div/div[@class='base-relation']/div[1]/text()")[
0].strip() # 110
# service_info["美购名称"] = str(base_info.xpath("div/h1/text()")[0].strip())
# service_info["sku活动价"] = base_info.xpath("div/div[@class='base-price']/em/text()")[0].strip() # 980
# service_info["sku原价"] = base_info.xpath("div/div[@class='base-price']/del/text()")[
# 0].strip() # 1980
# service_info["销量"] = base_info.xpath("div/div[@class='base-relation']/div[3]/em/text()")[
# 0].strip() # 110
# service_info["机构等级"] = base_info.xpath("div/div[@class='base-relation']/div[1]/text()")[
# 0].strip() # 110
service_info['可领取预约金优惠券'] = []
service_info['可用尾款券'] = []
......@@ -235,7 +242,6 @@ class SoYongSpider(object):
def __init__(self, file_name):
self.cityIdMapping = {'北京': '1', '上海': '9', '重庆': '22', '广州市': '289', '深圳市': '291', '郑州市': '240', '武汉市': '258',
'长沙市': '275', '南京市': '162', '成都市': '385', '西安市': '438', '杭州市': '175'}
self.keywords = ['瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀',
'美白针',
'眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提', '点阵激光', '面部吸脂', '开内眼角', '嗨体', '牙齿矫正', '皮秒', '超皮秒', '植发',
......@@ -249,9 +255,9 @@ class SoYongSpider(object):
'开眼角',
'海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
self.test_keywords = ['瘦脸针', '双眼皮']
self.city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
self.city_list = ["南京市", "北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
self.test_city_list = ["北京", "上海"]
self.page_num = 11
self.page_num = 500
self.file_name = file_name
self.have_get_service_info = self.get_have_spider_keywords()
# self.get_data_file = open(file_name, "a+", encoding="utf-8")
......@@ -298,12 +304,18 @@ class SoYongSpider(object):
print(city_name, ",", city_id, ",", keyword, ",", page)
resJson = crawler_xinyang.get_search_service_info_list(query=keyword, page=page,
city_id=city_id)
for data in crawler_xinyang.get_services_list(res_json=resJson, query=keyword,
city_name=city_name):
service_info_list, current_end_flat = crawler_xinyang.get_services_list(res_json=resJson,
query=keyword,
city_name=city_name,
city_id=city_id)
for data in service_info_list:
get_data_file.write(json.dumps(data))
get_data_file.write("\n")
if current_end_flat == True:
break
else:
pass
time.sleep(1)
get_data_file.close()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment