Commit e440244e authored by 李小芳's avatar 李小芳

add

parent 43d2037c
......@@ -64,7 +64,6 @@ def send_email_tome():
class CrawlerMain(object):
def __init__(self, city_id=-1):
self.headers = {
"cookie": "__order_time__=2021-07-16 15:22:00; msg_time=2021-07-16 15:22:00; back_order_time=2021-07-16 15:22:00; complain_time=2021-07-16 15:22:00; uuid=2E2206C5-B5CD-18F9-8B76-D5FE0D078395; __usersign__=1626341221780983876; _ga=GA1.2.2084074278.1626341224; smidV2=20210715174222a8c0fc7fc96128d6b9c09abf5787b250008f7cb10a6f61380; _gid=GA1.2.2004598599.1626602683; PHPSESSID=ace2ec3e62b7d5a8f7021c3c85e0bb00; __postion__=a%3A4%3A%7Bs%3A6%3A%22cityId%22%3Bi%3A0%3Bs%3A8%3A%22cityName%22%3Bs%3A0%3A%22%22%3Bs%3A8%3A%22cityCode%22%3Bi%3A0%3Bs%3A3%3A%22jwd%22%3Bi%3A0%3B%7D; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1626341224,1626510298,1626602683,1626675657; _gat=1; cityId={}; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1626675685".format(
city_id),
......@@ -88,13 +87,11 @@ class CrawlerMain(object):
@retry(stop_max_attempt_number=5, wait_fixed=1000)
def get_service_base_info_list(self, pid):
url = "https://y.soyoung.com/cp{}".format(pid)
try:
requests_res = requests.get(url, headers=self.headers, allow_redirects=False, timeout=10)
# 打印状态码
page_obj = etree.HTML(requests_res.text)
return page_obj, url
except ReadTimeout:
print('timeout')
return None, None
......@@ -144,26 +141,37 @@ class CrawlerMain(object):
print("ConnectionError")
return None
def get_services_list(self, res_json, query="", city_name=""):
def get_services_list(self, res_json, query="", city_name="", city_id=-1):
page_service_pids = []
current_end_flat = False
for service in res_json.get("responseData", {}).get("arr_product", []):
pid = service.get("pid")
spu_id = service.get("spu_id")
doctor_name = service.get("doctor_name")
hospital_name = service.get("hospital_name")
service_info = self.get_service_base_info(pid)
service_info['美购id'] = spu_id
service_info['skuid'] = pid
service_info['医生名'] = doctor_name
service_info['医院名称'] = hospital_name
service_info['query词'] = query
service_info['城市'] = city_name
service_info['平台'] = "新氧"
sort_service_info = sorted(service_info.items(), key=lambda x: self.title.index(x[0]), reverse=False)
page_service_pids.append(dict(sort_service_info))
return page_service_pids
current_city = service.get("district_2")
if int(current_city) == int(city_id):
pid = service.get("pid")
spu_id = service.get("spu_id")
doctor_name = service.get("doctor_name")
hospital_name = service.get("hospital_name")
service_info = self.get_service_base_info(pid)
service_info['美购id'] = spu_id
service_info['sku原价'] = service.get("price_origin")
service_info['sku活动价'] = service.get("price_online")
service_info['机构等级'] = service.get("avg_score")
service_info['美购名称'] = service.get("title")
service_info['销量'] = service.get("order_cnt")
service_info['skuid'] = pid
service_info['医生名'] = doctor_name
service_info['医院名称'] = hospital_name
service_info['query词'] = query
service_info['城市'] = city_name
service_info['平台'] = "新氧"
service_info['链接'] = "https://y.soyoung.com/cp{}".format(pid)
sort_service_info = sorted(service_info.items(), key=lambda x: self.title.index(x[0]), reverse=False)
page_service_pids.append(dict(sort_service_info))
else:
current_end_flat = True
break
return page_service_pids, current_end_flat
def get_service_base_info(self, pid):
service_info = dict()
......@@ -173,17 +181,16 @@ class CrawlerMain(object):
"/html[1]/body[1]/div[@class='page-content']"
"/div[@class='w1000']/div[@class='detail-wrap']/div[@class='width-control']/div"
)
service_info['链接'] = url
for base_info in res_json:
if "basic-info" in base_info.xpath("div/@class"):
service_info["美购名称"] = str(base_info.xpath("div/h1/text()")[0].strip())
service_info["sku活动价"] = base_info.xpath("div/div[@class='base-price']/em/text()")[0].strip() # 980
service_info["sku原价"] = base_info.xpath("div/div[@class='base-price']/del/text()")[0].strip() # 1980
service_info["销量"] = base_info.xpath("div/div[@class='base-relation']/div[3]/em/text()")[
0].strip() # 110
service_info["机构等级"] = base_info.xpath("div/div[@class='base-relation']/div[1]/text()")[
0].strip() # 110
# service_info["美购名称"] = str(base_info.xpath("div/h1/text()")[0].strip())
# service_info["sku活动价"] = base_info.xpath("div/div[@class='base-price']/em/text()")[0].strip() # 980
# service_info["sku原价"] = base_info.xpath("div/div[@class='base-price']/del/text()")[
# 0].strip() # 1980
# service_info["销量"] = base_info.xpath("div/div[@class='base-relation']/div[3]/em/text()")[
# 0].strip() # 110
# service_info["机构等级"] = base_info.xpath("div/div[@class='base-relation']/div[1]/text()")[
# 0].strip() # 110
service_info['可领取预约金优惠券'] = []
service_info['可用尾款券'] = []
......@@ -235,7 +242,6 @@ class SoYongSpider(object):
def __init__(self, file_name):
self.cityIdMapping = {'北京': '1', '上海': '9', '重庆': '22', '广州市': '289', '深圳市': '291', '郑州市': '240', '武汉市': '258',
'长沙市': '275', '南京市': '162', '成都市': '385', '西安市': '438', '杭州市': '175'}
self.keywords = ['瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀',
'美白针',
'眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提', '点阵激光', '面部吸脂', '开内眼角', '嗨体', '牙齿矫正', '皮秒', '超皮秒', '植发',
......@@ -249,9 +255,9 @@ class SoYongSpider(object):
'开眼角',
'海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
self.test_keywords = ['瘦脸针', '双眼皮']
self.city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
self.city_list = ["南京市", "北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
self.test_city_list = ["北京", "上海"]
self.page_num = 11
self.page_num = 500
self.file_name = file_name
self.have_get_service_info = self.get_have_spider_keywords()
# self.get_data_file = open(file_name, "a+", encoding="utf-8")
......@@ -298,12 +304,18 @@ class SoYongSpider(object):
print(city_name, ",", city_id, ",", keyword, ",", page)
resJson = crawler_xinyang.get_search_service_info_list(query=keyword, page=page,
city_id=city_id)
for data in crawler_xinyang.get_services_list(res_json=resJson, query=keyword,
city_name=city_name):
service_info_list, current_end_flat = crawler_xinyang.get_services_list(res_json=resJson,
query=keyword,
city_name=city_name,
city_id=city_id)
for data in service_info_list:
get_data_file.write(json.dumps(data))
get_data_file.write("\n")
if current_end_flat == True:
break
else:
pass
time.sleep(1)
get_data_file.close()
......
......@@ -65,7 +65,6 @@ def send_email_tome():
class CrawlerMain(object):
def __init__(self, city_id=-1):
self.headers = {
"cookie": "__order_time__=2021-07-16 15:22:00; msg_time=2021-07-16 15:22:00; back_order_time=2021-07-16 15:22:00; complain_time=2021-07-16 15:22:00; uuid=2E2206C5-B5CD-18F9-8B76-D5FE0D078395; __usersign__=1626341221780983876; _ga=GA1.2.2084074278.1626341224; smidV2=20210715174222a8c0fc7fc96128d6b9c09abf5787b250008f7cb10a6f61380; _gid=GA1.2.2004598599.1626602683; PHPSESSID=ace2ec3e62b7d5a8f7021c3c85e0bb00; __postion__=a%3A4%3A%7Bs%3A6%3A%22cityId%22%3Bi%3A0%3Bs%3A8%3A%22cityName%22%3Bs%3A0%3A%22%22%3Bs%3A8%3A%22cityCode%22%3Bi%3A0%3Bs%3A3%3A%22jwd%22%3Bi%3A0%3B%7D; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1626341224,1626510298,1626602683,1626675657; _gat=1; cityId={}; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1626675685".format(
city_id),
......@@ -89,7 +88,6 @@ class CrawlerMain(object):
@retry(stop_max_attempt_number=5, wait_fixed=1000)
def get_service_base_info_list(self, pid):
url = "https://y.soyoung.com/cp{}".format(pid)
page_obj =None
try:
requests_res = requests.get(url, headers=self.headers, allow_redirects=False, timeout=10)
# 打印状态码
......@@ -97,23 +95,23 @@ class CrawlerMain(object):
return page_obj, url
except ReadTimeout:
print('timeout')
return None, url
return None, None
except HTTPError:
print('httperror')
return None, url
return None, None
# 请求异常
except RequestException:
print('reqerror')
return None, url
return None, None
except socket.timeout:
print(socket.timeout)
return None, url
return None, None
except ReadTimeoutError:
print("ReadTimeoutError")
return None, url
return None, None
except ConnectionError:
print("ConnectionError")
return None, url
return None, None
def get_search_service_info_list(self, page=1, city_id=-1, query=""):
url = "https://www.soyoung.com/searchNew/product?" \
......@@ -144,27 +142,37 @@ class CrawlerMain(object):
print("ConnectionError")
return None
def get_services_list(self, res_json, query="", city_name=""):
def get_services_list(self, res_json, query="", city_name="", city_id=-1):
page_service_pids = []
current_end_flat = False
for service in res_json.get("responseData", {}).get("arr_product", []):
pid = service.get("pid")
spu_id = service.get("spu_id")
doctor_name = service.get("doctor_name")
hospital_name = service.get("hospital_name")
service_info = self.get_service_base_info(pid)
service_info['美购id'] = spu_id
service_info['skuid'] = pid
service_info['医生名'] = doctor_name
service_info['医院名称'] = hospital_name
print(hospital_name)
service_info['query词'] = query
service_info['城市'] = city_name
service_info['平台'] = "新氧"
sort_service_info = sorted(service_info.items(), key=lambda x: self.title.index(x[0]), reverse=False)
page_service_pids.append(dict(sort_service_info))
return page_service_pids
current_city = service.get("district_2")
if int(current_city) == int(city_id):
pid = service.get("pid")
spu_id = service.get("spu_id")
doctor_name = service.get("doctor_name")
hospital_name = service.get("hospital_name")
service_info = self.get_service_base_info(pid)
service_info['美购id'] = spu_id
service_info['sku原价'] = service.get("price_origin")
service_info['sku活动价'] = service.get("price_online")
service_info['机构等级'] = service.get("avg_score")
service_info['美购名称'] = service.get("title")
service_info['销量'] = service.get("order_cnt")
service_info['skuid'] = pid
service_info['医生名'] = doctor_name
service_info['医院名称'] = hospital_name
service_info['query词'] = query
service_info['城市'] = city_name
service_info['平台'] = "新氧"
service_info['链接'] = "https://y.soyoung.com/cp{}".format(pid)
sort_service_info = sorted(service_info.items(), key=lambda x: self.title.index(x[0]), reverse=False)
page_service_pids.append(dict(sort_service_info))
else:
current_end_flat = True
break
return page_service_pids, current_end_flat
def get_service_base_info(self, pid):
service_info = dict()
......@@ -174,18 +182,16 @@ class CrawlerMain(object):
"/html[1]/body[1]/div[@class='page-content']"
"/div[@class='w1000']/div[@class='detail-wrap']/div[@class='width-control']/div"
)
service_info['链接'] = url
for base_info in res_json:
if "basic-info" in base_info.xpath("div/@class"):
service_info["美购名称"] = str(base_info.xpath("div/h1/text()")[0].strip())
service_info["sku活动价"] = base_info.xpath("div/div[@class='base-price']/em/text()")[0].strip() # 980
service_info["sku原价"] = base_info.xpath("div/div[@class='base-price']/del/text()")[
0].strip() # 1980
service_info["销量"] = base_info.xpath("div/div[@class='base-relation']/div[3]/em/text()")[
0].strip() # 110
service_info["机构等级"] = base_info.xpath("div/div[@class='base-relation']/div[1]/text()")[
0].strip() # 110
# service_info["美购名称"] = str(base_info.xpath("div/h1/text()")[0].strip())
# service_info["sku活动价"] = base_info.xpath("div/div[@class='base-price']/em/text()")[0].strip() # 980
# service_info["sku原价"] = base_info.xpath("div/div[@class='base-price']/del/text()")[
# 0].strip() # 1980
# service_info["销量"] = base_info.xpath("div/div[@class='base-relation']/div[3]/em/text()")[
# 0].strip() # 110
# service_info["机构等级"] = base_info.xpath("div/div[@class='base-relation']/div[1]/text()")[
# 0].strip() # 110
service_info['可领取预约金优惠券'] = []
service_info['可用尾款券'] = []
......@@ -237,7 +243,6 @@ class SoYongSpider(object):
def __init__(self, file_name):
self.cityIdMapping = {'北京': '1', '上海': '9', '重庆': '22', '广州市': '289', '深圳市': '291', '郑州市': '240', '武汉市': '258',
'长沙市': '275', '南京市': '162', '成都市': '385', '西安市': '438', '杭州市': '175'}
self.keywords = ['瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀',
'美白针',
'眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提', '点阵激光', '面部吸脂', '开内眼角', '嗨体', '牙齿矫正', '皮秒', '超皮秒', '植发',
......@@ -251,11 +256,13 @@ class SoYongSpider(object):
'开眼角',
'海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
self.test_keywords = ['瘦脸针', '双眼皮']
self.city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
self.city_list = ["南京市", "北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
self.test_city_list = ["北京", "上海"]
self.page_num = 11
self.page_num = 500
self.file_name = file_name
self.have_get_service_info = self.get_have_spider_keywords()
# self.get_data_file = open(file_name, "a+", encoding="utf-8")
# self.read_data_file = open(self.file_name, "r", encoding="utf-8")
def get_have_spider_keywords(self):
have_get_service_info = {}
......@@ -299,12 +306,18 @@ class SoYongSpider(object):
print(city_name, ",", city_id, ",", keyword, ",", page)
resJson = crawler_xinyang.get_search_service_info_list(query=keyword, page=page,
city_id=city_id)
for data in crawler_xinyang.get_services_list(res_json=resJson, query=keyword,
city_name=city_name):
service_info_list, current_end_flat = crawler_xinyang.get_services_list(res_json=resJson,
query=keyword,
city_name=city_name,
city_id=city_id)
for data in service_info_list:
get_data_file.write(json.dumps(data))
get_data_file.write("\n")
if current_end_flat == True:
break
else:
pass
time.sleep(1)
get_data_file.close()
......@@ -315,15 +328,15 @@ class SoYongSpider(object):
return False
def main(city_tags):
def main(city_tag):
begin = time.time()
nowday = datetime.datetime.now()
today = str(nowday).split()[0]
file_name = "save_data_" + str(today) + city_tags + ".txt"
file_name = "save_data_" + str(today) + city_tag + ".txt"
while (True):
spider_obj = SoYongSpider(file_name)
flat = spider_obj.run(city_tags=city_tags)
flat = spider_obj.run(city_tags=city_tag)
print("flat:", flat)
if flat == True:
break
......@@ -334,7 +347,4 @@ def main(city_tags):
if __name__ == "__main__":
args = sys.argv[1]
main(city_tags=args)
main(city_tag=args)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment