Commit 3310832b authored by 李小芳's avatar 李小芳

add

parent 568a9046
...@@ -13,7 +13,10 @@ from email.utils import formataddr ...@@ -13,7 +13,10 @@ from email.utils import formataddr
from urllib import error from urllib import error
import requests import requests
# 导入requests.exceptions模块中的三种异常类 # 导入requests.exceptions模块中的三种异常类
from requests.exceptions import ReadTimeout, HTTPError, RequestException from requests.exceptions import ReadTimeout, HTTPError, RequestException, ConnectionError
from requests.packages.urllib3.exceptions import ReadTimeoutError
from socket import timeout
from retrying import retry from retrying import retry
import pandas as pd import pandas as pd
...@@ -103,6 +106,15 @@ class CrawlerMain(object): ...@@ -103,6 +106,15 @@ class CrawlerMain(object):
except RequestException: except RequestException:
print('reqerror') print('reqerror')
pass pass
except socket.timeout:
print(socket.timeout)
pass
except ReadTimeoutError:
print("ReadTimeoutError")
pass
except ConnectionError:
print("ConnectionError")
pass
def get_search_service_info_list(self, page=1, city_id=-1, query=""): def get_search_service_info_list(self, page=1, city_id=-1, query=""):
url = "https://www.soyoung.com/searchNew/product?" \ url = "https://www.soyoung.com/searchNew/product?" \
...@@ -123,6 +135,15 @@ class CrawlerMain(object): ...@@ -123,6 +135,15 @@ class CrawlerMain(object):
except RequestException: except RequestException:
print('reqerror') print('reqerror')
pass pass
except socket.timeout:
print(socket.timeout)
pass
except ReadTimeoutError:
print("ReadTimeoutError")
pass
except ConnectionError:
print("ConnectionError")
pass
def get_services_list(self, res_json, query="", city_name=""): def get_services_list(self, res_json, query="", city_name=""):
...@@ -148,64 +169,65 @@ class CrawlerMain(object): ...@@ -148,64 +169,65 @@ class CrawlerMain(object):
def get_service_base_info(self, pid): def get_service_base_info(self, pid):
service_info = dict() service_info = dict()
res_json, url = self.get_service_base_info_list(pid) res_json, url = self.get_service_base_info_list(pid)
res_json = res_json.xpath( if res_json:
"/html[1]/body[1]/div[@class='page-content']" res_json = res_json.xpath(
"/div[@class='w1000']/div[@class='detail-wrap']/div[@class='width-control']/div" "/html[1]/body[1]/div[@class='page-content']"
) "/div[@class='w1000']/div[@class='detail-wrap']/div[@class='width-control']/div"
service_info['链接'] = url )
service_info['链接'] = url
for base_info in res_json:
if "basic-info" in base_info.xpath("div/@class"): for base_info in res_json:
service_info["美购名称"] = str(base_info.xpath("div/h1/text()")[0].strip()) if "basic-info" in base_info.xpath("div/@class"):
service_info["sku活动价"] = base_info.xpath("div/div[@class='base-price']/em/text()")[0].strip() # 980 service_info["美购名称"] = str(base_info.xpath("div/h1/text()")[0].strip())
service_info["sku原价"] = base_info.xpath("div/div[@class='base-price']/del/text()")[0].strip() # 1980 service_info["sku活动价"] = base_info.xpath("div/div[@class='base-price']/em/text()")[0].strip() # 980
service_info["销量"] = base_info.xpath("div/div[@class='base-relation']/div[3]/em/text()")[ service_info["sku原价"] = base_info.xpath("div/div[@class='base-price']/del/text()")[0].strip() # 1980
0].strip() # 110 service_info["销量"] = base_info.xpath("div/div[@class='base-relation']/div[3]/em/text()")[
service_info["机构等级"] = base_info.xpath("div/div[@class='base-relation']/div[1]/text()")[ 0].strip() # 110
0].strip() # 110 service_info["机构等级"] = base_info.xpath("div/div[@class='base-relation']/div[1]/text()")[
service_info['可领取预约金优惠券'] = [] 0].strip() # 110
service_info['可用尾款券'] = [] service_info['可领取预约金优惠券'] = []
service_info['可用尾款券'] = []
for vip_info in base_info.xpath("div/dl[@class='base-param']/dd[@class='app-vip']/div"):
vip_str_info = "" for vip_info in base_info.xpath("div/dl[@class='base-param']/dd[@class='app-vip']/div"):
vip_title = vip_info.xpath("div[@class='label']/text()")[0].strip() if vip_info.xpath( vip_str_info = ""
"div[@class='label']/text()") else "" vip_title = vip_info.xpath("div[@class='label']/text()")[0].strip() if vip_info.xpath(
"div[@class='label']/text()") else ""
if vip_title in ["支持分期"]:
vip_data = vip_info.xpath("div[@class='text']/text()")[0].strip() if vip_info.xpath( if vip_title in ["支持分期"]:
"div[@class='text']/text()") else "" vip_data = vip_info.xpath("div[@class='text']/text()")[0].strip() if vip_info.xpath(
vip_str_info += vip_title + ":" + vip_data "div[@class='text']/text()") else ""
vip_str_info += vip_title + ":" + vip_data
elif vip_title in ["尾款红包"]:
vip_youhui = [] elif vip_title in ["尾款红包"]:
for youhui in vip_info.xpath("div[@class='text']/span"): vip_youhui = []
vip_data = youhui.xpath("em/text()") for youhui in vip_info.xpath("div[@class='text']/span"):
vip_youhui.append(vip_data[0] + "元红包满" + vip_data[1] + "可用") vip_data = youhui.xpath("em/text()")
vip_str_info += vip_title + ":" + " ".join(vip_youhui) vip_youhui.append(vip_data[0] + "元红包满" + vip_data[1] + "可用")
vip_str_info += vip_title + ":" + " ".join(vip_youhui)
elif vip_title in ["氧分抵扣"]:
vip_data = vip_info.xpath("div[@class='text']/text()")[0].strip() if vip_info.xpath( elif vip_title in ["氧分抵扣"]:
"div[@class='text']/text()") else "" vip_data = vip_info.xpath("div[@class='text']/text()")[0].strip() if vip_info.xpath(
vip_money = vip_info.xpath("div[@class='text']/em/text()")[0].strip() if vip_info.xpath( "div[@class='text']/text()") else ""
"div[@class='text']/em/text()") else "" vip_money = vip_info.xpath("div[@class='text']/em/text()")[0].strip() if vip_info.xpath(
"div[@class='text']/em/text()") else ""
vip_str_info += vip_title + ":" + vip_data + str(vip_money) + "元"
vip_str_info += vip_title + ":" + vip_data + str(vip_money) + "元"
else:
pass
service_info['可领取预约金优惠券'].append(vip_str_info)
for pay_info in base_info.xpath("div/div[@class='base-buy']/div[@class='price-box']"):
deposit_title = pay_info.xpath("span/i/text()")[0].strip()
deposit_price = pay_info.xpath("span/em/text()")[0].strip()
to_pay_title = pay_info.xpath("p/text()")[0].strip()
to_pay_price = pay_info.xpath("p/span/text()")[0].strip()
service_info['可用尾款券'].append(
deposit_title + ":" + deposit_price + "," + to_pay_title + ":" + to_pay_price
)
else: else:
pass pass
service_info['可领取预约金优惠券'].append(vip_str_info)
for pay_info in base_info.xpath("div/div[@class='base-buy']/div[@class='price-box']"):
deposit_title = pay_info.xpath("span/i/text()")[0].strip()
deposit_price = pay_info.xpath("span/em/text()")[0].strip()
to_pay_title = pay_info.xpath("p/text()")[0].strip()
to_pay_price = pay_info.xpath("p/span/text()")[0].strip()
service_info['可用尾款券'].append(
deposit_title + ":" + deposit_price + "," + to_pay_title + ":" + to_pay_price
)
else:
pass
return service_info return service_info
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment