Commit 3310832b authored by 李小芳's avatar 李小芳

add

parent 568a9046
...@@ -13,7 +13,10 @@ from email.utils import formataddr ...@@ -13,7 +13,10 @@ from email.utils import formataddr
from urllib import error from urllib import error
import requests import requests
# 导入requests.exceptions模块中的三种异常类 # 导入requests.exceptions模块中的三种异常类
from requests.exceptions import ReadTimeout, HTTPError, RequestException from requests.exceptions import ReadTimeout, HTTPError, RequestException, ConnectionError
from requests.packages.urllib3.exceptions import ReadTimeoutError
from socket import timeout
from retrying import retry from retrying import retry
import pandas as pd import pandas as pd
...@@ -103,6 +106,15 @@ class CrawlerMain(object): ...@@ -103,6 +106,15 @@ class CrawlerMain(object):
except RequestException: except RequestException:
print('reqerror') print('reqerror')
pass pass
except socket.timeout:
print(socket.timeout)
pass
except ReadTimeoutError:
print("ReadTimeoutError")
pass
except ConnectionError:
print("ConnectionError")
pass
def get_search_service_info_list(self, page=1, city_id=-1, query=""): def get_search_service_info_list(self, page=1, city_id=-1, query=""):
url = "https://www.soyoung.com/searchNew/product?" \ url = "https://www.soyoung.com/searchNew/product?" \
...@@ -123,6 +135,15 @@ class CrawlerMain(object): ...@@ -123,6 +135,15 @@ class CrawlerMain(object):
except RequestException: except RequestException:
print('reqerror') print('reqerror')
pass pass
except socket.timeout:
print(socket.timeout)
pass
except ReadTimeoutError:
print("ReadTimeoutError")
pass
except ConnectionError:
print("ConnectionError")
pass
def get_services_list(self, res_json, query="", city_name=""): def get_services_list(self, res_json, query="", city_name=""):
...@@ -148,6 +169,7 @@ class CrawlerMain(object): ...@@ -148,6 +169,7 @@ class CrawlerMain(object):
def get_service_base_info(self, pid): def get_service_base_info(self, pid):
service_info = dict() service_info = dict()
res_json, url = self.get_service_base_info_list(pid) res_json, url = self.get_service_base_info_list(pid)
if res_json:
res_json = res_json.xpath( res_json = res_json.xpath(
"/html[1]/body[1]/div[@class='page-content']" "/html[1]/body[1]/div[@class='page-content']"
"/div[@class='w1000']/div[@class='detail-wrap']/div[@class='width-control']/div" "/div[@class='w1000']/div[@class='detail-wrap']/div[@class='width-control']/div"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment