Commit 568a9046 authored by 李小芳's avatar 李小芳

add

parent 3d09d3c0
......@@ -11,6 +11,10 @@ from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formataddr
from urllib import error
import requests
# 导入requests.exceptions模块中的三种异常类
from requests.exceptions import ReadTimeout, HTTPError, RequestException
from retrying import retry
import pandas as pd
import requests
......@@ -76,16 +80,29 @@ class CrawlerMain(object):
"链接"
]
# 全部报错才会报错,如果其中一次正常,则继续执行
# 两次retry之间等待2秒,重试5次
@retry(stop_max_attempt_number=5, wait_fixed=1000)
def get_service_base_info_list(self, pid):
url = "https://y.soyoung.com/cp{}".format(pid)
try:
requests_res = requests.get(url, headers=self.headers, allow_redirects=False, timeout=10)
page_obj = etree.HTML(requests_res.text)
return page_obj, url
except error.URLError as e:
if isinstance(e.reason, socket.timeout):
print('超时,执行下一个请求')
# 打印状态码
if requests_res.status_code == 200:
page_obj = etree.HTML(requests_res.text)
return page_obj, url
except ReadTimeout:
print('timeout')
pass
except HTTPError:
print('httperror')
pass
# 请求异常
except RequestException:
print('reqerror')
pass
def get_search_service_info_list(self, page=1, city_id=-1, query=""):
url = "https://www.soyoung.com/searchNew/product?" \
......@@ -93,12 +110,19 @@ class CrawlerMain(object):
.format(query, page)
try:
requests_res = requests.get(url, headers=self.headers, allow_redirects=False, timeout=10)
res_json = requests_res.json()
return res_json
except error.URLError as e:
if isinstance(e.reason, socket.timeout):
print('超时,执行下一个请求')
if requests_res.status_code == 200:
res_json = requests_res.json()
return res_json
except ReadTimeout:
print('超时,执行下一个请求')
pass
except HTTPError:
print('httperror')
pass
# 请求异常
except RequestException:
print('reqerror')
pass
def get_services_list(self, res_json, query="", city_name=""):
......@@ -204,7 +228,7 @@ class SoYongSpider(object):
'开眼角',
'海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
self.test_keywords = ['瘦脸针', '双眼皮']
self.city_list = ["北京","上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
self.city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
self.test_city_list = ["北京", "上海"]
self.page_num = 11
self.file_name = file_name
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment