Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
568a9046
Commit
568a9046
authored
Jul 19, 2021
by
李小芳
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add
parent
3d09d3c0
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
37 additions
and
13 deletions
+37
-13
crawler_xinyang_ask_service.py
dev/xinyang_ask_tag/crawler_xinyang_ask_service.py
+37
-13
No files found.
dev/xinyang_ask_tag/crawler_xinyang_ask_service.py
View file @
568a9046
...
...
@@ -11,6 +11,10 @@ from email.mime.multipart import MIMEMultipart
from
email.mime.text
import
MIMEText
from
email.utils
import
formataddr
from
urllib
import
error
import
requests
# 导入requests.exceptions模块中的三种异常类
from
requests.exceptions
import
ReadTimeout
,
HTTPError
,
RequestException
from
retrying
import
retry
import
pandas
as
pd
import
requests
...
...
@@ -76,16 +80,29 @@ class CrawlerMain(object):
"链接"
]
# 全部报错才会报错,如果其中一次正常,则继续执行
# 两次retry之间等待2秒,重试5次
@retry
(
stop_max_attempt_number
=
5
,
wait_fixed
=
1000
)
def
get_service_base_info_list
(
self
,
pid
):
url
=
"https://y.soyoung.com/cp{}"
.
format
(
pid
)
try
:
requests_res
=
requests
.
get
(
url
,
headers
=
self
.
headers
,
allow_redirects
=
False
,
timeout
=
10
)
page_obj
=
etree
.
HTML
(
requests_res
.
text
)
return
page_obj
,
url
except
error
.
URLError
as
e
:
if
isinstance
(
e
.
reason
,
socket
.
timeout
):
print
(
'超时,执行下一个请求'
)
# 打印状态码
if
requests_res
.
status_code
==
200
:
page_obj
=
etree
.
HTML
(
requests_res
.
text
)
return
page_obj
,
url
except
ReadTimeout
:
print
(
'timeout'
)
pass
except
HTTPError
:
print
(
'httperror'
)
pass
# 请求异常
except
RequestException
:
print
(
'reqerror'
)
pass
def
get_search_service_info_list
(
self
,
page
=
1
,
city_id
=-
1
,
query
=
""
):
url
=
"https://www.soyoung.com/searchNew/product?"
\
...
...
@@ -93,12 +110,19 @@ class CrawlerMain(object):
.
format
(
query
,
page
)
try
:
requests_res
=
requests
.
get
(
url
,
headers
=
self
.
headers
,
allow_redirects
=
False
,
timeout
=
10
)
res_json
=
requests_res
.
json
()
return
res_json
except
error
.
URLError
as
e
:
if
isinstance
(
e
.
reason
,
socket
.
timeout
):
print
(
'超时,执行下一个请求'
)
if
requests_res
.
status_code
==
200
:
res_json
=
requests_res
.
json
()
return
res_json
except
ReadTimeout
:
print
(
'超时,执行下一个请求'
)
pass
except
HTTPError
:
print
(
'httperror'
)
pass
# 请求异常
except
RequestException
:
print
(
'reqerror'
)
pass
def
get_services_list
(
self
,
res_json
,
query
=
""
,
city_name
=
""
):
...
...
@@ -204,7 +228,7 @@ class SoYongSpider(object):
'开眼角'
,
'海菲秀'
,
'假体下巴'
,
'刷酸'
,
'泪沟'
,
'拉皮'
,
'全身吸脂'
,
'缩鼻翼'
]
self
.
test_keywords
=
[
'瘦脸针'
,
'双眼皮'
]
self
.
city_list
=
[
"北京"
,
"上海"
,
"广州市"
,
"深圳市"
,
"杭州市"
,
"成都市"
,
"重庆"
,
"南京市"
,
"武汉市"
,
"长沙市"
,
"郑州市"
,
"西安市"
]
self
.
city_list
=
[
"北京"
,
"上海"
,
"广州市"
,
"深圳市"
,
"杭州市"
,
"成都市"
,
"重庆"
,
"南京市"
,
"武汉市"
,
"长沙市"
,
"郑州市"
,
"西安市"
]
self
.
test_city_list
=
[
"北京"
,
"上海"
]
self
.
page_num
=
11
self
.
file_name
=
file_name
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment