Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
568a9046
Commit
568a9046
authored
3 years ago
by
李小芳
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add
parent
3d09d3c0
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
37 additions
and
13 deletions
+37
-13
crawler_xinyang_ask_service.py
dev/xinyang_ask_tag/crawler_xinyang_ask_service.py
+37
-13
No files found.
dev/xinyang_ask_tag/crawler_xinyang_ask_service.py
View file @
568a9046
...
@@ -11,6 +11,10 @@ from email.mime.multipart import MIMEMultipart
...
@@ -11,6 +11,10 @@ from email.mime.multipart import MIMEMultipart
from
email.mime.text
import
MIMEText
from
email.mime.text
import
MIMEText
from
email.utils
import
formataddr
from
email.utils
import
formataddr
from
urllib
import
error
from
urllib
import
error
import
requests
# 导入requests.exceptions模块中的三种异常类
from
requests.exceptions
import
ReadTimeout
,
HTTPError
,
RequestException
from
retrying
import
retry
import
pandas
as
pd
import
pandas
as
pd
import
requests
import
requests
...
@@ -76,16 +80,29 @@ class CrawlerMain(object):
...
@@ -76,16 +80,29 @@ class CrawlerMain(object):
"链接"
"链接"
]
]
# 全部报错才会报错,如果其中一次正常,则继续执行
# 两次retry之间等待2秒,重试5次
@retry
(
stop_max_attempt_number
=
5
,
wait_fixed
=
1000
)
def
get_service_base_info_list
(
self
,
pid
):
def
get_service_base_info_list
(
self
,
pid
):
url
=
"https://y.soyoung.com/cp{}"
.
format
(
pid
)
url
=
"https://y.soyoung.com/cp{}"
.
format
(
pid
)
try
:
try
:
requests_res
=
requests
.
get
(
url
,
headers
=
self
.
headers
,
allow_redirects
=
False
,
timeout
=
10
)
requests_res
=
requests
.
get
(
url
,
headers
=
self
.
headers
,
allow_redirects
=
False
,
timeout
=
10
)
page_obj
=
etree
.
HTML
(
requests_res
.
text
)
# 打印状态码
return
page_obj
,
url
if
requests_res
.
status_code
==
200
:
page_obj
=
etree
.
HTML
(
requests_res
.
text
)
except
error
.
URLError
as
e
:
return
page_obj
,
url
if
isinstance
(
e
.
reason
,
socket
.
timeout
):
print
(
'超时,执行下一个请求'
)
except
ReadTimeout
:
print
(
'timeout'
)
pass
except
HTTPError
:
print
(
'httperror'
)
pass
# 请求异常
except
RequestException
:
print
(
'reqerror'
)
pass
def
get_search_service_info_list
(
self
,
page
=
1
,
city_id
=-
1
,
query
=
""
):
def
get_search_service_info_list
(
self
,
page
=
1
,
city_id
=-
1
,
query
=
""
):
url
=
"https://www.soyoung.com/searchNew/product?"
\
url
=
"https://www.soyoung.com/searchNew/product?"
\
...
@@ -93,12 +110,19 @@ class CrawlerMain(object):
...
@@ -93,12 +110,19 @@ class CrawlerMain(object):
.
format
(
query
,
page
)
.
format
(
query
,
page
)
try
:
try
:
requests_res
=
requests
.
get
(
url
,
headers
=
self
.
headers
,
allow_redirects
=
False
,
timeout
=
10
)
requests_res
=
requests
.
get
(
url
,
headers
=
self
.
headers
,
allow_redirects
=
False
,
timeout
=
10
)
res_json
=
requests_res
.
json
()
if
requests_res
.
status_code
==
200
:
return
res_json
res_json
=
requests_res
.
json
()
return
res_json
except
error
.
URLError
as
e
:
except
ReadTimeout
:
if
isinstance
(
e
.
reason
,
socket
.
timeout
):
print
(
'超时,执行下一个请求'
)
print
(
'超时,执行下一个请求'
)
pass
except
HTTPError
:
print
(
'httperror'
)
pass
# 请求异常
except
RequestException
:
print
(
'reqerror'
)
pass
def
get_services_list
(
self
,
res_json
,
query
=
""
,
city_name
=
""
):
def
get_services_list
(
self
,
res_json
,
query
=
""
,
city_name
=
""
):
...
@@ -204,7 +228,7 @@ class SoYongSpider(object):
...
@@ -204,7 +228,7 @@ class SoYongSpider(object):
'开眼角'
,
'开眼角'
,
'海菲秀'
,
'假体下巴'
,
'刷酸'
,
'泪沟'
,
'拉皮'
,
'全身吸脂'
,
'缩鼻翼'
]
'海菲秀'
,
'假体下巴'
,
'刷酸'
,
'泪沟'
,
'拉皮'
,
'全身吸脂'
,
'缩鼻翼'
]
self
.
test_keywords
=
[
'瘦脸针'
,
'双眼皮'
]
self
.
test_keywords
=
[
'瘦脸针'
,
'双眼皮'
]
self
.
city_list
=
[
"北京"
,
"上海"
,
"广州市"
,
"深圳市"
,
"杭州市"
,
"成都市"
,
"重庆"
,
"南京市"
,
"武汉市"
,
"长沙市"
,
"郑州市"
,
"西安市"
]
self
.
city_list
=
[
"北京"
,
"上海"
,
"广州市"
,
"深圳市"
,
"杭州市"
,
"成都市"
,
"重庆"
,
"南京市"
,
"武汉市"
,
"长沙市"
,
"郑州市"
,
"西安市"
]
self
.
test_city_list
=
[
"北京"
,
"上海"
]
self
.
test_city_list
=
[
"北京"
,
"上海"
]
self
.
page_num
=
11
self
.
page_num
=
11
self
.
file_name
=
file_name
self
.
file_name
=
file_name
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment