Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
4d1d6e83
Commit
4d1d6e83
authored
Aug 17, 2021
by
李小芳
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
626b43f5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
351 additions
and
98 deletions
+351
-98
crawler.iml
.idea/crawler.iml
+1
-1
csv-plugin.xml
.idea/csv-plugin.xml
+35
-0
misc.xml
.idea/misc.xml
+5
-1
crawler_maoyan.py
dev/Movies_rank_project/crawler_maoyan.py
+97
-96
crawler_xinyang_ask_tag.py
dev/xinyang_ask_tag/crawler_xinyang_ask_tag.py
+213
-0
No files found.
.idea/crawler.iml
View file @
4d1d6e83
...
...
@@ -2,7 +2,7 @@
<module
type=
"PYTHON_MODULE"
version=
"4"
>
<component
name=
"NewModuleRootManager"
>
<content
url=
"file://$MODULE_DIR$"
/>
<orderEntry
type=
"
inheritedJdk
"
/>
<orderEntry
type=
"
jdk"
jdkName=
"Python 3.6 (doris_env)"
jdkType=
"Python SDK
"
/>
<orderEntry
type=
"sourceFolder"
forTests=
"false"
/>
</component>
<component
name=
"PyDocumentationSettings"
>
...
...
.idea/csv-plugin.xml
View file @
4d1d6e83
...
...
@@ -3,6 +3,41 @@
<component
name=
"CsvFileAttributes"
>
<option
name=
"attributeMap"
>
<map>
<entry
key=
"/crawler_sys/tools/video_num_count/crawler/haokan_青春旅社.csv"
>
<value>
<Attribute>
<option
name=
"separator"
value=
","
/>
</Attribute>
</value>
</entry>
<entry
key=
"/dev/xinyang_ask_tag/soyoung_service.csv"
>
<value>
<Attribute>
<option
name=
"separator"
value=
","
/>
</Attribute>
</value>
</entry>
<entry
key=
"/dev/xinyang_ask_tag/soyoung_service_1.csv"
>
<value>
<Attribute>
<option
name=
"separator"
value=
","
/>
</Attribute>
</value>
</entry>
<entry
key=
"/dev/xinyang_ask_tag/soyoung_service_cika.csv"
>
<value>
<Attribute>
<option
name=
"separator"
value=
","
/>
</Attribute>
</value>
</entry>
<entry
key=
"/dev/xinyang_ask_tag/soyoung_service_write_cika.csv"
>
<value>
<Attribute>
<option
name=
"separator"
value=
","
/>
</Attribute>
</value>
</entry>
<entry
key=
"/tasks/yangjingshu.csv"
>
<value>
<Attribute>
...
...
.idea/misc.xml
View file @
4d1d6e83
<?xml version="1.0" encoding="UTF-8"?>
<project
version=
"4"
>
<component
name=
"ProjectRootManager"
version=
"2"
project-jdk-name=
"Python 3.6"
project-jdk-type=
"Python SDK"
/>
<component
name=
"ProjectRootManager"
version=
"2"
project-jdk-name=
"Python 3.6 (doris_env)"
project-jdk-type=
"Python SDK"
/>
<component
name=
"PyCharmProfessionalAdvertiser"
>
<option
name=
"shown"
value=
"true"
/>
</component>
</project>
\ No newline at end of file
dev/Movies_rank_project/crawler_maoyan.py
View file @
4d1d6e83
...
...
@@ -60,7 +60,8 @@ def get_url():
if
cat
<=
5
and
source
<
14
:
continue
yield
cat
,
source
,
year
,
sort
yield
cat
,
source
,
year
,
sort
def
revise_data
():
scan_re
=
rds_list
.
scan_iter
()
...
...
@@ -87,22 +88,22 @@ class Crawler_main(object):
self
.
chrome_options
.
add_experimental_option
(
"prefs"
,
prefs
)
# self.driver = webdriver.Chrome(options=self.chrome_options)
self
.
one_video_dic
=
{
"platform"
:
"douban"
,
"ID"
:
""
,
"title"
:
""
,
"url"
:
""
,
"directors"
:
""
,
"screenwriter"
:
""
,
"casts"
:
""
,
"describe"
:
""
,
"year"
:
""
,
"provider"
:
""
,
"style_tags"
:
""
,
"project_tags"
:
""
,
"language"
:
""
,
"area"
:
""
,
"rate"
:
""
,
"comment_count"
:
""
"platform"
:
"douban"
,
"ID"
:
""
,
"title"
:
""
,
"url"
:
""
,
"directors"
:
""
,
"screenwriter"
:
""
,
"casts"
:
""
,
"describe"
:
""
,
"year"
:
""
,
"provider"
:
""
,
"style_tags"
:
""
,
"project_tags"
:
""
,
"language"
:
""
,
"area"
:
""
,
"rate"
:
""
,
"comment_count"
:
""
}
...
...
@@ -125,19 +126,19 @@ class Crawler_main(object):
):
offset
=
30
headers
=
{
"Accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
,
"Accept-Encoding"
:
"gzip, deflate, br"
,
"Accept-Language"
:
"zh,zh-CN;q=0.9"
,
"Connection"
:
"keep-alive"
,
# "Cookie": '__mta=150368905.1577424190198.1577424190198.1577433956085.2; uuid_n_v=v1; uuid=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; _csrf=c8be65f46b2c830502aa6a49c2f1aacb1660ffb3e3a6c4ae3623084677b66d7c; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189; _lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; mojo-uuid=396ea3294dbf9178fa564b08543aed72; mojo-session-id={"id":"f35010c2739ba6f036e332417fe21f84","time":1577433601641}; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1577434032; __mta=150368905.1577424190198.1577433956085.1577434032548.3; mojo-trace-id=57; _lxsdk_s=16f465e962c-545-9da-13a%7C%7C64',
"Cookie"
:
'__mta=150368905.1577424190198.1577931921073.1577933054583.8; uuid_n_v=v1; uuid=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; _csrf=c8be65f46b2c830502aa6a49c2f1aacb1660ffb3e3a6c4ae3623084677b66d7c; _lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; mojo-uuid=396ea3294dbf9178fa564b08543aed72; lt=dwim2AyVn0Nr4tMQ1qCHf87HvVwAAAAAsQkAAGKVo4UF5isSHZyJ2F-6Yypd0YqL-FIGGMTWixcuMN23AhelN_OPNDA2hAk5IuCtNg; lt.sig=0AWWI8aMHZfmuLzGDO9hoKoZqT8; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189,1577683110; mojo-session-id={"id":"8d8eb79ab4cbaf8082e721ba64b73f3a","time":1577935255982}; mojo-trace-id=1; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1577935256; __mta=150368905.1577424190198.1577933054583.1577935256193.9; _lxsdk_s=16f64452341-fac-102-6a1
%7
C265018624
%7
C3'
,
"Host"
:
"maoyan.com"
,
"Referer"
:
"https://maoyan.com/films?showType=3&offset=30"
,
"Sec-Fetch-Mode"
:
"navigate"
,
"Sec-Fetch-Site"
:
"same-origin"
,
"Sec-Fetch-User"
:
"?1"
,
"Upgrade-Insecure-Requests"
:
"1"
,
"User-Agent"
:
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"
,
"Accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
,
"Accept-Encoding"
:
"gzip, deflate, br"
,
"Accept-Language"
:
"zh,zh-CN;q=0.9"
,
"Connection"
:
"keep-alive"
,
# "Cookie": '__mta=150368905.1577424190198.1577424190198.1577433956085.2; uuid_n_v=v1; uuid=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; _csrf=c8be65f46b2c830502aa6a49c2f1aacb1660ffb3e3a6c4ae3623084677b66d7c; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189; _lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; mojo-uuid=396ea3294dbf9178fa564b08543aed72; mojo-session-id={"id":"f35010c2739ba6f036e332417fe21f84","time":1577433601641}; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1577434032; __mta=150368905.1577424190198.1577433956085.1577434032548.3; mojo-trace-id=57; _lxsdk_s=16f465e962c-545-9da-13a%7C%7C64',
"Cookie"
:
'__mta=150368905.1577424190198.1577931921073.1577933054583.8; uuid_n_v=v1; uuid=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; _csrf=c8be65f46b2c830502aa6a49c2f1aacb1660ffb3e3a6c4ae3623084677b66d7c; _lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; mojo-uuid=396ea3294dbf9178fa564b08543aed72; lt=dwim2AyVn0Nr4tMQ1qCHf87HvVwAAAAAsQkAAGKVo4UF5isSHZyJ2F-6Yypd0YqL-FIGGMTWixcuMN23AhelN_OPNDA2hAk5IuCtNg; lt.sig=0AWWI8aMHZfmuLzGDO9hoKoZqT8; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189,1577683110; mojo-session-id={"id":"8d8eb79ab4cbaf8082e721ba64b73f3a","time":1577935255982}; mojo-trace-id=1; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1577935256; __mta=150368905.1577424190198.1577933054583.1577935256193.9; _lxsdk_s=16f64452341-fac-102-6a1
%7
C265018624
%7
C3'
,
"Host"
:
"maoyan.com"
,
"Referer"
:
"https://maoyan.com/films?showType=3&offset=30"
,
"Sec-Fetch-Mode"
:
"navigate"
,
"Sec-Fetch-Site"
:
"same-origin"
,
"Sec-Fetch-User"
:
"?1"
,
"Upgrade-Insecure-Requests"
:
"1"
,
"User-Agent"
:
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"
,
}
count_false
=
0
if
args
.
max_page
:
...
...
@@ -147,7 +148,7 @@ class Crawler_main(object):
time
.
sleep
(
0.5
)
print
(
"page "
,
offset
)
url
=
"https://maoyan.com/films?showType=3&offset={0}"
.
format
(
str
(
offset
))
str
(
offset
))
proxies
=
get_proxy
(
4
)
requests_res
=
requests
.
get
(
url
,
headers
=
headers
,
proxies
=
proxies
,
allow_redirects
=
False
)
html
=
etree
.
HTML
(
requests_res
.
text
)
...
...
@@ -164,9 +165,9 @@ class Crawler_main(object):
for
rate
in
rate_list
:
rate_str
+=
rate
data_dic
=
{
"url"
:
url
,
"title"
:
title
,
"rate"
:
rate_str
,
"url"
:
url
,
"title"
:
title
,
"rate"
:
rate_str
,
}
if
style_tags
:
...
...
@@ -330,18 +331,18 @@ class Crawler_main(object):
# for handle in self.driver.window_handles:
# self.driver.switch_to.window(handle)
headers
=
{
"accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
,
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-language"
:
"zh,zh-CN;q=0.9"
,
# "cookie": "pgv_pvi=203414528; RK=SCQYJhGMVf; ptcz=5f0818b08a7345580a07bce669e0f0468b64107f4ecfb2c9bebf109cb23cf4fb; pgv_pvid=2754880744; ts_uid=176985184; tvfe_boss_uuid=54e907210062ff55; video_guid=0df27917cdb73abd; video_platform=2; XWINDEXGREY=0; mobileUV=1_16ac3c085a7_484c1; tvfe_search_uid=acc18029-4786-42c4-8f6a-f308777454bc; Qs_lvt_311470=1562066061; Qs_pv_311470=992309958717814400; _ga=GA1.2.1184421010.1562066062; login_remember=qq; ptui_loginuin=593516104; o_cookie=593516104; bucket_id=9231005; pac_uid=1_593516104; pgv_si=s4148599808; ptisp=; pgv_info=ssid=s6122922102; ts_refer=m.v.qq.com/x/page/o/d/k/i3023rbj8yk.html; qv_als=Kg5QkSIirE60EDyzA11576570304alZX8g==; ptag=m_v_qq_com|videolist:title; ts_last=v.qq.com/detail/m/mzc00200iwawqac.html; ad_play_index=132",
# "referer": "https://v.qq.com/x/cover/%s.html" % url_id,
"sec-fetch-mode"
:
"navigate"
,
"sec-fetch-site"
:
"same-origin"
,
"upgrade-insecure-requests"
:
"1"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
,
"sec-fetch-user"
:
"?1"
,
"cache-control"
:
"max-age=0"
,
"if-modified-since"
:
"Thu, 19 Dec 2019 02:30:00 GMT"
"accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
,
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-language"
:
"zh,zh-CN;q=0.9"
,
# "cookie": "pgv_pvi=203414528; RK=SCQYJhGMVf; ptcz=5f0818b08a7345580a07bce669e0f0468b64107f4ecfb2c9bebf109cb23cf4fb; pgv_pvid=2754880744; ts_uid=176985184; tvfe_boss_uuid=54e907210062ff55; video_guid=0df27917cdb73abd; video_platform=2; XWINDEXGREY=0; mobileUV=1_16ac3c085a7_484c1; tvfe_search_uid=acc18029-4786-42c4-8f6a-f308777454bc; Qs_lvt_311470=1562066061; Qs_pv_311470=992309958717814400; _ga=GA1.2.1184421010.1562066062; login_remember=qq; ptui_loginuin=593516104; o_cookie=593516104; bucket_id=9231005; pac_uid=1_593516104; pgv_si=s4148599808; ptisp=; pgv_info=ssid=s6122922102; ts_refer=m.v.qq.com/x/page/o/d/k/i3023rbj8yk.html; qv_als=Kg5QkSIirE60EDyzA11576570304alZX8g==; ptag=m_v_qq_com|videolist:title; ts_last=v.qq.com/detail/m/mzc00200iwawqac.html; ad_play_index=132",
# "referer": "https://v.qq.com/x/cover/%s.html" % url_id,
"sec-fetch-mode"
:
"navigate"
,
"sec-fetch-site"
:
"same-origin"
,
"upgrade-insecure-requests"
:
"1"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
,
"sec-fetch-user"
:
"?1"
,
"cache-control"
:
"max-age=0"
,
"if-modified-since"
:
"Thu, 19 Dec 2019 02:30:00 GMT"
}
try
:
page_data
=
requests
.
get
(
res
[
"url"
],
headers
=
headers
,
timeout
=
5
)
...
...
@@ -447,13 +448,13 @@ class Crawler_main(object):
except
:
comment_count
=
0
dic
=
{
"play_count_sum"
:
play_count_sum
,
"duration"
:
duration
,
"project_tags"
:
project_tags
,
"title"
:
title
,
"year"
:
year
,
"video_count"
:
1
,
"comment_count"
:
comment_count
"play_count_sum"
:
play_count_sum
,
"duration"
:
duration
,
"project_tags"
:
project_tags
,
"title"
:
title
,
"year"
:
year
,
"video_count"
:
1
,
"comment_count"
:
comment_count
}
project_name
=
keys
self
.
parse_data
(
dic
,
project_name
)
...
...
@@ -462,7 +463,7 @@ class Crawler_main(object):
dic
[
"video_id"
]
=
res
[
"title"
]
dic
[
"play_count"
]
=
res
[
"play_count_sum"
]
one_video_dic
=
{
title
:
dic
title
:
dic
}
# print(one_video_dic)
self
.
one_video_page
(
title
,
one_video_dic
,
type
=
"single"
)
...
...
@@ -473,15 +474,15 @@ class Crawler_main(object):
if
type
==
"list"
:
url
=
one_video_dic
[
one_video
][
"url"
]
headers
=
{
"accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
,
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-language"
:
"zh,zh-CN;q=0.9"
,
# "cookie": "pgv_pvi=203414528; RK=SCQYJhGMVf; ptcz=5f0818b08a7345580a07bce669e0f0468b64107f4ecfb2c9bebf109cb23cf4fb; pgv_pvid=2754880744; ts_uid=176985184; tvfe_boss_uuid=54e907210062ff55; video_guid=0df27917cdb73abd; video_platform=2; XWINDEXGREY=0; mobileUV=1_16ac3c085a7_484c1; tvfe_search_uid=acc18029-4786-42c4-8f6a-f308777454bc; Qs_lvt_311470=1562066061; Qs_pv_311470=992309958717814400; _ga=GA1.2.1184421010.1562066062; login_remember=qq; ptui_loginuin=593516104; o_cookie=593516104; bucket_id=9231005; pac_uid=1_593516104; pgv_si=s4148599808; ptisp=; pgv_info=ssid=s6122922102; ts_refer=m.v.qq.com/x/page/o/d/k/i3023rbj8yk.html; qv_als=Kg5QkSIirE60EDyzA11576570304alZX8g==; ptag=m_v_qq_com|videolist:title; ts_last=v.qq.com/detail/m/mzc00200iwawqac.html; ad_play_index=132",
"referer"
:
url
,
"sec-fetch-mode"
:
"navigate"
,
"sec-fetch-site"
:
"same-origin"
,
"upgrade-insecure-requests"
:
"1"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
,
"accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
,
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-language"
:
"zh,zh-CN;q=0.9"
,
# "cookie": "pgv_pvi=203414528; RK=SCQYJhGMVf; ptcz=5f0818b08a7345580a07bce669e0f0468b64107f4ecfb2c9bebf109cb23cf4fb; pgv_pvid=2754880744; ts_uid=176985184; tvfe_boss_uuid=54e907210062ff55; video_guid=0df27917cdb73abd; video_platform=2; XWINDEXGREY=0; mobileUV=1_16ac3c085a7_484c1; tvfe_search_uid=acc18029-4786-42c4-8f6a-f308777454bc; Qs_lvt_311470=1562066061; Qs_pv_311470=992309958717814400; _ga=GA1.2.1184421010.1562066062; login_remember=qq; ptui_loginuin=593516104; o_cookie=593516104; bucket_id=9231005; pac_uid=1_593516104; pgv_si=s4148599808; ptisp=; pgv_info=ssid=s6122922102; ts_refer=m.v.qq.com/x/page/o/d/k/i3023rbj8yk.html; qv_als=Kg5QkSIirE60EDyzA11576570304alZX8g==; ptag=m_v_qq_com|videolist:title; ts_last=v.qq.com/detail/m/mzc00200iwawqac.html; ad_play_index=132",
"referer"
:
url
,
"sec-fetch-mode"
:
"navigate"
,
"sec-fetch-site"
:
"same-origin"
,
"upgrade-insecure-requests"
:
"1"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
,
}
# print(url)
url_lis
=
url
.
split
(
"."
,
-
1
)
...
...
@@ -511,18 +512,18 @@ class Crawler_main(object):
if
not
if_pay
:
if_pay
=
""
dic
=
{
"album"
:
title
,
"video_title"
:
video_title
,
"if_pay"
:
if_pay
,
"comment_count"
:
comment_count
,
"url"
:
url
,
"video_url"
:
single_data_url
,
"video_id"
:
video_id
,
"duration"
:
duration
,
"video_count"
:
1
,
"play_count"
:
play_count_sum
,
"year"
:
year
,
"project_tags"
:
project_tags
,
"album"
:
title
,
"video_title"
:
video_title
,
"if_pay"
:
if_pay
,
"comment_count"
:
comment_count
,
"url"
:
url
,
"video_url"
:
single_data_url
,
"video_id"
:
video_id
,
"duration"
:
duration
,
"video_count"
:
1
,
"play_count"
:
play_count_sum
,
"year"
:
year
,
"project_tags"
:
project_tags
,
}
# print(dic)
...
...
@@ -549,11 +550,11 @@ class Crawler_main(object):
):
offset
=
0
headers
=
{
"Host"
:
"api.maoyan.com"
,
"Connection"
:
"Keep-Alive"
,
"Accept-Encoding"
:
"gzip"
,
"User-Agent"
:
"AiMovie /Oneplus-6.0.1-oneplus a5010-0x0-0-null-0-000000000000000-null"
,
"mtgdid"
:
"AAAAAAAAAAAAACh9V5sO1zmQc71i5gjpKNuww8T-JnDVTQHuVQFINVu2yYO8FhnCWl_Cqj2TMCWI983qEk_Ha5ayk_tXytbMWi4"
,
"Host"
:
"api.maoyan.com"
,
"Connection"
:
"Keep-Alive"
,
"Accept-Encoding"
:
"gzip"
,
"User-Agent"
:
"AiMovie /Oneplus-6.0.1-oneplus a5010-0x0-0-null-0-000000000000000-null"
,
"mtgdid"
:
"AAAAAAAAAAAAACh9V5sO1zmQc71i5gjpKNuww8T-JnDVTQHuVQFINVu2yYO8FhnCWl_Cqj2TMCWI983qEk_Ha5ayk_tXytbMWi4"
,
}
count_false
=
0
print
(
cat
,
source
,
year
,
sort
)
...
...
@@ -566,7 +567,7 @@ class Crawler_main(object):
time
.
sleep
(
0.1
)
print
(
"page "
,
offset
)
url
=
"http://api.maoyan.com/mmdb/search/movie/tag/list.json?cityId=1&limit=100&offset={0}&catId={1}&sourceId={2}&yearId={3}&sortId={4}&token=7SJTJRCOW4fNMlp_xZDfgeI8qL0AAAAAsAkAADq-Y4OtjaaVeiysSdZtMsWTuGb0liEIqBPrkrC5QNJ0xOlFWRhf__Rj4D5cDS9L9g&utm_campaign=AmovieBmovieCD-1&movieBundleVersion=8012031&utm_source=meituan&utm_medium=android&utm_term=8.12.3&utm_content=440000000189785&ci=1&net=1&dModel=oneplus
%20
a5010&uuid=0000000000000A10631E76CD844099D6694316F7616BBA157797426456628307&channelId=1&lat=0.0&lng=0.0&refer=c_boybi6x4&version_name=8.12.3&machine_type=0"
.
format
(
str
(
offset
),
cat
,
source
,
year
,
sort
)
str
(
offset
),
cat
,
source
,
year
,
sort
)
proxies
=
get_proxy
(
4
)
requests_res
=
requests
.
get
(
url
,
headers
=
headers
,
proxies
=
proxies
,
allow_redirects
=
False
)
dev_list
=
requests_res
.
json
()
...
...
@@ -612,8 +613,8 @@ class Crawler_main(object):
try
:
if
int
(
res
[
"rt"
][:
4
])
<
2010
:
dic
=
{
"box_office"
:
""
,
"url"
:
"https://maoyan.com/films/
%
s"
%
keys
"box_office"
:
""
,
"url"
:
"https://maoyan.com/films/
%
s"
%
keys
}
self
.
parse_data
(
dic
,
keys
)
rds_get
.
delete
(
keys
)
...
...
@@ -622,21 +623,21 @@ class Crawler_main(object):
pass
headers
=
{
"Accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
,
"Accept-Encoding"
:
"gzip, deflate"
,
"Accept-Language"
:
"zh,zh-CN;q=0.9"
,
"Connection"
:
"keep-alive"
,
# "Cookie": "_lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189,1577683110,1577942292; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __utma=17099173.1331545914.1577942309.1577942309.1577942309.1; __utmc=17099173; __utmz=17099173.1577942309.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __mta=150368905.1577424190198.1578028660590.1578044222257.17; _lxsdk_s=16f6ac3e790-0de-cab-b27%7C265018624%7C6; uuid_n_v=v1; iuuid=9C5AAEF02E0E11EAB981AB68C7AB1D51622E552FC52545AE9F3D31A0EE1F6A4F; webp=true; selectci=; ci=1%2C%E5%8C%97%E4%BA%AC; theme=maoyan; _last_page=undefined; latlng=39.908589%2C116.397316%2C1578045092790; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1578045104",
"Host"
:
"m.maoyan.com"
,
"Upgrade-Insecure-Requests"
:
"1"
,
"User-Agent"
:
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"
,
"Accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
,
"Accept-Encoding"
:
"gzip, deflate"
,
"Accept-Language"
:
"zh,zh-CN;q=0.9"
,
"Connection"
:
"keep-alive"
,
# "Cookie": "_lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189,1577683110,1577942292; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __utma=17099173.1331545914.1577942309.1577942309.1577942309.1; __utmc=17099173; __utmz=17099173.1577942309.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __mta=150368905.1577424190198.1578028660590.1578044222257.17; _lxsdk_s=16f6ac3e790-0de-cab-b27%7C265018624%7C6; uuid_n_v=v1; iuuid=9C5AAEF02E0E11EAB981AB68C7AB1D51622E552FC52545AE9F3D31A0EE1F6A4F; webp=true; selectci=; ci=1%2C%E5%8C%97%E4%BA%AC; theme=maoyan; _last_page=undefined; latlng=39.908589%2C116.397316%2C1578045092790; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1578045104",
"Host"
:
"m.maoyan.com"
,
"Upgrade-Insecure-Requests"
:
"1"
,
"User-Agent"
:
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"
,
}
# keys = 1249366
proxies
=
get_proxy
(
4
)
url
=
"http://m.maoyan.com/movie/{0}/box?_v_=yes&utm_campaign=AmovieBmovieD100&f=android&userid={1}"
.
format
(
keys
,
random
.
randint
(
265011000
,
265031000
))
page_source
=
requests
.
get
(
url
,
headers
=
headers
,
proxies
=
proxies
,
timeout
=
5
,
allow_redirects
=
False
)
keys
,
random
.
randint
(
265011000
,
265031000
))
page_source
=
requests
.
get
(
url
,
headers
=
headers
,
proxies
=
proxies
,
timeout
=
5
,
allow_redirects
=
False
)
# print(page_source.text)
try
:
page_json
=
re
.
findall
(
'AppData = (.*?);</script>'
,
page_source
.
text
)[
0
]
...
...
@@ -648,8 +649,8 @@ class Crawler_main(object):
# name = res_json.get("name")
box_office
=
res_json
.
get
(
"summary"
)
.
get
(
"mbox"
)
.
get
(
"sumBox"
)
dic
=
{
"box_office"
:
box_office
,
"url"
:
"https://maoyan.com/films/
%
s"
%
keys
"box_office"
:
box_office
,
"url"
:
"https://maoyan.com/films/
%
s"
%
keys
}
print
(
dic
)
self
.
parse_data
(
dic
,
keys
)
...
...
@@ -662,7 +663,7 @@ class Crawler_main(object):
if
__name__
==
"__main__"
:
if
args
.
style_tag
or
args
.
countries
:
Crawler_douban
=
Crawler_main
()
Crawler_douban
.
list_page
(
style_tags
=
args
.
style_tag
,
countries
=
args
.
countries
)
Crawler_douban
.
list_page
(
style_tags
=
args
.
style_tag
,
countries
=
args
.
countries
)
else
:
executor
=
ProcessPoolExecutor
(
max_workers
=
8
)
futures
=
[]
...
...
dev/xinyang_ask_tag/crawler_xinyang_ask_tag.py
0 → 100644
View file @
4d1d6e83
# -*- coding:utf-8 -*-
# @Time : 2019/12/27 15:49
# @Author : litao
"""
新氧https://www.soyoung.com/itemk// 页下各标签的问答
"""
import
numpy
as
np
import
random
import
argparse
import
json
,
redis
,
re
,
requests
from
selenium.webdriver
import
ActionChains
import
time
,
datetime
,
copy
from
selenium
import
webdriver
# from PIL import Image
import
os
from
selenium.webdriver.support.ui
import
WebDriverWait
# import cv2
from
fontTools.ttLib
import
*
from
crawler.crawler_sys.utils.trans_str_play_count_to_int
import
trans_play_count
from
crawler.crawler_sys.utils.trans_duration_str_to_second
import
trans_duration
from
concurrent.futures
import
ProcessPoolExecutor
from
lxml
import
etree
from
crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili
import
get_proxy
from
bs4
import
BeautifulSoup
# rds_list = redis.StrictRedis(host='127.0.0.1', port=6379, db=1, decode_responses=True)
# rds_single = redis.StrictRedis(host='127.0.0.1', port=6379, db=0, decode_responses=True)
# rds_get = redis.StrictRedis(host='127.0.0.1', port=6379, db=15, decode_responses=True)
# rds_copy = redis.StrictRedis(host='127.0.0.1', port=6379, db=1, decode_responses=True)
rds_list
=
redis
.
StrictRedis
(
host
=
'192.168.17.60'
,
port
=
6379
,
db
=
1
,
decode_responses
=
True
)
rds_single
=
redis
.
StrictRedis
(
host
=
'192.168.17.60'
,
port
=
6379
,
db
=
0
,
decode_responses
=
True
)
rds_get
=
redis
.
StrictRedis
(
host
=
'192.168.17.60'
,
port
=
6379
,
db
=
15
,
decode_responses
=
True
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Specify a platform name.'
)
parser
.
add_argument
(
'-p'
,
'--max_page'
,
default
=
0
,
type
=
int
,
help
=
(
'The max page numbers'
))
parser
.
add_argument
(
'-t'
,
'--style_tag'
,
default
=
""
,
type
=
str
,
help
=
(
'style_tag'
))
parser
.
add_argument
(
'-c'
,
'--countries'
,
default
=
""
,
type
=
str
,
help
=
(
'style_tag'
))
args
=
parser
.
parse_args
()
def
revise_data
():
scan_re
=
rds_list
.
scan_iter
()
for
one_scan
in
scan_re
:
# print(one_scan)
data
=
rds_list
.
hgetall
(
one_scan
)
# data["title"] = data["title"].replace("\r", "").replace("\n", "")
# data["describe"] = data["describe"].replace("\r", "").replace("\n", "")
if
not
data
.
get
(
"directors"
):
rds_get
.
hmset
(
one_scan
,
data
)
# rds_list.hmset(one_scan,data)
class
Crawler_main
(
object
):
def
__init__
(
self
):
# self.chrome_options = webdriver.ChromeOptions()
# # self.chrome_options.add_argument('--headless')
# self.chrome_options.add_argument('--disable-gpu')
# # self.chrome_options.add_argument("--start-maximized")
# self.chrome_options.add_argument("--no-sandbox")
# self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# self.timestamp = str(datetime.datetime.now().timestamp() * 1e3)
# prefs = {"profile.managed_default_content_settings.images": 2}
# self.chrome_options.add_experimental_option("prefs", prefs)
# self.driver = webdriver.Chrome(options=self.chrome_options)
self
.
headers
=
{
"accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
,
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
"cookie"
:
"_ga=GA1.2.193275787.1596185563; cookie_id=1605149277057691084; xysource=15154; PHPSESSID=e0ae5890a52041aa000765f7ddd6488b; __usersign__=1605149277224950668; __postion__=a
%3
A4
%3
A
%7
Bs
%3
A6
%3
A
%22
cityId
%22%3
Bi
%3
A0
%3
Bs
%3
A8
%3
A
%22
cityName
%22%3
Bs
%3
A0
%3
A
%22%22%3
Bs
%3
A8
%3
A
%22
cityCode
%22%3
Bi
%3
A0
%3
Bs
%3
A3
%3
A
%22
jwd
%22%3
Bi
%3
A0
%3
B
%7
D; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1605149278; _gid=GA1.2.1287643971.1605149278; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1605150670"
,
"referer"
:
"https://www.soyoung.com/itemk//"
,
"sec-fetch-dest"
:
"document"
,
"sec-fetch-mode"
:
"navigate"
,
"sec-fetch-site"
:
"same-origin"
,
"sec-fetch-user"
:
"?1"
,
"upgrade-insecure-requests"
:
"1"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
,
}
self
.
one_video_dic
=
{
"platform"
:
"douban"
,
"title"
:
""
,
"url"
:
""
,
"describe"
:
""
,
}
def
__exit__
(
self
):
# self.driver.close()
pass
def
list_page
(
self
,
releaserUrl
=
"https://www.soyoung.com/itemk//"
,
tag_list_xpath
=
None
,
):
offset
=
0
count_false
=
0
proxies
=
get_proxy
(
0
)
requests_res
=
requests
.
get
(
releaserUrl
,
headers
=
self
.
headers
,
allow_redirects
=
False
,
timeout
=
5
)
page_obj
=
etree
.
HTML
(
requests_res
.
text
)
obj_list
=
page_obj
.
xpath
(
"/html[1]/body[1]/div"
)
for
first_title_obj
in
obj_list
:
try
:
tag_id
=
first_title_obj
.
xpath
(
"./@id"
)[
0
]
print
(
tag_id
)
first_title
=
first_title_obj
.
xpath
(
"./div[1]/div[1]/text()"
)[
0
]
.
strip
()
print
(
"first_title"
,
first_title
)
except
:
continue
second_title_str_obj_list
=
first_title_obj
.
xpath
(
"./div[1]/div[2]/div[1]/div[1]/a"
)
if
'product100'
in
tag_id
:
second_title_obj_list
=
first_title_obj
.
xpath
(
"./div[2]/div"
)
for
count_tag
,
one_second_title_obj
in
enumerate
(
second_title_obj_list
):
second_title
=
second_title_str_obj_list
[
count_tag
]
.
xpath
(
"./text()"
)[
0
]
.
strip
()
second_id
=
second_title_str_obj_list
[
count_tag
]
.
xpath
(
"./@data-id"
)[
0
]
.
strip
()
# second_obj_list = one_second_title_obj.xpath("./div[2]/div")
print
(
"second_title"
,
second_title
)
for
third_title_obj_product
in
self
.
get_third_tag_list
(
second_id
):
# third_title_obj_list = one_third_title_obj.xpath("./div[2]/div")
# third_name = third_title_obj_product.xpath("./div[1]/text()")[0].strip()
# third_name_info = third_title_obj_product.xpath("./div[1]/span[1]/text()")[0].strip()
# third_name_des = third_title_obj_product.xpath("./p[1]/text()")[0].strip()
# third_name_url = "https:" + third_title_obj_product.xpath("./@data-url")[0].strip()
# print(third_title_obj_product)
third_name
=
third_title_obj_product
.
get
(
"name"
)
third_name_info
=
third_title_obj_product
.
get
(
"one_feature"
)
third_name_des
=
third_title_obj_product
.
get
(
"summary"
)
try
:
third_name_url
=
"https://www.soyoung.com/itemk/
%
s/"
%
third_title_obj_product
.
get
(
"seo"
)
.
get
(
"pinyin"
)
except
:
third_name_url
=
""
print
(
first_title
,
second_title
,
third_name
)
for
qa_title
,
qa_answer
in
self
.
parse_single_data
(
third_name_url
):
data_dict
=
{
"first_title"
:
first_title
,
"second_title"
:
second_title
,
"third_name"
:
third_name
,
"third_name_info"
:
third_name_info
,
"third_name_des"
:
third_name_des
,
"third_name_url"
:
third_name_url
,
"qa_title"
:
qa_title
,
"qa_answer"
:
qa_answer
,
}
yield
data_dict
# break
def
parse_single_data
(
self
,
data_url
):
try
:
requests_res
=
requests
.
get
(
data_url
,
headers
=
self
.
headers
,
allow_redirects
=
False
,
timeout
=
5
)
page_obj
=
etree
.
HTML
(
requests_res
.
text
)
obj_list
=
page_obj
.
xpath
(
"//section[@id='qa']/div"
)
for
qa_obj
in
obj_list
:
qa_title
=
qa_obj
.
xpath
(
"./div[1]/p[1]/text()"
)[
0
]
.
strip
()
qa_answer
=
qa_obj
.
xpath
(
"./div[2]/p[1]/span[1]/text()"
)[
0
]
.
strip
()
# print(qa_title,qa_answer)
yield
qa_title
,
qa_answer
except
:
yield
""
,
""
def
get_third_tag_list
(
self
,
menu_id
):
headers
=
{
"accept"
:
"application/json, text/javascript, */*; q=0.01"
,
"accept-encoding"
:
"gzip, deflate"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
# "cookie": "_ga=GA1.2.193275787.1596185563; cookie_id=1605149277057691084; xysource=15154; PHPSESSID=e0ae5890a52041aa000765f7ddd6488b; __usersign__=1605149277224950668; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1605149278; _gid=GA1.2.1287643971.1605149278; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1605165197",
"referer"
:
"https://www.soyoung.com/itemk//"
,
"sec-fetch-dest"
:
"empty"
,
"sec-fetch-mode"
:
"cors"
,
"sec-fetch-site"
:
"same-origin"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
,
"x-requested-with"
:
"XMLHttpRequest"
,
}
url
=
"https://www.soyoung.com/items/itemList?_json=1&menu_id=
%
s"
%
menu_id
requests_res
=
requests
.
get
(
url
,
headers
=
headers
,
allow_redirects
=
False
,
timeout
=
5
)
res_json
=
requests_res
.
json
()
return
res_json
if
__name__
==
"__main__"
:
# if args.style_tag or args.countries:
# Crawler_douban = Crawler_main()
# Crawler_douban.list_page(style_tags=args.style_tag,countries=args.countries)
# else:
# executor = ProcessPoolExecutor(max_workers=5)
# futures = []
# for one_scan in range(5):
# Crawler_douban = Crawler_main()
# future = executor.submit(Crawler_douban.detail_page, task=one_scan)
# futures.append(future)
# executor.shutdown(True)
import
pandas
as
pd
data_list
=
[]
Crawler_xinyang
=
Crawler_main
()
try
:
for
data
in
Crawler_xinyang
.
list_page
():
data_list
.
append
(
data
)
except
:
res
=
pd
.
DataFrame
(
data_list
)
res
.
to_csv
(
"wrong.csv"
,
encoding
=
"gb18030"
)
finally
:
res
=
pd
.
DataFrame
(
data_list
)
res
.
to_csv
(
"result.csv"
,
encoding
=
"gb18030"
)
# revise_data()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment