Commit 4d1d6e83 authored by 李小芳's avatar 李小芳

update

parent 626b43f5
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4"> <module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager"> <component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" /> <content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" /> <orderEntry type="jdk" jdkName="Python 3.6 (doris_env)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>
<component name="PyDocumentationSettings"> <component name="PyDocumentationSettings">
......
...@@ -3,6 +3,41 @@ ...@@ -3,6 +3,41 @@
<component name="CsvFileAttributes"> <component name="CsvFileAttributes">
<option name="attributeMap"> <option name="attributeMap">
<map> <map>
<entry key="/crawler_sys/tools/video_num_count/crawler/haokan_青春旅社.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/dev/xinyang_ask_tag/soyoung_service.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/dev/xinyang_ask_tag/soyoung_service_1.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/dev/xinyang_ask_tag/soyoung_service_cika.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/dev/xinyang_ask_tag/soyoung_service_write_cika.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/tasks/yangjingshu.csv"> <entry key="/tasks/yangjingshu.csv">
<value> <value>
<Attribute> <Attribute>
......
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project version="4"> <project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" /> <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (doris_env)" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="true" />
</component>
</project> </project>
\ No newline at end of file
...@@ -60,7 +60,8 @@ def get_url(): ...@@ -60,7 +60,8 @@ def get_url():
if cat <= 5 and source < 14: if cat <= 5 and source < 14:
continue continue
yield cat,source,year,sort yield cat, source, year, sort
def revise_data(): def revise_data():
scan_re = rds_list.scan_iter() scan_re = rds_list.scan_iter()
...@@ -87,22 +88,22 @@ class Crawler_main(object): ...@@ -87,22 +88,22 @@ class Crawler_main(object):
self.chrome_options.add_experimental_option("prefs", prefs) self.chrome_options.add_experimental_option("prefs", prefs)
# self.driver = webdriver.Chrome(options=self.chrome_options) # self.driver = webdriver.Chrome(options=self.chrome_options)
self.one_video_dic = { self.one_video_dic = {
"platform": "douban", "platform": "douban",
"ID": "", "ID": "",
"title": "", "title": "",
"url": "", "url": "",
"directors": "", "directors": "",
"screenwriter": "", "screenwriter": "",
"casts": "", "casts": "",
"describe": "", "describe": "",
"year": "", "year": "",
"provider": "", "provider": "",
"style_tags": "", "style_tags": "",
"project_tags": "", "project_tags": "",
"language": "", "language": "",
"area": "", "area": "",
"rate": "", "rate": "",
"comment_count": "" "comment_count": ""
} }
...@@ -125,19 +126,19 @@ class Crawler_main(object): ...@@ -125,19 +126,19 @@ class Crawler_main(object):
): ):
offset = 30 offset = 30
headers = { headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate, br", "Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh,zh-CN;q=0.9", "Accept-Language": "zh,zh-CN;q=0.9",
"Connection": "keep-alive", "Connection": "keep-alive",
# "Cookie": '__mta=150368905.1577424190198.1577424190198.1577433956085.2; uuid_n_v=v1; uuid=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; _csrf=c8be65f46b2c830502aa6a49c2f1aacb1660ffb3e3a6c4ae3623084677b66d7c; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189; _lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; mojo-uuid=396ea3294dbf9178fa564b08543aed72; mojo-session-id={"id":"f35010c2739ba6f036e332417fe21f84","time":1577433601641}; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1577434032; __mta=150368905.1577424190198.1577433956085.1577434032548.3; mojo-trace-id=57; _lxsdk_s=16f465e962c-545-9da-13a%7C%7C64', # "Cookie": '__mta=150368905.1577424190198.1577424190198.1577433956085.2; uuid_n_v=v1; uuid=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; _csrf=c8be65f46b2c830502aa6a49c2f1aacb1660ffb3e3a6c4ae3623084677b66d7c; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189; _lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; mojo-uuid=396ea3294dbf9178fa564b08543aed72; mojo-session-id={"id":"f35010c2739ba6f036e332417fe21f84","time":1577433601641}; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1577434032; __mta=150368905.1577424190198.1577433956085.1577434032548.3; mojo-trace-id=57; _lxsdk_s=16f465e962c-545-9da-13a%7C%7C64',
"Cookie": '__mta=150368905.1577424190198.1577931921073.1577933054583.8; uuid_n_v=v1; uuid=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; _csrf=c8be65f46b2c830502aa6a49c2f1aacb1660ffb3e3a6c4ae3623084677b66d7c; _lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; mojo-uuid=396ea3294dbf9178fa564b08543aed72; lt=dwim2AyVn0Nr4tMQ1qCHf87HvVwAAAAAsQkAAGKVo4UF5isSHZyJ2F-6Yypd0YqL-FIGGMTWixcuMN23AhelN_OPNDA2hAk5IuCtNg; lt.sig=0AWWI8aMHZfmuLzGDO9hoKoZqT8; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189,1577683110; mojo-session-id={"id":"8d8eb79ab4cbaf8082e721ba64b73f3a","time":1577935255982}; mojo-trace-id=1; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1577935256; __mta=150368905.1577424190198.1577933054583.1577935256193.9; _lxsdk_s=16f64452341-fac-102-6a1%7C265018624%7C3', "Cookie": '__mta=150368905.1577424190198.1577931921073.1577933054583.8; uuid_n_v=v1; uuid=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; _csrf=c8be65f46b2c830502aa6a49c2f1aacb1660ffb3e3a6c4ae3623084677b66d7c; _lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; mojo-uuid=396ea3294dbf9178fa564b08543aed72; lt=dwim2AyVn0Nr4tMQ1qCHf87HvVwAAAAAsQkAAGKVo4UF5isSHZyJ2F-6Yypd0YqL-FIGGMTWixcuMN23AhelN_OPNDA2hAk5IuCtNg; lt.sig=0AWWI8aMHZfmuLzGDO9hoKoZqT8; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189,1577683110; mojo-session-id={"id":"8d8eb79ab4cbaf8082e721ba64b73f3a","time":1577935255982}; mojo-trace-id=1; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1577935256; __mta=150368905.1577424190198.1577933054583.1577935256193.9; _lxsdk_s=16f64452341-fac-102-6a1%7C265018624%7C3',
"Host": "maoyan.com", "Host": "maoyan.com",
"Referer": "https://maoyan.com/films?showType=3&offset=30", "Referer": "https://maoyan.com/films?showType=3&offset=30",
"Sec-Fetch-Mode": "navigate", "Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin", "Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1", "Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1", "Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
} }
count_false = 0 count_false = 0
if args.max_page: if args.max_page:
...@@ -147,7 +148,7 @@ class Crawler_main(object): ...@@ -147,7 +148,7 @@ class Crawler_main(object):
time.sleep(0.5) time.sleep(0.5)
print("page ", offset) print("page ", offset)
url = "https://maoyan.com/films?showType=3&offset={0}".format( url = "https://maoyan.com/films?showType=3&offset={0}".format(
str(offset)) str(offset))
proxies = get_proxy(4) proxies = get_proxy(4)
requests_res = requests.get(url, headers=headers, proxies=proxies, allow_redirects=False) requests_res = requests.get(url, headers=headers, proxies=proxies, allow_redirects=False)
html = etree.HTML(requests_res.text) html = etree.HTML(requests_res.text)
...@@ -164,9 +165,9 @@ class Crawler_main(object): ...@@ -164,9 +165,9 @@ class Crawler_main(object):
for rate in rate_list: for rate in rate_list:
rate_str += rate rate_str += rate
data_dic = { data_dic = {
"url": url, "url": url,
"title": title, "title": title,
"rate": rate_str, "rate": rate_str,
} }
if style_tags: if style_tags:
...@@ -330,18 +331,18 @@ class Crawler_main(object): ...@@ -330,18 +331,18 @@ class Crawler_main(object):
# for handle in self.driver.window_handles: # for handle in self.driver.window_handles:
# self.driver.switch_to.window(handle) # self.driver.switch_to.window(handle)
headers = { headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"accept-encoding": "gzip, deflate, br", "accept-encoding": "gzip, deflate, br",
"accept-language": "zh,zh-CN;q=0.9", "accept-language": "zh,zh-CN;q=0.9",
# "cookie": "pgv_pvi=203414528; RK=SCQYJhGMVf; ptcz=5f0818b08a7345580a07bce669e0f0468b64107f4ecfb2c9bebf109cb23cf4fb; pgv_pvid=2754880744; ts_uid=176985184; tvfe_boss_uuid=54e907210062ff55; video_guid=0df27917cdb73abd; video_platform=2; XWINDEXGREY=0; mobileUV=1_16ac3c085a7_484c1; tvfe_search_uid=acc18029-4786-42c4-8f6a-f308777454bc; Qs_lvt_311470=1562066061; Qs_pv_311470=992309958717814400; _ga=GA1.2.1184421010.1562066062; login_remember=qq; ptui_loginuin=593516104; o_cookie=593516104; bucket_id=9231005; pac_uid=1_593516104; pgv_si=s4148599808; ptisp=; pgv_info=ssid=s6122922102; ts_refer=m.v.qq.com/x/page/o/d/k/i3023rbj8yk.html; qv_als=Kg5QkSIirE60EDyzA11576570304alZX8g==; ptag=m_v_qq_com|videolist:title; ts_last=v.qq.com/detail/m/mzc00200iwawqac.html; ad_play_index=132", # "cookie": "pgv_pvi=203414528; RK=SCQYJhGMVf; ptcz=5f0818b08a7345580a07bce669e0f0468b64107f4ecfb2c9bebf109cb23cf4fb; pgv_pvid=2754880744; ts_uid=176985184; tvfe_boss_uuid=54e907210062ff55; video_guid=0df27917cdb73abd; video_platform=2; XWINDEXGREY=0; mobileUV=1_16ac3c085a7_484c1; tvfe_search_uid=acc18029-4786-42c4-8f6a-f308777454bc; Qs_lvt_311470=1562066061; Qs_pv_311470=992309958717814400; _ga=GA1.2.1184421010.1562066062; login_remember=qq; ptui_loginuin=593516104; o_cookie=593516104; bucket_id=9231005; pac_uid=1_593516104; pgv_si=s4148599808; ptisp=; pgv_info=ssid=s6122922102; ts_refer=m.v.qq.com/x/page/o/d/k/i3023rbj8yk.html; qv_als=Kg5QkSIirE60EDyzA11576570304alZX8g==; ptag=m_v_qq_com|videolist:title; ts_last=v.qq.com/detail/m/mzc00200iwawqac.html; ad_play_index=132",
# "referer": "https://v.qq.com/x/cover/%s.html" % url_id, # "referer": "https://v.qq.com/x/cover/%s.html" % url_id,
"sec-fetch-mode": "navigate", "sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin", "sec-fetch-site": "same-origin",
"upgrade-insecure-requests": "1", "upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"sec-fetch-user": "?1", "sec-fetch-user": "?1",
"cache-control": "max-age=0", "cache-control": "max-age=0",
"if-modified-since": "Thu, 19 Dec 2019 02:30:00 GMT" "if-modified-since": "Thu, 19 Dec 2019 02:30:00 GMT"
} }
try: try:
page_data = requests.get(res["url"], headers=headers, timeout=5) page_data = requests.get(res["url"], headers=headers, timeout=5)
...@@ -447,13 +448,13 @@ class Crawler_main(object): ...@@ -447,13 +448,13 @@ class Crawler_main(object):
except: except:
comment_count = 0 comment_count = 0
dic = { dic = {
"play_count_sum": play_count_sum, "play_count_sum": play_count_sum,
"duration": duration, "duration": duration,
"project_tags": project_tags, "project_tags": project_tags,
"title": title, "title": title,
"year": year, "year": year,
"video_count": 1, "video_count": 1,
"comment_count": comment_count "comment_count": comment_count
} }
project_name = keys project_name = keys
self.parse_data(dic, project_name) self.parse_data(dic, project_name)
...@@ -462,7 +463,7 @@ class Crawler_main(object): ...@@ -462,7 +463,7 @@ class Crawler_main(object):
dic["video_id"] = res["title"] dic["video_id"] = res["title"]
dic["play_count"] = res["play_count_sum"] dic["play_count"] = res["play_count_sum"]
one_video_dic = { one_video_dic = {
title: dic title: dic
} }
# print(one_video_dic) # print(one_video_dic)
self.one_video_page(title, one_video_dic, type="single") self.one_video_page(title, one_video_dic, type="single")
...@@ -473,15 +474,15 @@ class Crawler_main(object): ...@@ -473,15 +474,15 @@ class Crawler_main(object):
if type == "list": if type == "list":
url = one_video_dic[one_video]["url"] url = one_video_dic[one_video]["url"]
headers = { headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"accept-encoding": "gzip, deflate, br", "accept-encoding": "gzip, deflate, br",
"accept-language": "zh,zh-CN;q=0.9", "accept-language": "zh,zh-CN;q=0.9",
# "cookie": "pgv_pvi=203414528; RK=SCQYJhGMVf; ptcz=5f0818b08a7345580a07bce669e0f0468b64107f4ecfb2c9bebf109cb23cf4fb; pgv_pvid=2754880744; ts_uid=176985184; tvfe_boss_uuid=54e907210062ff55; video_guid=0df27917cdb73abd; video_platform=2; XWINDEXGREY=0; mobileUV=1_16ac3c085a7_484c1; tvfe_search_uid=acc18029-4786-42c4-8f6a-f308777454bc; Qs_lvt_311470=1562066061; Qs_pv_311470=992309958717814400; _ga=GA1.2.1184421010.1562066062; login_remember=qq; ptui_loginuin=593516104; o_cookie=593516104; bucket_id=9231005; pac_uid=1_593516104; pgv_si=s4148599808; ptisp=; pgv_info=ssid=s6122922102; ts_refer=m.v.qq.com/x/page/o/d/k/i3023rbj8yk.html; qv_als=Kg5QkSIirE60EDyzA11576570304alZX8g==; ptag=m_v_qq_com|videolist:title; ts_last=v.qq.com/detail/m/mzc00200iwawqac.html; ad_play_index=132", # "cookie": "pgv_pvi=203414528; RK=SCQYJhGMVf; ptcz=5f0818b08a7345580a07bce669e0f0468b64107f4ecfb2c9bebf109cb23cf4fb; pgv_pvid=2754880744; ts_uid=176985184; tvfe_boss_uuid=54e907210062ff55; video_guid=0df27917cdb73abd; video_platform=2; XWINDEXGREY=0; mobileUV=1_16ac3c085a7_484c1; tvfe_search_uid=acc18029-4786-42c4-8f6a-f308777454bc; Qs_lvt_311470=1562066061; Qs_pv_311470=992309958717814400; _ga=GA1.2.1184421010.1562066062; login_remember=qq; ptui_loginuin=593516104; o_cookie=593516104; bucket_id=9231005; pac_uid=1_593516104; pgv_si=s4148599808; ptisp=; pgv_info=ssid=s6122922102; ts_refer=m.v.qq.com/x/page/o/d/k/i3023rbj8yk.html; qv_als=Kg5QkSIirE60EDyzA11576570304alZX8g==; ptag=m_v_qq_com|videolist:title; ts_last=v.qq.com/detail/m/mzc00200iwawqac.html; ad_play_index=132",
"referer": url, "referer": url,
"sec-fetch-mode": "navigate", "sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin", "sec-fetch-site": "same-origin",
"upgrade-insecure-requests": "1", "upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
} }
# print(url) # print(url)
url_lis = url.split(".", -1) url_lis = url.split(".", -1)
...@@ -511,18 +512,18 @@ class Crawler_main(object): ...@@ -511,18 +512,18 @@ class Crawler_main(object):
if not if_pay: if not if_pay:
if_pay = "" if_pay = ""
dic = { dic = {
"album": title, "album": title,
"video_title": video_title, "video_title": video_title,
"if_pay": if_pay, "if_pay": if_pay,
"comment_count": comment_count, "comment_count": comment_count,
"url": url, "url": url,
"video_url": single_data_url, "video_url": single_data_url,
"video_id": video_id, "video_id": video_id,
"duration": duration, "duration": duration,
"video_count": 1, "video_count": 1,
"play_count": play_count_sum, "play_count": play_count_sum,
"year": year, "year": year,
"project_tags": project_tags, "project_tags": project_tags,
} }
# print(dic) # print(dic)
...@@ -549,11 +550,11 @@ class Crawler_main(object): ...@@ -549,11 +550,11 @@ class Crawler_main(object):
): ):
offset = 0 offset = 0
headers = { headers = {
"Host": "api.maoyan.com", "Host": "api.maoyan.com",
"Connection": "Keep-Alive", "Connection": "Keep-Alive",
"Accept-Encoding": "gzip", "Accept-Encoding": "gzip",
"User-Agent": "AiMovie /Oneplus-6.0.1-oneplus a5010-0x0-0-null-0-000000000000000-null", "User-Agent": "AiMovie /Oneplus-6.0.1-oneplus a5010-0x0-0-null-0-000000000000000-null",
"mtgdid": "AAAAAAAAAAAAACh9V5sO1zmQc71i5gjpKNuww8T-JnDVTQHuVQFINVu2yYO8FhnCWl_Cqj2TMCWI983qEk_Ha5ayk_tXytbMWi4", "mtgdid": "AAAAAAAAAAAAACh9V5sO1zmQc71i5gjpKNuww8T-JnDVTQHuVQFINVu2yYO8FhnCWl_Cqj2TMCWI983qEk_Ha5ayk_tXytbMWi4",
} }
count_false = 0 count_false = 0
print(cat, source, year, sort) print(cat, source, year, sort)
...@@ -566,7 +567,7 @@ class Crawler_main(object): ...@@ -566,7 +567,7 @@ class Crawler_main(object):
time.sleep(0.1) time.sleep(0.1)
print("page ", offset) print("page ", offset)
url = "http://api.maoyan.com/mmdb/search/movie/tag/list.json?cityId=1&limit=100&offset={0}&catId={1}&sourceId={2}&yearId={3}&sortId={4}&token=7SJTJRCOW4fNMlp_xZDfgeI8qL0AAAAAsAkAADq-Y4OtjaaVeiysSdZtMsWTuGb0liEIqBPrkrC5QNJ0xOlFWRhf__Rj4D5cDS9L9g&utm_campaign=AmovieBmovieCD-1&movieBundleVersion=8012031&utm_source=meituan&utm_medium=android&utm_term=8.12.3&utm_content=440000000189785&ci=1&net=1&dModel=oneplus%20a5010&uuid=0000000000000A10631E76CD844099D6694316F7616BBA157797426456628307&channelId=1&lat=0.0&lng=0.0&refer=c_boybi6x4&version_name=8.12.3&machine_type=0".format( url = "http://api.maoyan.com/mmdb/search/movie/tag/list.json?cityId=1&limit=100&offset={0}&catId={1}&sourceId={2}&yearId={3}&sortId={4}&token=7SJTJRCOW4fNMlp_xZDfgeI8qL0AAAAAsAkAADq-Y4OtjaaVeiysSdZtMsWTuGb0liEIqBPrkrC5QNJ0xOlFWRhf__Rj4D5cDS9L9g&utm_campaign=AmovieBmovieCD-1&movieBundleVersion=8012031&utm_source=meituan&utm_medium=android&utm_term=8.12.3&utm_content=440000000189785&ci=1&net=1&dModel=oneplus%20a5010&uuid=0000000000000A10631E76CD844099D6694316F7616BBA157797426456628307&channelId=1&lat=0.0&lng=0.0&refer=c_boybi6x4&version_name=8.12.3&machine_type=0".format(
str(offset),cat,source,year,sort) str(offset), cat, source, year, sort)
proxies = get_proxy(4) proxies = get_proxy(4)
requests_res = requests.get(url, headers=headers, proxies=proxies, allow_redirects=False) requests_res = requests.get(url, headers=headers, proxies=proxies, allow_redirects=False)
dev_list = requests_res.json() dev_list = requests_res.json()
...@@ -612,8 +613,8 @@ class Crawler_main(object): ...@@ -612,8 +613,8 @@ class Crawler_main(object):
try: try:
if int(res["rt"][:4]) < 2010: if int(res["rt"][:4]) < 2010:
dic = { dic = {
"box_office": "", "box_office": "",
"url": "https://maoyan.com/films/%s" % keys "url": "https://maoyan.com/films/%s" % keys
} }
self.parse_data(dic, keys) self.parse_data(dic, keys)
rds_get.delete(keys) rds_get.delete(keys)
...@@ -622,21 +623,21 @@ class Crawler_main(object): ...@@ -622,21 +623,21 @@ class Crawler_main(object):
pass pass
headers = { headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate", "Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh,zh-CN;q=0.9", "Accept-Language": "zh,zh-CN;q=0.9",
"Connection": "keep-alive", "Connection": "keep-alive",
# "Cookie": "_lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189,1577683110,1577942292; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __utma=17099173.1331545914.1577942309.1577942309.1577942309.1; __utmc=17099173; __utmz=17099173.1577942309.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __mta=150368905.1577424190198.1578028660590.1578044222257.17; _lxsdk_s=16f6ac3e790-0de-cab-b27%7C265018624%7C6; uuid_n_v=v1; iuuid=9C5AAEF02E0E11EAB981AB68C7AB1D51622E552FC52545AE9F3D31A0EE1F6A4F; webp=true; selectci=; ci=1%2C%E5%8C%97%E4%BA%AC; theme=maoyan; _last_page=undefined; latlng=39.908589%2C116.397316%2C1578045092790; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1578045104", # "Cookie": "_lxsdk_cuid=16f45cef868c8-03522a76dfe0d8-5f4e2917-161012-16f45cef868c8; _lxsdk=F6EC1BC0286811EAA13C754DA9FC705E01959D18445546A1A0F7A8FE8311D8BD; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1577424189,1577683110,1577942292; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __utma=17099173.1331545914.1577942309.1577942309.1577942309.1; __utmc=17099173; __utmz=17099173.1577942309.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __mta=150368905.1577424190198.1578028660590.1578044222257.17; _lxsdk_s=16f6ac3e790-0de-cab-b27%7C265018624%7C6; uuid_n_v=v1; iuuid=9C5AAEF02E0E11EAB981AB68C7AB1D51622E552FC52545AE9F3D31A0EE1F6A4F; webp=true; selectci=; ci=1%2C%E5%8C%97%E4%BA%AC; theme=maoyan; _last_page=undefined; latlng=39.908589%2C116.397316%2C1578045092790; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1578045104",
"Host": "m.maoyan.com", "Host": "m.maoyan.com",
"Upgrade-Insecure-Requests": "1", "Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
} }
# keys = 1249366 # keys = 1249366
proxies = get_proxy(4) proxies = get_proxy(4)
url = "http://m.maoyan.com/movie/{0}/box?_v_=yes&utm_campaign=AmovieBmovieD100&f=android&userid={1}".format( url = "http://m.maoyan.com/movie/{0}/box?_v_=yes&utm_campaign=AmovieBmovieD100&f=android&userid={1}".format(
keys,random.randint(265011000,265031000)) keys, random.randint(265011000, 265031000))
page_source = requests.get(url, headers=headers, proxies=proxies, timeout=5,allow_redirects=False) page_source = requests.get(url, headers=headers, proxies=proxies, timeout=5, allow_redirects=False)
# print(page_source.text) # print(page_source.text)
try: try:
page_json = re.findall('AppData = (.*?);</script>', page_source.text)[0] page_json = re.findall('AppData = (.*?);</script>', page_source.text)[0]
...@@ -648,8 +649,8 @@ class Crawler_main(object): ...@@ -648,8 +649,8 @@ class Crawler_main(object):
# name = res_json.get("name") # name = res_json.get("name")
box_office = res_json.get("summary").get("mbox").get("sumBox") box_office = res_json.get("summary").get("mbox").get("sumBox")
dic = { dic = {
"box_office": box_office, "box_office": box_office,
"url":"https://maoyan.com/films/%s" % keys "url": "https://maoyan.com/films/%s" % keys
} }
print(dic) print(dic)
self.parse_data(dic, keys) self.parse_data(dic, keys)
...@@ -662,7 +663,7 @@ class Crawler_main(object): ...@@ -662,7 +663,7 @@ class Crawler_main(object):
if __name__ == "__main__": if __name__ == "__main__":
if args.style_tag or args.countries: if args.style_tag or args.countries:
Crawler_douban = Crawler_main() Crawler_douban = Crawler_main()
Crawler_douban.list_page(style_tags=args.style_tag,countries=args.countries) Crawler_douban.list_page(style_tags=args.style_tag, countries=args.countries)
else: else:
executor = ProcessPoolExecutor(max_workers=8) executor = ProcessPoolExecutor(max_workers=8)
futures = [] futures = []
......
# -*- coding:utf-8 -*-
# @Time : 2019/12/27 15:49
# @Author : litao
"""
新氧https://www.soyoung.com/itemk// 页下各标签的问答
"""
import numpy as np
import random
import argparse
import json, redis, re, requests
from selenium.webdriver import ActionChains
import time, datetime, copy
from selenium import webdriver
# from PIL import Image
import os
from selenium.webdriver.support.ui import WebDriverWait
# import cv2
from fontTools.ttLib import *
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
from concurrent.futures import ProcessPoolExecutor
from lxml import etree
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
from bs4 import BeautifulSoup
# rds_list = redis.StrictRedis(host='127.0.0.1', port=6379, db=1, decode_responses=True)
# rds_single = redis.StrictRedis(host='127.0.0.1', port=6379, db=0, decode_responses=True)
# rds_get = redis.StrictRedis(host='127.0.0.1', port=6379, db=15, decode_responses=True)
# rds_copy = redis.StrictRedis(host='127.0.0.1', port=6379, db=1, decode_responses=True)
rds_list = redis.StrictRedis(host='192.168.17.60', port=6379, db=1, decode_responses=True)
rds_single = redis.StrictRedis(host='192.168.17.60', port=6379, db=0, decode_responses=True)
rds_get = redis.StrictRedis(host='192.168.17.60', port=6379, db=15, decode_responses=True)
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--max_page', default=0, type=int,
help=('The max page numbers'))
parser.add_argument('-t', '--style_tag', default="", type=str,
help=('style_tag'))
parser.add_argument('-c', '--countries', default="", type=str,
help=('style_tag'))
args = parser.parse_args()
def revise_data():
scan_re = rds_list.scan_iter()
for one_scan in scan_re:
# print(one_scan)
data = rds_list.hgetall(one_scan)
# data["title"] = data["title"].replace("\r", "").replace("\n", "")
# data["describe"] = data["describe"].replace("\r", "").replace("\n", "")
if not data.get("directors"):
rds_get.hmset(one_scan, data)
# rds_list.hmset(one_scan,data)
class Crawler_main(object):
def __init__(self):
# self.chrome_options = webdriver.ChromeOptions()
# # self.chrome_options.add_argument('--headless')
# self.chrome_options.add_argument('--disable-gpu')
# # self.chrome_options.add_argument("--start-maximized")
# self.chrome_options.add_argument("--no-sandbox")
# self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# self.timestamp = str(datetime.datetime.now().timestamp() * 1e3)
# prefs = {"profile.managed_default_content_settings.images": 2}
# self.chrome_options.add_experimental_option("prefs", prefs)
# self.driver = webdriver.Chrome(options=self.chrome_options)
self.headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cookie": "_ga=GA1.2.193275787.1596185563; cookie_id=1605149277057691084; xysource=15154; PHPSESSID=e0ae5890a52041aa000765f7ddd6488b; __usersign__=1605149277224950668; __postion__=a%3A4%3A%7Bs%3A6%3A%22cityId%22%3Bi%3A0%3Bs%3A8%3A%22cityName%22%3Bs%3A0%3A%22%22%3Bs%3A8%3A%22cityCode%22%3Bi%3A0%3Bs%3A3%3A%22jwd%22%3Bi%3A0%3B%7D; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1605149278; _gid=GA1.2.1287643971.1605149278; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1605150670",
"referer": "https://www.soyoung.com/itemk//",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
}
self.one_video_dic = {
"platform": "douban",
"title": "",
"url": "",
"describe": "",
}
def __exit__(self):
# self.driver.close()
pass
def list_page(self, releaserUrl="https://www.soyoung.com/itemk//",
tag_list_xpath=None,
):
offset = 0
count_false = 0
proxies = get_proxy(0)
requests_res = requests.get(releaserUrl, headers=self.headers, allow_redirects=False, timeout=5)
page_obj = etree.HTML(requests_res.text)
obj_list = page_obj.xpath("/html[1]/body[1]/div")
for first_title_obj in obj_list:
try:
tag_id = first_title_obj.xpath("./@id")[0]
print(tag_id)
first_title = first_title_obj.xpath("./div[1]/div[1]/text()")[0].strip()
print("first_title", first_title)
except:
continue
second_title_str_obj_list = first_title_obj.xpath("./div[1]/div[2]/div[1]/div[1]/a")
if 'product100' in tag_id:
second_title_obj_list = first_title_obj.xpath("./div[2]/div")
for count_tag, one_second_title_obj in enumerate(second_title_obj_list):
second_title = second_title_str_obj_list[count_tag].xpath("./text()")[0].strip()
second_id = second_title_str_obj_list[count_tag].xpath("./@data-id")[0].strip()
# second_obj_list = one_second_title_obj.xpath("./div[2]/div")
print("second_title", second_title)
for third_title_obj_product in self.get_third_tag_list(second_id):
# third_title_obj_list = one_third_title_obj.xpath("./div[2]/div")
# third_name = third_title_obj_product.xpath("./div[1]/text()")[0].strip()
# third_name_info = third_title_obj_product.xpath("./div[1]/span[1]/text()")[0].strip()
# third_name_des = third_title_obj_product.xpath("./p[1]/text()")[0].strip()
# third_name_url = "https:" + third_title_obj_product.xpath("./@data-url")[0].strip()
# print(third_title_obj_product)
third_name = third_title_obj_product.get("name")
third_name_info = third_title_obj_product.get("one_feature")
third_name_des = third_title_obj_product.get("summary")
try:
third_name_url = "https://www.soyoung.com/itemk/%s/" % third_title_obj_product.get(
"seo").get("pinyin")
except:
third_name_url = ""
print(first_title, second_title, third_name)
for qa_title, qa_answer in self.parse_single_data(third_name_url):
data_dict = {
"first_title": first_title,
"second_title": second_title,
"third_name": third_name,
"third_name_info": third_name_info,
"third_name_des": third_name_des,
"third_name_url": third_name_url,
"qa_title": qa_title,
"qa_answer": qa_answer,
}
yield data_dict
# break
def parse_single_data(self, data_url):
try:
requests_res = requests.get(data_url, headers=self.headers, allow_redirects=False, timeout=5)
page_obj = etree.HTML(requests_res.text)
obj_list = page_obj.xpath("//section[@id='qa']/div")
for qa_obj in obj_list:
qa_title = qa_obj.xpath("./div[1]/p[1]/text()")[0].strip()
qa_answer = qa_obj.xpath("./div[2]/p[1]/span[1]/text()")[0].strip()
# print(qa_title,qa_answer)
yield qa_title, qa_answer
except:
yield "", ""
def get_third_tag_list(self, menu_id):
headers = {
"accept": "application/json, text/javascript, */*; q=0.01",
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9",
# "cookie": "_ga=GA1.2.193275787.1596185563; cookie_id=1605149277057691084; xysource=15154; PHPSESSID=e0ae5890a52041aa000765f7ddd6488b; __usersign__=1605149277224950668; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1605149278; _gid=GA1.2.1287643971.1605149278; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1605165197",
"referer": "https://www.soyoung.com/itemk//",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
}
url = "https://www.soyoung.com/items/itemList?_json=1&menu_id=%s" % menu_id
requests_res = requests.get(url, headers=headers, allow_redirects=False, timeout=5)
res_json = requests_res.json()
return res_json
if __name__ == "__main__":
# if args.style_tag or args.countries:
# Crawler_douban = Crawler_main()
# Crawler_douban.list_page(style_tags=args.style_tag,countries=args.countries)
# else:
# executor = ProcessPoolExecutor(max_workers=5)
# futures = []
# for one_scan in range(5):
# Crawler_douban = Crawler_main()
# future = executor.submit(Crawler_douban.detail_page, task=one_scan)
# futures.append(future)
# executor.shutdown(True)
import pandas as pd
data_list = []
Crawler_xinyang = Crawler_main()
try:
for data in Crawler_xinyang.list_page():
data_list.append(data)
except:
res = pd.DataFrame(data_list)
res.to_csv("wrong.csv", encoding="gb18030")
finally:
res = pd.DataFrame(data_list)
res.to_csv("result.csv", encoding="gb18030")
# revise_data()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment