Commit 6a26b3f6 authored by litaolemo's avatar litaolemo

update

parent e2e6ec48
...@@ -767,7 +767,7 @@ class Crawler_toutiao(): ...@@ -767,7 +767,7 @@ class Crawler_toutiao():
return video_image_url return video_image_url
def get_web_article_info(self,article_id): def get_web_article_info(self,article_id,proxies_num=0):
# headers = { # headers = {
# "Accept": "*/*", # "Accept": "*/*",
# "Accept-Encoding": "gzip, deflate", # "Accept-Encoding": "gzip, deflate",
...@@ -799,7 +799,7 @@ class Crawler_toutiao(): ...@@ -799,7 +799,7 @@ class Crawler_toutiao():
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
} }
url = "https://m.toutiao.com/i{0}/info/?i={1}".format(article_id,article_id) url = "https://m.toutiao.com/i{0}/info/?i={1}".format(article_id,article_id)
requests_res = retry_get_url(url,headers=headers,proxies=0) requests_res = retry_get_url(url,headers=headers,proxies=proxies_num)
res_json = requests_res.json() res_json = requests_res.json()
res_dic = { res_dic = {
"title":res_json["data"].get("title").replace("\r","").replace("\n",""), "title":res_json["data"].get("title").replace("\r","").replace("\n",""),
...@@ -849,10 +849,6 @@ class Crawler_toutiao(): ...@@ -849,10 +849,6 @@ class Crawler_toutiao():
try: try:
proxies = get_proxy(proxies_num) proxies = get_proxy(proxies_num)
if proxies: if proxies:
# proxies = {
# "http": "http://127.0.0.1:80",
# "https": "http://127.0.0.1:443"
# }
get_page = requests.get(url, headers=self.headers, proxies=proxies, timeout=10) get_page = requests.get(url, headers=self.headers, proxies=proxies, timeout=10)
else: else:
get_page = requests.get(url, headers=self.headers, timeout=10) get_page = requests.get(url, headers=self.headers, timeout=10)
...@@ -927,7 +923,7 @@ class Crawler_toutiao(): ...@@ -927,7 +923,7 @@ class Crawler_toutiao():
video_dic['id'] = cal_doc_id(video_dic["platform"], url=video_dic["url"], doc_id_type='all-time-url', data_dict=video_dic) video_dic['id'] = cal_doc_id(video_dic["platform"], url=video_dic["url"], doc_id_type='all-time-url', data_dict=video_dic)
try: try:
article_info = self.get_web_article_info(info_dic.get('tag_id')) article_info = self.get_web_article_info(info_dic.get('tag_id'),proxies_num=proxies_num)
video_dic.update(article_info) video_dic.update(article_info)
except Exception as e: except Exception as e:
print("method get_web_article_info error %s" %e) print("method get_web_article_info error %s" %e)
...@@ -1882,7 +1878,7 @@ if __name__ == '__main__': ...@@ -1882,7 +1878,7 @@ if __name__ == '__main__':
for url in data_lis: for url in data_lis:
test.releaser_page_by_time(1595088000000, 1595319362610, url, output_to_es_raw=True, test.releaser_page_by_time(1595088000000, 1595319362610, url, output_to_es_raw=True,
es_index='crawler-data-raw', releaser_page_num_max=2, es_index='crawler-data-raw', releaser_page_num_max=2,
proxies_num=2 proxies_num=1
) )
# test.get_releaser_follower_num(url) # test.get_releaser_follower_num(url)
# test.get_releaser_image(releaserUrl=data_lis[0]) # test.get_releaser_image(releaserUrl=data_lis[0])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment