Commit 209fc8b6 authored by litaolemo's avatar litaolemo

update

parent 89892252
...@@ -325,6 +325,10 @@ class Crawler_toutiao(): ...@@ -325,6 +325,10 @@ class Crawler_toutiao():
url = "https://m.toutiao.com/i{0}/info/?i={1}".format(article_id, article_id) url = "https://m.toutiao.com/i{0}/info/?i={1}".format(article_id, article_id)
requests_res = retry_get_url(url, headers=headers, proxies=proxies_num) requests_res = retry_get_url(url, headers=headers, proxies=proxies_num)
res_json = requests_res.json() res_json = requests_res.json()
try:
content = res_json["data"].get("content").replace("\r", "").replace("\n", "")
except:
content = ""
res_dic = { res_dic = {
"title": res_json["data"].get("title").replace("\r", "").replace("\n", ""), "title": res_json["data"].get("title").replace("\r", "").replace("\n", ""),
'high_quality_flag': int(res_json["data"].get('high_quality_flag')), 'high_quality_flag': int(res_json["data"].get('high_quality_flag')),
...@@ -334,7 +338,7 @@ class Crawler_toutiao(): ...@@ -334,7 +338,7 @@ class Crawler_toutiao():
"favorite_count": res_json["data"].get("digg_count"), "favorite_count": res_json["data"].get("digg_count"),
'releaser_followers_count': res_json["data"].get("follower_count"), 'releaser_followers_count': res_json["data"].get("follower_count"),
'release_time': int(res_json["data"].get('publish_time') * 1e3), 'release_time': int(res_json["data"].get('publish_time') * 1e3),
"content": res_json["data"].get("content").replace("\r", "").replace("\n", ""), "content": content,
"img_list": re.findall('img src=".*?"', res_json["data"].get("content")) "img_list": re.findall('img src=".*?"', res_json["data"].get("content"))
} }
return res_dic return res_dic
...@@ -438,16 +442,14 @@ class Crawler_toutiao(): ...@@ -438,16 +442,14 @@ class Crawler_toutiao():
# The search api just return something seems related to search # The search api just return something seems related to search
print(e) print(e)
continue continue
else: if len(toutiao_Lst) >= 100:
break output_result(result_Lst=toutiao_Lst,
if len(toutiao_Lst) >= 100: platform=self.platform,
output_result(result_Lst=toutiao_Lst, output_to_es_raw=output_to_es_raw,
platform=self.platform, output_to_es_register=output_to_es_register,
output_to_es_raw=output_to_es_raw, es_index=es_index,
output_to_es_register=output_to_es_register, )
es_index=es_index, toutiao_Lst.clear()
)
toutiao_Lst.clear()
if toutiao_Lst != []: if toutiao_Lst != []:
output_result(result_Lst=toutiao_Lst, output_result(result_Lst=toutiao_Lst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment