Commit 209fc8b6 authored by litaolemo's avatar litaolemo

update

parent 89892252
......@@ -325,6 +325,10 @@ class Crawler_toutiao():
url = "https://m.toutiao.com/i{0}/info/?i={1}".format(article_id, article_id)
requests_res = retry_get_url(url, headers=headers, proxies=proxies_num)
res_json = requests_res.json()
try:
content = res_json["data"].get("content").replace("\r", "").replace("\n", "")
except:
content = ""
res_dic = {
"title": res_json["data"].get("title").replace("\r", "").replace("\n", ""),
'high_quality_flag': int(res_json["data"].get('high_quality_flag')),
......@@ -334,7 +338,7 @@ class Crawler_toutiao():
"favorite_count": res_json["data"].get("digg_count"),
'releaser_followers_count': res_json["data"].get("follower_count"),
'release_time': int(res_json["data"].get('publish_time') * 1e3),
"content": res_json["data"].get("content").replace("\r", "").replace("\n", ""),
"content": content,
"img_list": re.findall('img src=".*?"', res_json["data"].get("content"))
}
return res_dic
......@@ -438,16 +442,14 @@ class Crawler_toutiao():
# The search api just return something seems related to search
print(e)
continue
else:
break
if len(toutiao_Lst) >= 100:
output_result(result_Lst=toutiao_Lst,
platform=self.platform,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
)
toutiao_Lst.clear()
if len(toutiao_Lst) >= 100:
output_result(result_Lst=toutiao_Lst,
platform=self.platform,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
)
toutiao_Lst.clear()
if toutiao_Lst != []:
output_result(result_Lst=toutiao_Lst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment