Commit 2838ff4e authored by litaolemo's avatar litaolemo

update

parent 9f62ee57
...@@ -144,8 +144,7 @@ class CrawlerDouban(): ...@@ -144,8 +144,7 @@ class CrawlerDouban():
else: else:
get_page = retry_get_url(url, headers=self.headers, timeout=self.timeout) get_page = retry_get_url(url, headers=self.headers, timeout=self.timeout)
except: except:
get_page = None continue
has_more = False
if get_page and get_page.status_code == 200: if get_page and get_page.status_code == 200:
try: try:
page_json = get_page.json() page_json = get_page.json()
...@@ -157,35 +156,39 @@ class CrawlerDouban(): ...@@ -157,35 +156,39 @@ class CrawlerDouban():
except Exception as e: except Exception as e:
print("load data error %s" % e) print("load data error %s" % e)
continue continue
if page_dic: if page_dic:
for one in page_dic: for one in page_dic:
releaser_id = one["author"]["id"] try:
mid = one["id"] releaser_id = one["author"]["id"]
if True: mid = one["id"]
# try: if True:
res_dic = { # try:
"release_time": trans_strtime_to_timestamp(one["create_time"]), res_dic = {
"fetch_time": int(datetime.datetime.now().timestamp()*1e3), "release_time": trans_strtime_to_timestamp(one["create_time"]),
"url": one["url"], "fetch_time": int(datetime.datetime.now().timestamp()*1e3),
"releaser": one["author"]["name"], "url": one["url"],
"repost_count": None, "releaser": one["author"]["name"],
"comment_count": trans_play_count(one["comments_count"]), "repost_count": None,
"favorite_count": None, "comment_count": trans_play_count(one["comments_count"]),
"title": one["title"], "favorite_count": None,
"releaserUrl": "https://www.douban.com/people/%s" % releaser_id, "title": one["title"],
"releaser_id_str": "douban_%s" % releaser_id, "releaserUrl": "https://www.douban.com/people/%s" % releaser_id,
'video_img':one["cover_url"], "releaser_id_str": "douban_%s" % releaser_id,
"mid":mid, 'video_img':one["cover_url"],
"platform":"douban", "mid":mid,
# "doc_id":doc_id "platform":"douban",
} # "doc_id":doc_id
doc_id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic, }
doc_id_type="all-time-url") doc_id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,
res_dic["doc_id"] = doc_id doc_id_type="all-time-url")
res_dic.update(self.get_single_page(mid,proxies_num)) res_dic["doc_id"] = doc_id
# print(res_dic) res_dic.update(self.get_single_page(mid,proxies_num))
# print(res_dic)
yield res_dic
yield res_dic
except Exception as e:
print("single data parse error %s " %e)
# except Exception as e: # except Exception as e:
# print(one) # print(one)
# print("row formate error %s" % e) # print("row formate error %s" % e)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment