update

2838ff4e · litaolemo · 9f62ee57 · 2838ff4e
Commit 2838ff4e authored Jul 28, 2020 by litaolemo
Hide whitespace changes
Inline Side-by-side

Showing with 32 additions and 29 deletions

crawler_douban.py crawler_sys/site_crawler_by_redis/crawler_douban.py +32 -29

No files found.
--- a/crawler_sys/site_crawler_by_redis/crawler_douban.py
+++ b/crawler_sys/site_crawler_by_redis/crawler_douban.py
@@ -144,8 +144,7 @@ class CrawlerDouban():
                else:
                    get_page = retry_get_url(url, headers=self.headers, timeout=self.timeout)
            except:
-                get_page = None
-                has_more = False
+                continue
            if get_page and get_page.status_code == 200:
                try:
                    page_json = get_page.json()
@@ -157,35 +156,39 @@ class CrawlerDouban():
                except Exception as e:
                    print("load data error %s" % e)
                    continue
+
                if page_dic:
                    for one in page_dic:
-                        releaser_id = one["author"]["id"]
-                        mid = one["id"]
-                        if True:
-                        # try:
-                            res_dic = {
-                                "release_time": trans_strtime_to_timestamp(one["create_time"]),
-                                "fetch_time": int(datetime.datetime.now().timestamp()*1e3),
-                                "url": one["url"],
-                                "releaser": one["author"]["name"],
-                                "repost_count": None,
-                                "comment_count": trans_play_count(one["comments_count"]),
-                                "favorite_count": None,
-                                "title": one["title"],
-                                "releaserUrl": "https://www.douban.com/people/%s" % releaser_id,
-                                "releaser_id_str": "douban_%s" % releaser_id,
-                                'video_img':one["cover_url"],
-                                "mid":mid,
-                                "platform":"douban",
-                                # "doc_id":doc_id
-                            }
-                            doc_id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,
-                                                doc_id_type="all-time-url")
-                            res_dic["doc_id"] = doc_id
-                            res_dic.update(self.get_single_page(mid,proxies_num))
-                            # print(res_dic)
-
-                            yield res_dic
+                        try:
+                            releaser_id = one["author"]["id"]
+                            mid = one["id"]
+                            if True:
+                            # try:
+                                res_dic = {
+                                    "release_time": trans_strtime_to_timestamp(one["create_time"]),
+                                    "fetch_time": int(datetime.datetime.now().timestamp()*1e3),
+                                    "url": one["url"],
+                                    "releaser": one["author"]["name"],
+                                    "repost_count": None,
+                                    "comment_count": trans_play_count(one["comments_count"]),
+                                    "favorite_count": None,
+                                    "title": one["title"],
+                                    "releaserUrl": "https://www.douban.com/people/%s" % releaser_id,
+                                    "releaser_id_str": "douban_%s" % releaser_id,
+                                    'video_img':one["cover_url"],
+                                    "mid":mid,
+                                    "platform":"douban",
+                                    # "doc_id":doc_id
+                                }
+                                doc_id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,
+                                                    doc_id_type="all-time-url")
+                                res_dic["doc_id"] = doc_id
+                                res_dic.update(self.get_single_page(mid,proxies_num))
+                                # print(res_dic)
+    
+                                yield res_dic
+                        except Exception as e:
+                            print("single data parse error %s " %e)
                        # except Exception as e:
                        #     print(one)
                        #     print("row formate error %s" % e)