update

d540d564 · litaolemo · fea8a745 · d540d564 · d540d564 · d540d564
Commit d540d564 authored Aug 04, 2020 by litaolemo
4 changed files
--- a/crawler_sys/scheduler/cal_ni_and_put_to_backend.py
+++ b/crawler_sys/scheduler/cal_ni_and_put_to_backend.py
@@ -133,6 +133,8 @@ def scan_from_redis(push_rule_class_list) -> Dict:
                    comment_count = one_data.get("comment_count")
                    favorite_count = one_data.get("favorite_count")
                    continue
+                if one_data.get("article_type") != "article":
+                    continue
                for push_bool in push_rule_class_list:
                    bool_res = push_bool.parse_data(fetch_time_last=fetch_time, repost_count_last=repost_count,
                                                    comment_count_last=comment_count,
@@ -315,11 +317,11 @@ def write_data_into_mysql(res_data):
        sql_query = """insert into api_tractate 
            (user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title) 
            values ({user_id},'{content}',{is_online},{status},{platform},{content_level},{is_excellent},'{create_time}','{last_modified}',{user_del},{low_quality},{low_quality_deal},'{platform_id}',{pgc_type},'{title}');""".format(
-            user_id=random.choice(user_id_list), content=data["content"], is_online=1, status=2, platform=15,
+            user_id=random.choice(user_id_list), content=data["content"], is_online=0, status=2, platform=15,
            content_level=data["level"],
            is_excellent=0, create_time=now_str,
            last_modified=now_str, user_del=0,
-            low_quality=0, low_quality_deal=0, platform_id=data["doc_id"], pgc_type=1, title=data["title"])
+            low_quality=0, low_quality_deal=0, platform_id=data["doc_id"], pgc_type=0, title=data["title"])
        res = cur.execute(sql_query)
        tractate_id = int(conn.insert_id())
        if res:

--- a/crawler_sys/site_crawler_by_redis/crawler_douban.py
+++ b/crawler_sys/site_crawler_by_redis/crawler_douban.py
@@ -178,6 +178,7 @@ class CrawlerDouban():
                                    'video_img':one["cover_url"],
                                    "mid":mid,
                                    "platform":"douban",
+                                    "article_type": "article"
                                    # "doc_id":doc_id
                                }
                                doc_id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,

--- a/crawler_sys/site_crawler_by_redis/crawler_weibo.py
+++ b/crawler_sys/site_crawler_by_redis/crawler_weibo.py
@@ -159,6 +159,11 @@ class Crawler_weibo():
                                    text, repost_count, comment_count, favorite_count = self.get_single_page(mid)
                                else:
                                    text = mblog["raw_text"]
+
+                                if mblog.get("page_info"):
+                                    article_type = mblog.get("page_info").get("type")
+                                else:
+                                    article_type = "article"
                                res_dic = {
                                    "release_time": trans_strtime_to_timestamp(mblog["created_at"]),
                                    "fetch_time": int(datetime.datetime.now().timestamp()*1e3),
@@ -176,6 +181,7 @@ class Crawler_weibo():
                                    "releaser_id_str": "weibo_%s" % releaser_id,
                                    "img_list":self.get_img(mblog),
                                    "platform":"weibo",
+                                    "article_type": article_type
                                    # "doc_id":doc_id
                                }
                                res_dic["doc_id"] = cal_doc_id(platform="weibo", url=one["scheme"], data_dict=res_dic,

--- a/crawler_sys/site_crawler_test/crawler_weibo.py
+++ b/crawler_sys/site_crawler_test/crawler_weibo.py
@@ -179,6 +179,10 @@ class Crawler_weibo():
                                    text,repost_count,comment_count,favorite_count = self.get_single_page(mid)
                                else:
                                    text = mblog["raw_text"]
+
+                                if mblog.get("page_info"):
+                                    article_type = mblog.get("page_info").get("type")
+
                                res_dic = {
                                    "release_time": trans_strtime_to_timestamp(mblog["created_at"]),
                                    "url": one["scheme"],
@@ -193,7 +197,8 @@ class Crawler_weibo():
                                    "mid":mid,
                                    "releaserUrl":"https://www.weibo.com/u/%s" % releaser_id,
                                    "releaser_id_str":"weibo_%s" % releaser_id,
-                                    "platform":"weibo"
+                                    "platform":"weibo",
+                                    "article_type":article_type
                                }
                                # from write_data_into_es.func_cal_doc_id import cal_doc_id
                                # id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,