Commit d540d564 authored by litaolemo's avatar litaolemo

update

parent fea8a745
......@@ -133,6 +133,8 @@ def scan_from_redis(push_rule_class_list) -> Dict:
comment_count = one_data.get("comment_count")
favorite_count = one_data.get("favorite_count")
continue
if one_data.get("article_type") != "article":
continue
for push_bool in push_rule_class_list:
bool_res = push_bool.parse_data(fetch_time_last=fetch_time, repost_count_last=repost_count,
comment_count_last=comment_count,
......@@ -315,11 +317,11 @@ def write_data_into_mysql(res_data):
sql_query = """insert into api_tractate
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
values ({user_id},'{content}',{is_online},{status},{platform},{content_level},{is_excellent},'{create_time}','{last_modified}',{user_del},{low_quality},{low_quality_deal},'{platform_id}',{pgc_type},'{title}');""".format(
user_id=random.choice(user_id_list), content=data["content"], is_online=1, status=2, platform=15,
user_id=random.choice(user_id_list), content=data["content"], is_online=0, status=2, platform=15,
content_level=data["level"],
is_excellent=0, create_time=now_str,
last_modified=now_str, user_del=0,
low_quality=0, low_quality_deal=0, platform_id=data["doc_id"], pgc_type=1, title=data["title"])
low_quality=0, low_quality_deal=0, platform_id=data["doc_id"], pgc_type=0, title=data["title"])
res = cur.execute(sql_query)
tractate_id = int(conn.insert_id())
if res:
......
......@@ -178,6 +178,7 @@ class CrawlerDouban():
'video_img':one["cover_url"],
"mid":mid,
"platform":"douban",
"article_type": "article"
# "doc_id":doc_id
}
doc_id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,
......
......@@ -159,6 +159,11 @@ class Crawler_weibo():
text, repost_count, comment_count, favorite_count = self.get_single_page(mid)
else:
text = mblog["raw_text"]
if mblog.get("page_info"):
article_type = mblog.get("page_info").get("type")
else:
article_type = "article"
res_dic = {
"release_time": trans_strtime_to_timestamp(mblog["created_at"]),
"fetch_time": int(datetime.datetime.now().timestamp()*1e3),
......@@ -176,6 +181,7 @@ class Crawler_weibo():
"releaser_id_str": "weibo_%s" % releaser_id,
"img_list":self.get_img(mblog),
"platform":"weibo",
"article_type": article_type
# "doc_id":doc_id
}
res_dic["doc_id"] = cal_doc_id(platform="weibo", url=one["scheme"], data_dict=res_dic,
......
......@@ -179,6 +179,10 @@ class Crawler_weibo():
text,repost_count,comment_count,favorite_count = self.get_single_page(mid)
else:
text = mblog["raw_text"]
if mblog.get("page_info"):
article_type = mblog.get("page_info").get("type")
res_dic = {
"release_time": trans_strtime_to_timestamp(mblog["created_at"]),
"url": one["scheme"],
......@@ -193,7 +197,8 @@ class Crawler_weibo():
"mid":mid,
"releaserUrl":"https://www.weibo.com/u/%s" % releaser_id,
"releaser_id_str":"weibo_%s" % releaser_id,
"platform":"weibo"
"platform":"weibo",
"article_type":article_type
}
# from write_data_into_es.func_cal_doc_id import cal_doc_id
# id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment