Commit 9df8c75a authored by zhongshangwu's avatar zhongshangwu

微博内容入库增加一级评论限制

parent 73481de5
...@@ -17,11 +17,6 @@ from engine.logger import info_logger, error_logger, logging_exception ...@@ -17,11 +17,6 @@ from engine.logger import info_logger, error_logger, logging_exception
IMAGE_SUFFIX = '-w' IMAGE_SUFFIX = '-w'
FILE_PATH = '/Users/zhongshangwu/workspace/gengmei/like/saturn/weibo/' FILE_PATH = '/Users/zhongshangwu/workspace/gengmei/like/saturn/weibo/'
# TODO
# 1. 图片裁剪上传
# 2. 过滤二级带图评论
# 3. 其他的过滤规则
#
class Command(BaseCommand): class Command(BaseCommand):
user_id_start = 241757306 # end 241806255 user_id_start = 241757306 # end 241806255
...@@ -37,7 +32,6 @@ class Command(BaseCommand): ...@@ -37,7 +32,6 @@ class Command(BaseCommand):
def get_random_user_id(self): def get_random_user_id(self):
# 随机获取马甲用户ID # 随机获取马甲用户ID
# return 241759142
while True: while True:
index = randint(0, 5000) index = randint(0, 5000)
user_id = self.user_id_start + index user_id = self.user_id_start + index
...@@ -171,7 +165,6 @@ class Command(BaseCommand): ...@@ -171,7 +165,6 @@ class Command(BaseCommand):
weibo_user_id=self.get_weibo_id(pictorial), weibo_user_id=self.get_weibo_id(pictorial),
platform=platform platform=platform
) )
# print("Pictorial user id:", pictorial['user_id'])
# 榜单名称取爬取内容的前20字符 # 榜单名称取爬取内容的前20字符
index_end = 20 index_end = 20
if len(pictorial.get('content')) < index_end: if len(pictorial.get('content')) < index_end:
...@@ -181,16 +174,13 @@ class Command(BaseCommand): ...@@ -181,16 +174,13 @@ class Command(BaseCommand):
pictorial['description'] = pictorial.get('content') pictorial['description'] = pictorial.get('content')
weibo_comments = pictorial.pop('comments', None) # --> 微博评论 weibo_comments = pictorial.pop('comments', None) # --> 微博评论
topics = [] # 一级带图评论 转化为内部的帖子
topic_count = 0
pictorial_comments = [] # 一级无图评论 转化为榜单的评论
first_pictorial_commennts = 0
# RPC 调用创建榜单 # RPC 调用创建榜单
pictorial_obj = rpc_invoker['venus/community/crawl/pictorial'](data=pictorial, platform=platform).unwrap() pictorial_obj = rpc_invoker['venus/community/crawl/pictorial'](data=pictorial, platform=platform).unwrap()
if not pictorial_obj: if not pictorial_obj:
self.create_faild_pictorial_list.append(pictorial) self.create_faild_pictorial_list.append(pictorial)
return None return None
pictorial_id = pictorial_obj.get('id') pictorial_id = pictorial_obj.get('id')
self.stats[weibo_id] = { self.stats[weibo_id] = {
"topics": {}, "topics": {},
...@@ -225,8 +215,6 @@ class Command(BaseCommand): ...@@ -225,8 +215,6 @@ class Command(BaseCommand):
self.create_faild_topic_list.append(topic.get('id')) self.create_faild_topic_list.append(topic.get('id'))
else: else:
# 创建帖子评论 # 创建帖子评论
# for topic_coment in replies:
# topic_coment["topic_id"] = topic_obj.get("id")
self.stats[weibo_id]["topics"][comment['id']] = { self.stats[weibo_id]["topics"][comment['id']] = {
"reply": [] "reply": []
} }
...@@ -240,6 +228,9 @@ class Command(BaseCommand): ...@@ -240,6 +228,9 @@ class Command(BaseCommand):
} }
else: # -> to pictorial comment else: # -> to pictorial comment
if len(self.stats[weibo_id]["first_comments"]) > 50:
continue
top_comments_obj = rpc_invoker['venus/community/crawl/replys'](data=[comment], platform=platform, pictorial_id=pictorial_id).unwrap() top_comments_obj = rpc_invoker['venus/community/crawl/replys'](data=[comment], platform=platform, pictorial_id=pictorial_id).unwrap()
if not top_comments_obj.get("reply_ids"): if not top_comments_obj.get("reply_ids"):
self.top_pictorial_error_comments.append(comment) self.top_pictorial_error_comments.append(comment)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment