Commit dd262629 authored by zhongshangwu's avatar zhongshangwu

修复小红书入库马甲用户逻辑

parent 3bffffdb
Pipeline #1978 passed with stage
......@@ -3,6 +3,7 @@ import os
from datetime import datetime
import json
import requests
import random
from random import randint
from django.core.management import BaseCommand
from engine.rpc import rpc_invoker
......@@ -16,7 +17,7 @@ from engine.logger import info_logger, error_logger, logging_exception
IMAGE_SUFFIX = '-w'
FILE_PATH = '/Users/haowei/Desktop/xhs/'
FILE_PATH = '/Users/zhongshangwu/workspace/gengmei/like/saturn/xiaohongshu/'
class Command(BaseCommand):
......@@ -27,7 +28,7 @@ class Command(BaseCommand):
del_cache_keys = []
create_faild_topic_list = []
create_faild_pictorial_list = []
top_comment_error_create = []
second_topic_comments = []
......@@ -37,23 +38,32 @@ class Command(BaseCommand):
second_pictorial_error_comments = []
second_reply_error_create = []
shadow_user_ids = []
def del_cache(self):
for obj in self.del_cache_keys:
ins_cache.delete(obj)
def load_shadow_users(self):
data = rpc_invoker['venus/community/crawl/load_shawdow_user'](
start_user_id=self.user_id_start,
end_user_id=self.user_id_start + 5000
).unwrap()
print(data, ">>>>>")
self.shadow_user_ids = data
def get_random_user_id(self):
# 随机获取用户ID
while True:
index = randint(0, 5000)
user_id = self.user_id_start + index
data = rpc_invoker['venus/community/user/is_shadow'](user_id=user_id).unwrap()
if not data:
continue
ret = data.get('user_id')
if ret:
return user_id
return random.choice(self.shadow_user_ids)
# while True:
# index = randint(0, 5000)
# user_id = self.user_id_start + index
# data = rpc_invoker['venus/community/user/is_shadow'](user_id=user_id).unwrap()
# if not data:
# continue
# ret = data.get('user_id')
# if ret:
# return user_id
def get_user_id(self, id_, platform):
# 获取用户ID 缓存记录保留用户关系
......@@ -85,12 +95,12 @@ class Command(BaseCommand):
filenames = []
for _, _, names in os.walk(file_path):
filenames = names
for filename in filenames:
ret.append(self.get_file_json_data(file_path + filename))
return ret
def get_file_json_data(self, file):
# 获取文件数据
data = None
......@@ -98,9 +108,9 @@ class Command(BaseCommand):
content = f.read()
if content.startswith(u'\ufeff'):
content = content.encode('utf8')[3:].decode('utf8')
data = json.loads(content)
return data
def get_image_size(self, image_url):
......@@ -142,7 +152,7 @@ class Command(BaseCommand):
for info in reply:
info['reply_id'] = comment.get('id')
info['type'] = comment.get('type')
ret.append(info)
return comment, ret
......@@ -187,7 +197,7 @@ class Command(BaseCommand):
obj['user_id'] = self.get_user_id(id_=obj.get('user').get('id'), platform=platform)
obj['top_id'] = top_id
obj.pop('user')
# rpc_invoker['venus/community/crawl/replys'](data=comments, platform=platform, topic_id=topic_id, pictorial_id=pictorial_id, top_id=top_id).unwrap()
self.second_topic_comments.extend(comments)
# print('-------- topic_comments:', self.second_topic_comments)
......@@ -211,7 +221,7 @@ class Command(BaseCommand):
'create_time': pictorial.get('create_time'),
'user_id': self.get_user_id(id_=obj.get('url'), platform=platform)
})
pictorial['user_id'] = self.get_user_id(id_=pictorial.get('id'), platform=platform)
pictorial['description'] = pictorial.get('content')
......@@ -248,14 +258,14 @@ class Command(BaseCommand):
obj['user_id'] = self.get_user_id(id_=obj.get('user').get('id'), platform=platform)
obj['top_id'] = top_id
obj.pop('user')
# rpc_invoker['venus/community/crawl/replys'](data=comments, platform=platform, topic_id=topic_id, pictorial_id=pictorial_id, top_id=top_id).unwrap()
self.second_pictorial_comments.extend(comments)
return None, None
def handle(self, *args, **options):
self.load_shadow_users()
platform = 4
# 帖子
print('----- start deal topic at {} -----'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')))
......@@ -264,7 +274,7 @@ class Command(BaseCommand):
print('-------- create_faild_topic_list:', len(self.create_faild_topic_list))
print('-------- second_topic_comments:', len(self.second_topic_comments))
print('----- end deal topic at {} -----'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')))
# 榜单
print('----- start deal pictorial at {} -----'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')))
pictorial_data = self.get_json_data_from_dir(is_pictorial=1)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment