Commit 1137e1c5 authored by haowang's avatar haowang

fix

parent 4155d54d
# 读取json文件 调用api 将数据导入库
import os
from datetime import datetime
import json
import requests
from random import randint
from api.views.base_view import BaseView
from api.utils.sensitive import Sensitive
from api.utils.upload import upload_image
from api.cache.cache import ins_cache
from libs.user import get_user_by_ids
from alpha_types.venus import ERROR
from alpha_types.venus import GRAP_PLATFORM
from engine.logger import info_logger, error_logger, logging_exception
FILE_PATH = '/Users/haowei/Desktop/xhs/'
API_URL_PROD = 'http://saturn.iyanzhi.com/api/v1/pictorial/create'
API_URL_TEST = 'http://saturn.gmapp.env/api/v1/pictorial/create'
API_URL_BENDI = 'http://127.0.0.1:8004/api/v1/pictorial/create'
class ReplyBatchCreate(BaseView):
"""
评论爬取
"""
user_id_start = 241757306 # end 241806255
del_cache_keys = []
first_replys = []
second_replys = []
create_faild_topic_list = []
total_topic_comments = []
total_pictorial_comments = []
def del_cache(self):
for obj in self.del_cache_keys:
ins_cache.delete(obj)
def get_random_user_id(self):
# 随机获取用户ID
while True:
index = randint(0, 5000)
user_id = self.user_id_start + index
error, data = self.call_rpc('venus/community/user/is_shadow', user_id=user_id)
ret = data.get('user_id')
if ret:
return user_id
def get_user_id(self, id_, platform):
# 获取用户ID 缓存记录保留用户关系
cache_key = 'grap:{}:{}'.format(platform, id_)
exist_key = 'grap:{}:{}'
value = ins_cache.get(cache_key)
user_id = None
if not value:
while True:
user_id = self.get_random_user_id()
exist = exist_key.format(platform, user_id)
if not ins_cache.get(exist):
ins_cache.set(exist, id_)
self.del_cache_keys.append(exist)
break
ins_cache.set(cache_key, user_id)
self.del_cache_keys.append(exist)
else:
user_id = int(value)
return user_id
def get_json_data_from_dir(self, is_topic=None, is_pictorial=None):
# 获取目录文件数据
ret = []
if is_topic:
file_path = FILE_PATH + 'topic/'
if is_pictorial:
file_path = FILE_PATH + 'pictorial/'
filenames = []
for _, _, names in os.walk(file_path):
filenames = names
for filename in filenames:
ret.append(self.get_file_json_data(file_path + filename))
return ret
def get_file_json_data(self, file):
# 获取文件数据
data = None
with open(file, 'r') as f:
content = f.read()
if content.startswith(u'\ufeff'):
content = content.encode('utf8')[3:].decode('utf8')
data = json.loads(content)
return data
def get_image_size(self, image_url):
# 获取图片宽高
try:
url = image_url + IMAGE_SUFFIX + '?imageInfo'
response = requests.request("GET", url)
info = response.json()
return info.get('width'), info.get('height')
except Exception as e:
logging_exception()
return None
def image_info(self, urls):
# 获取图片信息
ret = []
for url in urls:
image_url = upload_image(url)
while not image_url:
image_url = upload_image(url)
width, height = self.get_image_size(image_url)
while not width and not height:
width, height = self.get_image_size(image_url)
ret.append(
{
'url': image_url.replace('http://alpha.gmeiapp.com/', ''),
'height': height,
'width': width,
}
)
return ret
def revise_comments(self, comment, from_id):
ret = []
comment['from_id'] = from_id
comment['content'] = comment.get('comment')
reply = comment.pop('reply', None)
if not reply:
return comment, ret
for info in reply:
info['from_id'] = comment.get('from_id')
info['reply_id'] = comment.get('id')
info['type'] = comment.get('type')
ret.append(info)
return comment, ret
def create_comment(self, comment_list, from_id, platform, topic_id=None, pictorial_id=None):
top_comment, comments = self.revise_comments(comment_list, from_id)
top_comment['user_id'] = self.get_user_id(id_=top_comment.get('user').get('id'), platform=platform)
top_comment.pop('user')
error, ret = self.call_rpc('venus/community/crawl/replys', data=[top_comment], platform=platform, topic_id=topic_id, pictorial_id=pictorial_id)
if error:
return error, ret
top_id = ret.get('reply_ids')[0]
for obj in comments:
obj['user_id'] = self.get_user_id(id_=obj.get('user').get('id'), platform=platform)
obj.pop('user')
error, _ = self.call_rpc('venus/community/crawl/replys', data=comments, platform=platform, topic_id=topic_id, pictorial_id=pictorial_id, top_id=top_id)
if error:
return error, None
return None, None
def create_topic(self, topics, platform):
for topic in topics:
topic_comments = topic.pop('comments', None)
images = topic.pop('image')
topic['images'] = self.image_info(images)
topic['user_id'] = self.get_user_id(id_=topic.get('id'), platform=platform)
error, topic_obj = self.call_rpc('venus/community/crawl/topic', data=topic, platform=platform, pictorial_id=None)
if error:
print(error, topic.get('id'))
self.create_faild_topic_list.append(topic.get('id'))
continue
if not topic_comments:
continue
from_id = topic.get('id')
if platform == GRAP_PLATFORM.XIAOHONGSHU:
for topic_comment in topic_comments:
topic_comment['topic_id'] = topic_obj.get('id')
self.total_topic_comments.append(topic_comments)
return None, None
def create_pictorial(self, pictorial, platform):
topics = []
pictorial_id = None
if not pictorial:
return None, None
pictorial_comments = pictorial.pop('comments', None)
images = self.image_info(pictorial.pop('image'))
index = 0
for obj in images:
index += 1
topics.append({
'id': pictorial.get('id') + str(index),
'content': pictorial.get('content'),
'images': [obj],
'create_time': pictorial.get('create_time'),
'user_id': self.get_user_id(id_=obj.get('url'), platform=platform)
})
pictorial['user_id'] = self.get_user_id(id_=pictorial.get('id'), platform=platform)
pictorial['description'] = pictorial.get('content')
# 榜单名称取爬取内容的前20字符
index_end = 20
if len(pictorial.get('content')) < index_end:
index_end = len(pictorial.get('content')) - 1
pictorial['name'] = pictorial.get('content')[:index_end]
error, pictorial_obj = self.call_rpc('venus/community/crawl/pictorial', data=pictorial, platform=platform)
if error:
return error, None
pictorial_id = pictorial_obj.get('id')
if topics:
for obj in topics:
error, _ = self.call_rpc('venus/community/crawl/topic', data=obj, platform=platform, pictorial_id=pictorial_id)
if error:
return error, None
if pictorial_comments:
if platform == GRAP_PLATFORM.XIAOHONGSHU:
for pictorial_comment in pictorial_comments:
pictorial_comment['pictorial_id'] = pictorial_id
self.total_pictorial_comments.append(pictorial_comments)
return None, None
def process(self):
platform = 4
# 帖子
print('----- start deal topic at {} -----'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')))
topic_data = self.get_json_data_from_dir(is_topic=1)
for info in topic_data:
print(info)
# self.create_topic(topics=info, platform=platform)
# self.del_cache()
print('----- end deal topic at {} -----'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')))
# 榜单
# print('----- start deal pictorial at {} -----'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')))
# pictorial_data = self.get_json_data_from_dir(is_pictorial=1)
# print('----- end deal pictorial at {} -----'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')))
# if is_pictorial:
# for obj in topics:
# error, _ = self.create_pictorial(pictorial=obj, platform=platform)
# if error:
# self.del_cache()
# return self.error(error=error)
self.del_cache()
return self.ok()
if __name__ == "__main__":
a = ReplyBatchCreate()
a.process()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment