Commit 9343b777 authored by zhongshangwu's avatar zhongshangwu

微博数据入库

parent f6a5d5d3
""" 微博帖子入库榜单脚本 """
import time
import os
import re
from datetime import datetime
import json
import requests
from random import randint
from django.core.management import BaseCommand
from engine.rpc import rpc_invoker
from api.utils.upload import upload_weibo_image
from api.cache.cache import ins_cache
from engine.logger import info_logger, error_logger, logging_exception
IMAGE_SUFFIX = '-w'
FILE_PATH = '/Users/zhongshangwu/workspace/gengmei/like/saturn/api/management/commands/'
# TODO
# 1. 图片裁剪上传
# 2. 过滤二级带图评论
# 3. 其他的过滤规则
#
class Command(BaseCommand):
user_id_start = 241757306 # end 241806255
del_cache_keys = []
create_faild_topic_list = []
create_faild_pictorial_list = []
top_pictorial_error_comments = []
second_pictorial_error_comments = []
topic_error_comments = []
stats = {}
def get_random_user_id(self):
# 随机获取马甲用户ID
# return 241759142
while True:
index = randint(0, 5000)
user_id = self.user_id_start + index
data = rpc_invoker['venus/community/user/is_shadow'](user_id=user_id).unwrap()
if not data:
continue
ret = data.get('user_id')
if ret:
return user_id
def get_user_id(self, weibo_id, weibo_user_id, platform):
""" 同一条博文下面的 “关系” 需要平移过来
通过 Redis Hash 维持用户关系: key(weibo_user_id) --> value(internal_user_id)
"""
weibo_cache_key = "grap:{0}:{1}".format(platform, weibo_id)
user_id = ins_cache.hget(weibo_cache_key, weibo_user_id)
if not user_id:
user_id = self.get_random_user_id()
ins_cache.hset(weibo_cache_key, weibo_user_id, user_id)
else:
user_id = int(user_id)
return user_id
def get_weibo_id(self, content):
return content.get('user', {}).get('id')
def get_json_data_from_dir(self, is_topic=None, is_pictorial=None):
# 获取目录文件数据
ret = []
if is_topic:
file_path = FILE_PATH + 'topic/'
if is_pictorial:
file_path = FILE_PATH + 'pictorial/'
filenames = []
for _, _, names in os.walk(file_path):
filenames = names
for filename in filenames:
ret.append(self.get_file_json_data(file_path + filename))
return ret
def get_file_json_data(self, file):
# 获取文件数据
data = None
with open(file, 'r') as f:
content = f.read()
if content.startswith(u'\ufeff'):
content = content.encode('utf8')[3:].decode('utf8')
data = json.loads(content)
return data
def get_image_size(self, image_url):
# 获取图片宽高
try:
url = image_url + IMAGE_SUFFIX + '?imageInfo'
response = requests.request("GET", url)
info = response.json()
return info.get('width'), info.get('height')
except Exception as e:
logging_exception()
return None, None
def image_info(self, urls):
# 获取图片信息
ret = []
for url in urls:
image_url = upload_weibo_image(url)
while not image_url:
image_url = upload_weibo_image(url)
width, height = self.get_image_size(image_url)
while not width and not height:
width, height = self.get_image_size(image_url)
print(image_url)
ret.append(
{
'url': image_url.replace('http://alpha.gmeiapp.com/', ''),
'height': height,
'width': width,
}
)
return ret
def revise_comments(self, weibo_id, comment, platform):
""" 拆分 一级和二级评论 """
replies = []
if self.filter_first_comment(comment["content"], comment["images"], comment["reply"]):
return None, []
reply = comment.pop('reply', None)
comment["user_id"] = self.get_user_id(weibo_id, self.get_weibo_id(comment), platform)
comment.pop("user", None)
if not reply:
return comment, replies
images = []
if not comment["images"]:
normal_images = set(reply[0]["images"])
for info in reply[1:]:
info_images = set(info["images"])
normal_images = normal_images & info_images
comment["images"] = list(normal_images)
for info in reply:
if self.filter_second_comment(info["content"], comment["images"], info["images"]):
continue
info['reply_id'] = comment.get('id') # 通过微博平台的被评论ID, 在我们平台创建时找到对应的被评论ID
info['user_id'] = self.get_user_id(weibo_id, self.get_weibo_id(info), platform)
info.pop("user", None)
info.pop("images", None) # 二级评论不需要带图
replies.append(info)
return comment, replies
def create_pictorial(self, pictorial, platform):
""" 创建榜单内容 """
pictorial["content"] = pictorial["content"].replace("#", " ")
if self.filter_weibo(pictorial["content"], pictorial["images"]):
return None
weibo_id = pictorial.get('id')
pictorial['user_id'] = self.get_user_id(
weibo_id=weibo_id,
weibo_user_id=self.get_weibo_id(pictorial),
platform=platform
)
# print("Pictorial user id:", pictorial['user_id'])
# 榜单名称取爬取内容的前20字符
index_end = 20
if len(pictorial.get('content')) < index_end:
index_end = len(pictorial.get('content')) - 1
pictorial['name'] = pictorial.get('content')[:index_end]
pictorial['description'] = pictorial.get('content')
weibo_comments = pictorial.pop('comments', None) # --> 微博评论
topics = [] # 一级带图评论 转化为内部的帖子
topic_count = 0
pictorial_comments = [] # 一级无图评论 转化为榜单的评论
first_pictorial_commennts = 0
# RPC 调用创建榜单
pictorial_obj = rpc_invoker['venus/community/crawl/pictorial'](data=pictorial, platform=platform).unwrap()
if not pictorial_obj:
self.create_faild_pictorial_list.append(pictorial)
return None
pictorial_id = pictorial_obj.get('id')
self.stats[weibo_id] = {
"topics": {},
"first_comments": {}
}
# 处理微博博文评论
for idx, weibo_comment in enumerate(self.filter_duplicate_comments(weibo_comments)):
user_id = self.get_user_id(
weibo_id=weibo_id,
weibo_user_id=self.get_weibo_id(weibo_comment),
platform=platform
)
comment, replies = self.revise_comments(weibo_id, weibo_comment, platform)
if not comment:
continue
if comment["images"]: # -> to topic
images = self.image_info(comment.pop('images'))
topic = {
# 'id': pictorial_id + str(idx),
'id': comment['id'],
'content': comment.get('content', ''),
'images': images,
'create_time': comment.get('create_time'),
'user_id': user_id,
}
topic_obj = rpc_invoker['venus/community/crawl/topic'](data=topic, platform=platform, pictorial_id=pictorial_id).unwrap()
if not topic_obj:
self.create_faild_topic_list.append(topic.get('id'))
else:
# 创建帖子评论
# for topic_coment in replies:
# topic_coment["topic_id"] = topic_obj.get("id")
self.stats[weibo_id]["topics"][comment['id']] = {
"reply": []
}
replies = replies[:50]
ret = rpc_invoker['venus/community/crawl/replys'](data=replies, platform=platform, topic_id=topic_obj.get('id'), pictorial_id=None).unwrap()
if not ret:
self.topic_error_comments.extend(replies)
else:
self.stats[weibo_id]["topics"][comment['id']] = {
"reply": [item["id"] for item in replies]
}
else: # -> to pictorial comment
top_comments_obj = rpc_invoker['venus/community/crawl/replys'](data=[comment], platform=platform, pictorial_id=pictorial_id).unwrap()
if not top_comments_obj.get("reply_ids"):
self.top_pictorial_error_comments.append(comment)
else:
# 创建榜单二级评论
self.stats[weibo_id]["first_comments"][comment['id']] = {
"reply": []
}
replies = replies[:50]
for reply in replies:
reply["top_id"] = top_comments_obj.get("reply_ids")[0]
ret = rpc_invoker['venus/community/crawl/replys'](data=replies, platform=platform, topic_id=None, pictorial_id=pictorial_id).unwrap()
if not ret:
self.second_pictorial_error_comments.extend(replies)
else:
self.stats[weibo_id]["first_comments"][comment['id']] = {
"reply": [item["id"] for item in replies]
}
def filter_duplicate_comments(self, comments):
""" 过滤重复的评论 """
exists_comment = set()
ret = []
for comment in comments[::-1]:
if comment["id"] not in exists_comment:
exists_comment.add(comment["id"])
ret.append(comment)
return ret[::-1]
def filter_weibo(self, content, images):
""" 过滤 """
return (
self.filter_normal(content)
or self.filter_gif_images(images)
)
def filter_first_comment(self, content, images, replies):
""" 过滤一级评论 """
return (
self.filter_normal(content)
or self.filter_gif_images(images)
or not replies
)
def filter_second_comment(self, content, first_images, images):
""" 过滤二级评论 """
return (
self.filter_normal(content)
or self.filter_second_images(first_images, images)
)
def filter_gif_images(self, images):
for image in images:
if ".gif" in image:
return True
return False
def filter_second_images(self, first_images, images):
""" 过滤二级评论的图片 """
first_images = set(first_images or [])
images = set(images or [])
return images - first_images
def filter_normal(self, content):
if "@" in content:
return True
if "榜姐" in content or "渣浪" in content or "微博" in content:
return True
if "收起全文" in content:
return True
if "http://" in content or "https://" in content:
return True
regex = re.compile("#[^#]+#")
return regex.search(content)
def handle(self, *args, **options):
# print(upload_weibo_image("http://wx2.sinaimg.cn/woriginal/bbeff396ly1fxqi1wl2hqj20go0bvabl.jpg"))
platform = 5 # 微博
# 榜单
print('----- start deal weibo blog at {} -----'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')))
pictorial_data = self.get_json_data_from_dir(is_pictorial=1)
for idx, pictorial in enumerate(pictorial_data):
print('------- current pictorial idx :', idx)
start = time.time()
ret = self.create_pictorial(pictorial=pictorial, platform=platform)
print("escape:", time.time() - start)
if not ret:
print("create failed weibo blog:", pictorial["id"])
import pprint
pprint.pprint(self.stats)
print('-------- create_faild_topic_list:', len(self.create_faild_topic_list))
print('-------- topic_error_comments:', len(self.topic_error_comments))
print('-------- create_faild_pictorial_list:', len(self.create_faild_pictorial_list))
print('-------- top_pictorial_error_comments:', len(self.top_pictorial_error_comments))
print('-------- second_pictorial_error_comments:', len(self.second_pictorial_error_comments))
\ No newline at end of file
import requests import requests
from gm_upload import upload, upload_file from gm_upload import upload, upload_file
from gm_upload import IMG_TYPE from gm_upload import IMG_TYPE
import io
from PIL import Image
# import numpy as np
def upload_image(url, img_type=IMG_TYPE.TOPIC): def upload_image(url, img_type=IMG_TYPE.TOPIC):
...@@ -10,3 +13,24 @@ def upload_image(url, img_type=IMG_TYPE.TOPIC): ...@@ -10,3 +13,24 @@ def upload_image(url, img_type=IMG_TYPE.TOPIC):
return upload(response.content, img_type=img_type) return upload(response.content, img_type=img_type)
except: except:
return None return None
def upload_weibo_image(url, img_type=IMG_TYPE.TOPIC):
'''非站内图片处理'''
try:
response = requests.get(url)
img = Image.open(io.BytesIO(response.content))
w, h = img.size
img = img.crop((0, 0, w, h-10))
# img = img.convert('RGB')
# content = np.array(img)[..., ::-1]
temp = io.BytesIO()
# content = Image.fromarray(content)
img.save(temp, format="png")
content = temp.getvalue()
return upload(content, img_type=img_type)
except:
return None
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment