微博数据入库

9343b777 · zhongshangwu · f6a5d5d3 · 9343b777 · 9343b777
Commit 9343b777 authored Aug 21, 2019 by zhongshangwu
Hide whitespace changes
Inline Side-by-side

Showing with 376 additions and 0 deletions

weibo_commands.py api/management/commands/weibo_commands.py +352 -0

upload.py api/utils/upload.py +24 -0

No files found.
--- a/api/management/commands/weibo_commands.py
+++ b/api/management/commands/weibo_commands.py
+""" 微博帖子入库榜单脚本 """
+import time
+import os
+import re
+from datetime import datetime
+import json
+import requests
+from random import randint
+from django.core.management import BaseCommand
+from engine.rpc import rpc_invoker
+from api.utils.upload import upload_weibo_image
+from api.cache.cache import ins_cache
+
+from engine.logger import info_logger, error_logger, logging_exception
+
+
+IMAGE_SUFFIX = '-w'
+FILE_PATH = '/Users/zhongshangwu/workspace/gengmei/like/saturn/api/management/commands/'
+
+# TODO
+# 1. 图片裁剪上传
+# 2. 过滤二级带图评论
+# 3. 其他的过滤规则
+#
+
+class Command(BaseCommand):
+    user_id_start = 241757306 # end 241806255
+    del_cache_keys = []
+    create_faild_topic_list = []
+    create_faild_pictorial_list = []
+
+    top_pictorial_error_comments = []
+    second_pictorial_error_comments = []
+
+    topic_error_comments = []
+    stats = {}
+
+    def get_random_user_id(self):
+        # 随机获取马甲用户ID
+        # return 241759142
+        while True:
+            index = randint(0, 5000)
+            user_id = self.user_id_start + index
+            data = rpc_invoker['venus/community/user/is_shadow'](user_id=user_id).unwrap()
+            if not data:
+                continue
+            ret = data.get('user_id')
+            if ret:
+                return user_id
+
+    def get_user_id(self, weibo_id, weibo_user_id, platform):
+
+        """ 同一条博文下面的 “关系” 需要平移过来
+
+        通过 Redis Hash 维持用户关系: key(weibo_user_id) --> value(internal_user_id)
+
+        """
+
+        weibo_cache_key = "grap:{0}:{1}".format(platform, weibo_id)
+
+        user_id = ins_cache.hget(weibo_cache_key, weibo_user_id)
+        if not user_id:
+            user_id = self.get_random_user_id()
+            ins_cache.hset(weibo_cache_key, weibo_user_id, user_id)
+        else:
+            user_id = int(user_id)
+        return user_id
+
+    def get_weibo_id(self, content):
+        return content.get('user', {}).get('id')
+
+    def get_json_data_from_dir(self, is_topic=None, is_pictorial=None):
+        # 获取目录文件数据
+        ret = []
+        if is_topic:
+            file_path = FILE_PATH + 'topic/'
+        if is_pictorial:
+            file_path = FILE_PATH + 'pictorial/'
+        filenames = []
+        for _, _, names in os.walk(file_path):
+            filenames = names
+
+        for filename in filenames:
+            ret.append(self.get_file_json_data(file_path + filename))
+        return ret
+
+    def get_file_json_data(self, file):
+        # 获取文件数据
+        data = None
+        with open(file, 'r') as f:
+            content = f.read()
+            if content.startswith(u'\ufeff'):
+                content = content.encode('utf8')[3:].decode('utf8')
+
+            data = json.loads(content)
+
+        return data
+
+    def get_image_size(self, image_url):
+        # 获取图片宽高
+        try:
+            url = image_url + IMAGE_SUFFIX + '?imageInfo'
+            response = requests.request("GET", url)
+            info = response.json()
+            return info.get('width'), info.get('height')
+        except Exception as e:
+            logging_exception()
+            return None, None
+
+    def image_info(self, urls):
+        # 获取图片信息
+        ret = []
+        for url in urls:
+            image_url = upload_weibo_image(url)
+            while not image_url:
+                image_url = upload_weibo_image(url)
+            width, height = self.get_image_size(image_url)
+            while not width and not height:
+                width, height = self.get_image_size(image_url)
+            print(image_url)
+            ret.append(
+                {
+                    'url': image_url.replace('http://alpha.gmeiapp.com/', ''),
+                    'height': height,
+                    'width': width,
+                }
+            )
+        return ret
+
+    def revise_comments(self, weibo_id, comment, platform):
+        """ 拆分 一级和二级评论 """
+        replies = []
+
+        if self.filter_first_comment(comment["content"], comment["images"], comment["reply"]):
+            return None, []
+
+        reply = comment.pop('reply', None)
+        comment["user_id"] = self.get_user_id(weibo_id, self.get_weibo_id(comment), platform)
+        comment.pop("user", None)
+        if not reply:
+            return comment, replies
+
+        images = []
+        if not comment["images"]:
+
+            normal_images = set(reply[0]["images"])
+            for info in reply[1:]:
+                info_images = set(info["images"])
+                normal_images = normal_images & info_images
+            comment["images"] = list(normal_images)
+
+        for info in reply:
+            if self.filter_second_comment(info["content"], comment["images"], info["images"]):
+                continue
+
+            info['reply_id'] = comment.get('id')  # 通过微博平台的被评论ID， 在我们平台创建时找到对应的被评论ID
+            info['user_id'] = self.get_user_id(weibo_id, self.get_weibo_id(info), platform)
+            info.pop("user", None)
+            info.pop("images", None)  # 二级评论不需要带图
+            replies.append(info)
+        return comment, replies
+
+    def create_pictorial(self, pictorial, platform):
+        """ 创建榜单内容 """
+        pictorial["content"] = pictorial["content"].replace("#", " ")
+
+        if self.filter_weibo(pictorial["content"], pictorial["images"]):
+            return None
+
+        weibo_id = pictorial.get('id')
+
+        pictorial['user_id'] = self.get_user_id(
+            weibo_id=weibo_id,
+            weibo_user_id=self.get_weibo_id(pictorial),
+            platform=platform
+        )
+        # print("Pictorial user id:", pictorial['user_id'])
+        # 榜单名称取爬取内容的前20字符
+        index_end = 20
+        if len(pictorial.get('content')) < index_end:
+            index_end = len(pictorial.get('content')) - 1
+
+        pictorial['name'] = pictorial.get('content')[:index_end]
+        pictorial['description'] = pictorial.get('content')
+        weibo_comments = pictorial.pop('comments', None)  # --> 微博评论
+
+        topics = []  # 一级带图评论 转化为内部的帖子
+        topic_count = 0
+        pictorial_comments = []  # 一级无图评论 转化为榜单的评论
+        first_pictorial_commennts = 0
+
+        # RPC 调用创建榜单
+        pictorial_obj = rpc_invoker['venus/community/crawl/pictorial'](data=pictorial, platform=platform).unwrap()
+        if not pictorial_obj:
+            self.create_faild_pictorial_list.append(pictorial)
+            return None
+        pictorial_id = pictorial_obj.get('id')
+        self.stats[weibo_id] = {
+            "topics": {},
+            "first_comments": {}
+
+        }
+
+        # 处理微博博文评论
+        for idx, weibo_comment in enumerate(self.filter_duplicate_comments(weibo_comments)):
+            user_id = self.get_user_id(
+                weibo_id=weibo_id,
+                weibo_user_id=self.get_weibo_id(weibo_comment),
+                platform=platform
+            )
+
+            comment, replies = self.revise_comments(weibo_id, weibo_comment, platform)
+
+            if not comment:
+                continue
+            if comment["images"]:  # -> to topic
+                images = self.image_info(comment.pop('images'))
+                topic = {
+                    # 'id': pictorial_id + str(idx),
+                    'id': comment['id'],
+                    'content': comment.get('content', ''),
+                    'images': images,
+                    'create_time': comment.get('create_time'),
+                    'user_id': user_id,
+                }
+                topic_obj = rpc_invoker['venus/community/crawl/topic'](data=topic, platform=platform, pictorial_id=pictorial_id).unwrap()
+                if not topic_obj:
+                    self.create_faild_topic_list.append(topic.get('id'))
+                else:
+                    # 创建帖子评论
+                    # for topic_coment in replies:
+                    #     topic_coment["topic_id"] = topic_obj.get("id")
+                    self.stats[weibo_id]["topics"][comment['id']] = {
+                        "reply": []
+                    }
+                    replies = replies[:50]
+                    ret = rpc_invoker['venus/community/crawl/replys'](data=replies, platform=platform, topic_id=topic_obj.get('id'), pictorial_id=None).unwrap()
+                    if not ret:
+                        self.topic_error_comments.extend(replies)
+                    else:
+                        self.stats[weibo_id]["topics"][comment['id']] = {
+                            "reply": [item["id"] for item in replies]
+                        }
+
+            else:  # -> to pictorial comment
+                top_comments_obj = rpc_invoker['venus/community/crawl/replys'](data=[comment], platform=platform, pictorial_id=pictorial_id).unwrap()
+
+                if not top_comments_obj.get("reply_ids"):
+                    self.top_pictorial_error_comments.append(comment)
+                else:
+                    # 创建榜单二级评论
+                    self.stats[weibo_id]["first_comments"][comment['id']] = {
+                        "reply": []
+                    }
+                    replies = replies[:50]
+                    for reply in replies:
+                        reply["top_id"] = top_comments_obj.get("reply_ids")[0]
+                    ret = rpc_invoker['venus/community/crawl/replys'](data=replies, platform=platform, topic_id=None, pictorial_id=pictorial_id).unwrap()
+                    if not ret:
+                        self.second_pictorial_error_comments.extend(replies)
+                    else:
+                        self.stats[weibo_id]["first_comments"][comment['id']] = {
+                            "reply": [item["id"] for item in replies]
+                        }
+
+    def filter_duplicate_comments(self, comments):
+        """ 过滤重复的评论 """
+
+        exists_comment = set()
+        ret = []
+        for comment in comments[::-1]:
+
+            if comment["id"] not in exists_comment:
+                exists_comment.add(comment["id"])
+                ret.append(comment)
+        return ret[::-1]
+
+    def filter_weibo(self, content, images):
+        """ 过滤 """
+        return (
+            self.filter_normal(content)
+            or self.filter_gif_images(images)
+        )
+
+    def filter_first_comment(self, content, images, replies):
+        """ 过滤一级评论 """
+        return (
+            self.filter_normal(content)
+            or self.filter_gif_images(images)
+            or not replies
+        )
+
+    def filter_second_comment(self, content, first_images, images):
+        """ 过滤二级评论 """
+        return (
+            self.filter_normal(content)
+            or self.filter_second_images(first_images, images)
+        )
+
+    def filter_gif_images(self, images):
+        for image in images:
+            if ".gif" in image:
+                return True
+
+        return False
+
+    def filter_second_images(self, first_images, images):
+        """ 过滤二级评论的图片 """
+        first_images = set(first_images or [])
+        images = set(images or [])
+        return images - first_images
+
+    def filter_normal(self, content):
+
+        if "@" in content:
+            return True
+
+        if "榜姐" in content or "渣浪" in content or "微博" in content:
+            return True
+
+        if "收起全文" in content:
+            return True
+
+        if "http://" in content or "https://" in content:
+            return True
+
+        regex = re.compile("#[^#]+#")
+        return regex.search(content)
+
+
+    def handle(self, *args, **options):
+        # print(upload_weibo_image("http://wx2.sinaimg.cn/woriginal/bbeff396ly1fxqi1wl2hqj20go0bvabl.jpg"))
+        platform = 5  # 微博
+        # 榜单
+        print('----- start deal weibo blog at {} -----'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')))
+        pictorial_data = self.get_json_data_from_dir(is_pictorial=1)
+        for idx, pictorial in enumerate(pictorial_data):
+            print('------- current pictorial idx :', idx)
+            start = time.time()
+
+            ret = self.create_pictorial(pictorial=pictorial, platform=platform)
+            print("escape:", time.time() - start)
+            if not ret:
+                print("create failed weibo blog:", pictorial["id"])
+        import pprint
+        pprint.pprint(self.stats)
+        print('-------- create_faild_topic_list:', len(self.create_faild_topic_list))
+        print('-------- topic_error_comments:', len(self.topic_error_comments))
+        print('-------- create_faild_pictorial_list:', len(self.create_faild_pictorial_list))
+        print('-------- top_pictorial_error_comments:', len(self.top_pictorial_error_comments))
+        print('-------- second_pictorial_error_comments:', len(self.second_pictorial_error_comments))
\ No newline at end of file
--- a/api/utils/upload.py
+++ b/api/utils/upload.py
 import requests
 from gm_upload import upload, upload_file
 from gm_upload import IMG_TYPE
+import io
+from PIL import Image
+# import numpy as np


 def upload_image(url, img_type=IMG_TYPE.TOPIC):
@@ -10,3 +13,24 @@ def upload_image(url, img_type=IMG_TYPE.TOPIC):
        return upload(response.content, img_type=img_type)
    except:
        return None
+
+
+def upload_weibo_image(url, img_type=IMG_TYPE.TOPIC):
+    '''非站内图片处理'''
+    try:
+        response = requests.get(url)
+        img = Image.open(io.BytesIO(response.content))
+
+        w, h = img.size
+        img = img.crop((0, 0, w, h-10))
+
+        # img =  img.convert('RGB')
+        # content =  np.array(img)[..., ::-1]
+        temp = io.BytesIO()
+        # content = Image.fromarray(content)
+        img.save(temp, format="png")
+        content = temp.getvalue()
+
+        return upload(content, img_type=img_type)
+    except:
+        return None