fix

1137e1c5 · haowang · 4155d54d · 4155d54d
Commit 1137e1c5 authored Aug 13, 2019 by haowang
Hide whitespace changes
Inline Side-by-side

Showing with 0 additions and 256 deletions

reply_crawl.py api/utils/reply_crawl.py +0 -256

No files found.
--- a/api/utils/reply_crawl.py
+++ b/api/utils/reply_crawl.py
-# 读取json文件 调用api 将数据导入库
-import os
-from datetime import datetime
-import json
-import requests
-from random import randint
-from api.views.base_view import BaseView
-from api.utils.sensitive import Sensitive
-from api.utils.upload import upload_image
-from api.cache.cache import ins_cache
-
-from libs.user import get_user_by_ids
-from alpha_types.venus import ERROR
-from alpha_types.venus import GRAP_PLATFORM
-from engine.logger import info_logger, error_logger, logging_exception
-
-
-FILE_PATH = '/Users/haowei/Desktop/xhs/'
-API_URL_PROD = 'http://saturn.iyanzhi.com/api/v1/pictorial/create'
-API_URL_TEST = 'http://saturn.gmapp.env/api/v1/pictorial/create'
-API_URL_BENDI = 'http://127.0.0.1:8004/api/v1/pictorial/create'
-
-
-class ReplyBatchCreate(BaseView):
-    """
-        评论爬取
-    """
-    user_id_start = 241757306 # end 241806255
-    del_cache_keys = []
-    first_replys = []
-    second_replys = []
-    create_faild_topic_list = []
-    total_topic_comments = []
-    total_pictorial_comments = []
-
-
-    def del_cache(self):
-        for obj in self.del_cache_keys:
-            ins_cache.delete(obj)
-
-    def get_random_user_id(self):
-        # 随机获取用户ID
-        while True:
-            index = randint(0, 5000)
-            user_id = self.user_id_start + index
-            error, data = self.call_rpc('venus/community/user/is_shadow', user_id=user_id)
-            ret = data.get('user_id')
-            if ret:
-                return user_id
-
-    def get_user_id(self, id_, platform):
-        # 获取用户ID 缓存记录保留用户关系
-        cache_key = 'grap:{}:{}'.format(platform, id_)
-        exist_key = 'grap:{}:{}'
-        value = ins_cache.get(cache_key)
-        user_id = None
-        if not value:
-            while True:
-                user_id = self.get_random_user_id()
-                exist = exist_key.format(platform, user_id)
-                if not ins_cache.get(exist):
-                    ins_cache.set(exist, id_)
-                    self.del_cache_keys.append(exist)
-                    break
-            ins_cache.set(cache_key, user_id)
-            self.del_cache_keys.append(exist)
-        else:
-            user_id = int(value)
-        return user_id
-
-    def get_json_data_from_dir(self, is_topic=None, is_pictorial=None):
-        # 获取目录文件数据
-        ret = []
-        if is_topic:
-            file_path = FILE_PATH + 'topic/'
-        if is_pictorial:
-            file_path = FILE_PATH + 'pictorial/'
-        filenames = []
-        for _, _, names in os.walk(file_path):
-            filenames = names
-        
-        for filename in filenames:
-            ret.append(self.get_file_json_data(file_path + filename))
-    
-        return ret
-    
-    def get_file_json_data(self, file):
-        # 获取文件数据
-        data = None
-        with open(file, 'r') as f:
-            content = f.read()
-            if content.startswith(u'\ufeff'):
-                content = content.encode('utf8')[3:].decode('utf8')
-            
-            data = json.loads(content)
-    
-        return data
-
-    def get_image_size(self, image_url):
-        # 获取图片宽高
-        try:
-            url = image_url + IMAGE_SUFFIX + '?imageInfo'
-            response = requests.request("GET", url)
-            info = response.json()
-            return info.get('width'), info.get('height')
-        except Exception as e:
-            logging_exception()
-            return None
-
-    def image_info(self, urls):
-        # 获取图片信息
-        ret = []
-        for url in urls:
-            image_url = upload_image(url)
-            while not image_url:
-                image_url = upload_image(url)
-            width, height = self.get_image_size(image_url)
-            while not width and not height:
-                width, height = self.get_image_size(image_url)
-            ret.append(
-                {
-                    'url': image_url.replace('http://alpha.gmeiapp.com/', ''),
-                    'height': height,
-                    'width': width,
-                }
-            )
-        return ret
-
-    def revise_comments(self, comment, from_id):
-        ret = []
-        comment['from_id'] = from_id
-        comment['content'] = comment.get('comment')
-        reply = comment.pop('reply', None)
-        if not reply:
-            return comment, ret
-        for info in reply:
-            info['from_id'] = comment.get('from_id')
-            info['reply_id'] = comment.get('id')
-            info['type'] = comment.get('type')
-        
-            ret.append(info)
-        return comment, ret
-
-    def create_comment(self, comment_list, from_id, platform, topic_id=None, pictorial_id=None):
-        top_comment, comments = self.revise_comments(comment_list, from_id)
-        top_comment['user_id'] = self.get_user_id(id_=top_comment.get('user').get('id'), platform=platform)
-        top_comment.pop('user')
-        error, ret = self.call_rpc('venus/community/crawl/replys', data=[top_comment], platform=platform, topic_id=topic_id, pictorial_id=pictorial_id)
-        if error:
-            return error, ret
-        top_id = ret.get('reply_ids')[0]
-
-        for obj in comments:
-            obj['user_id'] = self.get_user_id(id_=obj.get('user').get('id'), platform=platform)
-            obj.pop('user')
-        error, _ = self.call_rpc('venus/community/crawl/replys', data=comments, platform=platform, topic_id=topic_id, pictorial_id=pictorial_id, top_id=top_id)
-        if error:
-            return error, None
-        return None, None
-
-    def create_topic(self, topics, platform):
-        for topic in topics:
-            topic_comments = topic.pop('comments', None)
-            images = topic.pop('image')
-            topic['images'] = self.image_info(images)
-            topic['user_id'] = self.get_user_id(id_=topic.get('id'), platform=platform)
-            error, topic_obj = self.call_rpc('venus/community/crawl/topic', data=topic, platform=platform, pictorial_id=None)
-            if error:
-                print(error, topic.get('id'))
-                self.create_faild_topic_list.append(topic.get('id'))
-                continue
-            if not topic_comments:
-                continue
-            from_id = topic.get('id')
-            if platform == GRAP_PLATFORM.XIAOHONGSHU:
-                for topic_comment in topic_comments:
-                    topic_comment['topic_id'] = topic_obj.get('id')
-                self.total_topic_comments.append(topic_comments)
-        return None, None
-
-    def create_pictorial(self, pictorial, platform):
-        topics = []
-        pictorial_id = None
-        if not pictorial:
-            return None, None
-        pictorial_comments = pictorial.pop('comments', None)
-        images = self.image_info(pictorial.pop('image'))
-        index = 0
-        for obj in images:
-            index += 1
-            topics.append({
-                'id': pictorial.get('id') + str(index),
-                'content': pictorial.get('content'),
-                'images': [obj],
-                'create_time': pictorial.get('create_time'),
-                'user_id': self.get_user_id(id_=obj.get('url'), platform=platform)
-            })
-        
-        pictorial['user_id'] = self.get_user_id(id_=pictorial.get('id'), platform=platform)
-        pictorial['description'] = pictorial.get('content')
-
-        # 榜单名称取爬取内容的前20字符
-        index_end = 20
-        if len(pictorial.get('content')) < index_end:
-            index_end = len(pictorial.get('content')) - 1
-
-        pictorial['name'] = pictorial.get('content')[:index_end]
-        error, pictorial_obj = self.call_rpc('venus/community/crawl/pictorial', data=pictorial, platform=platform)
-        if error:
-            return error, None
-        pictorial_id = pictorial_obj.get('id')
-
-        if topics:
-            for obj in topics:
-                error, _ = self.call_rpc('venus/community/crawl/topic', data=obj, platform=platform, pictorial_id=pictorial_id)
-                if error:
-                    return error, None
-        if pictorial_comments:
-            if platform == GRAP_PLATFORM.XIAOHONGSHU:
-                for pictorial_comment in pictorial_comments:
-                    pictorial_comment['pictorial_id'] = pictorial_id
-                self.total_pictorial_comments.append(pictorial_comments)
-
-        return None, None
-
-    def process(self):
-
-        platform = 4
-        # 帖子
-        print('----- start deal topic at {} -----'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')))
-        topic_data = self.get_json_data_from_dir(is_topic=1)
-        for info in topic_data:
-            print(info)
-            # self.create_topic(topics=info, platform=platform)
-            # self.del_cache()
-        print('----- end deal topic at {} -----'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')))
-    
-        # 榜单
-        # print('----- start deal pictorial at {} -----'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')))
-        # pictorial_data = self.get_json_data_from_dir(is_pictorial=1)
-        # print('----- end deal pictorial at {} -----'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')))
-
-        # if is_pictorial:
-        #     for obj in topics:
-        #         error, _ = self.create_pictorial(pictorial=obj, platform=platform)
-        #         if error:
-        #             self.del_cache()
-        #             return self.error(error=error)
-
-        self.del_cache()
-        return self.ok()
-
-
-if __name__ == "__main__":
-    a = ReplyBatchCreate()
-    a.process()