crawler_weibo.py 15.8 KB
# -*- coding:UTF-8 -*-
# @Time  ": "2020/7/22 14:42",
# @File  ": "crawler_weibo.py",
# @email ": "litao@igengmei.com",
# @author ": "litao",
# -*- coding: "utf-8 -*-",


import os
import copy
import requests
import re
import datetime ,time
import json
# import aiohttp
import random
# from bs4 import BeautifulSoup
# from multiprocessing import Pool
# from multiprocessing import Process
from crawler.crawler_sys.framework.video_fields_std import Std_fields_video
from crawler.crawler_sys.utils.output_results import retry_get_url
from crawler.crawler_sys.utils.output_results import output_result
# from crawler.crawler_sys.utils import output_log
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
from crawler.crawler_sys.utils.trans_strtime_to_timestamp import weibo_parse_time, trans_strtime_to_timestamp
# from crawler.crawler_sys.utils import connect_with_redis
# from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
# from crawler.crawler_sys.utils.util_logging import logged
# from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
from crawler.crawler_sys.utils.html_to_str import dehtml

from write_data_into_es.func_get_releaser_id import *


class Crawler_weibo():
    def __init__(self, timeout=None, platform='weibo'):
        if timeout == None:
            self.timeout = 10
        else:
            self.timeout = timeout
        self.platform = platform
        std_fields = Std_fields_video()
        self.video_data = std_fields.video_data
        self.video_data['platform'] = self.platform
        # remove fields that crawled data don't have
        pop_key_Lst = ['describe', 'repost_count', 'isOriginal',
                       'video_id']
        for popk in pop_key_Lst:
            self.video_data.pop(popk)

    @staticmethod
    def get_video_image(data):
        video_photo_url = data["pic_496x280"]
        return video_photo_url

    @staticmethod
    def get_single_page(mid):
        url = "https://m.weibo.cn/status/%s" % mid
        headers = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            # "cookie": "_T_WM=68345544646; WEIBOCN_FROM=1110006030; MLOGIN=0; XSRF-TOKEN=fd1a69; M_WEIBOCN_PARAMS=oid%3D4523948446845543%26luicode%3D20000061%26lfid%3D4528703037509890%26uicode%3D20000061%26fid%3D4523948446845543",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "same-origin",
            "sec-fetch-site": "same-origin",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
        }
        page_res = retry_get_url(url,headers=headers,proxies=0)
        page_json_context = re.findall(r"render_data = (.*)\[0\]", page_res.text,flags=re.DOTALL)[0]
        page_json = json.loads(page_json_context)
        text = dehtml(page_json[0]["status"]["text"])
        repost_count = trans_play_count(page_json[0]["status"]["reposts_count"])
        comment_count = trans_play_count(page_json[0]["status"]["comments_count"])
        favorite_count = trans_play_count(page_json[0]["status"]["attitudes_count"])
        return text,repost_count,comment_count,favorite_count

    def get_releaser_id(self, releaserUrl):
        return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)

    # def get_weibo_info(self,releaser_id):
    #     url = "https://m.weibo.cn/api/config"
    #     headers = {
    #         "accept": "application/json, text/plain, */*",
    #         "accept-encoding": "gzip, deflate, br",
    #         "accept-language": "zh-CN,zh;q=0.9",
    #         # "cookie": "_T_WM=30976479190; WEIBOCN_FROM=1110006030; MLOGIN=0; XSRF-TOKEN=ce3c56; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D1%2526q%253D%25E8%25BF%25AA%25E4%25B8%25BD%25E7%2583%25AD%25E5%25B7%25B4%26fid%3D1076031669879400%26uicode%3D10000011",
    #         "mweibo-pwa": "1",
    #         "referer": "https://m.weibo.cn/u/{0}?uid={1}&t=0".format(releaser_id,releaser_id),
    #         "sec-fetch-dest": "empty",
    #         "sec-fetch-mode": "cors",
    #         "sec-fetch-site": "same-origin",
    #         "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
    #         "x-requested-with": "XMLHttpRequest",
    #         # "x-xsrf-token": "ce3c56",
    #     }
    #     requests_res = retry_get_url(url,headers=headers)
    #     res_json = requests_res.json()
    #     xsrf_token = res_json["data"]["st"]
    #     url_extr = res_json["data"]["loginUrl"].split(releaser_id+"%26")
    #     set_cookies = requests_res.headers.get()
    #     return xsrf_token,url_extr
    # @logged
    def releaser_page(self, releaserUrl,
                      output_to_file=False, filepath=None,
                      output_to_es_raw=False,
                      output_to_es_register=False,
                      push_to_redis=False,
                      releaser_page_num_max=10000,
                      es_index=None,
                      doc_type=None,proxies_num=None):
        print('Processing releaserUrl %s' % releaserUrl)
        result_Lst = []
        releaser_id = self.get_releaser_id(releaserUrl)
        # xsrf_token,url_extr = self.get_weibo_info(releaser_id)
        headers = {
            "accept": "application/json, text/plain, */*",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            # "cookie": "_T_WM=30976479190; XSRF-TOKEN=9e4bb8; WEIBOCN_FROM=1110006030; MLOGIN=0; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D1%2526q%253D%25E8%25BF%25AA%25E4%25B8%25BD%25E7%2583%25AD%25E5%25B7%25B4%26fid%3D1076031669879400%26uicode%3D10000011",
            "mweibo-pwa": "1",
            # "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4",
            # "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
            "x-requested-with": "XMLHttpRequest",
            # "x-xsrf-token": xsrf_token,
        }
        pagenum = 0
        has_more = True
        since_id = 0
        if releaser_id != None:
            while pagenum <= releaser_page_num_max and has_more:
                pagenum += 1
                time.sleep(0.5)
                "?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4&type=uid&value=1669879400&containerid=1076031669879400&since_id=451822205602429"
                url = "https://m.weibo.cn/api/container/getIndex?uid={0}&t=0&type=uid&value={1}&containerid=107603{2}&since_id={3}".format(releaser_id,releaser_id,releaser_id,since_id)
                headers["referer"] = "https://m.weibo.cn/u/uid={0}&t=0".format(releaser_id)
                print('Page number: %d' % pagenum)
                try:
                    if proxies_num:
                        get_page = retry_get_url(url,headers=headers, timeout=self.timeout,proxies=proxies_num)
                    else:
                        get_page = retry_get_url(url,headers=headers,timeout=self.timeout)
                except:
                    get_page = None
                    has_more = False
                if get_page != None and get_page.status_code == 200:
                    try:
                        page_json = get_page.json()
                        total = page_json["data"]["cardlistInfo"]["total"]
                        if pagenum > total:
                            break
                        since_id = page_json["data"]["cardlistInfo"]["since_id"]
                        page_dic = page_json["data"].get("cards")
                    except Exception as e:
                        print("load data error %s" % e)
                        continue

                    if page_dic:
                        for one in page_dic:
                            try:
                                mblog = one.get("mblog")
                                mid = mblog.get("mid")
                                forward_text = ""
                                forward_user = ""
                                if one.get("source") == "绿洲":
                                    text_type = "绿洲"
                                elif mblog.get("retweeted_status"):
                                    text_type = "转发"
                                    forward_text = mblog.get("retweeted_status").get("raw_text")
                                    forward_user = mblog.get("retweeted_status").get("user").get("screen_name")
                                else:
                                    text_type = one.get("source")
                                if mblog.get("isLongText"):
                                    text,repost_count,comment_count,favorite_count = self.get_single_page(mid)
                                else:
                                    text = mblog["raw_text"]

                                if mblog.get("page_info"):
                                    article_type = mblog.get("page_info").get("type")

                                res_dic = {
                                    "release_time": trans_strtime_to_timestamp(mblog["created_at"]),
                                    "url": one["scheme"],
                                    "releaser": mblog["user"]["screen_name"],
                                    "repost_count": trans_play_count(mblog["reposts_count"]),
                                    "comment_count": trans_play_count(mblog["comments_count"]),
                                    "favorite_count": trans_play_count(mblog["attitudes_count"]),
                                    "title": text.replace("\u200b",""),
                                    "wb_type":text_type,
                                    "forward_user":forward_user,
                                    "forward_text":forward_text,
                                    "mid":mid,
                                    "releaserUrl":"https://www.weibo.com/u/%s" % releaser_id,
                                    "releaser_id_str":"weibo_%s" % releaser_id,
                                    "platform":"weibo",
                                    "article_type":article_type
                                }
                                # from write_data_into_es.func_cal_doc_id import cal_doc_id
                                # id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,
                                #                 doc_id_type="all-time-url")
                                # print(id)
                                yield res_dic
                            except Exception as e:
                                print(mblog)
                                print("row formate error %s"% e)
                                continue


    def get_releaser_follower_num(self, releaserUrl):
       pass

    def releaser_page_by_time(self, start_time, end_time, url,**kwargs):
        data_lis = []
        count_false = 0
        output_to_file = kwargs.get("output_to_file")
        filepath = kwargs.get("filepath")
        push_to_redis = kwargs.get("push_to_redis")
        output_to_es_register = kwargs.get("output_to_es_register")
        output_to_es_raw = kwargs.get("output_to_es_raw")
        es_index = kwargs.get("es_index")
        for res in self.releaser_page(url,proxies_num=kwargs.get("proxies_num")):
            video_time = res["release_time"]
            # print(res)
            if video_time:
                if start_time <= video_time:
                    if video_time < end_time:
                        try:
                            # res["fetch_time"] = datetime.datetime.fromtimestamp(res.get("fetch_time") / 1000).strftime('%Y-%m-%d %H:%M:%S')
                            res["release_time"] = datetime.datetime.fromtimestamp(res.get("release_time") / 1000).strftime('%Y-%m-%d %H:%M:%S')

                        except:
                            pass
                        data_lis.append(res)

                        if len(data_lis) >= 100:
                            output_result(result_Lst=data_lis,
                                          platform=self.platform,
                                          output_to_file=output_to_file,
                                          filepath=filepath,
                                          push_to_redis=push_to_redis,
                                          output_to_es_register=output_to_es_register,
                                          output_to_es_raw=output_to_es_raw,
                                          es_index=es_index,
                                       )
                            data_lis.clear()
                else:
                    count_false += 1
                    if count_false > 10:
                        break
                    else:
                        continue

        # if data_lis != []:
        #     output_result(result_Lst=data_lis,
        #                   platform=self.platform,
        #                   output_to_file=output_to_file,
        #                   filepath=filepath,
        #                   push_to_redis=push_to_redis,
        #                   output_to_es_register=output_to_es_register,
        #                   output_to_es_raw=output_to_es_raw,
        #                   es_index=es_index,
        #                   )
        import pandas as pd
        data = pd.DataFrame(data_lis)
        s = datetime.datetime.now()
        ss = str(s)[0:19].replace(' ', '-').replace(':', '-')
        res = data.to_csv('%s%sall_s1.csv' % ("all_", ss), encoding='gb18030',
                          # columns=columns
                          )
        data_lis.clear()


if __name__ == '__main__':
    test = Crawler_weibo()
    url = 'https://weibo.com/p/1644114654/home?from=page_100306&mod=TAB#place'
    # releaserUrl = 'http://v.qq.com/vplus/cfa34d96d1b6609f1dccdea65b26b83d'
    url_list = [
        # "https://weibo.com/u/1764615662",
        # "https://weibo.com/u/3662247177",
        # "https://weibo.com/u/2378564111",
        # "https://weibo.com/u/2983578965",
        # "https://weibo.com/u/3938976579",
        # "https://weibo.com/u/6511177474",
        # "https://weibo.com/u/6343916471",
        # "https://weibo.com/u/6511177474",
        # "https://weibo.com/u/2921603920",
        # "https://weibo.com/u/6470919752",
        # "https://weibo.com/u/2653906910?refer_flag=1001030103_&is_hot=1",
        # "https://weibo.com/u/3115996363?is_hot=1",
        # "https://weibo.com/p/1005053212093237/home?from=page_100505&mod=TAB#place",
        # "https://weibo.com/u/3926129482",
        # "https://weibo.com/u/5509337969?is_hot=1",
        # "https://weibo.com/u/5477320351",
        # "https://weibo.com/p/1005055634795408/home?from=page_100505&mod=TAB#place",
        "https://weibo.com/u/6511173721",
        # "https://weibo.com/p/1005055471534537/home?from=page_100505&mod=TAB&is_hot=1#place",

    ]
    # res = test.releaser_page(url, output_to_es_raw=True,
    #                     es_index='crawler-data-raw',
    #                      releaser_page_num_max=400,proxies_num=0)
    # for r in res:
    #     print(r)
    for u in url_list:
        test.releaser_page_by_time(1590940800000, 1595468554268, u, output_to_es_register=True,
                                          es_index='crawler-data-raw',
                                          doc_type='doc', releaser_page_num_max=4000)
    # test.get_single_page(4524055937468233)