crawler_haokan.py 12.1 KB
# -*- coding:utf-8 -*-
# @Time : 2020/2/26 11:40
# @Author : litao

import requests
import json, re, datetime, urllib
from crawler.crawler_sys.utils.output_results import retry_get_url
from crawler.crawler_sys.utils.output_results import hot_words_output_result, output_result
from crawler.crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp
from write_data_into_es.func_cal_doc_id import *
from crawler.crawler_sys.site_crawler.crawler_haokan import Crawler_haokan
crawler_video_page = Crawler_haokan().video_page


class CrawlerHaoKan(object):
    def __init__(self):
        self.platform = "haokan"
        self.headers = {
                "Host": "sv.baidu.com",
                "Connection": "keep-alive",

                "Charset": "UTF-8",
                "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; OPPO R11 Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/74.0.3729.136 Mobile Safari/537.36 haokan/5.9.2.10 (Baidu; P1 5.1.1)/OPPO_22_1.1.5_11R+OPPO/1022131c/3B42DEA1B123E0BFCC96D85E1E191EB1%7C0/1/5.9.2.10/509021/1",
                "X-Bfe-Quic": "enable=1",
                # "XRAY-REQ-FUNC-ST-DNS": "okHttp;1582687757091;0",
                # "XRAY-TRACEID": "58f10e39-772a-42b0-bed2-451038d27de4",
                # "Cookie": "BAIDUID=E577F98F951CE0989D45142695B6CE78:FG=1; FEED_VIDS=8633+8523+6577+3630; FEED_TAB=recommend; BAIDUZID=FFD42183BD34A7D8D951D8D356B53F7BBC; BAIDUCUID=_82ZiliKS8lNav8m0aHRuliP-i0EOvatgiv6fg8kSiKoLqqqB",
                "Content-Type": "application/x-www-form-urlencoded",
                "Accept-Encoding": "gzip, deflate",

        }
        self.get_hot_page_videos()

    def get_hot_words(self):
        bulk_list = []
        timestamp = int(datetime.datetime.now().timestamp())
        url_dic = {
                "log": "vhk",
                "tn": "1022131c",
                "ctn": "1022131c",
                # "mac": "48:A4:72:58:86:D5",
                # "imei": "866174725888628",
                # "cuid": "3B42DEA1B123E0BFCC96D85E1E191EB1|0",
                "bdboxcuid": "",
                # "c3_aid": "A00-GH4F2VNIUV7SHQU3HMUUFTLTKSN3IUAD-ZURJH6Y5",
                "os": "android",
                "osbranch": "a0",
                "ua": "900_1600_320",
                "ut": "OPPO R11_5.1.1_22_OPPO",
                "uh": "OPPO,qcom,sdm660",
                "apiv": "5.9.2.10",
                "appv": "509021",
                "version": "5.9.2.10",
                "life": timestamp,
                "clife": timestamp,
                # "hid": "3B691F5D047A9200FADD7D5BA67D1B78",
                "imsi": "0",
                "network": "1",
                # "location": r"{%22prov%22:%22%22,%22city%22:%22%22,%22county%22:%22%22,%22street%22:%22%22,%22latitude%22:30.004828,%22longitude%22:112.575499}",
                "sids": "5155_2",
                "young_mode": "0",
        }
        post_dic = {
                "search/presug": "method=get"
        }
        url = "https://sv.baidu.com/haokan/api?%s" % urllib.parse.urlencode(url_dic)
        res = requests.post(url, data=post_dic, headers=self.headers)
        res_json = res.json()
        for data_list in res_json["search/presug"]["data"]:
            for data in data_list["list"]:
                try:
                    dic = {
                            "platform": self.platform,
                            "title": data["title"],
                            "cmd": data["cmd"],
                            "fetch_time": int(datetime.datetime.now().timestamp() * 1e3),
                            "data_type": data['tplName']
                    }
                    bulk_list.append(dic)
                except:
                    continue
        hot_words_output_result(bulk_list)
        return True

    def search_hottopic(self,title=None,url="",channel="",max_page=2,**kwargs):
        """
        GET https://haokan.baidu.com/creator/haokantopic/topichomepage/5875485993792393513&sfrom=inside-souqianyeHuaTi?id=5875485993792393513&bfe=1 HTTP/1.1
        :param url: 
        :return: 
        """
        res_data_list= []
        url = urllib.parse.unquote(url)
        _id = re.findall("/(\d+)&", url)[0]
        print(url, _id)
        get_url = re.findall("url_key=(.*)",url)[0] +"?id=%s&bfe=1" %_id
        headers = {
                "Host": "haokan.baidu.com",
                "Connection": "keep-alive",
                "Upgrade-Insecure-Requests": "1",
                "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; OPPO R11 Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/74.0.3729.136 Mobile Safari/537.36 haokan/5.9.2.10 (Baidu; P1 5.1.1)/OPPO_22_1.1.5_11R+OPPO/1022131c/3B42DEA1B123E0BFCC96D85E1E191EB1%7C0/1/5.9.2.10/509021/1",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
                # "Cookie": "BAIDUCUID=_82ZiliKS8lNav8m0aHRuliP-i0EOvatgiv6fg8kSiKoLqqqB; Hm_lvt_77ca61e523cd51ec7ac7a23bc4d24edf=1582689757; BAIDUZID=xOiExF1dC2xpt9E7juy1SeKlPDFxsf9nSi9w-06_uAg6ipAHsDqeeoUY6agIwPr6xlNET9po9osj0tENM2pCIZQ; Hm_lpvt_77ca61e523cd51ec7ac7a23bc4d24edf=1582799564; BAIDUID=E577F98F951CE0989D45142695B6CE78:FG=1",
                "X-Requested-With": "com.baidu.haokan",
        }

        res = retry_get_url(get_url,proxies=3,headers=headers,timeout=10)

        res_json = json.loads(re.findall(r"__PRELOADED_STATE__ = (.*);[ ]+document", res.text)[0])
        # print(res_json)
        for data in res_json["listData"]["list"]:
            res_data = crawler_video_page("https://haokan.baidu.com/v?vid=%s"%data.get("vid"), vid=data.get("vid"))
            res_data["channel"] = channel
            res_data["is_hot"] = 0
            res_data_list.append(res_data)
        page = 2
        while page <= max_page:
            headers["Referer"] = get_url
            get_url = "https://haokan.baidu.com/creator/topicvlist?id=%s&tabType=hot&page=%s&size=10" %(_id,page)
            page += 1
            res = retry_get_url(get_url,proxies=3,headers=headers,timeout=10)
            res_json = res.json()
            for data in res_json["data"]["list"]:
                res_data = crawler_video_page("https://haokan.baidu.com/v?vid=%s" % data.get("vid"),
                                              vid=data.get("vid"))
                res_data["channel"] = channel
                res_data["is_hot"] = 0
                res_data_list.append(res_data)

        output_result(result_Lst=res_data_list,
                      platform=self.platform,
                      output_to_es_raw=True,
                      )
        res_data_list.clear()


    def search_video(self, title=None, max_page=2,**kwargs):
        # 搜索页
        page = 1
        bulk_all_body = ""
        res_data_list = []
        timestamp = int(datetime.datetime.now().timestamp())
        while page <= max_page:
            url_dic = {
                    "cmd": "search",
                    "log": "vhk",
                    "tn": "1022131c",
                    "ctn": "1022131c",
                    # "mac": "48:A4:72:58:86:D5",
                    # "imei": "866174725888628",
                    # "cuid": "3B42DEA1B123E0BFCC96D85E1E191EB1|0",
                    "bdboxcuid": "",
                    "c3_aid": "A00-GH4F2VNIUV7SHQU3HMUUFTLTKSN3IUAD-ZURJH6Y5",
                    "os": "android",
                    "osbranch": "a0",
                    "ua": "900_1600_320",
                    "ut": "OPPO%20R11_5.1.1_22_OPPO",
                    "uh": "OPPO,qcom,sdm660",
                    "apiv": "5.9.2.10",
                    "appv": "509021",
                    "version": "5.9.2.10",
                    "life": timestamp,
                    "clife": timestamp,
                    "hid": "3B691F5D047A9200FADD7D5BA67D1B78",
                    "imsi": "0",
                    "network": "1",
                    # "location": "{%22prov%22:%22%22,%22city%22:%22%22,%22county%22:%22%22,%22street%22:%22%22,%22latitude%22:30.004828,%22longitude%22:112.575499}",
                    "sids": "5155_2",
                    "young_mode": "0",

            }
            post_dic = {
                    "method": "get",
                    "tag": "rc",
                    "cursor_time": "0",
                    "cb_cursor": "0",
                    "hot_cursor": "0",
                    "offline_cursor": "0",
                    "rn": "10",
                    "pn": str(page),
                    "title": title,
                    "force": "0",
                    "needBjh": "1",
                    "long_video": "1",
                    "wordseg": "1",
                    "outpn": "0",
                    "innerpn": "1",

            }
            post_body = {
                    "search": urllib.parse.urlencode(post_dic)
            }
            requests_res = requests.post("https://sv.baidu.com/haokan/api?%s" % urllib.parse.urlencode(url_dic),
                                         data=post_body, headers=self.headers)
            requests_json = requests_res.json()
            page += 1
            print(requests_json)
            for count, data in enumerate(requests_json["search"]["data"]["list"]):
                res_data = crawler_video_page(data.get("video_short_url"), data.get("media_id"))
                # res_data["hot_num"] = data.get("hot")
                res_data["is_hot"] = 0
                res_data_list.append(res_data)

            output_result(result_Lst=res_data_list,
                          platform=self.platform,
                          output_to_es_raw=True,
                          )
            res_data_list.clear()

    def get_hot_page_videos(self, max_page=10):

        res_data_list = []
        headers = {
                "Host": "haokan.baidu.com",
                "Connection": "keep-alive",
                "Upgrade-Insecure-Requests": "1",
                "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; OPPO R11 Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/74.0.3729.136 Mobile Safari/537.36 haokan/5.9.2.10 (Baidu; P1 5.1.1)/OPPO_22_1.1.5_11R+OPPO/1022131c/3B42DEA1B123E0BFCC96D85E1E191EB1%7C0/1/5.9.2.10/509021/1",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
                # "Cookie": "BAIDUCUID=_82ZiliKS8lNav8m0aHRuliP-i0EOvatgiv6fg8kSiKoLqqqB; Hm_lvt_77ca61e523cd51ec7ac7a23bc4d24edf=1582689757; BAIDUZID=xOiExF1dC2xpt9E7juy1SeKlPDFxsf9nSi9w-06_uAg4fX1n0Ludwa0A0JsRzcPL1pUlarSep8D1-W4SE8U4K9A; BAIDUID=E577F98F951CE0989D45142695B6CE78:FG=1; Hm_lpvt_77ca61e523cd51ec7ac7a23bc4d24edf=1582769261",
                "X-Requested-With": "com.baidu.haokan",
        }
        url = "https://haokan.baidu.com/haokan/wisehotbroadcast?sfrom=inside-search_found"
        res = retry_get_url(url=url, proxies=3, headers=headers)

        res_json = json.loads(re.findall(r"__PRELOADED_STATE__ = (.*);[ ]+document", res.text)[0])
        print(res_json)
        for data in res_json["video"]:
            res_data = crawler_video_page(data.get("pageUrl"), vid=data.get("vid"))
            res_data["hot_num"] = data.get("hot")
            res_data["is_hot"] = 1
            res_data_list.append(res_data)

        output_result(result_Lst=res_data_list,
                      platform=self.platform,
                      output_to_es_raw=True,
                      )
        res_data_list.clear()

    def get_hot_videos(self,**kwargs):
        if kwargs.get("data_type") == "hottopic_image":
            self.search_hottopic(**kwargs)
        else:
            self.search_video(**kwargs)


if __name__ == "__main__":
    crawler = CrawlerHaoKan()
    # crawler.get_hot_words()
    # crawler.get_hot_videos()
    crawler.search_hottopic("baiduhaokan://webview/?url_key=https%3A%2F%2Fhaokan.baidu.com%2Fcreator%2Fhaokantopic%2Ftopichomepage%2F5875485993792393513%26sfrom%3Dinside-souqianyeHuaTi",channel="#专治不开心#")