# -*- coding: utf-8 -*-
"""
Created on Thu Mar 15 15:23:08 2018

@author: fangyucheng

Edited by hanye on 2018-05-15

Edited by fangyucheng on 2018-05-17

Edited by litao on 2019-04-10
"""

import copy
import datetime
import json
import random
import re
import time
import urllib

from write_data_into_es.func_get_releaser_id import *
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
import requests

from crawler.crawler_sys.framework.get_redirect_resp import get_redirected_resp
from crawler.crawler_sys.utils.get_toutiao_as_cp_signature import as_cp
from crawler.crawler_sys.framework.video_fields_std import Std_fields_video
from crawler.crawler_sys.site_crawler.toutiao_get_signature import getHoney
from crawler.crawler_sys.utils.output_results import output_result
from crawler.crawler_sys.utils.output_results import retry_get_url
from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
# from crawler.crawler_sys.utils import output_log
from crawler.crawler_sys.utils.util_logging import logged


class Crawler_toutiao():

    def __init__(self, timeout=None, platform='toutiao'):
        if timeout is None:
            self.timeout = 10
        else:
            self.timeout = timeout
        self.platform = platform
        std_fields = Std_fields_video()
        self.video_data = std_fields.video_data
        self.video_data['platform'] = self.platform
        # remove fields that crawled data don't have
        pop_key_Lst = ['channel', 'describe', 'isOriginal', "repost_count", "video_id"]
        for popk in pop_key_Lst:
            self.video_data.pop(popk)
        self.releaser_url_pattern = 'http://www.365yg.com/c/user/[RELEASER_ID]/'

        self.list_page_url_dict = {'all_channel': (
            'https://www.365yg.com/api/pc/feed/?max_behot_time=0'
            '&category=video_new&utm_source=toutiao')}
        self.legal_list_page_urls = []
        self.legal_channels = []
        for ch in self.list_page_url_dict:
            list_page_url = self.list_page_url_dict[ch]
            self.legal_list_page_urls.append(list_page_url)
            self.legal_channels.append(ch)
        self.api_list = [
            "ic",
            "is",
            "api3-normal-c-hl",
            "ib",
            "api3-normal-c-lf",
            "id",
            "ie",
            "api3-normal-c-lq",
            "ii",
            "io",
            "it",
            "iu",
            "lf",
            "lg",
            "lh",
        ]
        #        log_path = '/home/hanye/crawlersNew/crawler/crawler_log'
        #        current_day = str(datetime.datetime.now())[:10]
        #        info_log_file = log_path + '/all_' + current_day + '.log'
        #        info_name = self.platform + '_info'
        #        error_log_file = log_path + '/error_' + current_day + '.log'
        #        error_name = self.platform + '_error'
        #        self.loggerinfo = output_log.init_logger(name=info_name, log_file=info_log_file)
        #        self.loggererror = output_log.init_logger(name=error_name, log_file=error_log_file)
        # self.headers = {
        #
        #     # "Host": "api3-normal-c-hl.snssdk.com",
        #     "Connection": "Keep-Alive",
        #     "accept-encoding": "gzip, deflate",
        #     "x-ss-sessionid": "",
        #     "sdk-version": "2",
        #     # "cookie": "ttreq=1$b72ff1a4bec1a13b4e42dc12a741db51c0b2d52b; odin_tt=6664b911161b73ecb325b187e53bb16a2d2702779d2d74ec168a8ac182b33eb6e273ee261d6f965442fc71e3efa2f581a39e3c4c58a8ea33ff87c7103d4833db",
        #     # "x-tt-trace-id": "00-eea570870a10554009a6731d771f000d-eea570870a105540-01",
        #     # "tt-request-time": "4294967295",
        #     # "x-ss-cookie": "odin_tt=6664b911161b73ecb325b187e53bb16a2d2702779d2d74ec168a8ac182b33eb6e273ee261d6f965442fc71e3efa2f581a39e3c4c58a8ea33ff87c7103d4833db; ttreq=1$b72ff1a4bec1a13b4e42dc12a741db51c0b2d52b",
        #     # "x-ss-dp": "13",
        #     # "passport-sdk-version": "5.1.0",
        #     # "user-agent": "News 7.5.4 rv:7.5.4.10 (iPhone; iOS 10.3.3; zh_CN) Cronet",
        #     # "X-Gorgon": "0401f049000019b9cb347d6a6db31a12f642c82f8052c7bd8ba0",
        #     # "X-Khronos": str(int(datetime.datetime.now().timestamp())),
        #     "sec-fetch-user": "?1",
        #     "cache-control": "?max-age=0",
        #     "sec-fetch-site": "none",
        #     "upgrade-insecure-requests": "1",
        #
        # }
        # self.headers = {
        #         "Connection": "keep-alive",
        #         # "User-Agent": "News 7.6.0 rv:7.6.0.14 (iPad; iOS 13.3.1; zh_CN) Cronet",
        #         "User-Agent": "Dalvik/2.1.0 (Linux; U; Android 5.1.1; OPPO R11 Build/NMF26X) NewsArticle/7.1.5 cronet/TTNetVersion:2996a052 2019-08-12",
        #         "sdk-version": "2",
        #         # "x-ss-sessionid": "",
        #         # "passport-sdk-version": "5.4.0",
        #         # "X-SS-DP": "13",
        #         "Accept-Encoding": "gzip, deflate",
        #         # "X-Gorgon": "030000003005ed4d41d9d6e0dd5c9b0dbb917c%sc282d" % random.randint(401800000,401952122),
        #
        # }
        self.headers = {

            "accept": "text/javascript, text/html, application/xml, text/xml, */*",
            "accept-encoding": "gzip, deflate",
            "accept-language": "zh,zh-CN;q=0.9",
            "content-type": "application/x-www-form-urlencoded",
            # "cookie": "gftoken=MjA4NTcyMDkyMXwxNTgyOTYxNjM3NjZ8fDAGBgYGBgY; SLARDAR_WEB_ID=9706fc8c-b8a6-4265-8a2e-e3f0739daaf2; UM_distinctid=1708fddb4c0466-04c756d28410e1-752c6c3c-51abc-1708fddb4c1790; CNZZDATA1274386066=608234173-1582960977-https%253A%252F%252Fwww.toutiao.com%252F%7C1582960977",
            # "referer": "https://profile.zjurl.cn/rogue/ugc/profile/?user_id=50502346296&media_id=50502346296&request_source=1",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
            "x-requested-with": "XMLHttpRequest",

        }

    def extract_field(self, raw_str, raw_field_name):
        try:
            field_value = (re.findall('%s:.+?,' % raw_field_name, raw_str)[0]
                               .replace('%s:' % raw_field_name, '')[:-1])
            # remove string start space and single quotation marks
            field_value_cleaned = re.sub('\'$', '', re.sub('^\s+?\'', '', field_value))
        except:
            field_value_cleaned = None
        return field_value_cleaned

    def get_host_str(self, url):
        get_host_str = re.findall('://.+?/', url)
        if get_host_str != []:
            host_str = get_host_str[0].replace(':', '').replace('/', '')
        else:
            host_str = None
        return host_str


    def video_page(self, url):
        """
        release_time is missing, should update this field when inserting into es.
        url such as 'http://toutiao.com/group/6532819433331622404/' can not get data now
        suggestion by fangyucheng is to rebuild url as
        www.365yg.com with video_id to sovle the problem
        """

        if "item" in url or "group" in url:
            vid = re.findall("/(\d+)/", url)[0]
        elif "xigua" in url:
            vid = re.findall("/i(\d+)/", url)[0]
        else:
            print(url)
            return None
        headers = {
            "Accept-Encoding": "gzip",
            "X-SS-REQ-TICKET": str(int(datetime.datetime.now().timestamp()) * 1e3),
            "sdk-version": "1",
            # "Cookie": "qh[360]=1; install_id=85200129335; ttreq=1$e8e97b875965bf4af4b5dbaaba4d4a5ec3441e47; history=JM89SDpxGAfw5%2F%2Bo%2F7tEz15%2FZ0tbUEN7Q8FhEYQQIdJ2oNFBgpagCA7BIbUFNUT0NjSkRIvl2AveOdr2XEVUuDS0FFnQEETEo%2BOH5%2Fvj9%2F0WyqF4xphMZNLJeD6aSBmk15Tt4nTWSGUaEHR0e%2BG9aqGfPFOgOXrZ%2BtQBJVI6QXPA89R9dzs2QCqC6eil7H3eQhcFiJOXE4NLgDL9q7FscXLM78Qv62rk0GuiRN511vlNRZioEEArGesNaKhQXxBmHd1q7ic19JNcb90Cu1ELfdQz11KkY4Ob%2BWZYex%2BRPCfFK6uaO12GkJ%2FEN%2BtofMgAVEg8s0qbw2ehgkKiwToovMVNdJP4ai%2Fqvw4vjlLXFi%2BqefWmhTKpUvum%2FoR3VBIvYDrgeYT5YtpNksxJe6WeA3SReODW1diayV1cq%2FzDhf2%2FoqFMognaHwAAAP%2F%2F; odin_tt=8cd4f07f6dc385b01edd52312dd29fbe7fdbfa059194493779de3fe408b8836bb9265292bb9335bc976037dd93e5d131de7acf894a805930417b4d3be7f308e0",
            # "X-Gorgon": "0300ddd08400675de6e75ad03849011c863306ddae2b0eb3cec4",
            # "X-Khronos": str(int(datetime.datetime.now().timestamp())),
            # "Host": "xgapi.snssdk.com",
            "Connection": "Keep-Alive",
            "Authorization": "HMAC-SHA1:2.0:1573091168911407306:bab42eac5b9e4a8eb25a91fc371ad533:WTfDrhnIsymHfmHCgG9YvRSu2YY=",
            "User-Agent": "okhttp/3.10.0.1",
            "X-Pods": "",
        }

        print(vid)
        url_dic = {
            "group_id": vid,
            "item_id": vid,
            "aggr_type": 0,
            "context": 1,
            "flags": 64,
            # "iid": "77627602260",
            # "device_id": random.randint(50000000000,59999999999),
            "ac": "wifi",
            "channel": "update",
            "aid": "13",
            "app_name": "news_article",
            "version_code": "732",
            "version_name": "7.3.2",
            "device_platform": "android",
            "ab_version": "830855,947965,942635,662176,665176,674051,643894,919834,649427,677130,710077,801968,707372,661900,668775,990369,739390,662099,668774,765190,976875,857803,952277,757281,679101,660830,759657,661781,648315",
            "ab_group": "100168",
            "ab_feature": "94563,102749",
            "ssmix": "a",
            "device_type": "oppo R11s Plus",
            "device_brand": "OPPO",
            "language": "zh",
            "os_api": "23",
            "os_version": "6.0.1",
            # "uuid": "250129616283002",
            # "openudid": "7313ae71df9e5367",
            "manifest_version_code": "731",
            "resolution": "810*1440",
            "dpi": "270",
            "update_version_code": "75410",
            "_rticket": int(datetime.datetime.now().timestamp() * 1e3),
            # "rom_version": "coloros__v417ir release-keys",
            # "fp": "w2TZFzTqczmWFlwOLSU1J2xecSKO",
            # "tma_jssdk_version": "1.24.0.1",
            # "pos": "5r_x8vP69Ono-fi_p6ysq7Opra2kr6ixv_H86fTp6Pn4v6eupLOkra6vpajg",
            "plugin": "0",
            # "ts":int(datetime.datetime.now().timestamp()),
            #     "as":"ab7f9fce505d1d7dbe7f9f",
            #     "mas":"011993339399f959a359d379b98587814259a359d3997919d319b3"
        }
        url = 'http://xgapi.snssdk.com/video/app/article/information/v25/?%s' % (
            urllib.parse.urlencode(url_dic))
        # get_page = get_redirected_resp(url)
        res = retry_get_url(url, headers=headers, timeout=5, proxies=1)

        try:
            get_page = res.json()
            # print(get_page)
        except:
            return None

        if get_page is None:
            return None
        else:
            data = get_page.get("data")
            video_dict = copy.deepcopy(self.video_data)
            fetch_time = int(datetime.datetime.now().timestamp() * 1e3)

            video_dict['url'] = data.get("display_url")
            try:
                try:
                    video_dict['title'] = data.get("h5_extra").get("title")
                except:
                    video_dict['title'] = data.get("share_info").get("title")
            except:
                return None
            video_dict['play_count'] = data.get("video_watch_count")
            video_dict['favorite_count'] = data.get("digg_count")
            video_dict['comment_count'] = data.get("comment_count")
            if not video_dict['comment_count']:
                video_dict['comment_count'] = 0
            video_dict['repost_count'] = data.get("repin_count")
            if not video_dict['repost_count']:
                video_dict['repost_count'] = 0
            video_dict['video_id'] = vid
            try:
                video_dict['releaser'] = data.get("h5_extra").get("name")
            except:
                video_dict['releaser'] = data.get("source")
            try:
                video_dict['releaser_id_str'] = data.get("h5_extra").get("media_user_id")
            except:
                video_dict['releaser_id_str'] = data.get("user_info").get("user_id")
            video_dict['releaserUrl'] = "https://www.toutiao.com/c/user/%s/" % video_dict['releaser_id_str']
            video_dict['releaser_id_str'] = "toutiao_%s" % video_dict['releaser_id_str']
            video_dict['duration'] = data.get("video_duration")
            try:
                if type(video_dict['play_count']) != int:
                    video_dict['play_count'] = data.get("video_detail_info").get("video_watch_count")
            except:
                return None
            try:
                if type(video_dict['duration']) != int:
                    video_dict['duration'] = 0
            except:
                return None
            video_dict['fetch_time'] = fetch_time
            try:
                video_dict['release_time'] = int(int(data.get("h5_extra").get("publish_stamp")) * 1e3)
            except:
                video_dict.pop("release_time")
            for k in video_dict:
                if type(video_dict[k]) == str or type(video_dict[k]) == int:
                    pass
                else:
                    return None
            return video_dict

    def get_web_article_info(self, article_id, proxies_num=0):
        # headers = {
        #     "Accept": "*/*",
        #     "Accept-Encoding": "gzip, deflate",
        #     "Accept-Language": "zh,zh-CN;q=0.9",
        #     "Connection": "keep-alive",
        #     # "Cookie": "tt_webid=6851461299689686542; SLARDAR_WEB_ID=568d391e-7f96-491b-9557-b045a55e9dd8",
        #     "Host": "m.toutiao.com",
        #     # "Referer": "https://m.toutiao.com/i6851146167279944199/",
        #     "Sec-Fetch-Dest": "empty",
        #     "Sec-Fetch-Mode": "cors",
        #     "Sec-Fetch-Site": "same-origin",
        #     "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
        # }
        # headers["Referer"] = "https://m.toutiao.com/i%s" % article_id
        headers = {

            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Cookie": "tt_webid=6851788569271944719",
            "Host": "m.toutiao.com",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
        }
        url = "https://m.toutiao.com/i{0}/info/?i={1}".format(article_id, article_id)
        requests_res = retry_get_url(url, headers=headers, proxies=proxies_num)
        res_json = requests_res.json()
        try:
            content = res_json["data"].get("content").replace("\r", "").replace("\n", "")
        except:
            content = ""
        try:
            title = res_json["data"].get("title").replace("\r", "").replace("\n", "")
        except:
            title = ""
        try:
            play_count = int(res_json["data"].get('impression_count'))
        except:
            play_count = 0
        res_dic = {
            "title":title,
            'high_quality_flag': int(res_json["data"].get('high_quality_flag')),
            "play_count": play_count,
            "comment_count": res_json["data"].get("comment_count"),
            "repost_count": res_json["data"].get("repost_count"),
            "favorite_count": res_json["data"].get("digg_count"),
            'releaser_followers_count': res_json["data"].get("follower_count"),
            'release_time': int(res_json["data"].get('publish_time') * 1e3),
            "content": content,
            "img_list": re.findall('img src=".*?"', res_json["data"].get("content"))
        }
        return res_dic


    def search_page_old(self, keyword, search_pages_max=12,
                        output_to_es_raw=False,
                        output_to_es_register=False,
                        es_index=None,proxies_num=0):
        headers_search = {
            "accept": "application/json, text/javascript",
            "accept-encoding": "gzip, deflate",
            "accept-language": "zh-CN,zh;q=0.9",
            "content-type": "application/x-www-form-urlencoded",
            # "cookie": "csrftoken=37420c8aa08013294281c3f0b053377d; WEATHER_CITY=%E5%8C%97%E4%BA%AC; SLARDAR_WEB_ID=7a399564-d37b-40eb-ad7e-bd04a4b7a43c; ttcid=01d94567f6f644248be6cfba11f23d8640; s_v_web_id=verify_kcvob60z_zV3lvb5j_dO3z_42Np_A2aO_h2hUwNM55Jt0; tt_webid=6855452651430823432; __tasessionId=xghnx07iq1596159468729; tt_webid=6855452651430823432; tt_scid=8KvMkZv-mN4OUclzXS7.9-pl0T409L4rqvI.Y2c0gwRvwMdRjm7SczvFbCGtzdgAcc7e",
            "referer": "https://www.toutiao.com/search/?keyword=%s" % keyword.encode('utf-8'),
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
            "x-requested-with": "XMLHttpRequest",
        }
        urls = []
        for page_num in range(0, search_pages_max):
            page_num = page_num * 20
            query_dic = {
                "aid": "24",
                "app_name": "web_search",
                "offset": str(page_num),
                "format": "json",
                "keyword": keyword.encode('utf-8'),
                "autoload": "true",
                "count": "20",
                "en_qc": "1",
                "cur_tab": "1",
                "from": "search_tab",
                "pd": "synthesis",
                "timestamp": int(datetime.datetime.now().timestamp()*1e3),
                # "_signature": "b.FccAAgEBCppuX9t-890W.wHWAADDV3aS3k4oUj4uf89BAU.AlKwpVJ5sJqJx5vRyWYy6hHnm9HqZSc0oNQfDNbq5oqJlQQvxS1Qb2pSamjLMSecFu67csOzqT88Nn13wX",

            }
            url = 'https://www.toutiao.com/api/search/content/?{0}'.format(urllib.parse.urlencode(query_dic))
            urls.append(url)
        toutiao_Lst = []
        for search_page_url in urls:
            get_page = retry_get_url(search_page_url, headers=headers_search,proxies=proxies_num)
            if get_page.status_code != 200:
                # retry once
                get_page = requests.get(search_page_url)
                if get_page.status_code != 200:
                    continue
            page_dict = get_page.json()
            if page_dict["has_more"] == 0:
                break
            # print(page_dict)
            if page_dict['data']:
                for one_line in page_dict['data']:
                    try:

                        title = one_line['title']
                        try:
                            abstract = one_line['abstract']
                        except:
                            abstract = ""
                        if one_line.get('article_url'):
                            url = one_line['article_url']
                        elif one_line.get('url'):
                            url = one_line['url']
                        play_count = one_line['read_count']
                        comment_count = one_line['comment_count']
                        favorite_count = one_line['digg_count']
                        article_id = one_line['id']
                        releaser = one_line['media_name']
                        uid = one_line['user_id']
                        releaserUrl = "https://www.toutiao.com/c/user/%s/" % uid
                        release_time = one_line['publish_time']
                        release_time = int(int(release_time) * 1e3)
                        fetch_time = int(datetime.datetime.now().timestamp() * 1e3)
                        releaser_id = self.find_releaser_id(releaserUrl)
                        D0 = copy.deepcopy(self.video_data)
                        D0['title'] = title
                        D0['abstract'] = abstract
                        D0['url'] = url
                        D0['play_count'] = play_count
                        D0['comment_count'] = comment_count
                        D0['favorite_count'] = favorite_count
                        D0['article_id'] = article_id
                        D0['releaser'] = releaser
                        D0['releaserUrl'] = releaserUrl
                        D0['release_time'] = release_time
                        D0['releaser_id_str'] = "toutiao_%s" % releaser_id
                        D0['fetch_time'] = fetch_time
                        D0['search_word'] = keyword
                        D0["type"] = "article"
                        try:
                            article_info = self.get_web_article_info(article_id, proxies_num=proxies_num)
                            D0.update(article_info)
                        except Exception as e:
                            print("method get_web_article_info error %s" % e)
                        # print(D0)
                        toutiao_Lst.append(D0)
                    except Exception as e:
                        # It's totally ok to drop the last return data value.
                        # The search api just return something seems related to search
                        print(e)
                        continue
                    if len(toutiao_Lst) >= 100:
                        output_result(result_Lst=toutiao_Lst,
                                      platform=self.platform,
                                      output_to_es_raw=output_to_es_raw,
                                      output_to_es_register=output_to_es_register,
                                      es_index=es_index,
                                      )
                        toutiao_Lst.clear()

        if toutiao_Lst != []:
            output_result(result_Lst=toutiao_Lst,
                          platform=self.platform,
                          output_to_es_raw=output_to_es_raw,
                          output_to_es_register=output_to_es_register,
                          es_index=es_index,
                         )

        return toutiao_Lst

    def search_page(self, keyword, search_pages_max=30,
                    output_to_es_raw=False,
                    output_to_es_register=False,
                    es_index=None, proxies_num=0):
        self.search_page_old(keyword, search_pages_max=search_pages_max, output_to_es_raw=output_to_es_raw,
                             output_to_es_register=output_to_es_register,
                             es_index=es_index,
                             proxies_num=proxies_num)

    def find_releaser_id(self, releaserUrl):
        return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)

    def releaser_page_via_pc(self, releaserUrl,
                             output_to_file=False, filepath=None,
                             releaser_page_num_max=30,
                             output_to_es_raw=False,
                             output_to_es_register=False,
                             push_to_redis=False,
                             es_index=None,
                             doc_type=None,
                             proxy_dic=None):
        """
        If output_to_file=True is passed in, an absolute path string should also
        be passed to filepath parameter. filepath string tells where to put the
        output file. The output file name is assigned automatically, is not supported
        to assign by user.
        """

        headers = {'Host': 'www.365yg.com',
                   'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
                   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                   'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
                   'Accept-Encoding': 'gzip, deflate, br',
                   'Connection': 'keep-alive',
                   'Upgrade-Insecure-Requests': '1',
                   'Cache-Control': 'max-age=0'}
        result_Lst = []
        whether_continue = True
        behot_time = 0
        page_count = 0
        releaser_id = self.find_releaser_id(releaserUrl)
        if releaser_id is None:
            pass
        # self.loggererror.error("%s can't get releaser_id" % releaserUrl)
        else:
            print('releaser_id', releaser_id)
            # self.loggerinfo.info("process on releaser %s" % releaser_id)
            while whether_continue is True and page_count <= releaser_page_num_max:
                releaser_page_url = ('https://www.365yg.com/c/user/article/?user_id='
                                     + releaser_id + '&max_behot_time=' + str(behot_time) +
                                     '&max_repin_time=0&count=20&page_type=0')
                # http://m.365yg.com/video/app/user/home/?to_user_id=73299297129&format=json&max_behot_time=0
                get_page = retry_get_url(releaser_page_url, headers=headers, proxies=proxy_dic)
                if get_page is None:
                    # self.loggererror.error("%s can't get page at page num %s" % (releaserUrl, page_count))
                    print("can't get page at %s" % page_count)
                    continue
                else:
                    page_count += 1
                    try:
                        page_dic = get_page.json()
                    except:
                        page_dic = None
                        # self.loggererror.error('Failed to transfer text to dict on json data url: %s' % releaser_page_url)
                        print('Failed to transfer text to dict on '
                              'json data url: %s' % releaser_page_url)
                        continue
                    video_dic = page_dic['data']
                    whether_continue = page_dic['has_more']
                    behot_time = page_dic['next']['max_behot_time']
                    for line in video_dic:
                        video_dict = copy.deepcopy(self.video_data)
                        # behot_time is different in every video item in the same
                        # releaser page, and will generally descent. The one used
                        # to get next page is the behot_time value from the last
                        # video in the present releaser page.
                        try:
                            video_dict['release_time'] = int(line['behot_time'] * 1e3)
                            video_dict['title'] = line['title']
                            video_dict['url'] = line['display_url']
                            video_dict['releaser'] = line['source']
                            video_id = line['group_id']
                            video_dict['video_id'] = video_id
                            video_dict['url'] = 'http://www.365yg.com/a' + str(video_id) + '/'
                            video_dict['comment_count'] = line['comments_count']
                            duration_str = line['video_duration_str']
                            video_dict['duration'] = trans_duration(duration_str)
                            video_dict['play_count'] = line['video_watch_count']
                        except KeyError as except_msg:
                            # self.loggererror.error('Got KeyError exception: %s at page %s'
                            # % (except_msg, releaserUrl))
                            print('Got KeyError exception: %s at page %s' % (
                                except_msg, releaserUrl))
                            try:
                                print(duration_str)
                            except:
                                print("can't print duration_str")
                            continue
                        fetch_time = int(datetime.datetime.now().timestamp() * 1e3)
                        video_dict['platform'] = self.platform
                        video_dict['releaser_id_str'] = str(releaser_id)
                        video_dict['fetch_time'] = fetch_time
                        video_dict['releaserUrl'] = releaserUrl
                        result_Lst.append(video_dict)
                        if len(result_Lst) % 100 == 0:
                            output_result(result_Lst, self.platform,
                                          output_to_file=output_to_file,
                                          filepath=filepath,
                                          output_to_es_raw=output_to_es_raw,
                                          output_to_es_register=output_to_es_register,
                                          push_to_redis=push_to_redis,
                                          es_index=es_index,
                                          doc_type=doc_type)
                            result_Lst.clear()
                            print(behot_time)
            #                            self.loggerinfo.info("write %s 100 video_data_dicts into es %s, %s"
            #                                                 % (releaser_id, es_index, doc_type))

            if result_Lst != []:
                output_result(result_Lst, self.platform,
                              output_to_file=output_to_file,
                              filepath=filepath,
                              output_to_es_raw=output_to_es_raw,
                              output_to_es_register=output_to_es_register,
                              push_to_redis=push_to_redis,
                              es_index=es_index,
                              doc_type=doc_type)

    #                self.loggerinfo.info("write %s %s video_data_dicts into es %s, %s"
    #                                     % (releaser_id, len(result_Lst), es_index, doc_type))

    def check_play_count_by_video_page(self, url):
        """
        To check whether the play_count from releaser_page(www.365yg.com) is the
        real play_count show on video_page
        """
        if "toutiao.com" in url:
            video_id_str = ' '.join(re.findall('/group/[0-9]+', url))
            video_id = ' '.join(re.findall('\d+', video_id_str))
            url = 'http://www.365yg.com/a' + video_id
        get_page = get_redirected_resp(url)
        if get_page is None:
            return None
        else:
            page = get_page.text
            find_play_count = re.findall('videoPlayCount: \d+,', page)
            if find_play_count != []:
                play_count = re.findall('\d+', find_play_count[0])[0]
                return int(play_count)
            else:
                print("can't get play_count")

    def get_releaser_image(self, releaserUrl=None, data=None):
        if releaserUrl:
            headers = {'Host': 'www.toutiao.com',
                       'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
                       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                       'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
                       'Accept-Encoding': 'gzip, deflate, br',
                       'Connection': 'keep-alive',
                       'Upgrade-Insecure-Requests': '1',
                       'Cache-Control': 'max-age=0'}
            proxies = get_proxy(1)
            releaser_id = self.find_releaser_id(releaserUrl)
            releaserUrl = 'https://www.toutiao.com/c/user/' + str(releaser_id) + '/'
            get_page = retry_get_url(releaserUrl, headers=headers, proxies=proxies)
            page = get_page.text
            try:
                releaser_img = re.findall("avtar_img:'(.*)'", data)[0]
                return "http:" + releaser_img
            except:
                print("can't get releaser_img")
        else:
            releaser_img = re.findall("avtar_img:'(.*)'", data)[0]
            return "http:" + releaser_img

    def get_releaser_follower_num(self, releaserUrl):
        releaser_id = self.find_releaser_id(releaserUrl)
        releaserUrl = 'https://www.toutiao.com/c/user/' + str(releaser_id) + '/'
        headers = {
            # 'referer': releaserUrl,
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "accept-encoding": "gzip, deflate",
            "accept-language": "zh,zh-CN;q=0.9",
            "cache-control": "max-age=0",
            # "cookie": 'csrftoken=301d4862d95090ad520f8a54ae360b93; uuid="w:79cdae1ec41c48c9b9cd21255077f629"; _ga=GA1.2.161734802.1557472235; sid_guard="135fa6481579978c777871e6fd64388b|1561603571|15552000|Tue\054 24-Dec-2019 02:46:11 GMT"; tt_webid=6722360764079998478; CNZZDATA1259612802=1042288349-1556155453-%7C1565168219; tt_track_id=c588c87e33b84cd5dd4056cf33200ca4; _ba=BA0.2-20191118-51299-hQirR6SUpyfp1u9bfDab; tt_webid=6722360764079998478; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=16eef18c3b72df-0f53ff0c8ec3e8-72256d3c-5c030-16eef18c3b8634; s_v_web_id=335bc73471816523ac088af5f1424861; __tasessionId=f2cnfge5z1577959024466',
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "none",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        }
        count = 0

        while count < 3:
            try:
                count += 1
                get_page = retry_get_url(releaserUrl, headers=headers, proxies=5)
                page = get_page.text
                follower_num = int(re.findall('\d+', ' '.join(re.findall("fensi:'\d+'", page)))[0])
                print('%s follower number is %s' % (releaserUrl, follower_num))
                releaser_img = self.get_releaser_image(data=page)
                return follower_num, releaser_img
            except:
                print("can't find followers")
                continue
        else:
            return None, None

    def get_releaser_page(self, releaserUrl):
        releaser_id = self.find_releaser_id(releaserUrl)
        releaserUrl = 'https://profile.zjurl.cn/user/profile/homepage/v7/?user_id=%s&request_source=1' % str(
            releaser_id)
        headers = {
            "accept": "text/javascript, text/html, application/xml, text/xml, */*",
            "accept-encoding": "gzip, deflate",
            "accept-language": "zh,zh-CN;q=0.9",
            "content-type": "application/x-www-form-urlencoded",
            # "cookie": "_ga=GA1.2.820802612.1574645404; SLARDAR_WEB_ID=6678585193337783815; UM_distinctid=16f694e01a8d0-033097fa287e68-5f4e2917-161398-16f694e01a96ab; odin_tt=c3b67a2f796710b8aeda2eea0ff940b66427f774a370e8151a32483f06d120ad1c713efefc2d455cb9ec6178ddbbd1a6; gftoken=NDE2NDU0MzExM3wxNTgyODkwMDI4NTh8fDAGBgYGBgY; CNZZDATA1274386066=1245948952-1578016633-%7C1589879022",
            "referer": releaserUrl,
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
            "x-requested-with": "XMLHttpRequest",
        }
        count = 0

        while count < 3:
            try:
                count += 1
                get_page = retry_get_url(releaserUrl, headers=headers, proxies=5, timeout=5)
                page = get_page.json()
                follower_num = page["data"]['followers_count']
                print('%s follower number is %s' % (releaserUrl, follower_num))
                time.sleep(random.random())
                location = page["data"].get('area')
                huozan = page["data"].get("digg_count")
                verify = page["data"].get("'verified_content'")
                signature = page["data"].get('description')
                publish_count = page["data"].get('publish_count')
                signature_type = page["data"].get('media_type')
                dic = {
                    "location": location,
                    "huozan": huozan,
                    "verify": verify,
                    "signature": signature,
                    "follower_num": follower_num,
                    "publish_count": publish_count,
                    "signature_type": signature_type,
                }
                print(dic)
                return dic
            except:
                print("can't find followers")
                continue
        else:
            return None

    def get_releaserUrl_from_video_page(self, url, proxy_dic=None):
        """
        To get releaserUrl from video page
        """
        if "toutiao.com" in url:
            video_id_str = ' '.join(re.findall('/group/[0-9]+', url))
            video_id = ' '.join(re.findall('\d+', video_id_str))
            url = 'http://www.365yg.com/a' + video_id
        get_page = retry_get_url(url, proxies=proxy_dic)
        if get_page is None:
            return None
        else:
            page = get_page.text
            find_releaser_id = re.findall("mediaId: '\d+',", page)
            if find_releaser_id != []:
                releaser_id = re.findall('\d+', ' '.join(find_releaser_id))[0]
                releaserUrl = 'https://www.toutiao.com/c/user/' + str(releaser_id) + '/'
                return releaserUrl
            else:
                return None

    def list_page(self, task_lst,
                  output_to_file=False,
                  filepath=None,
                  page_num_max=20,
                  output_to_es_raw=False,
                  output_to_es_register=False,
                  es_index='crawler-data-raw',
                  doc_type='doc',
                  proxy_dic=None):

        """
        To get video data from list page, it can not be revised to async-crawler
        due to the next page depends on the previous page's data
        """

        cookie = ('tt_webid=6553778542248003086;'
                  'CNZZDATA1259612802=1625670355-1516338642-%7C1527150919;'
                  '_ga=GA1.2.1539151044.1516342895;'
                  '__utma=91863192.1539151044.1516342895.1521092491.1521092491.1;'
                  '__tea_sdk__user_unique_id=6553778542248003086;'
                  '__tea_sdk__ssid=545b4c91-3bd3-4748-831b-6edbf9415b70;'
                  'CNZZDATA1262382642=810628165-1527124428-%7C1529484233;'
                  '__tasessionId=ptjvpftsc1539054420281;'
                  '_gid=GA1.2.1520435477.1539054422')

        headers = {'Host': 'www.365yg.com',
                   'User-Agent': self.random_useragent(),
                   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                   'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
                   'Accept-Encoding': 'gzip, deflate, br',
                   'Cookie': cookie,
                   'Connection': 'keep-alive',
                   'Upgrade-Insecure-Requests': '1',
                   'Cache-Control': 'max-age=0'}

        result_lst = []
        max_behot_time = 0
        page_num = 1
        while page_num <= page_num_max:
            listurl = ('https://www.365yg.com/api/pc/feed/?max_behot_time='
                       + str(max_behot_time) + '&category=video_new&utm_source=toutiao')
            get_page = retry_get_url(listurl, headers=headers, proxies=proxy_dic)
            page_num += 1
            try:
                page_dic = get_page.json()
            except:
                page_dic = {}
            if page_dic == {}:
                max_behot_time = 0
                print("can't get list page")
                continue
            else:
                max_behot_time = page_dic['next']['max_behot_time']
                video_info_lst = page_dic['data']
                for line in video_info_lst:
                    video_dic = copy.deepcopy(self.video_data)
                    title = line['title']
                    video_dic['title'] = line['title']
                    video_dic['data_from'] = 'list_page'
                    video_dic['url'] = 'https://www.365yg.com/a' + line['group_id']
                    try:
                        dura_str = line['video_duration_str']
                        video_dic['duration'] = trans_duration(dura_str)
                    except:
                        video_dic['duration'] = 0
                        print("%s can't get duration" % title)
                    video_dic['releaser'] = line['source']
                    video_dic['releaserUrl'] = 'https://www.365yg.com' + line['media_url']
                    video_dic['release_time'] = int(int(line['behot_time']) * 1e3)
                    try:
                        video_dic['describe'] = line['abstract']
                    except:
                        video_dic['describe'] = ''
                        print("%s can't get describe" % title)
                    try:
                        video_dic['play_count'] = line['video_play_count']
                    except:
                        video_dic['play_count'] = 0
                        print("%s can't get play_count" % title)
                    video_dic['fetch_time'] = int(datetime.datetime.timestamp() * 1e3)
                    result_lst.append(video_dic)
                    if len(result_lst) >= 100:
                        output_result(result_Lst=result_lst,
                                      platform=self.platform,
                                      output_to_file=output_to_file,
                                      filepath=filepath,
                                      output_to_es_raw=output_to_es_raw,
                                      output_to_es_register=output_to_es_register,
                                      es_index=es_index,
                                      doc_type=doc_type)
                        result_lst.clear()
                        print(max_behot_time)
        if result_lst != []:
            output_result(result_Lst=result_lst,
                          platform=self.platform,
                          output_to_file=output_to_file,
                          filepath=filepath,
                          output_to_es_raw=output_to_es_raw,
                          output_to_es_register=output_to_es_register,
                          es_index=es_index,
                          doc_type=doc_type)
        return result_lst

    def get_data_mediaid(self, releaserUrl, releaser_id):
        headers = {
            "Host": "m.toutiao.com",
            "Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": self.random_useragent(),
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9"
        }
        releaserUrl = "http://m.toutiao.com/profile/%s/#mid=%s" % (releaser_id, releaser_id)
        time.sleep(1)
        res = requests.get(releaserUrl, headers=headers, timeout=5)

        # cookie = requests.utils.dict_from_cookiejar(res.cookies)
        # print(cookie)
        try:
            data_mediaid = re.findall(r'data-mediaid="(\d+)"', res.text)
        except:
            data_mediaid = ""
        if data_mediaid:
            # print(data_mediaid)
            return data_mediaid[0]
        else:
            return False

    def App_releaser_page_video(self, releaserUrl,
                                output_to_file=False,
                                filepath=None,
                                releaser_page_num_max=30,
                                output_to_es_raw=False,
                                output_to_es_register=False,
                                push_to_redis=False,
                                es_index=None,
                                doc_type=None,
                                proxies_num=None):

        result_list = []
        has_more = True
        count = 1
        count_false = 0
        releaser_id = self.find_releaser_id(releaserUrl)
        offset = ""
        self.headers[
            "referer"] = "https://profile.zjurl.cn/rogue/ugc/profile/?user_id=%s&request_source=1" % releaser_id
        # vid = "AB5483CA-FCDC-42F1-AFB1-077A1%sDA" % random.randint(100000, 999999)
        # ccid = "F153594D-1310-4984-A4C3-A679D4D%s" % random.randint(10000, 99999)
        # openudid = "5d44f2ea1b74e3731b27e5ed8039ac29f%s" % random.randint(1000000, 9999999)
        # idfa = "E3FC9054-384B-485F-9B4C-936F33D7D%s" % random.randint(100, 999)
        # iid = str(random.randint(104525900000, 104526000000))
        while has_more and count <= releaser_page_num_max:
            # print(str(releaser_id)+str(max_behot_time))
            # js_head = json.loads(get_js(str(releaser_id)+str(max_behot_time)))
            print("get %s video on page %s" % (releaser_id, count))

            # url_dic = {
            #     "visited_uid": releaser_id,
            #     "client_extra_params": '{"playparam":"codec_type:0"}',
            #     "count": "20",
            #     "category": "profile_video",
            #     "offset": offset,
            #     "stream_api_version": "88",
            #     # "client_extra_params": '{"playparam": "codec_type:0"}',
            #     # "media_id": "1649067165440014",
            #     # "resolution": "640*1136",
            #     "ab_feature": "794526,1226546",
            #     # "vid": vid,
            #     "app_name": "news_article",
            #     "channel": "App%20Store",
            #     # "cdid": ccid,
            #     "resolution": "1536*2048",
            #     "device_type": "iPad6,11",
            #     "iid": iid,
            #     # "cdid": ccid,
            #     "ssmix": "a",
            #     "update_version_code": "75410",
            #     "aid": "13",
            #     # "openudid":openudid,
            #     # "idfv": vid,
            #     "os_version": "13.3",
            #     "device_platform": "ipad",
            #     "ab_client": "a1,f2,f7,e1",
            #     "ab_group": "794526,1226546",
            #     "ac": "WIFI",
            #     "version_code": "7.5.4",
            #     # "device_id": random.randint(60000000000, 80000000000),
            #     # "openudid": openudid,
            #     # "idfv": vid,
            #     # "idfa": idfa,
            #     # "iid": "95105525198",
            #     # "device_id": "70149736870",
            #     #     "iid": "94295239614",
            #     #     "device_id": "69418894872",
            # }
            # url_dic = {
            #         # "category": "profile_video",
            #         "visited_uid": releaser_id,
            #         "stream_api_version": "47",
            #         "count": "20",
            #         "offset": offset,
            #         "client_extra_params": r'{}',
            #         "iid": iid,
            #         "ac": "wifi",
            #         # "mac_address": "08:00:27:1F:7E:A0",
            #         "channel": "wap_test_lite_1",
            #         "aid": "35",
            #         "app_name": "news_article_lite",
            #         "version_code": "715",
            #         "version_name": "7.1.5",
            #         "device_platform": "android",
            #         "ab_version": "668903,668905,668907,808414,772541,1378617,668908,668904,668906,1401332,1496418,9289425",
            #         "ab_client": "a1,c2,e1,f2,g2,f7",
            #         "ab_feature": "z1",
            #         "abflag": "3",
            #         "ssmix": "a",
            #         "device_type": "OPPO R11",
            #         "device_brand": "OPPO",
            #         "language": "zh",
            #         "os_api": "22",
            #         "os_version": "5.1.1",
            #         "manifest_version_code": "715",
            #         "resolution": "1920*1080",
            #         "dpi": "401",
            #         "update_version_code": "71504",
            #         "sa_enable": "0",
            #         "tma_jssdk_version": "1.25.4.2",
            #         # "fp": "a_fake_fp",
            #         "rom_version": "coloros__r11-user",
            #         "plugin_state": "30631999",
            #         "as": "ab9db50e485e50977f9db5",
            #         "mas": "ab9db50e485e50977f9db5",
            #
            # }
            # host = random.choice(self.api_list)
            # self.headers["Host"] = host
            # url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&{2}".format(host,
            #                                                                                         random.randint(5,
            #                                                                                                        10),
            #                                                                                         urllib.parse.urlencode(
            #                                                                                             url_dic))
            # url = """https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&client_extra_params=&count=20&offset={3}&stream_api_version=88&category=profile_video&version_code=7.6.0&app_name=news_article&channel=App Store&resolution=1536*2048&aid=13&ab_feature=794528&ab_version=765192,857803,660830,1444046,1397712,1434498,662176,801968,1419045,668775,1462526,1190525,1489306,1493796,1439625,1469498,668779,1417599,662099,1477261,1484884,668774,1496422,1427395&ab_group=794528&pos=5pe9vb/x8v788cLx/On47unC7fLuv72nveaXvb29vb/ 8vLv fTz/On4y/zx6Pjuv72nveaXvb29vb29v/Hy8/r06ej5 L 9p72tsZe9vb29vb2/8fzp9Ono fi/vae9rZe9vb294Je9veCX4A==&update_version_code=76014&ac=WIFI&os_version=13.3.1&ssmix=a&device_platform=ipad&iid={4}&ab_client=a1,f2,f7,e1&device_type=iPad6,11""".format(
            #     random.choice(self.api_list), random.randint(5, 10), str(releaser_id), str(offset), iid)
            # url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&stream_api_version=47&count=20&offset={3}&ac=wifi&channel=wap_test_lite_1&&iid=104525915827&device_id=70787469432&aid=35&app_name=news_article_lite&version_code=715&version_name=7.1.5&device_platform=android&ab_version=668903,668905,668907,808414,772541,1378617,668908,668904,668906,1401332,1496418,928942&ab_client=a1,c2,e1,f2,g2,f7&ab_feature=z1&abflag=3&ssmix=a&device_type=OPPO R11&device_brand=OPPO&language=zh&os_api=22&os_version=5.1.1&manifest_version_code=715&resolution=900*1600&dpi=320&update_version_code=71504&sa_enable=0&fp=a_fake_fp&tma_jssdk_version=1.25.4.2&rom_version=coloros__r11-user 5.1.1 nmf26x 500200210 release-keys&plugin_state=30631999".format(
            #         random.choice(self.api_list), random.randint(5, 10), str(releaser_id), str(offset))
            url = "https://profile.zjurl.cn/api/feed/profile/v1/?category=profile_video&visited_uid={0}&stream_api_version=82&request_source=1&offset={1}&user_id={2}".format(
                str(releaser_id), str(offset), str(releaser_id))
            # print(url)
            try:
                proxies = get_proxy(proxies_num)
                if proxies:
                    get_page = requests.get(url, headers=self.headers, proxies=proxies, timeout=10)
                else:
                    get_page = requests.get(url, headers=self.headers, timeout=10)
            except:
                continue
            # time.sleep(0.3)
            # print(get_page.text)
            # time.sleep(0.5)
            # format_json = re.match(r"jsonp\d+", get_page.text)
            page_dic = {}
            # print(get_page.text)
            try:
                page_dic = get_page.json()
                if page_dic.get("message") != "success":
                    count_false += 1
                    if count_false < 10:
                        continue
                    else:
                        print("unknow error")
                        break
                # print(get_page)
                # print(page_dic)
                data_list = page_dic.get('data')
                has_more = page_dic.get('has_more')
                offset = str(page_dic.get("offset"))
            except:
                if not page_dic:
                    count_false += 1
                    if count_false >= 10:
                        break
                    else:
                        continue
                if data_list:
                    data_list = page_dic.get('data')
                    has_more = page_dic.get('has_more')
                else:
                    data_list = []
                    has_more = False

            # offset = page_dic.get('offset')

            if has_more is None:
                has_more = False
            if data_list == []:
                print("no data in releaser %s page %s" % (releaser_id, count))
                # print(page_dic)
                # print(url)
                proxies = get_proxy(1)
                count_false += 1
                if count_false >= 5:
                    has_more = False
                continue

            else:
                count_false = 0
                count += 1
                for one_video in data_list:
                    # info_str = one_video.get('content')
                    info_dic = json.loads(one_video["content"])
                    video_dic = copy.deepcopy(self.video_data)
                    video_dic['title'] = info_dic.get('title')
                    video_dic['url'] = info_dic.get('share_url')
                    video_dic['releaser'] = info_dic.get('source')
                    video_dic['releaserUrl'] = releaserUrl
                    release_time = info_dic.get('publish_time')
                    video_dic['release_time'] = int(release_time * 1e3)
                    video_dic['duration'] = info_dic.get('video_duration')
                    video_dic['play_count'] = info_dic.get('video_detail_info').get("video_watch_count")
                    video_dic['repost_count'] = info_dic.get('forward_info').get('forward_count')
                    video_dic['comment_count'] = info_dic.get('comment_count')
                    video_dic['favorite_count'] = info_dic.get('digg_count')
                    video_dic['video_id'] = info_dic.get('item_id')
                    video_dic['fetch_time'] = int(datetime.datetime.now().timestamp() * 1e3)
                    video_dic['releaser_id_str'] = "toutiao_%s" % releaser_id
                    video_dic['video_img'] = self.get_video_image(info_dic)
                    result_list.append(video_dic)
                    if len(result_list) >= 100:
                        # data_count += len(result_list)
                        # print(result_list)
                        output_result(result_Lst=result_list,
                                      platform=self.platform,
                                      output_to_file=output_to_file,
                                      filepath=filepath,
                                      output_to_es_raw=output_to_es_raw,
                                      es_index=es_index,
                                      doc_type=doc_type,
                                      output_to_es_register=output_to_es_register)
                        result_list.clear()
        if result_list != []:
            # data_count += len(result_list)
            # print(result_list)
            # print(data_count)
            output_result(result_Lst=result_list,
                          platform=self.platform,
                          output_to_file=output_to_file,
                          filepath=filepath,
                          output_to_es_raw=output_to_es_raw,
                          es_index=es_index,
                          doc_type=doc_type,
                          output_to_es_register=output_to_es_register)

    @staticmethod
    def get_video_image(data):
        video_image_url = ""
        if data.get("large_image_list"):
            video_image_url = data["large_image_list"][0]["url_list"][0]["url"]
            # height = data["large_image_list"][0]["height"]
            # width = data["large_image_list"][0]["width"]
        elif data.get("ugc_video_cover"):
            video_image_url = data["ugc_video_cover"]["url_list"][0]["url"]
            # height = data["ugc_video_cover"]["height"]
            # width = data["ugc_video_cover"]["width"]
        elif data.get("video_detail_info"):
            video_image_url = data["video_detail_info"]["detail_video_large_image"]["url_list"][0]["url"]
            # height = data["video_detail_info"]["detail_video_large_image"]["height"]
            # width = data["video_detail_info"]["detail_video_large_image"]["width"]

        return video_image_url

    def App_releaser_page_all(self, releaserUrl,
                              output_to_file=False,
                              filepath=None,
                              releaser_page_num_max=30,
                              output_to_es_raw=False,
                              output_to_es_register=False,
                              push_to_redis=False,
                              es_index=None,
                              doc_type=None,
                              proxy_dic=None, crawler_type="profile_all", proxies_num=None):

        result_list = []
        has_more = True
        count = 1
        releaser_id = self.find_releaser_id(releaserUrl)
        count_false = 0
        offset = "0"
        self.headers[
            "referer"] = "https://profile.zjurl.cn/rogue/ugc/profile/?user_id=%s&request_source=1" % releaser_id
        # vid = "AB5483CA-FCDC-42F1-AFB1-077A1%sDA" % random.randint(100000, 999999)
        # ccid = "F153594D-1310-4984-A4C3-A679D4D%s" % random.randint(10000, 99999)
        # openudid = "5d44f2ea1b74e3731b27e5ed8039ac29f%s" % random.randint(1000000, 9999999)
        # idfa = "E3FC9054-384B-485F-9B4C-936F33D7D%s" % random.randint(100, 999)
        # iid = str(random.randint(104525900000, 104526000000))
        while has_more and count <= releaser_page_num_max:
            print("get %s article on page %s" % (releaser_id, count))
            # js_head = json.loads(get_js(str(releaser_id)+str(max_behot_time)))
            # url_dic = {
            #         # "category": crawler_type,
            #         "visited_uid": releaser_id,
            #         "client_extra_params": '{"playparam": "codec_type:0"}',
            #         "count": "20",
            #         "offset": offset,
            #         "stream_api_version": "88",
            #         "category": crawler_type,
            #         # "resolution": "640*1136",
            #         # "vid": vid,
            #         "ab_feature": "794526,1226546",
            #         # "vid": vid,
            #         "app_name": "news_article",
            #         "channel": "App Store",
            #         "cdid": ccid,
            #         "resolution": "1536*2048",
            #         "device_type": "iPad6,11",
            #         "ssmix": "a",
            #         "update_version_code": "75410",
            #         "aid": "13",
            #         # "openudid":openudid,
            #         # "idfv": vid,
            #         "os_version": "13.3",
            #         "device_platform": "ipad",
            #         "ab_client": "a1,f2,f7,e1",
            #         "ab_group": "794526,1226546",
            #         "ac": "WIFI",
            #         "version_code": "7.5.4",
            #         "iid": iid,
            #         # "device_id": random.randint(60000000000, 80000000000),
            # }
            # url_dic = {
            #         # "category": "profile_video",
            #         "visited_uid": releaser_id,
            #         "stream_api_version": "47",
            #         "count": "20",
            #         "offset": offset,
            #         "client_extra_params": r'{}',
            #         # "iid": iid,
            #         "ac": "wifi",
            #         # "mac_address": "08:00:27:1F:7E:A0",
            #         "channel": "wap_test_lite_1",
            #         "aid": "35",
            #         "app_name": "news_article_lite",
            #         "version_code": "715",
            #         "version_name": "7.1.5",
            #         "device_platform": "android",
            #         "ab_version": "668903,668905,668907,808414,772541,1378617,668908,668904,668906,1401332,1496418,9289425",
            #         "ab_client": "a1,c2,e1,f2,g2,f7",
            #         "ab_feature": "z1",
            #         "abflag": "3",
            #         "ssmix": "a",
            #         "device_type": "OPPO%20R11",
            #         "device_brand": "OPPO",
            #         "language": "zh",
            #         "os_api": "22",
            #         "os_version": "5.1.1",
            #         "manifest_version_code": "715",
            #         "resolution": "1080*1920",
            #         "dpi": "401",
            #         "update_version_code": "71504",
            #         # "sa_enable": "0",
            #         # "fp": "a_fake_fp",
            #         # "tma_jssdk_version": "1.25.4.2",
            #         # "rom_version": "coloros__r11-user 5.1.1 nmf26x 500200210 release-keys",
            #         # "plugin_state": "30631999",
            #         # "as": "ab9db50e485e50977f9db5",
            #         # "mas": "ab9db50e485e50977f9db5",
            #
            # }
            # url = "http://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_all&{2}".format(
            #             #         random.choice(self.api_list), random.randint(5, 10), urllib.parse.urlencode(url_dic))
            # url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_all&visited_uid={2}&stream_api_version=47&count=20&offset={3}&ac=wifi&channel=wap_test_lite_1&aid=35&app_name=news_article_lite&&iid=104525915827&device_id=70787469432&version_code=715&version_name=7.1.5&device_platform=android&ab_version=668903,668905,668907,808414,772541,1378617,668908,668904,668906,1401332,1496418,928942&ab_client=a1,c2,e1,f2,g2,f7&ab_feature=z1&abflag=3&ssmix=a&device_type=OPPO R11&device_brand=OPPO&language=zh&os_api=22&os_version=5.1.1&manifest_version_code=715&resolution=900*1600&dpi=320&update_version_code=71504&sa_enable=0&fp=a_fake_fp&tma_jssdk_version=1.25.4.2&rom_version=coloros__r11-user 5.1.1 nmf26x 500200210 release-keys&plugin_state=30631999".format(
            #     random.choice(self.api_list), random.randint(5, 10), str(releaser_id), str(offset))
            # url = """https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_all&visited_uid={2}&client_extra_params=&count=20&offset={3}&stream_api_version=88&category=profile_all&version_code=7.6.0&app_name=news_article&channel=App Store&resolution=1536*2048&aid=13&ab_feature=794528&ab_version=765192,857803,660830,1444046,1397712,1434498,662176,801968,1419045,668775,1462526,1190525,1489306,1493796,1439625,1469498,668779,1417599,662099,1477261,1484884,668774,1496422,1427395&ab_group=794528&pos=5pe9vb/x8v788cLx/On47unC7fLuv72nveaXvb29vb/ 8vLv fTz/On4y/zx6Pjuv72nveaXvb29vb29v/Hy8/r06ej5 L 9p72tsZe9vb29vb2/8fzp9Ono fi/vae9rZe9vb294Je9veCX4A==&update_version_code=76014&ac=WIFI&os_version=13.3.1&ssmix=a&device_platform=ipad&iid={4}&ab_client=a1,f2,f7,e1&device_type=iPad6,11""".format(
            #     random.choice(self.api_list), random.randint(5, 10), str(releaser_id), str(offset), iid)
            url = "https://profile.zjurl.cn/api/feed/profile/v1/?category=profile_all&visited_uid={0}&stream_api_version=82&request_source=1&offset={1}&user_id={2}&e=md5&operator=CHINA MOBILE_46007&network=WIFI".format(
                str(releaser_id), str(offset), str(releaser_id))

            # print(url)

            proxies = get_proxy(proxies_num=proxies_num)
            try:
                if proxies:
                    get_page = requests.get(url, headers=self.headers, proxies=proxies, timeout=10)
                else:
                    get_page = requests.get(url, headers=self.headers, timeout=10)
            except Exception as e:
                print(e)
                continue
            # print(get_page.text)

            # print(get_page.text)
            # time.sleep(0.5)
            # format_json = re.match(r"jsonp\d+", get_page.text)
            page_dic = {}
            try:
                page_dic = get_page.json()
                data_list = page_dic.get('data')
                if page_dic.get("message") != "success":
                    count_false += 1
                    if count_false < 3:
                        continue
                    else:
                        print("unknow error")
                        break
                has_more = page_dic.get('has_more')
                offset = str(page_dic.get("offset"))
            except:
                if not page_dic:
                    count_false += 1
                    if count_false >= 3:
                        break
                    else:
                        continue
                if data_list:
                    data_list = page_dic.get('data')
                    has_more = page_dic.get('has_more')
                else:
                    data_list = []
                    has_more = False
            # offset = page_dic.get('offset')

            if has_more is None:
                has_more = False
            if data_list == []:
                print("no data in releaser %s page %s" % (releaser_id, count))
                # print(page_dic)
                has_more = False
                continue
            else:
                count += 1
                count_false = 0
                for one_video in data_list:
                    # info_str = one_video.get('content')
                    info_dic = json.loads(one_video["content"])
                    if info_dic.get("has_video"):
                        video_dic = copy.deepcopy(self.video_data)
                        video_dic['title'] = info_dic.get('title')
                        video_dic['url'] = info_dic.get('share_url')
                        if video_dic['url'] == "":
                            video_dic['url'] = info_dic.get('url')
                        video_dic['releaser'] = info_dic.get('source')
                        video_dic['releaserUrl'] = releaserUrl
                        release_time = info_dic.get('publish_time')
                        video_dic['release_time'] = int(release_time * 1e3)
                        video_dic['duration'] = info_dic.get('video_duration')
                        video_dic['play_count'] = info_dic.get("video_detail_info").get("video_watch_count")
                        if not video_dic['play_count']:
                            video_dic['play_count'] = info_dic.get("read_count")
                        video_dic['repost_count'] = info_dic.get('forward_info').get('forward_count')
                        video_dic['comment_count'] = info_dic.get('comment_count')
                        video_dic['favorite_count'] = info_dic.get('digg_count')
                        video_dic['video_id_str'] = info_dic.get('item_id')
                        video_dic['fetch_time'] = int(datetime.datetime.now().timestamp() * 1e3)
                        video_dic['releaser_id_str'] = "toutiao_%s" % releaser_id
                        video_dic['video_img'] = self.get_video_image(info_dic)
                        # print(video_dic)
                        for v in video_dic:
                            try:
                                if video_dic[v] is not None:
                                    pass
                                else:
                                    video_dic = self.video_page(video_dic['url'])
                                    if video_dic:
                                        pass
                                    else:
                                        continue
                            except:
                                continue
                        result_list.append(video_dic)
                    elif info_dic.get("abstract") == "":
                        video_url = None
                        video_dic = copy.deepcopy(self.video_data)
                        # print(info_dic)
                        if info_dic.get('article_url'):
                            video_url = info_dic.get('article_url')
                        elif info_dic.get('display_url'):
                            video_url = info_dic.get('display_url')
                        elif info_dic.get('url'):
                            video_url = info_dic.get('url')
                        elif info_dic.get('share_url'):
                            video_url = info_dic.get('share_url')
                        elif info_dic.get("raw_data"):
                            if info_dic.get("raw_data").get("origin_group"):
                                video_url = info_dic.get("raw_data").get("origin_group").get('article_url')
                            elif info_dic.get("raw_data").get("comment_base"):
                                video_url = info_dic.get("raw_data").get("comment_base").get('share').get('share_url')
                            elif info_dic.get("raw_data").get("action"):
                                video_url = "https://m.toutiaoimg.cn/group/%s/" % info_dic.get("raw_data").get(
                                    'group_id')
                                video_dic['video_id'] = info_dic.get("raw_data").get('group_id')
                                video_dic['play_count'] = info_dic.get("raw_data").get("action").get("play_count")
                                video_dic['repost_count'] = info_dic.get("raw_data").get("action").get("share_count")
                                video_dic['comment_count'] = info_dic.get("raw_data").get("action").get('comment_count')
                                video_dic['favorite_count'] = info_dic.get("raw_data").get("action").get('digg_count')
                                video_dic['duration'] = info_dic.get('raw_data').get('video').get("duration")
                                video_dic['title'] = info_dic.get('raw_data').get("title")
                                video_dic['releaser'] = info_dic.get('raw_data').get("user").get("info").get("name")
                                video_dic['releaserUrl'] = releaserUrl
                                video_dic['url'] = video_url
                                video_dic['releaser_id_str'] = "toutiao_%s" % releaser_id
                                video_dic['fetch_time'] = int(datetime.datetime.now().timestamp() * 1e3)
                                video_dic['video_img'] = "http://p1-tt.bytecdn.cn/large/" + info_dic.get(
                                    'raw_data').get('video').get("origin_cover").get("uri")
                                video_dic['release_time'] = int(info_dic.get("raw_data").get("create_time") * 1e3)
                                video_url = None
                        if video_url:
                            video_page_dic = self.video_page(video_url)
                            if video_page_dic:
                                video_dic.update(video_page_dic)
                        result_list.append(video_dic)
                        if len(result_list) >= 100:
                            # data_count += len(result_list)
                            # print(result_list)
                            output_result(result_Lst=result_list,
                                          platform=self.platform,
                                          output_to_file=output_to_file,
                                          filepath=filepath,
                                          output_to_es_raw=output_to_es_raw,
                                          es_index=es_index,
                                          doc_type=doc_type,
                                          output_to_es_register=output_to_es_register)
                            result_list.clear()
        if result_list != []:
            # data_count += len(result_list)
            # print(result_list)
            # print(data_count)
            output_result(result_Lst=result_list,
                          platform=self.platform,
                          output_to_file=output_to_file,
                          filepath=filepath,
                          output_to_es_raw=output_to_es_raw,
                          es_index=es_index,
                          doc_type=doc_type,
                          output_to_es_register=output_to_es_register)

    def retry_url(self, url, headers, cookies):
        retry_count = 0
        proxies = get_proxy(1)
        while retry_count < 10:
            # time.sleep(0.2)
            get_page = requests.get(url, headers=headers, allow_redirects=False, cookies=cookies, proxies=proxies)
            page_dic = get_page.json()
            data_list = page_dic.get('data')
            retry_count += 1
            if data_list:
                return page_dic
        else:
            return False

    def releaser_page_pc(self, releaserUrl,
                         output_to_file=False,
                         filepath=None,
                         releaser_page_num_max=30,
                         output_to_es_raw=False,
                         output_to_es_register=False,
                         push_to_redis=False,
                         es_index=None,
                         doc_type=None,
                         proxy_dic=None, **kwargs):
        page_dic = {}
        result_list = []
        has_more = True
        count = 1
        releaser_id = self.find_releaser_id(releaserUrl)
        # print('releaser_id', releaser_id)
        max_behot_time = 0
        data_count = 0
        # print(as_cp_sign)
        headers = {
            "accept": "application/json, text/javascript",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            "content-type": "application/x-www-form-urlencoded",
            "x-requested-with": "XMLHttpRequest",
            "Referer": "https://www.toutiao.com/c/user/%s/" % releaser_id,
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
            # "cookie":'cookie: tt_webid=6673330506500982276; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=169c3156bb86b3-00d2e2a0ad50b2-7a1437-161398-169c3156bb9746; tt_webid=6673330506500982276; csrftoken=301d4862d95090ad520f8a54ae360b93; uuid="w:79cdae1ec41c48c9b9cd21255077f629"; CNZZDATA1259612802=281397494-1553752275-https%253A%252F%252Fwww.baidu.com%252F%7C1555306390',
            "cache-control": "max-age=0",
        }
        user_page_url = "https://www.toutiao.com/c/user/%s/" % releaser_id
        user_page = requests.get(user_page_url, headers=headers)
        cookies = user_page.cookies
        while has_more and count <= releaser_page_num_max:
            # print(str(releaser_id)+str(max_behot_time))
            # js_head = json.loads(get_js(str(releaser_id)+str(max_behot_time)))
            # get_as_cp_sign = requests.get(
            #         "http://127.0.0.1:3000/?id=%s&max_behot_time=%s" % (releaser_id, max_behot_time))
            get_as_cp_sign = as_cp(releaser_id, max_behot_time)
            url_dic = {"page_type": "0",
                       "user_id": releaser_id,
                       "max_behot_time": max_behot_time,
                       "count": "20",
                       "as": get_as_cp_sign[0],
                       "cp": get_as_cp_sign[1],
                       # "_signature": as_cp_sign.get('_signature')
                       }

            url = "https://www.toutiao.com/c/user/article/?%s" % urllib.parse.urlencode(url_dic)
            get_page = requests.get(url, headers=headers, cookies=cookies)
            # time.sleep(0.5)
            # format_json = re.match(r"jsonp\d+", get_page.text)
            page_dic = {}
            try:
                page_dic = get_page.json()
                data_list = page_dic.get('data')
                has_more = page_dic.get('has_more')
                max_behot_time = str(page_dic.get("next")["max_behot_time"])
            except:
                data_list = self.retry_url(url, headers, cookies)
                if data_lis:
                    data_list = page_dic.get('data')
                    has_more = page_dic.get('has_more')
                else:
                    data_list = []
                    has_more = False
            # offset = page_dic.get('offset')

            if has_more is None:
                has_more = False
            if data_list == []:
                print("no data in releaser %s page %s" % (releaser_id, count))
                # print(page_dic)
                print(url)
                count += 1
                continue
            else:
                count += 1
                for one_video in data_list:
                    max_behot_time = str(page_dic.get("next")["max_behot_time"])
                    # info_str = one_video.get('content')
                    info_dic = one_video
                    video_dic = copy.deepcopy(self.video_data)
                    video_dic['title'] = info_dic.get('title')
                    video_dic['url'] = info_dic.get('source_url')
                    video_dic['releaser'] = info_dic.get('source')
                    video_dic['releaserUrl'] = releaserUrl
                    release_time = info_dic.get('behot_time')
                    video_dic['release_time'] = int(release_time * 1e3)
                    video_dic['duration'] = self.t2s(info_dic.get('video_duration_str'))
                    video_dic['play_count'] = info_dic.get('detail_play_effective_count')
                    video_dic['repost_count'] = info_dic.get('forward_info').get('forward_count')
                    # video_dic['comment_count'] = info_dic.get('comment_count')
                    # video_dic['favorite_count'] = info_dic.get('digg_count')
                    # video_dic['video_id'] = info_dic.get('item_id')
                    # video_dic['fetch_time'] = int(time.time() * 1e3)
                    result_list.append(video_dic)
                    if len(result_list) >= 100:
                        data_count += len(result_list)
                        print(result_list)
                        # output_result(result_Lst=result_list,
                        #               platform=self.platform,
                        #               output_to_file=output_to_file,
                        #               filepath=filepath,
                        #               output_to_es_raw=output_to_es_raw,
                        #               es_index=es_index,
                        #               doc_type=doc_type,
                        #               output_to_es_register=output_to_es_register)
                        result_list.clear()
        if result_list != []:
            data_count += len(result_list)
            # print(result_list)
            print(data_count)
            # output_result(result_Lst=result_list,
            #               platform=self.platform,
            #               output_to_file=output_to_file,
            #               filepath=filepath,
            #               output_to_es_raw=output_to_es_raw,
            #               es_index=es_index,
            #               doc_type=doc_type,
            #               output_to_es_register=output_to_es_register)

    # @logged
    def releaser_page_old(self, releaserUrl,
                          output_to_file=False,
                          filepath=None,
                          releaser_page_num_max=30,
                          output_to_es_raw=False,
                          output_to_es_register=False,
                          push_to_redis=False,
                          es_index=None,
                          doc_type=None,
                          proxy_dic=None):
        result_list = []
        has_more = True
        offset = 0
        count = 1
        releaser_id = self.find_releaser_id(releaserUrl)
        headers = {'Host': 'is.snssdk.com',
                   'Connection': 'keep-alive',
                   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                   'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
                   'Accept-Encoding': 'gzip, deflate, br',
                   'Upgrade-Insecure-Requests': '1',
                   'Cache-Control': 'max-age=0',
                   'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
                   # 'Cookie': 'odin_tt=36c67e1135e11981703d797f6246957dcc05964f811721d0c77bcf091fc7484db5698ef9ca767558ea9f1b643ad9e3ac; UM_distinctid=167c9e978c419c-0107b7bccfdb688-143a7540-1fa400-167c9e978c54a2; CNZZDATA1274386066=172017830-1545278343-%7C1545278343'
                   }
        while has_more is True and count <= releaser_page_num_max and offset is not None:
            url_dic = {"category": "profile_video",
                       "visited_uid": releaser_id,
                       "offset": offset}
            url = urllib.parse.urlencode(url_dic)
            url = "https://i.snssdk.com/api/feed/profile/v1/?%s" % urllib.parse.urlencode(url_dic)
            print(url)
            get_page = requests.get(url, headers=headers)
            print("process on releaser %s page %s" % (releaser_id, offset))
            page_dic = get_page.json()
            data_list = page_dic['data']
            has_more = page_dic.get('has_more')
            offset = page_dic.get('offset')
            if has_more is None:
                has_more = False
            if data_list == []:
                print("no data in releaser %s page %s" % (releaser_id, count))
                count += 1
                continue
            else:
                count += 1
                for one_video in data_list:
                    info_str = one_video.get('content')
                    info_dic = json.loads(info_str)
                    video_dic = copy.deepcopy(self.video_data)
                    video_dic['title'] = info_dic.get('title')
                    video_dic['url'] = info_dic.get('display_url')
                    video_dic['releaser'] = info_dic.get('source')
                    video_dic['releaserUrl'] = releaserUrl
                    release_time = info_dic.get('publish_time')
                    video_dic['release_time'] = int(release_time * 1e3)
                    video_dic['duration'] = info_dic.get('video_duration')
                    video_dic['play_count'] = info_dic.get('video_detail_info').get('video_watch_count')
                    video_dic['repost_count'] = info_dic.get('forward_info').get('forward_count')
                    video_dic['comment_count'] = info_dic.get('comment_count')
                    video_dic['favorite_count'] = info_dic.get('digg_count')
                    video_dic['video_id'] = info_dic.get('item_id')
                    video_dic['fetch_time'] = int(datetime.datetime.now().timestamp() * 1e3)
                    result_list.append(video_dic)
                    if len(result_list) >= 100:
                        output_result(result_Lst=result_list,
                                      platform=self.platform,
                                      output_to_file=output_to_file,
                                      filepath=filepath,
                                      output_to_es_raw=output_to_es_raw,
                                      es_index=es_index,
                                      doc_type=doc_type,
                                      output_to_es_register=output_to_es_register)
                        result_list.clear()
        if result_list != []:
            output_result(result_Lst=result_list,
                          platform=self.platform,
                          output_to_file=output_to_file,
                          filepath=filepath,
                          output_to_es_raw=output_to_es_raw,
                          es_index=es_index,
                          doc_type=doc_type,
                          output_to_es_register=output_to_es_register)

    def web_releaser_page(self, releaserUrl,
                          output_to_file=False,
                          filepath=None,
                          releaser_page_num_max=30,
                          output_to_es_raw=False,
                          output_to_es_register=False,
                          push_to_redis=False,
                          es_index=None,
                          doc_type=None,
                          proxy_dic=None):
        page_dic = {}
        result_list = []
        has_more = 1
        count = 1
        releaser_id = self.find_releaser_id(releaserUrl)
        print('releaser_id', releaser_id)

        max_behot_time = ""
        count_callback = 3
        data_count = 0
        media_id = self.get_data_mediaid(releaserUrl, releaser_id)
        if not media_id:
            media_id = releaser_id

        headers = {
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            "cache-control": "max-age=0",
            "upgrade-insecure-requests": "1",
            "Referer": "http://m.toutiao.com/profile/%s/" % releaser_id,
            "User-Agent": self.random_useragent(),
        }

        while has_more == 1 and count <= releaser_page_num_max:
            eas, ecp = getHoney()
            url_dic = {"page_type": "0",
                       "uid": releaser_id,
                       "max_behot_time": max_behot_time,
                       "media_id": media_id,
                       "output": "json",
                       "is_json": "1",
                       "count": "20",
                       "from": "user_profile_app",
                       "version": "2",
                       "as": eas,
                       "cp": ecp,
                       "callback": "jsonp%s" % count_callback,
                       }

            url = "https://www.toutiao.com/pgc/ma/?%s" % urllib.parse.urlencode(url_dic)
            get_page = requests.get(url, headers=headers, allow_redirects=False)
            # time.sleep(0.5)
            # format_json = re.match(r"jsonp\d+", get_page.text)
            page_dic = {}
            if get_page.text[0:4] == "json":
                get_page_text = (get_page.text[len(url_dic["callback"]) + 1:-1])
                # print(res)0
            try:
                page_dic = json.loads(r"%s" % get_page_text)
                data_list = page_dic.get('data')
                has_more = page_dic.get('has_more')
                max_behot_time = str(page_dic.get("next").get("max_behot_time"))
            except:
                print("json失败 ", get_page.text)
                data_list = []
                has_more = False
            # offset = page_dic.get('offset')
            if has_more is None:
                has_more = False
            if data_list == []:
                print("no data in releaser %s page %s" % (releaser_id, count))
                print(page_dic)
                print(url)
                count += 1
                continue
            else:
                count += 1
                count_callback += 1
                for one_video in data_list:

                    # info_str = one_video.get('content')
                    info_dic = one_video
                    video_dic = copy.deepcopy(self.video_data)
                    video_dic['title'] = info_dic.get('title')
                    video_dic['url'] = info_dic.get('source_url')
                    print(info_dic.get('source_url'))
                    video_dic['releaser'] = info_dic.get('source')
                    video_dic['releaserUrl'] = releaserUrl
                    release_time = info_dic.get('publish_time')
                    video_dic['release_time'] = int(release_time * 1e3)
                    video_dic['duration'] = self.t2s(info_dic.get('video_duration_str'))
                    video_dic['play_count'] = info_dic.get('detail_play_effective_count')
                    # video_dic['repost_count'] = info_dic.get('forward_info').get('forward_count')
                    video_dic['comment_count'] = info_dic.get('comment_count')
                    video_dic['favorite_count'] = info_dic.get('digg_count')
                    video_dic['video_id'] = info_dic.get('item_id')
                    video_dic['fetch_time'] = int(datetime.datetime.now().timestamp() * 1e3)
                    result_list.append(video_dic)
                    if len(result_list) >= 100:
                        # data_count += len(result_list)
                        # print(result_list)
                        # data_count += len(result_list)
                        output_result(result_Lst=result_list,
                                      platform=self.platform,
                                      output_to_file=output_to_file,
                                      filepath=filepath,
                                      output_to_es_raw=output_to_es_raw,
                                      es_index=es_index,
                                      doc_type=doc_type,
                                      output_to_es_register=output_to_es_register)
                        result_list.clear()
        if result_list != []:
            # data_count += len(result_list)
            # print(result_list)
            # print(data_count)
            print(result_list)
            output_result(result_Lst=result_list,
                          platform=self.platform,
                          output_to_file=output_to_file,
                          filepath=filepath,
                          output_to_es_raw=output_to_es_raw,
                          es_index=es_index,
                          doc_type=doc_type,
                          output_to_es_register=output_to_es_register)

    # @logged
    def releaser_page(self, releaserUrl,
                      output_to_file=False,
                      filepath=None,
                      releaser_page_num_max=80,
                      output_to_es_raw=False,
                      output_to_es_register=False,
                      push_to_redis=False,
                      es_index=None,
                      doc_type=None,
                      proxies_num=None):
        self.App_releaser_page_video(releaserUrl, output_to_file, filepath, releaser_page_num_max, output_to_es_raw,
                                     output_to_es_register, push_to_redis, es_index, doc_type, proxies_num)
        self.App_releaser_page_all(releaserUrl, output_to_file, filepath, releaser_page_num_max, output_to_es_raw,
                                   output_to_es_register, push_to_redis, es_index, doc_type, proxies_num)

    @staticmethod
    def t2s(t):
        if t:
            if len(t) == 5:
                t = str(t)
                m, s = t.strip().split(":")
                return float(m) * 60 + float(s)
            elif len(t) == 8:
                t = str(t)
                h, m, s = t.strip().split(":")
                return float(h) * 3600 + float(m) * 60 + float(s)
        else:
            return 0

    @staticmethod
    def random_useragent():
        agent_lis = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
            "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
            "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36"
        ]
        return agent_lis[random.randrange(0, len(agent_lis))]


if __name__ == '__main__':
    data_lis = ["https://www.toutiao.com/c/user/85676020889/#mid=1593814181797896"]
    data_lis = ["https://www.toutiao.com/c/user/6113709817/#mid=6113817558",
                "https://www.toutiao.com/c/user/3688888283/#mid=3689528443",
                "https://www.toutiao.com/c/user/4188615746/#mid=4273783271"]
    # data_lis = ["https://www.toutiao.com/c/user/6027579671/#mid=6217730861","http://www.toutiao.com/c/user/61621115551/#mid=1569627833237506"]
    # data_lis = ["https://www.toutiao.com/c/user/57767591798/#mid=1561719930544130"]
    miaopai_list = []
    test = Crawler_toutiao()
    # test.releaser_page(data_lis[0])
    # res = test.video_page("https://www.ixigua.com/i6701478014242259463/")
    # print(res)
    # for u in data_lis:
        #     # # test.releaser_page(u)
        # test.App_releaser_page_video(u, output_to_es_raw=True, es_index='crawler-data-raw',
        #                              doc_type='doc',
        #                              releaser_page_num_max=3, proxies_num=0)

        # test.get_releaser_page(u)
        # test.App_releaser_page_all(u, output_to_es_raw=True, es_index='crawler-data-raw',
        #                            doc_type='doc',
        #                            releaser_page_num_max=3, proxies_num=1))
    # test.releaser_page(u)
    test.search_page("5热玛吉")