# -*- coding: utf-8 -*-
"""
Created on Tue Aug 14 20:13:21 2018

@author: fangyucheng
"""
import copy
import re
# import rsa
import time
import json
import urllib
import base64
import binascii
import datetime

import requests

from bs4 import BeautifulSoup
from crawler.crawler_sys.framework.video_fields_std import Std_fields_video
from crawler.crawler_sys.utils.output_results import retry_get_url, output_result
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
from crawler.crawler_sys.utils.trans_strtime_to_timestamp import weibo_parse_time, trans_strtime_to_timestamp
from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
from crawler.crawler_sys.utils.util_logging import logged
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
from crawler.crawler_sys.utils.html_to_str import dehtml
from write_data_into_es.func_get_releaser_id import *
from write_data_into_es.func_cal_doc_id import cal_doc_id


class Crawler_weibo():

    def __init__(self, timeout=None, platform='weibo'):
        self.platform = "weibo"
        self.session = requests.Session()
        self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                        'Accept-Encoding': 'gzip, deflate, sdch',
                        'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
                        'Connection': 'keep-alive',
                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2128.59 Safari/537.36'}
        std_fields = Std_fields_video()
        self.video_data = std_fields.video_data
        self.video_data['platform'] = self.platform
        # remove fields that crawled data don't have
        pop_key_Lst = ['channel', 'describe', 'isOriginal', "repost_count", "video_id"]
        for popk in pop_key_Lst:
            self.video_data.pop(popk)
    # def manipulate_login(self, user_name, password):
    #     # cookie写入cookie_pool
    #     cookie_pool = open('cookie_pool',
    #                        'a', encoding='utf-8')
    #
    #     # 转换用户名
    #     user_name_quote = urllib.parse.quote_plus(user_name)
    #     user_name_base64 = base64.b64encode(user_name_quote.encode('utf-8'))
    #     user_name_b64 = user_name_base64.decode('utf-8')
    #
    #     # 获得servertime pubkey rsakv nonce 四个参数
    #     current_time = int(time.time() * 1000)
    #     login_url_first_part = 'http://login.sina.com.cn/sso/prelogin.php?'
    #     login_url_dic = {'entry': 'weibo',
    #                      'callback': 'sinaSSOController.preloginCallBack',
    #                      'su': user_name_b64,
    #                      'rsakt': 'mod',
    #                      'checkpin': '1',
    #                      'client': 'ssologin.js(v1.4.18)',
    #                      '_': current_time}
    #     login_url_second_part = urllib.parse.urlencode(login_url_dic)
    #     login_url = login_url_first_part + login_url_second_part
    #     get_page = requests.get(login_url)
    #     get_page.encoding = 'utf-8'
    #     page = get_page.text
    #     page_rep = page.replace('sinaSSOController.preloginCallBack', '')
    #     page_dic = eval(page_rep)
    #     pubkey = page_dic['pubkey']
    #     servertime = page_dic['servertime']
    #     rsakv = page_dic['rsakv']
    #     nonce = page_dic['nonce']

        # 构造密码
        # rsa_pubkey = int(pubkey, 16)
        # key = rsa.PublicKey(rsa_pubkey, 65537)
        # message = str(servertime) + '\t' + str(nonce) + '\n' + str(password)
        # message = message.encode("utf-8")
        # password_rsa = rsa.encrypt(message, key)
        # password_bi = binascii.b2a_hex(password_rsa)

        # login，通过post，获得cookie
        # post_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)'
        # post_data_dic = {'encoding': 'UTF-8',
        #                  'entry': 'weibo',
        #                  'from': '',
        #                  'gateway': '1',
        #                  'nonce': nonce,
        #                  'pagerefer': "",
        #                  'prelt': 67,
        #                  'pwencode': 'rsa2',
        #                  "returntype": "META",
        #                  'rsakv': rsakv,
        #                  'savestate': '7',
        #                  'servertime': servertime,
        #                  'service': 'miniblog',
        #                  'sp': password_bi,
        #                  'sr': '1920*1080',
        #                  'su': user_name_b64,
        #                  'useticket': '1',
        #                  'vsnf': '1',
        #                  'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack&display=0&'}
        #
        # logining_page = self.session.post(post_url, data=post_data_dic, headers=self.headers)
        # login_loop = logining_page.content.decode("GBK")
        #
        # if '正在登录' in login_loop or 'Signing in' in login_loop:
        #     cookie = logining_page.cookies.get_dict()
        #     print(cookie,type(cookie))
        #     current_time = int(time.time() * 1000)
        #     cookie_dic = {'cookie': cookie,
        #                   'current_time': current_time}
        #     cookie_json = json.dump(cookie_dic,cookie_pool)
        #     print('got cookie in login process')
        #     return cookie
        # else:
        #     print('post failed, suggest to login again')


    def test_cookie(self, test_url=None,
                    cookie=None,
                    user_name=None,
                    password=None):
        if test_url is None:
            test_url = 'https://weibo.com/1188203673/GuV3o9VYt'
        get_page = requests.get(test_url, cookies=cookie)
        page = get_page.text
        length = len(page)
        if length > 20000:
            print("due to the page's length is %s, cookie is useful" % length)
            return cookie
        else:
            print("invalid cookie at the page length %s" % length)
            return None

    def get_weibo_info_from_search_page(self, retrieve_soup, cookie):
        try:
            weibo_id = retrieve_soup.find('div', {'action-type': 'feed_list_item'})['mid']
            user_id_str = retrieve_soup.find('div', {'action-type': 'feed_list_item'})['tbinfo']
            user_id = re.findall('\d+', user_id_str)[0]
        except:
            try:
                weibo_id_str = retrieve_soup.find('a', {'action-type': 'fl_menu'})['action-data']
                weibo_id = re.findall('\d+', weibo_id_str)[0]
                user_id_str = retrieve_soup.find('a', {'class': 'name_txt'})['usercard']
                user_id = re.findall('\d+', ' '.join(re.findall('id=\d+', user_id_str)))[0]
            except:
                weibo_id = None
                user_id = None
                print('id_error')
        try:
            user_url = retrieve_soup.find('div', {'class': 'face'}).a['href']
        except:
            user_url = None
            print('user_url error')
        try:
            user_nickname = retrieve_soup.find('div', {'class': 'face'}).a['title']
        except:
            user_nickname = None
            print('user_nickname error')
        try:
            weibo_content = retrieve_soup.find('p', {'class': 'comment_txt'}).text
            weibo_content = weibo_content.strip('\n').strip()
            if '展开全文' in weibo_content:
                weibo_content = self.get_longtext(weibo_id, cookie)
        except:
            weibo_content = None
            print('weibo_content error')
        # 没有判断是否为超级话题 #没有摘取emoji #没有下载图片
        try:
            release_time = retrieve_soup.find('a', {'class': 'W_textb'})['date']
        except:
            try:
                release_time = retrieve_soup.find('a', {'node-type': 'feed_list_item_date'})['date']
            except:
                release_time = None
                print('release_time error')
        try:
            weibo_url = retrieve_soup.find('a', {'class': 'W_textb'})['href']
        except:
            weibo_url = None
            print('weibo_url error')
        try:
            come_from = retrieve_soup.find('a', {'rel': 'nofollow'}).text
        except:
            come_from = None
            print("can't find come_from")
        try:
            repost_count = retrieve_soup.find('a', {'action-type': 'feed_list_forward'}).em.text
        except:
            repost_count = 0
        try:
            comment_count = retrieve_soup.find('a', {'action-type': 'feed_list_comment'}).em.text
        except:
            comment_count = 0
        try:
            favorite_count = retrieve_soup.find('a', {'action-type': 'feed_list_like'}).em.text
        except:
            favorite_count = 0
        fetch_time = int(time.time() * 1000)
        weibo_info = {'weibo_id': weibo_id,
                      'user_id': user_id,
                      'user_url': user_url,
                      'user_nickname': user_nickname,
                      'weibo_content': weibo_content,
                      'release_time': release_time,
                      'weibo_url': weibo_url,
                      'come_from': come_from,
                      'repost_count': repost_count,
                      'comment_count': comment_count,
                      'favorite_count': favorite_count,
                      'fetch_time': fetch_time}
        return weibo_info

    def get_repost_info(self, retrieve_soup):
        try:
            weibo_id = retrieve_soup['mid']
        except:
            weibo_id = None
            print('weibo_id error')
        try:
            user_id_str = retrieve_soup.div.a['usercard']
            user_id = re.findall('\d+', user_id_str)[0]
        except:
            user_id = None
            print('user_id error')
        try:
            user_url = retrieve_soup.div.a['href']
        except:
            user_url = None
            print('user_url error')
        try:
            user_nickname = retrieve_soup.find('div', {'class': 'WB_text'}).a.text
        except:
            user_nickname = None
            print('user_nickname error')
        try:
            weibo_content = retrieve_soup.find('span', {'node-type': 'text'}).text
            weibo_content = weibo_content.strip('\n').strip()
        except:
            weibo_content = None
            print('weibo_content error')
        if weibo_content is not None and '//' in weibo_content:
            parent_lst = weibo_content.split('//')
            try:
                pattern = '@.*:'
                parent_weibo = re.findall(pattern, parent_lst[1])[0].replace(':', '').replace('@', '')
            except:
                pattern = '@.*：'
                parent_weibo = re.findall(pattern, parent_lst[1])[0].replace('：', '').replace('@', '')
        else:
            parent_weibo = None
        # 没有判断是否为超级话题 #没有摘取emoji #没有下载图片
        try:
            release_time = retrieve_soup.find('a', {'node-type': 'feed_list_item_date'})['date']
        except:
            release_time = None
            print('release_time error')
        try:
            weibo_url = retrieve_soup.find('a', {'node-type': 'feed_list_item_date'})['href']
        except:
            weibo_url = None
            print('weibo_url error')
        try:
            repost_count_str = retrieve_soup.find('a', {'action-type': 'feed_list_forward'}).text
            repost_count_lst = re.findall('\d+', repost_count_str)
            if repost_count_lst != []:
                repost_count = repost_count_lst[0]
            else:
                repost_count = 0
        except:
            repost_count = 0
        try:
            favorite_count_str = retrieve_soup.find('span', {'node-type': 'like_status'}).text
            favorite_count_str.replace('ñ', '')
            try:
                favorite_count_lst = re.findall('\d+', favorite_count_str)
                if favorite_count_lst != []:
                    favorite_count = favorite_count_lst[0]
                else:
                    favorite_count = 0
            except:
                favorite_count = 0
                print('favorite_count is zero')
        except:
            favorite_count = 0
        fetch_time = int(time.time() * 1000)
        repost_info = {'weibo_id': weibo_id,
                       'user_id': user_id,
                       'user_url': user_url,
                       'user_nickname': user_nickname,
                       'weibo_content': weibo_content,
                       'parent_weibo': parent_weibo,
                       'release_time': release_time,
                       'weibo_url': weibo_url,
                       'repost_count': repost_count,
                       'favorite_count': favorite_count,
                       'fetch_time': fetch_time}
        return repost_info

    def get_user_weibo_info(self, retrieve_soup, cookie):
        try:
            weibo_id = retrieve_soup['mid']
        except:
            weibo_id = None
            print('weibo_id error')
        try:
            user_nickname = retrieve_soup.find('a', {'class': 'W_f14'}).text
        except:
            user_nickname = None
            print('user_nickname error')
        try:
            weibo_content = retrieve_soup.find('div', {'class': 'WB_text'}).text
            weibo_content = weibo_content.strip('\n').strip()
            if '展开全文' in weibo_content:
                weibo_content = self.get_longtext(weibo_id, cookie)
        except:
            weibo_content = None
            print('weibo_content error')
        # 没有判断是否为超级话题 #没有摘取emoji #没有下载图片
        try:
            release_time = retrieve_soup.find('a', {'class': 'S_txt2'})['date']
        except:
            release_time = None
            print('release_time error')
        try:
            text = "weibo.com"
            weibo_url = text.join((retrieve_soup.find('a', {'class': 'S_txt2'})['href']))
            print(weibo_url)
        except:
            weibo_url = None
            print('weibo_url error')
        try:
            come_from = retrieve_soup.find('a', {'rel': 'nofollow'}).text
        except:
            come_from = None
            print("can't find come_from")
        try:
            repost_count_lst = retrieve_soup.find('span', {'node-type': 'forward_btn_text'}).find_all('em')
            for line in repost_count_lst:
                try:
                    repost_count = int(line.text)
                except:
                    repost_count = 0
        except:
            repost_count = 0
        try:
            comment_count_lst = retrieve_soup.find('span', {'node-type': 'comment_btn_text'}).find_all('em')
            for line in comment_count_lst:
                try:
                    comment_count = int(line.text)
                except:
                    comment_count = 0
        except:
            comment_count = 0
        try:
            favorite_count_lst = retrieve_soup.find('span', {'node-type': 'like_status'}).find_all('em')
            for line in favorite_count_lst:
                try:
                    favorite_count = int(line.text)
                except:
                    favorite_count = 0
        except:
            favorite_count = 0
            print('favorite_count is zero')
        fetch_time = int(time.time() * 1000)
        weibo_info = {'weibo_id': weibo_id,
                      'user_nickname': user_nickname,
                      'weibo_content': weibo_content,
                      'come_from': come_from,
                      'release_time': release_time,
                      'weibo_url': weibo_url,
                      'repost_count': repost_count,
                      'comment_count': comment_count,
                      'favorite_count': favorite_count,
                      'fetch_time': fetch_time}
        return weibo_info

    def get_longtext(self, weibo_id, cookie):
        current_time = int(time.time() * 1000)
        longtext_url = ('https://weibo.com/p/aj/mblog/getlongtext?ajwvr=6&mid='
                        + weibo_id + '&is_settop&is_sethot&is_setfanstop&'
                                     'is_setyoudao&__rnd=' + str(current_time))
        get_page = requests.get(longtext_url, headers=self.headers, cookies=cookie)
        try:
            page_dic = get_page.json()
            wait_for_soup = page_dic['data']['html']
            soup = BeautifulSoup(wait_for_soup, 'html.parser')
            longtext = soup.text
            return longtext
        except:
            print("can't get longtext")
            return ''

    def get_single_article_page(self,article_id,keyword,proxies=0):
        headers = {
            "Accept": "application/json, text/plain, */*",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Connection": "keep-alive",
            # "Cookie": "SINAGLOBAL=565010119549.1364.1559571258394; login_sid_t=85753e367d54782a25518436f329cfa0; cross_origin_proto=SSL; _s_tentry=www.baidu.com; Apache=5712976583220.359.1595386386561; ULV=1595386386575:2:1:1:5712976583220.359.1595386386561:1592884354178; UOR=,,login.sina.com.cn; SSOLoginState=1595829153; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWZ46TE-isMWEvFjmXZnGFZ5JpX5KMhUgL.Fo2Re0zpShqfSoe2dJLoI7_e9gfadcvadcvad7tt; ALF=1627695088; SCF=AlrGNPCzM_VX3PzgxftYKkUv6Gj7FjmOVVbH8EpsTADeRxEeW-7_ipW8LVV7sGN-t7JJA-VwFKC2Ot0ZkHwHstE.; SUB=_2A25yJwQhDeRhGedG6FAQ9CjJzT-IHXVRVXLprDV8PUNbmtAKLRPmkW9NUVHbR2NjdmB2ZEtnFBK75m3CwwTzeqTJ; SUHB=08J6qQipU2qH8A; CARD-MAIN=cfec82595a1164dea323b2fb276c823f",
            "Host": "card.weibo.com",
            "Referer": "https://card.weibo.com/article/m/show/id/{0}?_wb_client_=1&open_source=weibo_search&luicode=10000011&lfid=100103type%3D21%26q%3D{1}%26t%3D0".format(article_id,urllib.parse.quote(keyword)),
            "Sec-Fetch-Dest": "empty",
            "Sec-Fetch-Mode": "cors",
            "Sec-Fetch-Site": "same-origin",
            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",

       }
        url = "https://card.weibo.com/article/m/aj/detail?id={0}&_t={1}".format(article_id,int(datetime.datetime.now().timestamp()*1e3))
        try:
            requests_res = retry_get_url(url,headers=headers,proxies=proxies)
            res_json = requests_res.json()
            # print(res_json)
            data = res_json["data"]
            video_dic = {}
            video_dic["url"] = data["target_url"]
            video_dic["title"] = data["title"]
            video_dic["fetch_time"] = int(datetime.datetime.now().timestamp()*1e3)
            video_dic["release_time"] = trans_strtime_to_timestamp(data["create_at"])
            video_dic["play_count"] = trans_play_count(data["read_count"])
            video_dic["content"] = data["content"]
            video_dic["releaser"] = data["userinfo"].get('screen_name')
            video_dic["releaser_id"] = str(data["userinfo"].get('id'))
            video_dic["releaserUrl"] = data["userinfo"].get('url')
            video_dic["releaser_id_str"] = "weibo_" + str(video_dic["releaser_id"])
            video_dic["img_list"] = re.findall('img src="(.*?)"',data["content"])
            video_dic["mid"] = article_id
            return video_dic
        except Exception as e:
            print("single data row formate error %s" % e)


    def search_article_page(self, keyword, search_pages_max=12,
                        output_to_es_raw=False,
                        output_to_es_register=False,
                        es_index=None,
                        doc_type=None, proxies_num=0):
        count_false = 0
        headers_search = {
            "Accept": "application/json, text/plain, */*",
            "MWeibo-Pwa": "1",
            "Referer": "https://m.weibo.cn/search?containerid=100103type=1&q={0}".format(urllib.parse.quote(keyword)),
            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
            "X-Requested-With": "XMLHttpRequest",
            "X-XSRF-TOKEN": "65d497"
                }
        urls = []
        for page_num in range(0, search_pages_max):
            url = 'https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D21%26q%3D{0}%26t%3D0&page_type=searchall&page={1}'.format(urllib.parse.quote(keyword),page_num + 1)
            urls.append(url)
        weibo_Lst = []
        for search_page_url in urls:
            get_page = retry_get_url(search_page_url, headers=headers_search)
            if get_page.status_code != 200:
                # retry once
                get_page = requests.get(search_page_url)
                if get_page.status_code != 200:
                    continue
            page_dict = get_page.json()
            while page_dict['data'].get("msg") == '这里还没有内容':
                get_page = retry_get_url(search_page_url, headers=headers_search)
                page_dict = get_page.json()
                count_false += 1
                if count_false >= 3:
                    continue

            if page_dict['data'].get("cards")[0].get("card_group"):
                for one_line in page_dict['data'].get("cards")[0].get("card_group"):
                    try:

                        title = one_line['title_sub']
                        # abstract = one_line['abstract']
                        # url = one_line['article_url']
                        # play_count = one_line['read_count']
                        # comment_count = one_line['comment_count']
                        # favorite_count = one_line['digg_count']
                        article_id = re.findall("(\d+)",one_line['scheme'])[0]
                        # releaser = one_line['media_name']
                        # uid = one_line['user_id']
                        # releaserUrl = "https://www.toutiao.com/c/user/%s/" % uid
                        # release_time = one_line['publish_time']
                        # release_time = int(int(release_time) * 1e3)
                        fetch_time = int(datetime.datetime.now().timestamp() * 1e3)
                        # releaser_id = self.get_releaser_id(releaserUrl)
                        D0 = copy.deepcopy(self.video_data)
                        D0['title'] = title
                        # D0['abstract'] = abstract
                        # D0['url'] = url
                        # D0['play_count'] = play_count
                        # D0['comment_count'] = comment_count
                        # D0['favorite_count'] = favorite_count
                        D0['mid'] = article_id
                        # D0['releaser'] = releaser
                        # D0['releaserUrl'] = releaserUrl
                        # D0['release_time'] = release_time
                        # D0['releaser_id_str'] = "toutiao_%s" % releaser_id
                        D0['fetch_time'] = fetch_time
                        D0['search_word'] = keyword
                        D0["type"] = "article"
                        try:
                            article_info = self.get_single_article_page(article_id,keyword, proxies=proxies_num)
                            D0.update(article_info)
                        except Exception as e:
                            print("method get_web_article_info error %s" % e)
                            continue
                        # print(D0)
                        weibo_Lst.append(D0)
                    except KeyError:
                        # It's totally ok to drop the last return data value.
                        # The search api just return something seems related to search
                        continue
            else:
                break
            if len(weibo_Lst) >= 100:
                output_result(result_Lst=weibo_Lst,
                              platform=self.platform,
                              output_to_es_raw=output_to_es_raw,
                              output_to_es_register=output_to_es_register,
                              es_index=es_index,
                              doc_type=doc_type)
                weibo_Lst.clear()

        if weibo_Lst != []:
            output_result(result_Lst=weibo_Lst,
                          platform=self.platform,
                          output_to_es_raw=output_to_es_raw,
                          output_to_es_register=output_to_es_register,
                          es_index=es_index,
                          doc_type=doc_type)

        return weibo_Lst

    def get_releaser_id(self, releaserUrl):
        return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)

    def search_page(self, keyword, search_pages_max=30,
                    output_to_es_raw=False,
                    output_to_es_register=False,
                    es_index=None,
                    doc_type=None, proxies_num=0):
        self.search_article_page(keyword, search_pages_max=search_pages_max, output_to_es_raw=output_to_es_raw,
                             output_to_es_register=output_to_es_register,
                             es_index=es_index,
                             doc_type=doc_type, proxies_num=proxies_num)

    def repost_page(self, weibo_id, user_name, password):
        total_page = 0
        result_lst = []
        cookie = self.manipulate_login(user_name=user_name,
                                       password=password)
        # cookie = self.test_cookie(get_cookie)
        if cookie is not None:
            current_time = int(time.time() * 1000)
            repost_url = 'https://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=' + weibo_id + '&max_id=0&page=1&__rnd=' + str(
                current_time)
            get_page = requests.get(repost_url, headers=self.headers, cookies=cookie)
            get_page.encoding = 'utf-8'
            try:
                page_dic = get_page.json()
                total_page = page_dic['data']['page']['totalpage']
                repost_info = page_dic['data']['html']
                repost_soup = BeautifulSoup(repost_info, 'html.parser')
                repost_agg = repost_soup.find_all('div', {'action-type': 'feed_list_item'})
                for line in repost_agg:
                    try:
                        one_repost = self.get_repost_info(line)
                        result_lst.append(one_repost)
                        print('get one repost')
                    except:
                        print('one repost data error')
                print(one_repost)
            except:
                print("can't get repost data")
        time.sleep(6)
        if cookie is not None and total_page != 0:
            for page_num in range(1, total_page + 1):
                current_time = int(time.time() * 1000)
                repost_url = ('https://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=' + weibo_id +
                              '&max_id=0&page=' + str(page_num) + '&__rnd=' + str(current_time))
                get_page = requests.get(repost_url, headers=self.headers, cookies=cookie)
                time.sleep(3)
                get_page.encoding = 'utf-8'
                try:
                    page_dic = get_page.json()
                    total_page = page_dic['data']['page']['totalpage']
                    repost_info = page_dic['data']['html']
                    repost_soup = BeautifulSoup(repost_info, 'html.parser')
                    repost_agg = repost_soup.find_all('div', {'action-type': 'feed_list_item'})
                    for line in repost_agg:
                        one_repost = self.get_repost_info(line)
                        result_lst.append(one_repost)
                        print('get one repost at %s' % page_num)
                    print(one_repost)
                except:
                    print("can't get repost data")
        if result_lst != []:
            return result_lst
        else:
            print("can't get repost data")
            return None

    def user_page(self, user_id, user_name, password):
        result_lst = []
        cookie_pool = open('cookie_pool',
                           'r', encoding='utf-8')
        for coo in cookie_pool:
            print(coo)
            cookie = json.loads(coo)
        #cookie = self.manipulate_login(user_name=user_name,password=password)
        #cookie = {"ALC": "ac%3D2%26bt%3D1561705868%26cv%3D5.0%26et%3D1593241868%26ic%3D-621306587%26login_time%3D1561705868%26scf%3D%26uid%3D7211103954%26vf%3D0%26vs%3D0%26vt%3D0%26es%3Db91c9d11ca009f8c4f48080505ae615b", "LT": "1561705868", "tgc": "TGT-NzIxMTEwMzk1NA==-1561705868-tc-6005B5FEAADCEB07A63BA0D6D544CF92-1", "ALF": "1593241868", "SCF": "Ah7YtXJ_s6ue4BJWekcj8HMaZEYi3Kel5243tYoDHC9y0TD9y7MYKIhYu7fV0_BEaPmgGpFKmkyz-WA-cF6-Vgc.", "SUB": "_2A25wEc3cDeRhGeFM6lMQ8C3FzjiIHXVTZrgUrDV_PUNbm9AKLULSkW9NQP7JKShhH9bCX9VIpjzhPXX89XiDiHbj", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFmSG3DWrqckklXmwYD.UNJ5NHD95QNeo2peK501K-XWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNeKM7eKM0SheX15tt", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLGNs4CxjbOMtIyzkLiJp5WpmYO0t4yjhLGMk4CzjpOUtA==", "login": "609423641c81693ee710ee69b0d0e34c"}
        if cookie is not None:
            for page_num in range(1, 3):
                first_url = ('https://weibo.com/u/' + user_id + '?visible=0&is_all=1&is_tag=0'
                                                                '&profile_ftype=1&page=' + str(page_num) + '#feedtop')
                get_page = requests.get(first_url, headers=self.headers, cookies=cookie)
                get_page.encoding = 'utf-8'
                page = get_page.text
                soup = BeautifulSoup(page, 'html.parser')
                sfa = soup.find_all('script')
                find_content = ''
                for line in sfa:
                    if 'Pl_Official_MyProfileFeed__' in str(line):
                        find_content = str(line)
                find_content = find_content.replace('<script>FM.view(', '').replace(')</script>', '')
                # print(find_content)
                find_content_dic = json.loads(find_content)
                content_for_soup = find_content_dic['html']
                soup_content = BeautifulSoup(content_for_soup, 'html.parser')
                weibo_lst = soup_content.find_all('div', {'action-type': 'feed_list_item'})
                # time.sleep(15)
                for line_count,line in enumerate(weibo_lst):
                    weibo_info = self.get_user_weibo_info(line, cookie)
                    weibo_info['user_id'] = user_id
                    weibo_info['user_url'] = 'https://weibo.com/' + user_id
                    result_lst.append(weibo_info)
                    print('get data at element page:%s pagebar:%s' % (page_num,line_count))
                get_parameter = soup.find_all('script', {'type': 'text/javascript'})
                for line in get_parameter:
                    if 'pid' in str(line) and 'oid' in str(line):
                        parameter_str = str(line)
                parameter_str = parameter_str.replace('\r', '').replace('\n', '').replace("\'", '')
                domain = re.findall('\d+', ''.join(re.findall("pid]=\d+", parameter_str)))[0]
                special_id = re.findall('\d+', ''.join(re.findall("page_id]=\d+", parameter_str)))[0]
                current_time = int(time.time() * 1000)
                for pagebar in [0, 1]:
                    user_url = ('https://weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&domain='
                                + domain + '&profile_ftype=1&is_all=1&pagebar=' + str(pagebar) +
                                '&pl_name=Pl_Official_MyProfileFeed__22&id=' + special_id +
                                '&script_uri=/' + user_id + '&feed_type=0&page=' + str(page_num) + '&pre_page=1'
                                                                                                   '&domain_op=' + domain + '&__rnd=' + str(
                                current_time))
                    get_page = requests.get(user_url, headers=self.headers, cookies=cookie)
                    get_page.encoding = 'utf-8'
                    try:
                        page_dic = get_page.json()
                        user_weibo_str = page_dic['data']
                        user_weibo_soup = BeautifulSoup(user_weibo_str, 'html.parser')
                        user_weibo_agg = user_weibo_soup.find_all('div', {'action-type': 'feed_list_item'})
                        # time.sleep(15)
                        for line in user_weibo_agg:
                            try:
                                weibo_info = self.get_user_weibo_info(line, cookie)
                                weibo_info['user_id'] = user_id
                                weibo_info['user_url'] = 'https://weibo.com/' + user_id
                                result_lst.append(weibo_info)
                                print('get data at ajax page page_num:%s pagebar:%s'
                                      % (page_num, pagebar))
                            except:
                                print('one weibo_info error')
                    except:
                        print('page error at page_num:%s pagebar:%s' % (page_num, pagebar))
        if result_lst != []:
            return result_lst
        else:
            print("can't get repost data")
            return None

    @staticmethod
    def get_single_page(mid):
        url = "https://m.weibo.cn/status/%s" % mid
        headers = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            # "cookie": "_T_WM=68345544646; WEIBOCN_FROM=1110006030; MLOGIN=0; XSRF-TOKEN=fd1a69; M_WEIBOCN_PARAMS=oid%3D4523948446845543%26luicode%3D20000061%26lfid%3D4528703037509890%26uicode%3D20000061%26fid%3D4523948446845543",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "same-origin",
            "sec-fetch-site": "same-origin",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
        }
        page_res = retry_get_url(url, headers=headers, proxies=0)
        page_json_context = re.findall(r"render_data = (.*)\[0\]", page_res.text, flags=re.DOTALL)[0]
        page_json = json.loads(page_json_context)
        text = dehtml(page_json[0]["status"]["text"])
        repost_count = trans_play_count(page_json[0]["status"]["reposts_count"])
        comment_count = trans_play_count(page_json[0]["status"]["comments_count"])
        favorite_count = trans_play_count(page_json[0]["status"]["attitudes_count"])
        return text, repost_count, comment_count, favorite_count

    def get_releaser_id(self, releaserUrl):
        return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)



    @staticmethod
    def get_img(data):
        img_list = []
        if data.get("pics"):
            for one in data.get("pics"):
                try:
                    img_list.append(one["large"]["url"])
                except Exception as e:
                    img_list.append(one["url"])
                    print("add img error %s" % e)
        return img_list

    def releaser_page(self, releaserUrl,
                      output_to_file=False, filepath=None,
                      output_to_es_raw=False,
                      output_to_es_register=False,
                      push_to_redis=False,
                      releaser_page_num_max=10000,
                      es_index=None,
                      doc_type=None, proxies_num=None):
        print('Processing releaserUrl %s' % releaserUrl)
        result_Lst = []
        releaser_id = self.get_releaser_id(releaserUrl)
        # xsrf_token,url_extr = self.get_weibo_info(releaser_id)
        headers = {
            "accept": "application/json, text/plain, */*",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            # "cookie": "_T_WM=30976479190; XSRF-TOKEN=9e4bb8; WEIBOCN_FROM=1110006030; MLOGIN=0; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D1%2526q%253D%25E8%25BF%25AA%25E4%25B8%25BD%25E7%2583%25AD%25E5%25B7%25B4%26fid%3D1076031669879400%26uicode%3D10000011",
            "mweibo-pwa": "1",
            # "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4",
            # "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
            "x-requested-with": "XMLHttpRequest",
            # "x-xsrf-token": xsrf_token,
        }
        pagenum = 0
        has_more = True
        since_id = 0
        if releaser_id:
            while pagenum <= releaser_page_num_max and has_more:
                pagenum += 1
                time.sleep(0.5)
                "?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4&type=uid&value=1669879400&containerid=1076031669879400&since_id=451822205602429"
                url = "https://m.weibo.cn/api/container/getIndex?uid={0}&t=0&type=uid&value={1}&containerid=107603{2}&since_id={3}".format(
                    releaser_id, releaser_id, releaser_id, since_id)
                headers["referer"] = "https://m.weibo.cn/u/uid={0}&t=0".format(releaser_id)
                print('Page number: %d' % pagenum)
                try:
                    if proxies_num:
                        get_page = retry_get_url(url, headers=headers, timeout=self.timeout, proxies=proxies_num)
                    else:
                        get_page = retry_get_url(url, headers=headers, timeout=self.timeout)
                except:
                    get_page = None
                    has_more = False
                if get_page and get_page.status_code == 200:
                    try:
                        page_json = get_page.json()
                        total = page_json["data"]["cardlistInfo"]["total"]
                        if pagenum > total:
                            break
                        since_id = page_json["data"]["cardlistInfo"]["since_id"]
                        page_dic = page_json["data"].get("cards")
                    except Exception as e:
                        print("load data error %s" % e)
                        continue

                    if page_dic:
                        for one in page_dic:
                            try:
                                mblog = one.get("mblog")
                                mid = mblog.get("mid")
                                forward_text = ""
                                forward_user = ""

                                if one.get("source") == "绿洲":
                                    text_type = "绿洲"
                                elif mblog.get("retweeted_status"):
                                    text_type = "转发"
                                    forward_text = mblog.get("retweeted_status").get("raw_text")
                                    forward_user = mblog.get("retweeted_status").get("user").get("screen_name")
                                else:
                                    text_type = one.get("source")
                                if mblog.get("isLongText"):
                                    text, repost_count, comment_count, favorite_count = self.get_single_page(mid)
                                else:
                                    text = mblog["raw_text"]
                                res_dic = {
                                    "release_time": trans_strtime_to_timestamp(mblog["created_at"]),
                                    "fetch_time": int(datetime.datetime.now().timestamp() * 1e3),
                                    "url": one["scheme"],
                                    "releaser": mblog["user"]["screen_name"],
                                    "repost_count": trans_play_count(mblog["reposts_count"]),
                                    "comment_count": trans_play_count(mblog["comments_count"]),
                                    "favorite_count": trans_play_count(mblog["attitudes_count"]),
                                    "title": text.replace("\u200b", ""),
                                    "wb_type": text_type,
                                    "forward_user": forward_user,
                                    "forward_text": forward_text,
                                    "mid": mid,
                                    "releaserUrl": "https://www.weibo.com/u/%s" % releaser_id,
                                    "releaser_id_str": "weibo_%s" % releaser_id,
                                    "img_list": self.get_img(mblog),
                                    "platform": "weibo",
                                    # "doc_id":doc_id
                                }
                                res_dic["doc_id"] = cal_doc_id(platform="weibo", url=one["scheme"], data_dict=res_dic,
                                                               doc_id_type="all-time-url")
                                yield res_dic
                            except Exception as e:
                                print(json.dumps(mblog))
                                print("row formate error %s" % e)
                                continue

    def get_releaser_follower_num(self, releaserUrl):
        pass

    def releaser_page_by_time(self, start_time, end_time, url, allow, **kwargs):
        count_false = 0
        for res in self.releaser_page(url, proxies_num=kwargs.get("proxies_num")):
            video_time = res["release_time"]
            # print(res)
            if video_time:
                if start_time < video_time:
                    if video_time < end_time:
                        yield res
                else:
                    count_false += 1
                    if count_false > allow:
                        break
                    else:
                        yield res


if __name__ == '__main__':
    weibo = Crawler_weibo()
    # user_name = '7255925880'
    # password = 'Lemo1995'
    #    keyword = '罗奕佳'
    #    user_id = 'jianchuan'
    #    weibo_id = '4273575663592672'
    # user_id = '1788283193'
    #    test_search2 = weibo.search_page(keyword, user_name, password)
    #    test_repost = weibo.repost_page(weibo_id, user_name, password)
    # user_page = weibo.user_page(user_id, user_name, password)
    weibo.search_page("迪丽热巴",output_to_es_register=True,es_index="crawler-data-raw",search_pages_max=1)
    # print(user_page)