# -*- coding:UTF-8 -*-
# @Time  : 2020/7/24 16:07
# @File  : crawler_douban.py
# @email : litao@igengmei.com
# @author : litao


import os
import copy
import requests
import re
import datetime ,time
import json
import urllib
import random
# from bs4 import BeautifulSoup
# from multiprocessing import Pool
# from multiprocessing import Process
from crawler.crawler_sys.framework.video_fields_std import Std_fields_video
from crawler.crawler_sys.utils.output_results import retry_get_url
from crawler.crawler_sys.utils.output_results import output_result
# from crawler.crawler_sys.utils import output_log
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
from crawler.crawler_sys.utils.trans_strtime_to_timestamp import weibo_parse_time, trans_strtime_to_timestamp
# from crawler.crawler_sys.utils import connect_with_redis
# from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
# from crawler.crawler_sys.utils.util_logging import logged
# from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
# from crawler.crawler_sys.utils.html_to_str import dehtml
# from bs4 import BeautifulSoup
from write_data_into_es.func_get_releaser_id import *
from write_data_into_es.func_cal_doc_id import cal_doc_id


class CrawlerDouban():
    def __init__(self, timeout=None, platform='douban'):
        if timeout == None:
            self.timeout = 10
        else:
            self.timeout = timeout
        self.platform = platform
        std_fields = Std_fields_video()
        self.video_data = std_fields.video_data
        self.video_data['platform'] = self.platform
        # remove fields that crawled data don't have
        pop_key_Lst = ['describe', 'repost_count', 'isOriginal',
                       'video_id']
        for popk in pop_key_Lst:
            self.video_data.pop(popk)
        self.sig_list = [
            "aOI2VYvkFvPfUngaeoz%2BNYQ7MQM%3D",
            "Glc52sbPO46I%2FR%2FOCjl%2BGwKo94I%3D",
            "l9oVu%2FYau2UwMyhc5m8ldALp5eU%3D",
            "tL36trbi73v7Y057K10%2FQ9fdCiA%3D"
        ]
        self.headers = {
            "User-Agent": "api-client/1 com.douban.frodo/10.39.0(189) Android/23 product/cancro vendor/Netease model/Miui rom/android  network/wifi  platform/AndroidPad",
            "Host": "frodo.douban.com",
            "Connection": "Keep-Alive",
            "Accept-Encoding": "gzip",
        }

    def get_single_page(self,mid,proxies):
        count_true = 0
        while count_true <= 5:
            try:
                count_true += 1
                url = "https://frodo.douban.com/api/v2/group/topic/{0}?event_source=search&os_rom=android&apikey=0dad551ec0f84ed02907ff5c42e8ec70&channel=Baidu_Market&_sig={2}&udid=dc{1}e9f33c54b4bb579c49100b6f2cc0dc5cc".format(mid,random.randint(10000,99999),random.choice(self.sig_list))
                page_res = retry_get_url(url,headers=self.headers,proxies=proxies)
                page_json = page_res.json()
                # content = dehtml(page_json["content"])
                if page_json.get('localized_message'):
                    continue

                # content_html = """<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><title>Title</title></head><body>%s</body></html>""" % page_json["content"]
                # bs = BeautifulSoup(content_html, "html.parser")
                # content = bs.textarea.get_text()
                content = page_json["content"]
                repost_count = trans_play_count(page_json["reshares_count"])
                comment_count = trans_play_count(page_json["comments_count"])
                favorite_count = trans_play_count(page_json["like_count"])
                collection_count = trans_play_count(page_json["collections_count"])
                img_list = re.findall(r'"(http.*?[jpg|webp]{1}?)"',content)
                dic = {
                    "content":content,
                    "repost_count":repost_count,
                    "comment_count":comment_count,
                    "favorite_count":favorite_count,
                    "collection_count":collection_count,
                    "img_list":img_list,
                }
                return dic
            except Exception as e:
                print("single page error %s"% e)
                continue
        print("single page error")
        return None

    def get_releaser_id(self, releaserUrl):
        return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)


    def gooseneck(self,releaserUrl,output_to_file=False, filepath=None,
                      output_to_es_raw=False,
                      output_to_es_register=False,
                      push_to_redis=False,
                      releaser_page_num_max=10000,
                      es_index=None,proxies_num=None):

        page = 0
        has_more = True
        url_dic = {
            "start": None,
            "count": "20",
            "sortby": "new",
            "apple": "389276ed556d40cada2e208482b51cd7",
            "icecream": "ffd8f7d71419a98e48819cbac587ebbd",
            "mooncake": "0f607264fc6318a92b9e13c65db7cd3c",
            "webview_ua": "Mozilla%2F5.0%20%28Linux%3B%20Android%2010.0.1%3B%20Miui%20Build%2FV417IR%3B%20wv%29%20AppleWebKit%2F537.36%20%28KHTML%2C%20like%20Gecko%29%20Version%2F4.0%20Chrome%2F52.0.2743.100%20Mobile%20Safari%2F537.36",
            "screen_width": "810",
            "screen_height": "1440",
            "sugar": "0",
            "longitude": "0",
            "latitude": "0",
            "os_rom": "android",
            "apikey": "0dad551ec0f84ed02907ff5c42e8ec70",
            "channel": "Baidu_Market",
            "udid": "dc{0}e9f33c54b4bb579c49100b6f2cc0dc5cc".format(random.randint(10000,99999)),
            "_sig": random.choice(self.sig_list),
            "_ts": None,

        }
        while page <= releaser_page_num_max and has_more:
            url_dic["_ts"] = int(datetime.datetime.now().timestamp())
            url_dic["start"] = str(page * 20)
            if "hot_tag" in releaserUrl:
                url_dic["sortby"] = "hot"
            elif "new_tag" in releaserUrl:
                url_dic["sortby"] = "new"
            url = "https://frodo.douban.com/api/v2/group/248952/topics?%s" % urllib.parse.urlencode(url_dic)
            try:
                if proxies_num:
                    get_page = retry_get_url(url, headers=self.headers, timeout=self.timeout, proxies=proxies_num)
                else:
                    get_page = retry_get_url(url, headers=self.headers, timeout=self.timeout)
            except:
                continue
            if get_page and get_page.status_code == 200:
                try:
                    page_json = get_page.json()
                    total = page_json["total"]
                    page += 1
                    if page > total:
                        break
                    page_dic = page_json["topics"]
                except Exception as e:
                    print("load data error %s" % e)
                    continue

                if page_dic:
                    for one in page_dic:
                        try:
                            releaser_id = one["author"]["id"]
                            mid = one["id"]
                            if True:
                            # try:
                                res_dic = {
                                    "release_time": trans_strtime_to_timestamp(one["create_time"]),
                                    "fetch_time": int(datetime.datetime.now().timestamp()*1e3),
                                    "url": one["url"],
                                    "releaser": one["author"]["name"],
                                    "repost_count": None,
                                    "comment_count": trans_play_count(one["comments_count"]),
                                    "favorite_count": None,
                                    "title": one["title"],
                                    "releaserUrl": "https://www.douban.com/people/%s" % releaser_id,
                                    "releaser_id_str": "douban_%s" % releaser_id,
                                    'video_img':one["cover_url"],
                                    "mid":mid,
                                    "platform":"douban",
                                    "article_type": "article"
                                    # "doc_id":doc_id
                                }
                                doc_id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,
                                                    doc_id_type="all-time-url")
                                res_dic["doc_id"] = doc_id
                                res_dic.update(self.get_single_page(mid,proxies_num))
                                # print(res_dic)

                                yield res_dic
                        except Exception as e:
                            print("single data parse error %s " %e)
                        # except Exception as e:
                        #     print(one)
                        #     print("row formate error %s" % e)
                        #     continue

    # @logged
    def releaser_page(self, releaserUrl,
                      **kwargs):
        return self.gooseneck(releaserUrl,**kwargs)


    def get_releaser_follower_num(self, releaserUrl):
       pass

    def releaser_page_by_time(self, start_time, end_time, url, allow, **kwargs):
        count_false = 0
        for res in self.releaser_page(url, proxies_num=kwargs.get("proxies_num")):
            video_time = res["release_time"]
            # print(res)
            if video_time:
                if start_time < video_time:
                    if video_time < end_time:
                        count_false = 0
                        yield res
                else:
                    count_false += 1
                    if count_false > allow:
                        break
                    else:
                        yield res
if __name__ == '__main__':
    test = CrawlerDouban()
    url = 'https://weibo.com/p/1644114654/home?from=page_100306&mod=TAB#place'
    # releaserUrl = 'http://v.qq.com/vplus/cfa34d96d1b6609f1dccdea65b26b83d'
    url_list = [
            "https://www.douban.com/people/new_tag"

    ]
    # res = test.releaser_page(url, output_to_es_raw=True,
    #                     es_index='crawler-data-raw',
    #                      releaser_page_num_max=400,proxies_num=0)
    # for r in res:
    #     print(r)
    for u in url_list:
        ttt = test.releaser_page_by_time(1595755100232, 1595906959333, u, output_to_es_register=True,
                                          es_index='crawler-data-raw',
                                          doc_type='doc', releaser_page_num_max=4000,allow=20)
        for t in ttt:
            print(t)
    # test.get_single_page(4524055937468233)