# -*- coding:utf-8 -*-
# @Time : 2019/12/11 18:58
# @Author : litao
import numpy as np
import random
import json, redis, re, requests
from selenium.webdriver import ActionChains
import time, datetime, copy
from selenium import webdriver
from PIL import Image
import os
from selenium.webdriver.support.ui import WebDriverWait
import cv2
from fontTools.ttLib import *
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
from concurrent.futures import ProcessPoolExecutor

# rds_list = redis.StrictRedis(host='127.0.0.1', port=6379, db=1, decode_responses=True)
# rds_single = redis.StrictRedis(host='127.0.0.1', port=6379, db=0, decode_responses=True)
# rds_get = redis.StrictRedis(host='127.0.0.1', port=6379, db=15, decode_responses=True)

# rds_copy = redis.StrictRedis(host='127.0.0.1', port=6379, db=1, decode_responses=True)
rds_list = redis.StrictRedis(host='192.168.17.60', port=6379, db=1, decode_responses=True)
rds_single = redis.StrictRedis(host='192.168.17.60', port=6379, db=0, decode_responses=True)
rds_get = redis.StrictRedis(host='192.168.17.60', port=6379, db=15, decode_responses=True)


def revise_data():
    scan_re = rds_list.scan_iter()
    for one_scan in scan_re:
        # print(one_scan)
        data = rds_list.hgetall(one_scan)
        # data["title"] = data["title"].replace("\r", "").replace("\n", "")
        # data["describe"] = data["describe"].replace("\r", "").replace("\n", "")
        rds_get.hmset(one_scan, data)
        # rds_list.hmset(one_scan,data)

class CrawlerMain(object):
    def __init__(self):
        self.chrome_options = webdriver.ChromeOptions()
        # self.chrome_options.add_argument('--headless')
        self.chrome_options.add_argument('--disable-gpu')
        # self.chrome_options.add_argument("--start-maximized")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
        self.timestamp = str(datetime.datetime.now().timestamp() * 1e3)
        prefs = {"profile.managed_default_content_settings.images": 2}
        self.chrome_options.add_experimental_option("prefs", prefs)
        self.driver = webdriver.Chrome(options=self.chrome_options)
        self.one_video_dic = {
                "platform": "tencnt_video",
                "ID": "",
                "title": "",
                "url": "",
                "describe": "",
                "video_count": "",
                "sum_duration": "",
                "year": "",
                "provider": "",
                "style_tags": "",
                "project_tags": "",
                "language": "",
                "area": "",
                "if_pay": "",
                "play_count_sum": "",
                "play_heat": "",
                "rate": "",
                "favorite_count_sum": "",
                "comment_count_sum": "",
                "barrage_count": "",
        }
        self.single_video_dic = {
                "platform": "tencnt_video",
                "vid": "",
                "title": "",
                "video_title": "",
                "url": "",
                "video_url": "",
                "describe": "",
                "video_count": "",
                "duration": "",
                "year": "",
                "provider": "",
                "language": "",
                "area": "",
                "if_pay": "",
                "play_count": "",
                "play_heat": "",
                "rate": "",
                "favorite_count": "",
                "comment_count": "",
                "barrage_count": "",
        }

    def __exit__(self):
        self.driver.close()

    def list_page(self, releaserUrl=None,
                  video_list_xpath=None,
                  play_count_xpath=None,
                  if_free=None,
                  title=None,
                  describe=None,
                  project_tag_xpath=None,
                  provider=None,
                  year=None,
                  next_page_xpath=None,
                  roll=None,
                  project_tag=None,
                  style_tags=None
                  ):

        while True:
            self.driver.get(releaserUrl)
            # self.driver.implicitly_wait(5)
            time.sleep(0.2)
            js = "var q=document.documentElement.scrollTop=%s" % roll
            max_video_num = self.driver.find_element_by_xpath("//span[@class='hl']").text
            print(int(max_video_num))
            max_page = int(int(max_video_num) / 28)
            for p in range(max_page):
                roll += roll
                js = "var q=document.documentElement.scrollTop=%s" % roll
                self.driver.execute_script(js)
                time.sleep(0.3)
            vidoe_list_obj = self.driver.find_elements_by_xpath(
                    "//body[@class='page_channel page_channel_doco']/div[@class='mod_row_box']/div[@class='mod_bd _mod_listpage']/div[@class='mod_figure mod_figure_v_default mod_figure_list_box']/div")

            try:
                for count, one_video in enumerate(vidoe_list_obj):
                    print(count)
                    if_pay = ""
                    video_count = ""
                    describe = ""
                    title = one_video.find_element_by_xpath("./div[1]//a[1]").text
                    project_name = "tencent_%s" % title
                    url = one_video.find_element_by_xpath("./a[1]").get_attribute('href')
                    video_count_list = one_video.find_elements_by_xpath("./a[1]/div[1]")
                    discribe_list = one_video.find_elements_by_xpath("./div[1]/div[1]")
                    if discribe_list:
                        describe = discribe_list[0].text
                    if video_count_list:
                        video_count = video_count_list[0].text
                    if_pay_list = one_video.find_elements_by_xpath("./a[1]/img[2]")
                    if if_pay_list:
                        if_pay = if_pay_list[0].get_attribute("alt")
                    data_dic = {
                            "url": url,
                            "title": title,
                            "video_count": video_count,
                            "if_pay": if_pay,
                            "describe": describe
                    }
                    # action = ActionChains(self.driver)
                    # action.click(one_video).perform()
                    if style_tags:
                        temp_dic = {}
                        temp_dic["style_tags"] = style_tags
                        data_dic.update(temp_dic)
                    if project_tag:
                        temp_dic = {}
                        temp_dic["project_tags"] = project_tag
                        data_dic.update(temp_dic)
                    if year:
                        temp_dic = {}
                        temp_dic["year"] = year
                        data_dic.update(temp_dic)
                    if provider:
                        temp_dic = {}
                        temp_dic["provider"] = provider
                        data_dic.update(temp_dic)
                    self.parse_data(data_dic, project_name)
                else:
                    # self.driver.close()
                    break
            except Exception as e:
                print(e)
                self.driver.close()

    def style_tags(self, releaserUrl=None,
                   video_list_xpath=None,
                   play_count_xpath=None,
                   if_free=None,
                   title=None,
                   describe=None,
                   style_tags_xpath=None,
                   style=None,
                   project_tag=None,
                   provider=None,
                   year=None,
                   next_page_xpath=None,
                   roll=None, ):
        self.driver.get(releaserUrl)
        time.sleep(0.2)
        style_tags_obj = self.driver.find_elements_by_xpath(style_tags_xpath)
        for style_tag in style_tags_obj:
            style_tags = style_tag.text
            print(style_tags)
            if style_tags == "全部":
                continue
            action = ActionChains(self.driver)
            action.click(style_tag).perform()
            provider = style_tags
            # if True:
            self.list_page(releaserUrl=self.driver.current_url,
                           video_list_xpath=video_list_xpath,
                           play_count_xpath=play_count_xpath,
                           if_free=if_free,
                           title=title,
                           describe=describe,
                           project_tag_xpath=style_tags_xpath,
                           project_tag=project_tag,
                           style_tags=style,
                           provider=provider,
                           year=year,
                           next_page_xpath=next_page_xpath,
                           roll=roll, )

    def parse_data(self, data_dic, project_name):
        res = rds_list.hgetall(project_name)
        if res:
            if not res.get("project_tags"):
                res["project_tags"] = ""
            if data_dic.get("style_tags"):
                if data_dic.get("style_tags") not in res["style_tags"]:
                    data_dic["style_tags"] = res["style_tags"] + "," + data_dic.get("style_tags")
            if data_dic.get("project_tags"):
                if data_dic.get("project_tags") not in res["project_tags"]:
                    data_dic["project_tags"] = res["project_tags"] + "," + data_dic.get("project_tags")
            if data_dic.get("provider"):
                if data_dic.get("provider") not in res["provider"]:
                    data_dic["provider"] = res["provider"] + "," + data_dic.get("provider")
            if data_dic.get("if_pay"):
                if data_dic.get("if_pay") not in res["if_pay"]:
                    data_dic["if_pay"] = res["if_pay"] + "," + data_dic.get("if_pay")
            for k in data_dic:
                if not data_dic[k]:
                    data_dic[k] = ""
            res.update(data_dic)
            rds_list.hmset(project_name, res)
        else:
            data = copy.deepcopy(self.one_video_dic)
            data.update(data_dic)
            rds_list.hmset(project_name, data)

    def parse_single_data(self, data_dic, project_name):
        res = rds_single.hgetall(project_name)
        if res:
            if not res.get("project_tags"):
                res["project_tags"] = ""
            if data_dic.get("style_tags"):
                if data_dic.get("style_tags") not in res["style_tags"]:
                    data_dic["style_tags"] = res["style_tags"] + "," + data_dic.get("style_tags")
            if data_dic.get("project_tags"):
                if data_dic.get("project_tags") not in res["project_tags"]:
                    data_dic["project_tags"] = res["project_tags"] + "," + data_dic.get("project_tags")
            if data_dic.get("provider"):
                if data_dic.get("provider") not in res["provider"]:
                    data_dic["provider"] = res["provider"] + "," + data_dic.get("provider")
            for k in data_dic:
                if not data_dic[k]:
                    data_dic[k] = ""
            res.update(data_dic)
            rds_single.hmset(project_name, res)
        else:
            data = copy.deepcopy(self.single_video_dic)
            for k in data_dic:
                if not data_dic[k]:
                    data_dic[k] = " "
            data.update(data_dic)
            rds_single.hmset(project_name, data)

    def video_page(self, task=0):
        # self.driver = webdriver.Chrome(options=self.chrome_options)
        # self.driver.maximize_window()
        has_data = rds_get.dbsize()
        while has_data:
            keys = rds_get.randomkey()
            res = rds_get.hgetall(keys)
            has_data = rds_get.dbsize()
            time.sleep(0.2)
            url_id_list = re.findall("cover/(.*).html", res["url"])
            if url_id_list:
                url_id = url_id_list[0]
            else:
                print(res)
                url_id = ""
            # for handle in self.driver.window_handles:
            #     self.driver.switch_to.window(handle)
            headers = {
                    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
                    "accept-encoding": "gzip, deflate, br",
                    "accept-language": "zh,zh-CN;q=0.9",
                    # "cookie": "pgv_pvi=203414528; RK=SCQYJhGMVf; ptcz=5f0818b08a7345580a07bce669e0f0468b64107f4ecfb2c9bebf109cb23cf4fb; pgv_pvid=2754880744; ts_uid=176985184; tvfe_boss_uuid=54e907210062ff55; video_guid=0df27917cdb73abd; video_platform=2; XWINDEXGREY=0; mobileUV=1_16ac3c085a7_484c1; tvfe_search_uid=acc18029-4786-42c4-8f6a-f308777454bc; Qs_lvt_311470=1562066061; Qs_pv_311470=992309958717814400; _ga=GA1.2.1184421010.1562066062; login_remember=qq; ptui_loginuin=593516104; o_cookie=593516104; bucket_id=9231005; pac_uid=1_593516104; pgv_si=s4148599808; ptisp=; pgv_info=ssid=s6122922102; ts_refer=m.v.qq.com/x/page/o/d/k/i3023rbj8yk.html; qv_als=Kg5QkSIirE60EDyzA11576570304alZX8g==; ptag=m_v_qq_com|videolist:title; ts_last=v.qq.com/detail/m/mzc00200iwawqac.html; ad_play_index=132",
                    # "referer": "https://v.qq.com/x/cover/%s.html" % url_id,
                    "sec-fetch-mode": "navigate",
                    "sec-fetch-site": "same-origin",
                    "upgrade-insecure-requests": "1",
                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
                    "sec-fetch-user": "?1",
                    "cache-control": "max-age=0",
                    "if-modified-since": "Thu, 19 Dec 2019 02:30:00 GMT"
            }
            try:
                page_data = requests.get(res["url"], headers=headers,timeout=5)
            except:
                continue
            # print(page_data.status_code)
            if page_data.status_code != 200:
                rds_get.delete(keys)
                continue
            res_text = page_data.content.decode("utf-8")
            # print(page_data.content.decode("utf-8"))
            print("task %s" % task,res["url"])

            try:
                cover_info = re.findall("var COVER_INFO = (.*)\n", res_text)[0]
                cover_info_dic = json.loads(cover_info,encoding="utf-8")
            except:
                cover_info_dic = {}
            try:
                VIDEO_INFO = re.findall("var VIDEO_INFO = (.*)\n", res_text)[0]
                video_info_dic = json.loads(VIDEO_INFO,encoding="utf-8")
            except Exception as e:
                print("video_dic_erro", e)
                rds_get.delete(keys)
                continue
            # if r"\x" in str(cover_info_dic):
            #     # print(video_info_dic.encoding("utf-8"))
            #     continue
            if cover_info_dic.get("video_ids"):
                video_id_list = cover_info_dic.get("video_ids")
                # if_pay_list = cover_info_dic.get("vip_ids")
                langue = cover_info_dic.get("langue")
                if not langue:
                    langue = ""
                id = url_id
                year = cover_info_dic.get("year")
                if not year:
                    year = cover_info_dic.get("yearAndArea").get("year")
                if not year:
                    year = cover_info_dic.get("publish_date")
                area = cover_info_dic.get("area_name")
                if not area:
                    area = ""
                style_tags = cover_info_dic.get("main_genre")
                if not style_tags:
                    style_tags = ""
                description = cover_info_dic.get("description")
                if not description:
                    description = ""
                play_count_sum = cover_info_dic.get("view_all_count")
                if not play_count_sum:
                    play_count_sum = ""
                try:
                    rate = cover_info_dic.get("score").get("score")
                except:
                    rate = ""
                if not rate:
                    rate = ""
                title = cover_info_dic.get("title")
                dic = {"description": description,
                       "rate": rate,
                       "play_count_sum": play_count_sum,
                       "langue": langue,
                       "id": id,
                       "year": year,
                       "area": area,
                       "style_tags": style_tags,
                       "title": title,
                       "video_count": len(video_id_list)
                       }
                # print(dic)
                project_name = keys
                # print(dic)

                self.parse_data(dic, project_name)
                one_video_dic = {}
                for count, video in enumerate(video_id_list):
                    pay_dic = {}
                    pay_dic["vid"] = video
                    pay_dic["video_count"] = count + 1
                    pay_dic["album"] = title
                    pay_dic["url"] = res["url"]
                    one_video_dic[video] = pay_dic
                self.one_video_page(title, one_video_dic, type="list")
            else:
                video_id_list = []
                video_id_list.append(video_info_dic.get("vid"))
                duration = video_info_dic.get("duration")
                if not duration:
                    duration = ""
                year = video_info_dic.get("publish_date")
                if not year:
                    year = ""
                play_count_sum = video_info_dic.get("view_all_count")
                if not play_count_sum:
                    play_count_sum = ""
                project_tags = video_info_dic.get("pioneer_tag")
                if not project_tags:
                    project_tags = ""
                title = video_info_dic.get("title")
                try:
                    comment_count = re.findall("title='(\d+)热评'", res)[0]
                except:
                    comment_count = 0
                dic = {
                        "play_count_sum": play_count_sum,
                        "duration": duration,
                        "project_tags": project_tags,
                        "title": title,
                        "year": year,
                        "video_count": 1,
                        "comment_count": comment_count
                }
                project_name = keys
                self.parse_data(dic, project_name)
                dic["album"] = dic["title"]
                dic["video_url"] = res["url"]
                dic["video_id"] = res["title"]
                dic["play_count"] = res["play_count_sum"]
                one_video_dic = {
                        title: dic
                }
                # print(one_video_dic)
                self.one_video_page(title, one_video_dic, type="single")
            rds_get.delete(keys)

    def one_video_page(self, title, one_video_dic, type="list"):
        for one_video in one_video_dic:
            if type == "list":
                url = one_video_dic[one_video]["url"]
                headers = {
                        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
                        "accept-encoding": "gzip, deflate, br",
                        "accept-language": "zh,zh-CN;q=0.9",
                        # "cookie": "pgv_pvi=203414528; RK=SCQYJhGMVf; ptcz=5f0818b08a7345580a07bce669e0f0468b64107f4ecfb2c9bebf109cb23cf4fb; pgv_pvid=2754880744; ts_uid=176985184; tvfe_boss_uuid=54e907210062ff55; video_guid=0df27917cdb73abd; video_platform=2; XWINDEXGREY=0; mobileUV=1_16ac3c085a7_484c1; tvfe_search_uid=acc18029-4786-42c4-8f6a-f308777454bc; Qs_lvt_311470=1562066061; Qs_pv_311470=992309958717814400; _ga=GA1.2.1184421010.1562066062; login_remember=qq; ptui_loginuin=593516104; o_cookie=593516104; bucket_id=9231005; pac_uid=1_593516104; pgv_si=s4148599808; ptisp=; pgv_info=ssid=s6122922102; ts_refer=m.v.qq.com/x/page/o/d/k/i3023rbj8yk.html; qv_als=Kg5QkSIirE60EDyzA11576570304alZX8g==; ptag=m_v_qq_com|videolist:title; ts_last=v.qq.com/detail/m/mzc00200iwawqac.html; ad_play_index=132",
                        "referer": url,
                        "sec-fetch-mode": "navigate",
                        "sec-fetch-site": "same-origin",
                        "upgrade-insecure-requests": "1",
                        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
                }
                # print(url)
                url_lis = url.split(".", -1)
                url_lis[2] = url_lis[2] + "/" + one_video
                single_data_url = ".".join(url_lis)
                # print(single_data_url)
                page_data = requests.get(single_data_url, headers=headers,timeout=5)
                if page_data.status_code != 200:
                    return None
                VIDEO_INFO = re.findall("var VIDEO_INFO = (.*)\n", page_data.content.decode("utf-8"))[0]
                video_info_dic = json.loads(VIDEO_INFO,encoding="utf-8")
                video_id = video_info_dic.get("vid")
                duration = video_info_dic.get("duration")
                year = video_info_dic.get("video_checkup_time")
                play_count_sum = video_info_dic.get("view_all_count")

                project_tags = video_info_dic.get("pioneer_tag")
                if not project_tags:
                    project_tags = ""
                video_title = video_info_dic.get("title")
                try:
                    comment_count = re.findall("title='(\d+)热评'", page_data.text)[0]
                except:
                    comment_count = 0
                project_name = "tencnt_video_%s" % (one_video)
                if_pay = video_info_dic.get("pay_status")
                if not if_pay:
                    if_pay = ""
                dic = {
                        "album": title,
                        "video_title": video_title,
                        "if_pay": if_pay,
                        "comment_count": comment_count,
                        "url": url,
                        "video_url": single_data_url,
                        "video_id": video_id,
                        "duration": duration,
                        "video_count": 1,
                        "play_count": play_count_sum,
                        "year": year,
                        "project_tags": project_tags,
                }

                # print(dic)
                self.parse_single_data(dic, project_name)
            else:
                project_name = "tencnt_video_%s" % (one_video)
                dic = one_video_dic[one_video]
                self.parse_single_data(dic, project_name)

    def detail_page(self):
        pass


if __name__ == "__main__":
    crlawler_tenxun = CrawlerMain()
    provider_dic = {
            "全部": "https://v.qq.com/channel/doco?_all=1&channel=doco&itrailer=-1&listpage=1&pay=-1&sort=19",
            "BBC":"https://v.qq.com/channel/doco?_all=1&channel=doco&itrailer=1&listpage=1&pay=-1&sort=19",
            "国家地理":"https://v.qq.com/channel/doco?_all=1&channel=doco&itrailer=4&listpage=1&pay=-1&sort=19",
            "HBO":"https://v.qq.com/channel/doco?_all=1&channel=doco&itrailer=3175&listpage=1&pay=-1&sort=19",
            "NHK":"https://v.qq.com/channel/doco?_all=1&channel=doco&itrailer=2&listpage=1&pay=-1&sort=19",
            "历史频道":"https://v.qq.com/channel/doco?_all=1&channel=doco&itrailer=7&listpage=1&pay=-1&sort=19",
            "ITV":"https://v.qq.com/channel/doco?_all=1&channel=doco&itrailer=3530&listpage=1&pay=-1&sort=19",
            "探索频道":"https://v.qq.com/channel/doco?_all=1&channel=doco&itrailer=3174&listpage=1&pay=-1&sort=19",
            "ZDF":"https://v.qq.com/channel/doco?_all=1&channel=doco&itrailer=3176&listpage=1&pay=-1&sort=19",
            "ARTE":"https://v.qq.com/channel/doco?_all=1&channel=doco&itrailer=3172&listpage=1&pay=-1&sort=19",
            "腾讯自制":"https://v.qq.com/channel/doco?_all=1&channel=doco&itrailer=15&listpage=1&pay=-1&sort=19",
            "合作机构":"https://v.qq.com/channel/doco?_all=1&channel=doco&itrailer=6&listpage=1&pay=-1&sort=19",
            "其他":"https://v.qq.com/channel/doco?_all=1&channel=doco&itrailer=5&listpage=1&pay=-1&sort=19",
    }
    for provider in provider_dic:
        crlawler_tenxun.list_page(releaserUrl=provider_dic[provider],
                               video_list_xpath="/html[1]/body[1]/div[6]/div[1]/div[2]/div",
                               roll=1000,
                               provider=provider
                               )

    # executor = ProcessPoolExecutor(max_workers=8)
    # futures = []
    # crlawler_tecnt = CrawlerMain()
    # # process_task = crlawler_bilbili.video_page
    # for one_scan in range(8):
    #     crlawler_tecnt = CrawlerMain()
    #     future = executor.submit(crlawler_tecnt.video_page,task=one_scan)
    #     futures.append(future)
    #
    #     # crlawler_bilbili.video_page(one_scan)
    # executor.shutdown(True)
    # crlawler_tecnt = CrawlerMain()
    # crlawler_tecnt.video_page()
    # revise_data()