# -*- coding: utf-8 -*-
# @Time : 2019/7/30 10:51
# @Author : litao
"""
Created on Wed Sep 26 22:53:34 2018

@author: zhouyujiang
"""
import sys, os
from elasticsearch.helpers import scan
from target_releaser_add import *
import argparse
import datetime
import json
from elasticsearch import Elasticsearch

import requests
from bs4 import BeautifulSoup

def rebuild_releaserUrl(releaserUrl):
    get_page = requests.get(releaserUrl)
    page = get_page.text
    soup = BeautifulSoup(page, 'html.parser')
    try:
        releaser_id = soup.find('div', {'class': 'user-name'}).a['href']
    except:
        return None 
    url = 'https://id.tudou.com' + releaser_id + '==/videos'
    return url

parser = argparse.ArgumentParser()
parser.add_argument('-d', '--daily_type', type=str, default=None, help=('like 2018-09-26'))
# parser.add_argument('-p', '--platform', type=str, default=None, help=('like 2018-06-30'))

args = parser.parse_args()

if args.daily_type is None:
    now_day_d = datetime.datetime.now().date()
    now_day = datetime.datetime.fromordinal(now_day_d.toordinal())
    fetch_start = now_day - datetime.timedelta(1)
    fetch_end = now_day
    fetch_start_ts = int(datetime.datetime.timestamp(fetch_start))*1000
    fetch_end_ts = int(datetime.datetime.timestamp(fetch_end))*1000
    release_start = now_day - datetime.timedelta(2)
    release_end = now_day - datetime.timedelta(1)
    vieo_num_release_time_end_dt = release_start - datetime.timedelta(30)
    release_start_ts = int(datetime.datetime.timestamp(release_start))*1000
    release_end_ts = int(datetime.datetime.timestamp(release_end))*1000
    vieo_num_release_time_start_ts = int(datetime.datetime.timestamp(vieo_num_release_time_end_dt))*1000
    
else:
    now_day = datetime.datetime.strptime(args.daily_type, '%Y-%m-%d')
    fetch_start_ts = int(datetime.datetime.timestamp(now_day) * 1000)
    last_day = datetime.datetime.strptime(args.daily_type, '%Y-%m-%d') + datetime.timedelta(1)
    fetch_end_ts = int(datetime.datetime.timestamp(last_day) * 1000)
    zuotian = datetime.datetime.strptime(args.daily_type, '%Y-%m-%d') - datetime.timedelta(1)
    release_start_ts = int(datetime.datetime.timestamp(zuotian) * 1000)
    release_end_ts = fetch_start_ts

today = datetime.datetime.now()
first_day = datetime.datetime(today.year, today.month, 1)
day_before_first_day = first_day - datetime.timedelta(1)
l_month = day_before_first_day.month
l_year = day_before_first_day.year


hosts = '192.168.17.11'
port = 80
user = 'zhouyujiang'
passwd = '8tM9JDN2LVxM'
http_auth = (user, passwd)
es = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
bulk_all_body = ''
index = 'short-video-production'
doc_type = 'daily-url'
log_file = open('/home/hanye/crawlersNew/write_data_into_es/log/add_releaser-from_toutiao_top100_{date}'.format(
        date=str(now_day)[0:10].replace('-', '_')), 'w')
print('satrt:{st}'.format(st=datetime.datetime.now()), file=log_file)


def target_releaser_add():
    platform_lst = ['toutiao', 'haokan', '腾讯视频', 'new_tudou']
    bulk_list = []
    for platform in platform_lst:
        search_body = {
                "query": {
                        "bool": {
                                "filter": [
                                        {"range": {"fetch_time": {"gte": fetch_start_ts, "lt": fetch_end_ts}}},
                                        {"range": {"release_time": {"gte": release_start_ts, "lt": release_end_ts}}},
                                        {"term": {"platform.keyword": platform}}

                                ],
                                "must_not": [
                                        {"term": {
                                                "releaserUrl.keyword": {
                                                        "value": ""
                                                }
                                        }}
                                ],
                                "must": [
                                        {"exists": {"field": "releaserUrl"}}
                                ]

                        }
                }, "sort": [
                        {
                                "play_count": {
                                        "order": "desc"
                                }
                        }
                ], "size": 1000
        }

        search_re = es.search(index=index, doc_type=doc_type, body=search_body, size=1000)
        count = 0

        if search_re['hits']['total'] > 0:
            for one in search_re['hits']['hits']:
                line = one['_source']
                releaserUrl = line['releaserUrl']
                if releaserUrl != '' and releaserUrl is not None:
                    line_dict = {}
                    line_dict['releaser'] = line['releaser']
                    line_dict['platform'] = line['platform']
                    doc_id = one["_id"]
                    print(doc_id)
                    #                find_exist = {
                    #                    "query": {
                    #                        "bool": {
                    #                            "filter": [
                    #                                {"term": {"_id": doc_id}}
                    #                            ]
                    #                        }
                    #                    }
                    #                }
                    #                q1 = es.search(index='target_releasers', doc_type='doc', body=find_exist)

                    search_releaser = {
                            "query": {
                                    "bool": {
                                            "filter": [
                                                    {"term": {"releaser.keyword": line_dict['releaser']}},
                                                    {"term": {"platform.keyword": line_dict['platform']}},
                                                    {"range": {"release_time": {"gte": vieo_num_release_time_start_ts,
                                                                                "lt": release_end_ts}}}
                                            ]
                                    }
                            }
                    }
                    q2 = es.search(index='short-video-all-time-url', doc_type='all-time-url', body=search_releaser)
                    if q2['hits']['total'] > 0:

                        video_num = q2['hits']['total']
                        if video_num > 300 and video_num < 900:
                            line_dict['frequency'] = 3
                        elif video_num > 900:
                            line_dict['frequency'] = 9
                        else:
                            line_dict['frequency'] = 1
                        line_dict['Nov_2018'] = video_num
                    else:
                        line_dict['frequency'] = 1
                        line_dict['Nov_2018'] = 0
                    # line_dict['post_by'] = 'zyj'
                    line_dict['post_time'] = int(datetime.datetime.timestamp(datetime.datetime.now()) * 1000)
                    line_dict['timestamp'] = int(datetime.datetime.timestamp(datetime.datetime.now()) * 1000)
                    line_dict['releaserUrl'] = line['releaserUrl']
                    if line['platform'] == 'new_tudou':
                        releaserUrl = rebuild_releaserUrl(line['releaserUrl'])
                        if releaserUrl == None:
                            continue
                        line_dict['releaserUrl'] = releaserUrl
                    bulk_list.append(line_dict)
                    count = count + 1
                    if count % 100 == 0:
                        write_to_es(bulk_list)
                        bulk_all_body = []

    if bulk_all_body != '':
        write_to_es(bulk_list)
        bulk_all_body = []

    print('end:{st}'.format(st=datetime.datetime.now()), file=log_file)
    print('add_' + str(count) + 'data_in_to_tagert_releaser', file=log_file)


def csv(file_csv, **kwargs):
    extra_dic = {
            "week_report": "True",
            'key_releaser': "True",
            "frequency": 3,
    }
    if kwargs.get("project"):
        extra_dic.update(kwargs.get("project"))
        print(extra_dic)
        return write_to_es(file_csv, extra_dic=extra_dic, post_by=kwargs.get("post_by"))
    return write_to_es(file_csv, post_by=kwargs.get("post_by"))

def bulk(body):
    eror_dic = es.bulk(body=body, index="target_releasers", doc_type="doc")
    if eror_dic['errors'] is True:
        print(eror_dic)
    eror_dic1 = es.bulk(body=body, index="target_releasers_org", doc_type="doc")
    if eror_dic1['errors'] is True:
        print(eror_dic1)
    data_lis = []

def add(source):
    body_dic = {"releaserUrl": None,
                "releaser": None,
                "timestamp": int(datetime.datetime.now().timestamp() * 1e3),
                "post_by": None,
                #"Nov_2018": 0,
                # "frequency": 3,
                "platform": None,
                "post_time": int(datetime.datetime.now().timestamp() * 1e3),
                }
    body_dic.update(source)
    if not body_dic.get("project"):
        return csv([body_dic], post_by=body_dic.get("post_by"))
    else:
        return csv([body_dic], project=body_dic.get("project"), post_by=body_dic.get("post_by"))

def delete(_id):
    data = ""
    count = 0
    data = data + ('{"delete": {"_id":"%s"}}\n' % _id)
    bulk(data)


def check_releaser_id():
    read_index = "target_releasers"
    read_type = "doc"
    platform_list = [
            "haokan",
            "miaopai",
            "toutiao",
            "抖音", "腾讯视频", "腾讯新闻", "kwai", "new_tudou", "网易新闻"
    ]
    search_body_lis = [
    {
            "query": {
                    "bool": {
                            "filter": [
                                    {"terms": {"platform.keyword": platform_list}},
                                    {"term": {"is_valid": "false"}}
                            ]
                    }
            }
    }
    ,
            {
                    "query": {
                            "bool": {
                                    "filter": [
                                            {"terms": {"platform.keyword": platform_list}},

                                    ],"must_not": [
                                 {"exists": {"field": "is_valid"}}
                                    ]
                            }
                    }
            }
    ]
    for search_body in search_body_lis:
        search_re = scan(index=read_index, doc_type=read_type,
                         query=search_body,
                         request_timeout=500,
                         scroll='5m',
                         client=es)
        for res in search_re:
            _id = res["_id"]
            platform = res["_source"]["platform"]
            releaser = res["_source"]["releaser"]
            releaserUrl = res["_source"]["releaserUrl"]
            post_by = res["_source"].get("post_by")
            key_releaser = res["_source"].get("key_releaser")
            releaser_id = get_releaser_id(platform=platform,releaserUrl=releaserUrl,is_qq=True if platform == "腾讯视频" else False)
            if releaser_id:
                delete(_id)
                add(res["_source"])

def key_releaser_frequency():
    now = int(datetime.datetime.now().timestamp() * 1e3)
    body = {
        "query": {
            "bool": {
                "filter": [
                     {"term": {"key_releaser.keyword": "True"}},
       {"range": {"frequency": {"lt": 3}}}

                ]
            }
        }
    }

    read_index = "target_releasers"
    read_type = "doc"
    search_re = scan(index=read_index, doc_type=read_type,
                     query=body,
                     request_timeout=100,
                     scroll='5m',
                     client=es)

    count_true = 0
    count_false = 0

    bulk_all_body = ""
    error_info = ""
    for res in search_re:
        _id = res["_id"]
        res["_source"]["frequency"] = 3
        count_true += 1
        print(_id)

        bulk_head = '{"index": {"_id":"%s"}}' % _id
        bulk_body = json.dumps(res["_source"], ensure_ascii=False)
        bulk_one_body = bulk_head + '\n' + bulk_body + '\n'
        bulk_all_body += bulk_one_body
        if count_true % 1000 == 0:
            eror_dic = es.bulk(index=read_index, doc_type=read_type,
                               body=bulk_all_body, request_timeout=200)
            bulk_all_body = ''
            if eror_dic['errors'] is True:
                count_false = 1
                print(eror_dic['items'])
                print(bulk_all_body)
                error_info += eror_dic['items']
            print(count_true)
    if bulk_all_body != '':
        eror_dic = es.bulk(index=read_index, doc_type=read_type,
                           body=bulk_all_body, request_timeout=200)
        if eror_dic['errors'] is True:
            count_false = 1
            print(eror_dic)
            error_info += eror_dic['items']
    if count_false == 0:
        return "%s条数据写入成功" % count_true
    else:
        return error_info


if __name__ == "__main__":
    target_releaser_add()
    check_releaser_id()
    key_releaser_frequency()