# -*- coding: utf-8 -*-
"""
Created on Mon Jul  9 15:27:42 2018

@author: zhouyujiang
"""

# 修改月增量播放量等问题

import json
import datetime
import sys
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan


def data_correction(data_year, data_month,
                    doc_type=None, corr_lower_bdr_ts=None,
                    log_file=sys.stdout):
    program_start_time = datetime.datetime.now()
    data_year = data_year
    data_month = data_month

    if data_month == 12:
        month_n = 1
        year_n = data_year + 1
    else:
        month_n = data_month + 1
        year_n = data_year

    if doc_type is None:
        last_day_of_the_month = (datetime.datetime(year=year_n, month=month_n, day=1)
                                 - datetime.timedelta(days=1)).isoformat()[:10]
        first_day_of_the_month_t = datetime.datetime(data_year, data_month, 1)
        first_day_of_the_month = int(first_day_of_the_month_t.timestamp() * 1000)
        corr_lower_bdr_ts = first_day_of_the_month
        doc_type = 'daily-url-' + last_day_of_the_month
    else:
        if corr_lower_bdr_ts is None:
            print('argument corr_lower_bdr_ts should be given together with '
                  'doc_type, function returns', file=log_file)
            return None
        else:
            pass

    index = 'short-video-production'
    not_find_count = 0

    es = Elasticsearch(hosts='192.168.17.11', port=9200)
    # 查找月增量为负和播放量小于等于0
    bulk_body_all = ''
    count = 0
    search_body = {
        "query": {
            "bool": {
                "filter": [
                    {"range": {"play_count": {"lte": 0}}},
                    {"range": {"monthly_net_inc_play_count": {"lt": 0}}}

                ]
            }
        }

    }
    search_total = es.search(index=index, doc_type=doc_type,
                             body=search_body,
                             request_timeout=100)
    total = search_total['hits']['total']
    if total > 0:
        print("共有{total}条数据需要修改".format(total=total), file=log_file)
        scan_for_unusual_data = scan(client=es, query=search_body,
                                     index=index, doc_type=doc_type,
                                     scroll='5m',
                                     request_timeout=100)
        for one_dict in scan_for_unusual_data:
            count = count + 1
            doc_id = one_dict['_id']
            line_str = ''
            q_one_dict_source = one_dict['_source']
            url = q_one_dict_source['url']
            fetch_time = q_one_dict_source['fetch_time']
            print("正在处理第{count}条数据\nURL:{url}".format(count=count, url=url),
                  datetime.datetime.now(), file=log_file)
            search_url = {
                "query": {
                    "bool": {
                        "filter": [
                            {"range": {"fetch_time": {"lt": fetch_time,
                                                      "gte": corr_lower_bdr_ts}}},
                            {"term": {"url.keyword": url}},
                            {"range": {"play_count": {"gt": 0}}}
                        ]
                    }
                },
                "sort": [
                    {
                        "fetch_time": {
                            "order": "desc"
                        }
                    }
                ]
            }
            search_re_url = es.search(index=index, doc_type='daily-url',
                                      body=search_url, size=1, request_timeout=100)
            total_hits = search_re_url['hits']['total']
            if total_hits == 0:
                not_find_count = not_find_count + 1
                print("没有找到！\t{url}".format(url=url), file=log_file)
                new_monthly_net_inc_play_count = 0
                new_monthly_net_inc_comment_count = 0
                new_monthly_net_inc_favorite_count = 0
                data_provider = 'CCR'
                data_correction_note = ('find negative MNI value, '
                                        'failed to find effective historical data, '
                                        'wipe MNI values to zeros.')
                timestamp = int(datetime.datetime.now().timestamp() * 1e3)
                q_one_dict_source.update({
                    "monthly_net_inc_play_count": new_monthly_net_inc_play_count,
                    "monthly_net_inc_comment_count": new_monthly_net_inc_comment_count,
                    "monthly_net_inc_favorite_count": new_monthly_net_inc_favorite_count,
                    "data_provider": data_provider,
                    "timestamp": timestamp,
                    "data_correction_note": data_correction_note
                })
            else:
                se_url_hits = search_re_url['hits']['hits']
                daily_url_source = se_url_hits[0]['_source']
                same_key = set(daily_url_source.keys()) & set(q_one_dict_source.keys())
                diff_source = set(q_one_dict_source.keys()) - set(daily_url_source.keys())
                for one_diff in diff_source:
                    if one_diff == 'monthly_net_inc_play_count':
                        q_one_dict_source[one_diff] = daily_url_source['play_count'] - (
                            q_one_dict_source["play_count"]
                            - q_one_dict_source["monthly_net_inc_play_count"])
                    if one_diff == 'monthly_net_inc_comment_count':
                        q_one_dict_source[one_diff] = daily_url_source['comment_count'] - (
                            q_one_dict_source["comment_count"]
                            - q_one_dict_source["monthly_net_inc_comment_count"])
                    if one_diff == 'monthly_net_inc_favorite_count':
                        q_one_dict_source[one_diff] = daily_url_source['favorite_count'] - (
                            q_one_dict_source["favorite_count"]
                            - q_one_dict_source[
                                "monthly_net_inc_favorite_count"])
                for one_same in same_key:
                    q_one_dict_source[one_same] = daily_url_source[one_same]
                data_provider = 'CCR'
                q_one_dict_source.update({
                    "data_correction_note": ("find negative MNI value, "
                                             "correcting by historical data."),
                    "data_provider": data_provider
                })
            action_str = '{"index": {"_id":"%s"}}' % doc_id
            data_str = json.dumps(q_one_dict_source, ensure_ascii=False)
            line_str = action_str + '\n' + data_str + '\n'
            bulk_body_all += line_str
            print("修改完成", file=log_file)
            if count % 1000 == 0 or count == total_hits:
                print("已经完成了{count}条".format(count=count), file=log_file)
                eror_dic = es.bulk(body=bulk_body_all,
                                   index="short-video-production", doc_type=doc_type,
                                   request_timeout=200)
                bulk_body_all = ''
                if eror_dic['errors'] is True:
                    print(eror_dic, file=log_file)
        if bulk_body_all != '':
            eror_dic = es.bulk(body=bulk_body_all,
                               index="short-video-production", doc_type=doc_type,
                               request_timeout=200)
            bulk_body_all = ''
            if eror_dic['errors'] is True:
                print(eror_dic, file=log_file)
        used_time = datetime.datetime.now() - program_start_time
        print('共修改了{count}条，其中{not_find_count}条未找到，共耗时{used_time}'
              .format(count=count, not_find_count=not_find_count,
                      used_time=used_time), file=log_file)
    else:
        print('{doc_type}中没有需要修改的数据'.format(doc_type=doc_type), file=log_file)
