# -*- coding: utf-8 -*-
"""
Created on Tue Oct 10 10:56:17 2017

Search distinctly from daily-url in date range of 2017-09-01 to 2017-09-30,
and write into a new type named 'all-time-url-2017-09-30'

Calculate date_str by system date.
This task is supposed to run every month on the first of current month.
It will calculate / write / build the daily-url for last whole month.

Modify _id of monthly daily-url to be url ONLY on Nov 07 2011.
Previous mechanics cannot ensure the latest data being saved all the time.

Nov 07 2017
Comment the delete historical data part.
Because, using _id mechanics the elasticsearch will take care of unique-selection things

Nov 08 2017
Push data by day, rather than push a whole month together

Nov 15 2017
Bug fix: It used to iterate from the first day of the month EVERYDAY, it's wasting time.
Change to iterate one day a time.

Add multi threads when writing data within one fetch_time day

@author: hanye
"""

import time
import sys
import json
import datetime
import elasticsearch
from elasticsearch.helpers import scan
from func_cal_doc_id import cal_doc_id
import _thread

#from func_filter_duplicated_data_on_renming import filter_duplicated_data_on_renming


def monthly_aggregated_daily_url_general(index_pre,doc_type_pre, index_new,doc_type_new, 
                                         threads_num=5, platform_list=[]):

    hosts = '192.168.17.11'
    port = 9200
    es = elasticsearch.Elasticsearch(hosts=hosts, port=port)


    log_path = ('/home/hanye/project_data/Python/Projects/proj-short-videos/write-data-into-es/log/')
    log_fn = ('update_general_daily-url_short-video_'
              + doc_type_new + '_'
              + datetime.datetime.now().isoformat().replace(':', '-'))
    f_log = open(log_path+log_fn, 'a', encoding='gb18030')
    print('log starts at', datetime.datetime.now(), file=f_log)
    print('Aggregate data from %s into %s' % (doc_type_pre, doc_type_new),
          file=f_log)
    print('doc_type_new:', doc_type_new, file=f_log)

    # 2 search data distinctly from daily-url
    search_body = {
        "query": {
            "bool": {
                "filter": [
                    {"range": {"release_time": {"gte":0, "lte":0}}}],
                }
            },
        "size": 2,
        "aggs": {
            "release_time_distribution": {
                "date_histogram": {
                    "field": "release_time",
                    "interval": "day"
                }
            }
        },
    }
    if platform_list != []:
        for platform in platform_list:
            extra_dict = {"term": {"platform.keyword": platform}}
            search_body['query']['bool']['filter'].append(extra_dict)
    release_time_start_iso = '2000-01-01T00:00:00'
    release_time_end_iso = '2020-01-01T00:00:00'

    release_time_start_ts = int(datetime.datetime.strptime(
        release_time_start_iso, '%Y-%m-%dT%H:%M:%S').timestamp()*1e3)
    release_time_end_ts = int(datetime.datetime.strptime(
        release_time_end_iso, '%Y-%m-%dT%H:%M:%S').timestamp()*1e3)

    # 2.0 Find total hits of the whole month and the distribution over release_time
    search_body['query']['bool']['filter'][0]['range']['release_time']['gte'] = release_time_start_ts
    search_body['query']['bool']['filter'][0]['range']['release_time']['lte'] = release_time_end_ts

    #pre
    search_resp = es.search(index=index_pre,
                            doc_type=doc_type_pre,
                            body=search_body,
                            request_timeout=300)
    search_hits = search_resp['hits']['total']
    print('search hits', search_hits, file=f_log)

    # 2.1 By total hits and data distribution by release_time
    # find proper release_time range for multi-threads
    data_distr_by_release_time_Lst = search_resp['aggregations']['release_time_distribution']['buckets']
    average_data_num = search_hits // threads_num
    release_time_range_Lst = []
    # find the end side of each range segment
    data_counter_collector = 0
    distr_idx = 0
    for distr_by_releaseT in data_distr_by_release_time_Lst:
        data_num_each_day = distr_by_releaseT['doc_count']
        data_counter_collector += data_num_each_day
        if (data_counter_collector > average_data_num*0.9
                or distr_idx == len(data_distr_by_release_time_Lst)-1):
            release_time_range_dict = {'start':None,
                                       'end':None,
                                       'end_idx': distr_idx,
                                       'data_num': data_counter_collector}
            release_time_range_Lst.append(release_time_range_dict)
            data_counter_collector = 0
        distr_idx += 1

    # fillup the start timestamp and the end timestamp
    if data_distr_by_release_time_Lst != []:
        start_side_cache = data_distr_by_release_time_Lst[0]['key']
        for range_seg in release_time_range_Lst:
            if range_seg['end_idx']+1 > len(data_distr_by_release_time_Lst)-1:
                range_seg['end'] = int(data_distr_by_release_time_Lst[range_seg['end_idx']]['key']
                                       + 24*3600*1e3)
            else:
                range_seg['end'] = data_distr_by_release_time_Lst[range_seg['end_idx']+1]['key']
            range_seg['start'] = start_side_cache
            start_side_cache = range_seg['end']
        print('release_time_range_Lst:\n', release_time_range_Lst, file=f_log)
    else:
        sys.exit('Fetch time out of boundary, system exit.')

    # in case the splitted range segments are longer than threads_num
    if len(release_time_range_Lst) != threads_num:
        threads_num = len(release_time_range_Lst)

    # initial lock flags to be used in threads so that main process can exit after
    # making sure each thread exit
    exit_lock_flag_Lst = [_thread.allocate_lock() for i in range(threads_num)]
    # initial globel lock to prevent conflit on stdout when print
    stdout_lock = _thread.allocate_lock()


    # 2.2 define sub-function to run in thread
    def aggs_daily_url_in_threads(thread_id, search_body_in_thread):
        # 2.2.1 find data lines in each threads
        # pre
        find_hits_total_in_threads = es.search(index=index_pre,
                                               doc_type=doc_type_pre,
                                               body=search_body_in_thread,
                                               size=0,
                                               request_timeout=300)
        hits_total_in_threads = find_hits_total_in_threads['hits']['total']
        with stdout_lock:
            print('[', thread_id, ']', 'search_body_in_threads',
                  search_body_in_thread, file=f_log)
            print('[', thread_id, ']', 'hits_total_in_threads',
                  hits_total_in_threads, file=f_log)

        if hits_total_in_threads > 0:
            # pre
            scan_resp = scan(client=es,
                             index=index_pre,
                             doc_type=doc_type_pre,
                             query=search_body_in_thread,
                             scroll='5m',
                             size=1000,
                             request_timeout=300
                            )

            line_counter = 0
            bulk_write_body = ''
            for line in scan_resp:
                line_counter += 1
                line_dict = line['_source']
                d_url = line_dict['url']
                # add special treatment for platform toutiao on Nov 06 2017
                # select distinctly by 19-digit videoID for toutiao
                d_platform = line_dict['platform']

                vid = cal_doc_id(platform=d_platform, url=d_url, doc_id_type='all-time-url',
                                 data_dict=line_dict)
                d_id = vid
                # update timestamp
                line_dict['timestamp'] = int(datetime.datetime.now().timestamp()*1e3)

                bulk_write_action = '{"index": {"_id": "'+d_id+'"}}'
                bulk_write_data = json.dumps(line_dict, ensure_ascii=False)
                bulk_write_body += (bulk_write_action + '\n'
                                    + bulk_write_data + '\n')
                if line_counter%1000 == 0 and bulk_write_body != '':
                    retry_counter = 0
                    while retry_counter < 30:
                        t_s = datetime.datetime.now()
                        try:
                            # new
                            es.bulk(index=index_new,
                                    doc_type=doc_type_new,
                                    body=bulk_write_body,
                                    request_timeout=300)
                            t_e = datetime.datetime.now()
                            t_delta = t_e - t_s
                            print('[', thread_id, ']', 'bulk write', line_counter,
                                  '/', hits_total_in_threads,
                                  'time spent', t_delta, file=f_log)
                            bulk_write_body = ''
                            break
                        except:
                            retry_counter += 1
                            print('[', thread_id, ']',
                                  'bulk write failed when bulk write line %d, retry %d'
                                  % (line_counter, retry_counter),
                                  datetime.datetime.now(), file=f_log)
                            time.sleep(60)

            if bulk_write_body != '':
                retry_counter1 = 0
                while retry_counter1 < 30:
                    try:
                        # new
                        es.bulk(index=index_new,
                                doc_type=doc_type_new,
                                body=bulk_write_body,
                                request_timeout=300)
                        print('[', thread_id, ']', 'bulk write done',
                              line_counter, '/', hits_total_in_threads, file=f_log)
                        break
                    except:
                        retry_counter1 += 1
                        print('[', thread_id, ']',
                              'bulk write failed when bulk write %d, retry %d'
                              % (line_counter, retry_counter1),
                              datetime.datetime.now(), file=f_log)
                        time.sleep(60)
            else:
                pass
        else:
            print('[', thread_id, ']', 'Got zero hits for query body\n', search_body_in_thread,
                  '\nin %s/%s' % (index_pre, doc_type_pre),
                  datetime.datetime.now(), file=f_log)
        # lock when finish
        exit_lock_flag_Lst[thread_id].acquire()
        with stdout_lock:
            print('[', thread_id, '] thread exits.', datetime.datetime.now(),
                  file=f_log)


    # 2.3 iterate throught all the days of the target month, once one day, latest day first
    for i in range(0, threads_num):
        release_time_start_ts_in_thread = release_time_range_Lst[i]['start']
        release_time_end_ts_in_thread = release_time_range_Lst[i]['end']
        search_body_in_thread = {
            "query": {
                "bool": {
                    "filter": [
                        {"range": {
                            "release_time":
                                {"gte": release_time_start_ts_in_thread,
                                 "lt": release_time_end_ts_in_thread
                                }
                            }
                        },
                    ]
                }
            }
        }

        _thread.start_new_thread(aggs_daily_url_in_threads, (i, search_body_in_thread))

    # check if all threads exit
    while not all(lk.locked() for lk in exit_lock_flag_Lst):
        time.sleep(1)

    print('Main thread exiting.', datetime.datetime.now(), file=f_log)

    # deal with 人民日报 after multi-threads bulk write
    # Added on Nov 28 2017
    find_all_renming_data_bd = {
        "query": {
            "bool": {
                "filter": {
                    "term": {
                        "platform.keyword": "人民日报"
                    }
                }
            }
        }
    }
    # new
    find_all_renming_search_resp = es.search(index=index_new,
                                             doc_type=doc_type_new,
                                             body=find_all_renming_data_bd,
                                             request_timeout=300,
                                             size=1)
    renming_total = find_all_renming_search_resp['hits']['total']
    print('There are total', renming_total, 'lines in platform 人民日报', file=f_log)
    # new
    find_all_renming_scan_resp = scan(client=es,
                                      query=find_all_renming_data_bd,
                                      index=index_new,
                                      doc_type=doc_type_new,
                                      request_timeout=300,
                                      size=1000)
    # mark title_releaser data with max play_count in this dict
    renming_title_releaser_dict = {}
    for renming_l in find_all_renming_scan_resp:
        renming_d = renming_l['_source']
        rd_id = renming_l['_id']
        rd_title = renming_d['title']
        rd_releaser = renming_d.get("releaser")
        if not rd_releaser:
            rd_releaser = ""
        rd_play_count = renming_d['play_count']
        rd_title_releaser = rd_title + rd_releaser
        if rd_title_releaser not in renming_title_releaser_dict:
            renming_title_releaser_dict.update({rd_title_releaser: {'data': renming_d,
                                                                    '_id': rd_id}})
        else:
            rd_play_count_in_dict = renming_title_releaser_dict[rd_title_releaser]['data']['play_count']
            # if the difference between two data with unique title+releaser
            # is less than 3%,
            # the one with smaller play_count will be deleted;
            # if their difference is greater than 3%, they will be taged with
            # possible_duplicated=True
            if rd_play_count == rd_play_count_in_dict:
                # new
                es.delete(index=index_new, doc_type=doc_type_new,
                          id=rd_id, request_timeout=300)
            elif (rd_play_count != 0
                  and abs(rd_play_count_in_dict-rd_play_count)/rd_play_count <= 0.03):
                # because elif has already ruled out the case of
                # rd_paly_count==rd_play_count_in_dict,
                # that left only one possible case: rd_play_count==0
                # and rd_play_count_in_dict!=0,
                # which should fall into case that the play_count difference
                # are greater than 3%
                if rd_play_count < rd_play_count_in_dict:
                    # new
                    es.delete(index=index_new, doc_type=doc_type_new,
                              id=rd_id, request_timeout=300)
                else:
                    # new
                    es.delete(index=index_new, doc_type=doc_type_new,
                              id=renming_title_releaser_dict[rd_title_releaser]['_id'],
                              request_timeout=300)
                    renming_title_releaser_dict.update(
                        {rd_title_releaser: {'data': renming_d,
                                             '_id': rd_id}}
                        )
            else:
                # tag with possible_duplicated=True in these data
                update_body = {
                    "doc": {"possible_duplicated": True}
                }
                # 1 update the one in the dict
                # new
                es.update(index=index_new, doc_type=doc_type_new,
                          id=renming_title_releaser_dict[rd_title_releaser]['_id'],
                          body=update_body, request_timeout=300)
                # 2 update the one outside the dict
                # new
                es.update(index=index_new, doc_type=doc_type_new,
                          id=rd_id,
                          body=update_body, request_timeout=300)
                # 3 update the dict to make sure the one with greater play_count stays
                if rd_play_count > rd_play_count_in_dict:
                    renming_title_releaser_dict.update(
                        {rd_title_releaser: {'data': renming_d,
                                             '_id': rd_id}}
                        )

    f_log.close()
