# -*- coding: utf-8 -*-
"""
Created on Tue Oct 10 10:56:17 2017

Search distinctly from daily-url in date range of 2017-09-01 to 2017-09-30,
and write into a new type named 'all-time-url-2017-09-30'

Calculate date_str by system date.
This task is supposed to run every month on the first of current month.
It will calculate / write / build the daily-url for last whole month.

Modify _id of monthly daily-url to be url ONLY on Nov 07 2011.
Previous mechanics cannot ensure the latest data being saved all the time.

Nov 07 2017
Comment the delete historical data part.
Because, using _id mechanics the elasticsearch will take care of unique-selection things

Nov 08 2017
Push data by day, rather than push a whole month together

Nov 15 2017
Bug fix: It used to iterate from the first day of the month EVERYDAY, it's wasting time.
Change to iterate one day a time.

Add multi threads when writing data within one fetch_time day

@author: hanye
"""

import time
import sys
import json
import datetime
import elasticsearch
from elasticsearch.helpers import scan
import _thread
from func_cal_doc_id import cal_doc_id
from ReleaserMeta import ReleaseMeta
#from func_calculate_toutiao_video_id import calculate_toutiao_video_id
#from func_filter_duplicated_data_on_renming import filter_duplicated_data_on_renming
#from func_calculate_newTudou_video_id import calculate_newTudou_video_id

def define_index(year, month, day):
    if month == 1 and day == 1:
        year = year - 1
    index = 'short-video-production-' + str(year)
    return index
    
def define_monthly_data_slice_type(year, month, day):
    if month == 12:
        year_n = year + 1
        month_n = 1
    else:
        year_n = year
        month_n = month + 1
    # Put data fetched on the first day of the month into the monthly slice
    # of the previous month. The considerations are:
    # data fetched on day-N is gathered into daily-url only when the fecth time
    # is around 00:00 of day-N, which reflect the 'fact' of day-(N-1).
    # So, data fetched on, say, May 01 00:00 (actually is 00:00 - 07:00 a.m.),
    # is the fact of Apr 30.
    if day == 1:
        if month_n > 1:
            month_n -= 1
        elif month_n == 1:
            month_n = 12
            year_n -= 1
        else:
            pass
    last_day_of_the_month_T = (datetime.datetime(year=year_n, month=month_n, day=1)
                               - datetime.timedelta(days=1))
    date_str = last_day_of_the_month_T.isoformat()[:10]
    doc_type_monthly_slice = 'daily-url-' + date_str

    return (doc_type_monthly_slice, last_day_of_the_month_T)

def find_last_day_in_the_month(year, month):
    if month == 12:
        next_month = 1
        next_year = year + 1
    else:
        next_month = month + 1
        next_year = year
    last_day_in_the_month_T = (datetime.datetime(next_year, next_month, 1)
                               - datetime.timedelta(days=1))
    return last_day_in_the_month_T

def mapping_from_pre_type(es, index, doc_type_pre, doc_type_new, f_log):
    # mapping
    es_indices = elasticsearch.client.IndicesClient(es)
    resp_get_mapping = es_indices.get_mapping(index='short-video-production-2019', doc_type='daily-url-2019-01-31')
    print(resp_get_mapping)
    mapping_propertity_dict = resp_get_mapping['short-video-production-2019-v4']['mappings']['daily-url-2019-01-31']
    put_mapping_resp = es_indices.put_mapping(doc_type=doc_type_new,
                                              body=mapping_propertity_dict,
                                              index=index)
    print('mapping result\n', put_mapping_resp, file=f_log)


def monthly_aggregated_daily_url_for_one_fetch_day(year, month, day, threads_num=5):

    hosts = '192.168.17.11'
    port = 9200
    es = elasticsearch.Elasticsearch(hosts=hosts, port=port)
    # daily-url-的索引别名永远是short-video-production 每年一个切片 例如short-video-prod-v3
    read_idnex = 'short-video-production'
    doc_type_pre = 'daily-url'
    write_index = define_index(year, month, day)
    # 1 mapping new type every time before write data
    doc_type_new, last_day_of_the_month_T = define_monthly_data_slice_type(year, month, day)

    log_path = ('/home/hanye/project_data/Python/Projects/proj-short-videos/'
                'write-data-into-es/log/')
    log_fn = ('update_monthly_daily-url_' + write_index + '_'
              + doc_type_new + '_'
              + datetime.datetime.now().isoformat().replace(':', '-'))
    f_log = open(log_path+log_fn, 'a', encoding='gb18030')
    print('log starts at', datetime.datetime.now(), file=f_log)
    print('doc_type_new', doc_type_new, file=f_log)

    # mapping
    mapping_from_pre_type(es, write_index, doc_type_pre, doc_type_new, f_log)

    # 2 search data distinctly from daily-url
    search_body = {
        "query": {
            "bool": {
                "must": [
                    {"range": {"fetch_time": {"gte":0, "lte":0}}},
                    {"range": {"release_time": {"gte":0, "lt":0}}}
                ],
            }
        },
        "size": 2,
        "aggs": {
            "release_time_distribution": {
                "date_histogram": {
                    "field": "release_time",
                    "interval": "day"
                }
            }
        },

    }

    # release_time_start_iso = '2000-01-01T00:00:00'
    release_time_end_T = last_day_of_the_month_T + datetime.timedelta(days=1)
    release_time_start_T = release_time_end_T - datetime.timedelta(days=100)
    release_time_start_ts = int(release_time_start_T.timestamp()*1e3)
    release_time_end_ts = int(release_time_end_T.timestamp()*1e3)

    # 2.0 Find total hits of the whole month and the distribution over release_time
    this_day_T = datetime.datetime(year=year, month=month, day=day)
    fetch_time_start_T = this_day_T
    fetch_time_end_T = this_day_T+datetime.timedelta(days=1)
    fetch_time_start_ts = int(fetch_time_start_T.timestamp()*1e3)
    fetch_time_end_ts = int(fetch_time_end_T.timestamp()*1e3-1)
    print('will write fetch time from ', fetch_time_start_T, "to",  fetch_time_end_T, "will write release time from",
          release_time_start_T, "to", release_time_end_T, file=f_log)
    search_body['query']['bool']['must'][0]['range']['fetch_time']['gte'] = fetch_time_start_ts
    search_body['query']['bool']['must'][0]['range']['fetch_time']['lte'] = fetch_time_end_ts
    search_body['query']['bool']['must'][1]['range']['release_time']['gte'] = release_time_start_ts
    search_body['query']['bool']['must'][1]['range']['release_time']['lt'] = release_time_end_ts

    search_resp = es.search(index=read_idnex,
                            doc_type=doc_type_pre,
                            body=search_body,
                            request_timeout=300)
    search_hits = search_resp['hits']['total']
    print('search hits', search_hits, file=f_log)

    # add releaser_name into index: releaser_meta 2019-12-16
    try:
        scan_re = scan(client=es, index=read_idnex, doc_type=doc_type_pre, query=search_body)
        test = ReleaseMeta(scan=scan_re, index='releaser_meta', doc_type='doc')
        test.func_write_release_meta_into_es()
    except Exception as e:
        print("[error]", e, " in add releaser_name into index: releaser_meta")

    # 2.1 By total hits and data distribution by release_time
    # find proper release_time range for multi-threads
    data_distr_by_release_time_Lst = search_resp['aggregations']['release_time_distribution']['buckets']
    average_data_num = search_hits//threads_num
    release_time_range_Lst = []
    # find the end side of each range segment
    data_counter_collector = 0
    distr_idx = 0
    for distr_by_releaseT in data_distr_by_release_time_Lst:
        data_num_each_day = distr_by_releaseT['doc_count']
        data_counter_collector += data_num_each_day
        if (data_counter_collector > average_data_num*0.9
                or distr_idx == len(data_distr_by_release_time_Lst)-1):
            release_time_range_dict = {'start':None, 'end':None,
                                       'end_idx': distr_idx,
                                       'data_num': data_counter_collector}
            release_time_range_Lst.append(release_time_range_dict)
            data_counter_collector = 0
        distr_idx += 1

    # fillup the start timestamp and the end timestamp
    if data_distr_by_release_time_Lst != []:
        start_side_cache = data_distr_by_release_time_Lst[0]['key']
        for range_seg in release_time_range_Lst:
            if range_seg['end_idx']+1 > len(data_distr_by_release_time_Lst)-1:
                range_seg['end'] = int(
                    data_distr_by_release_time_Lst[range_seg['end_idx']]['key']
                    + 24*3600*1e3)
            else:
                range_seg['end'] = data_distr_by_release_time_Lst[range_seg['end_idx']+1]['key']
            range_seg['start'] = start_side_cache
            start_side_cache = range_seg['end']
        print('release_time_range_Lst:\n', release_time_range_Lst, file=f_log)
    else:
        sys.exit('Fetch time out of boundary, system exit.')

    # in case the splitted range segments are longer than threads_num
    if len(release_time_range_Lst) != threads_num:
        threads_num = len(release_time_range_Lst)

    # initial lock flags to be used in threads so that main process can exit after
    # making sure each thread exit
    exit_lock_flag_Lst = [_thread.allocate_lock() for i in range(threads_num)]
    # initial globel lock to prevent conflit on stdout when print
    stdout_lock = _thread.allocate_lock()


    # 2.2 define sub-function to run in thread
    def aggs_daily_url_in_threads(thread_id, search_body_in_thread, doc_type_new,
                                  is_multi_thread=True):
        # 2.2.1 find data lines in each threads
        find_hits_total_in_threads = es.search(index=read_idnex,
                                               doc_type=doc_type_pre,
                                               body=search_body_in_thread,
                                               size=0,
                                               request_timeout=300)
        hits_total_in_threads = find_hits_total_in_threads['hits']['total']
        with stdout_lock:
            print('[', thread_id, ']', 'search_body_in_threads',
                  search_body_in_thread, file=f_log)
            print('[', thread_id, ']', 'hits_total_in_threads',
                  hits_total_in_threads, file=f_log)

        if hits_total_in_threads > 0:
            scan_resp = scan(client=es,
                             index=read_idnex,
                             doc_type=doc_type_pre,
                             query=search_body_in_thread,
                             scroll='5m',
                             size=1000,
                             request_timeout=300
                            )

            line_counter = 0

            bulk_write_body = ''
            for line in scan_resp:
                line_counter += 1
                line_dict = line['_source']
                d_url = line_dict['url']

                # add special treatment for platform toutiao on Nov 06 2017
                # select distinctly by 19-digit videoID for toutiao
                d_platform = line_dict['platform']
#                if 'video_id' in line_dict:
#                    if type(line_dict['video_id']) != str:
#                        video_id = line_dict['video_id']
#                        line_dict['video_id'] = str(video_id)
                d_id = cal_doc_id(platform=d_platform, url=d_url,
                                  doc_id_type='all-time-url',
                                  data_dict=line_dict)

                # update timestamp
                line_dict['timestamp'] = int(datetime.datetime.now().timestamp()
                                             * 1e3)

                bulk_write_action = '{"index": {"_id": "' + d_id + '"}}'
                bulk_write_data = json.dumps(line_dict, ensure_ascii=False)
                bulk_write_body += (bulk_write_action + '\n'
                                    +bulk_write_data + '\n')
                if line_counter%1000 == 0 and bulk_write_body != '':
                    t_s = datetime.datetime.now()
                    es.bulk(index=write_index,
                            doc_type=doc_type_new,
                            body=bulk_write_body,
                            request_timeout=300)
                    t_e = datetime.datetime.now()
                    t_delta = t_e - t_s
                    print('[', thread_id, ']', 'bulk write', line_counter,
                          '/', hits_total_in_threads,
                          'time spent', t_delta, file=f_log)
                    bulk_write_body = ''

            if bulk_write_body != '':
                es.bulk(index=write_index,
                        doc_type=doc_type_new,
                        body=bulk_write_body,
                        request_timeout=300)
                print('[', thread_id, ']', 'bulk write done',
                      line_counter, '/', hits_total_in_threads, file=f_log)

        if is_multi_thread is True:
            # lock when finish
            exit_lock_flag_Lst[thread_id].acquire()
            with stdout_lock:
                print('[', thread_id, '] thread exits.',
                      datetime.datetime.now(), file=f_log)
        else:
            print('Done.', datetime.datetime.now(), file=f_log)

    # 2.3 iterate throught all the days of the target month, once one day, latest day first
    print('\n****** fetch_time range:',
          datetime.datetime.fromtimestamp(int(fetch_time_start_ts/1e3)),
          'to',
          datetime.datetime.fromtimestamp(int(fetch_time_end_ts/1e3)),
          datetime.datetime.now(), '******\n',
          file=f_log)

    for i in range(0, threads_num):
        release_time_start_ts_in_thread = release_time_range_Lst[i]['start']
        release_time_end_ts_in_thread = release_time_range_Lst[i]['end']
        search_body_in_thread = {
            "query": {
                "bool": {
                    "must": [
                        {"range": {"release_time": {
                            "gte": release_time_start_ts_in_thread,
                            "lt": release_time_end_ts_in_thread}}
                        },
                        {"range": {"fetch_time": {
                            "gte": fetch_time_start_ts,
                            "lte": fetch_time_end_ts}}
                        }
                    ]
                }
            }
        }

        _thread.start_new_thread(aggs_daily_url_in_threads,
                                 (i, search_body_in_thread, doc_type_new))

    # check if all threads exit
    while not all(lk.locked() for lk in exit_lock_flag_Lst):
        time.sleep(1)

    print('Multi-threads exiting.', datetime.datetime.now(), file=f_log)

    # if day==1 and there are data released on the same day
    # put them into the monthly slice of the next month
    if day == 1:
        release_time_lower_bdr = int(
            datetime.datetime(year, month, day).timestamp()*1e3)
        day1_new_released_body = {
            "query": {
                "bool": {
                    "must": [
                        {"range": {"fetch_time": {"gte":fetch_time_start_ts,
                                                  "lte":fetch_time_end_ts}}},
                        {"range": {"release_time": {"gte":release_time_lower_bdr}}}
                    ],
                }
            },
            "size": 0,
        }
        day1_new_released_resp = es.search(index=read_idnex, doc_type=doc_type_pre,
                                           body=day1_new_released_body,
                                           request_timeout=100)
        day1_new_released_hits = day1_new_released_resp['hits']['total']
        if day1_new_released_hits > 0:
            last_day_in_the_month_according_to_fetch_day_T = find_last_day_in_the_month(year, month)
            doc_type_next_month = (
                'daily-url-%s'
                % last_day_in_the_month_according_to_fetch_day_T.isoformat()[:10])
            print('There are %d hits released on the fetch day for %s,'
                  % (day1_new_released_hits, this_day_T),
                  ('\nwill process these data in an extra thread with '
                   'the same function like multi-thread part,'),
                  '\nand write data into _type: %s' % doc_type_next_month,
                  datetime.datetime.now(), file=f_log)
            # mapping
            mapping_from_pre_type(es, write_index, doc_type_pre, doc_type_next_month, f_log)
            # aggs and write data in
            aggs_daily_url_in_threads(i, day1_new_released_body,
                                      doc_type_next_month,
                                      is_multi_thread=False)
        else:
            print('No data are released on the fetch day for %s,' % this_day_T,
                  'will proceed,', datetime.datetime.now(), file=f_log)
    print('Done processing with day==1 and there are data released on the same day situation',
          datetime.datetime.now(), file=f_log)

    # deal with 人民日报 after multi-threads bulk write
    # Added on Nov 28 2017
    # find_all_renming_data_bd = {
    #     "query": {
    #         "bool": {
    #             "filter": {
    #                 "term": {
    #                     "platform.keyword": "人民日报"
    #                 }
    #             }
    #         }
    #     }
    # }
    # find_all_renming_search_resp = es.search(index=write_index, doc_type=doc_type_new,
    #                                          body=find_all_renming_data_bd,
    #                                          request_timeout=300,
    #                                          size=1)
    # renming_total = find_all_renming_search_resp['hits']['total']
    # print('There are total', renming_total,
    #       'lines in platform 人民日报', file=f_log)
    # find_all_renming_scan_resp = scan(client=es, query=find_all_renming_data_bd,
    #                                   index=write_index, doc_type=doc_type_new,
    #                                   request_timeout=300,
    #                                   size=1000)
    # # mark title_releaser data with max play_count in this dict
    # renming_title_releaser_dict = {}
    # for renming_l in find_all_renming_scan_resp:
    #     renming_d = renming_l['_source']
    #     rd_id = renming_l['_id']
    #     rd_title = renming_d['title']
    #     rd_releaser = renming_d['releaser']
    #     rd_play_count = renming_d['play_count']
    #     rd_title_releaser = rd_title + rd_releaser
    #     if rd_title_releaser not in renming_title_releaser_dict:
    #         renming_title_releaser_dict.update({rd_title_releaser: {'data': renming_d,
    #                                                                 '_id': rd_id}})
    #     else:
    #         rd_play_count_in_dict = renming_title_releaser_dict[rd_title_releaser]['data']['play_count']
    #         # if the difference between two data with unique title+releaser is less than 3%,
    #         # the one with smaller play_count will be deleted;
    #         # if their difference is greater than 3%, they will be taged
    #         # with possible_duplicated=True
    #         if rd_play_count == rd_play_count_in_dict:
    #             es.delete(index=write_index, doc_type=doc_type_new,
    #                       id=rd_id, request_timeout=300)
    #         elif (rd_play_count != 0
    #               and abs(rd_play_count_in_dict-rd_play_count)/rd_play_count <= 0.03):
    #             # because elif has already ruled out the case of
    #             # rd_paly_count==rd_play_count_in_dict,
    #             # that left only one possible case: rd_play_count==0 and rd_play_count_in_dict!=0,
    #             # which should fall into case that the play_count difference are greater than 3%
    #             if rd_play_count < rd_play_count_in_dict:
    #                 es.delete(index=write_index, doc_type=doc_type_new,
    #                           id=rd_id, request_timeout=300)
    #             else:
    #                 es.delete(index=write_index, doc_type=doc_type_new,
    #                           id=renming_title_releaser_dict[rd_title_releaser]['_id'],
    #                           request_timeout=300)
    #                 renming_title_releaser_dict.update({rd_title_releaser: {'data': renming_d,
    #                                                                         '_id': rd_id}})
    #         else:
    #             # tag with possible_duplicated=True in these data
    #             update_body = {
    #                 "doc": {
    #                     "possible_duplicated": True
    #                 }
    #             }
    #             # 1 update the one in the dict
    #             es.update(index=write_index, doc_type=doc_type_new,
    #                       id=renming_title_releaser_dict[rd_title_releaser]['_id'],
    #                       body=update_body, request_timeout=300)
    #             # 2 update the one outside the dict
    #             es.update(index=write_index, doc_type=doc_type_new,
    #                       id=rd_id,
    #                       body=update_body, request_timeout=300)
    #             # 3 update the dict to make sure the one with greater play_count stays
    #             if rd_play_count > rd_play_count_in_dict:
    #                 renming_title_releaser_dict.update({rd_title_releaser: {'data': renming_d,
    #                                                                         '_id': rd_id}})

    print('All done.', datetime.datetime.now(), file=f_log)
    f_log.close()
