# -*- coding: utf-8 -*-
"""
Created on Mon Sep 25 14:34:45 2017

calculate monthly net increase values for daily-url data
1 if a video is released after 00:00 of the first day of each month, the 
accumulated values is used as monthly_net_inc values directly.
2 if a video is released before 00:00 of the first day of each month, the
current accumulated values will be substracted by the accumulated values at
the day before the first day of each month.
2.1 if a video is released before the first day of a month, but it's historical
data is not aviable

migrate to be a function on Oct 03 2017

Nov 18 2017
Add multi-threads.
Auto distribute data by release_time into threads.

Modify release_time range to be from 60 days before the passed-in date.
The reason for doing this is, data from earlier time have tiny chance to found
historical data for net increase values calculation. This lead to more search time,
which slows down the whole calculation process.

@author: hanye
"""
#from esCCR import EsUtils
import datetime
import time
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from elasticsearch import exceptions
import json
import _thread
import threading
from func_calculate_toutiao_video_id import calculate_toutiao_video_id
from func_calculate_newTudou_video_id import calculate_newTudou_video_id



def cal_monthly_net_inc(fetch_year, fetch_month, fetch_day,
                        doc_type_target='daily-url',
                        threads_num=5):    
    date_passed_in=datetime.datetime(year=fetch_year, month=fetch_month, day=fetch_day)
    
    hosts = '192.168.17.11'
    port = 9200
#    user='elastic'
#    passwd='hy_csm_16'
#    http_auth=(user, passwd)    
    es_read = Elasticsearch(hosts=hosts, port=port)
    es_write = Elasticsearch(hosts=hosts, port=port)
    
    index = 'short-video-production'
    doc_type_daily_url = 'daily-url'      
    
    the_day_before_first_day_of_the_month_str = (datetime.datetime(year=fetch_year,month=fetch_month,day=1)-datetime.timedelta(days=1)).isoformat()[:10]
    first_day_of_the_month_str = datetime.datetime(year=fetch_year,month=fetch_month,day=1).isoformat()[:10]
    
    log_path = '/home/hanye/project_data/Python/Projects/proj-short-videos/write-data-into-es/log/'
    progress_log_filename = 'calculate_monthly_net_inc_for_'+str(fetch_year)+'_'+str(fetch_month)+'_'+str(fetch_day)+'_log_'+datetime.datetime.now().isoformat().replace(':','-')
    f_progress_log = open(log_path+progress_log_filename, 'a', encoding='gb18030')
    print('************ log starts at', datetime.datetime.now(), file=f_progress_log)
    print('calculate monthly net_inc values', file=f_progress_log)
    err_log_filename='calculate_monthly_net_inc_for_'+str(fetch_year)+'_'+str(fetch_month)+'_'+str(fetch_day)+'_error_log'
    f_err_log=open(log_path+err_log_filename, 'a', encoding='gb18030')
    
    t_s_cal = datetime.datetime.now()
    
    # 1 get all the data from ccr es for given fetch_time
    # from _type=daily-url
    
    # fetch_time lower boundary extends to one day before the input day, and 
    # search_body of data to be calculated added a must_not exists monthly_cal_base,
    # so that to include data that has been missed.
    # The reason of missing to calculate monthly_net_inc is because 
    # after calculation process runs, there still data come in.
    # for example, if the calculation process runs on schedule at 16:30 everyday,
    # data come in after 16:30 will be missed in the previous logic
    # Modfied on Dec 04 2017
    fetch_time_start_ts = int((datetime.datetime(year=fetch_year,month=fetch_month,day=fetch_day)-datetime.timedelta(days=1)).timestamp()*1e3)
    fetch_time_end_ts = int((datetime.datetime(year=fetch_year,month=fetch_month,day=fetch_day)+datetime.timedelta(days=1)).timestamp()*1e3-1)

    fetch_time_start_iso=datetime.datetime.fromtimestamp(int(fetch_time_start_ts/1e3)).isoformat()
    fetch_time_end_iso=datetime.datetime.fromtimestamp(int(fetch_time_end_ts/1e3)).isoformat()
    
    # Modify release_time range to be from 60 days before the passed-in date
    # Nov 18 2017
    release_time_start_ts=int((date_passed_in - datetime.timedelta(days=90))
                           .timestamp()*1000)
    release_time_end_ts=int((date_passed_in + datetime.timedelta(days=360))
                           .timestamp()*1000)
    
    search_body = {
            "query": {
                    "bool": {
#                            "filter": [{"term": {"platform.keyword": "youtube"}}],
                            "must": [{"range": {"release_time": {"gte": 0, "lte": 0}}},
                                     {"range": {"fetch_time": {"gte": 0, "lte": 0}}}],
#                            "should": [
#                                    {"term": {"monthly_cal_base.keyword": 
#                                        {"value": "historical_uncomplete"}}},
#                                    {"bool": {
#                                      "must_not": [
#                                        {"exists": {"field":"monthly_cal_base"}}
#                                        ]}},
#                                  ],
#                            "minimum_should_match": 1,
                            "must_not": [
                                        {"exists": {"field": "MNI_hist_data_id"}},
                                        {"term": {"monthly_cal_base.keyword": {"value":"accumulated_values"}}}
                                      ]
                            
                            }
                    },
            "size": 2, 
            "aggs": {
              "release_time_distribution": {
                "date_histogram": {
                  "field": "release_time",
                  "interval": "day",
                  "time_zone": "Asia/Shanghai"
                }
              }
            }                    
    }
    
    search_body['query']['bool']['must'][0]['range']['release_time']['gte']=release_time_start_ts
    search_body['query']['bool']['must'][0]['range']['release_time']['lte']=release_time_end_ts
    
    search_body['query']['bool']['must'][1]['range']['fetch_time']['gte']=fetch_time_start_ts
    search_body['query']['bool']['must'][1]['range']['fetch_time']['lte']=fetch_time_end_ts    
    
    search_response = es_read.search(index=index,
                                       doc_type=doc_type_target,
                                       body=search_body,
                                       request_timeout=100)
    hits_total=search_response['hits']['total']
    print('find total hits in _type',doc_type_target, hits_total, file=f_progress_log)
    if hits_total==0:
        return
    
    # 1.1 By total hits and data distribution by release_time
    # find proper release_time range for multi-threads
    data_distr_by_release_time_Lst=search_response['aggregations']['release_time_distribution']['buckets']
    average_data_num=hits_total//threads_num
    release_time_range_Lst=[]
    # find the end side of each range segment
    data_counter_collector=0
    distr_idx=0
    for distr_by_releaseT in data_distr_by_release_time_Lst:
        data_num_each_day=distr_by_releaseT['doc_count']
        data_counter_collector+=data_num_each_day
        if data_counter_collector>average_data_num*0.9 or distr_idx==len(data_distr_by_release_time_Lst)-1:
            release_time_range_dict={'start':None, 'end':None,
                                     'end_idx': distr_idx, 'data_num': data_counter_collector}
            release_time_range_Lst.append(release_time_range_dict)
            data_counter_collector=0
        distr_idx+=1
    
    # fillup the start timestamp and the end timestamp
    start_side_cache=data_distr_by_release_time_Lst[0]['key']
    for range_seg in release_time_range_Lst:
        if range_seg['end_idx']+1>len(data_distr_by_release_time_Lst)-1:
            range_seg['end']=int(data_distr_by_release_time_Lst[range_seg['end_idx']]['key']+24*3600*1e3)
        else:
            range_seg['end']=data_distr_by_release_time_Lst[range_seg['end_idx']+1]['key']
        range_seg['start']=start_side_cache
        start_side_cache=range_seg['end']

    # in case the splitted range segments are longer or shorter than threads_num
    if len(release_time_range_Lst)!=threads_num:
        threads_num=len(release_time_range_Lst)
    print('Actual threads_num', threads_num, file=f_progress_log)
    
    print('release_time_range_Lst:\n', release_time_range_Lst, file=f_progress_log)

    # lock flag to make sure each thread exit
    exit_lock_flag_Lst=[_thread.allocate_lock() for i in range(threads_num)]
    # globel lock to prevent conflit on stdout when print
    stdout_lock=_thread.allocate_lock()    

    
    # 1.2 define sub-function to run in thread
    def cal_monthly_net_inc_in_thread(thread_id, search_body_in_thread):
        with stdout_lock:
            print('[',thread_id,']', 'search_body_in_thread', search_body_in_thread, file=f_progress_log)
        find_hits_total_in_threads= es_read.search(index=index,
                                                   doc_type=doc_type_target,
                                                   body=search_body_in_thread,
                                                   size=0,
                                                   request_timeout=100)
        hits_total_in_threads=find_hits_total_in_threads['hits']['total']
        with stdout_lock:
            print('[',thread_id,']', 'hits_total_in_threads', hits_total_in_threads, file=f_progress_log)
        scan_response = scan(client=es_read,
                             query=search_body_in_thread,
                             index=index,
                             doc_type=doc_type_target,
                             scroll='5m',
                             size=1000,
                             request_timeout=100)
        
        data_raw_collector = []
        
        line_counter=0
        for line in scan_response:
            data_raw_collector.append(line)
            line_counter+=1
            if line_counter%1000==0 or line_counter==hits_total_in_threads:
                lines_of_accumulated_values=0
                lines_of_historical_complete=0
                lines_of_historical_uncomplete=0
                lines_of_historical_data_absent=0
                
                with stdout_lock:
                    print('[',thread_id,']', 'processing', line_counter, '/', hits_total_in_threads,
                          datetime.datetime.now(), file=f_progress_log)
                # update monthly_net_inc values every 1000 docs
                t_search_and_calculate_start=datetime.datetime.now()
                bulk_body_daily_url = ''
                #
                for raw_line in data_raw_collector:
                    current_id_in_daily_url=raw_line['_id']
                    line_data=raw_line['_source']
                    url=line_data['url']
                    play_count_acc=int(line_data['play_count'])
                    comment_count_acc=int(line_data['comment_count'])
                    favorite_count_acc=int(line_data['favorite_count'])
                    release_time=int(line_data['release_time'])
                    release_time_T = datetime.datetime.fromtimestamp(int(release_time/1e3))
                    fetch_time=int(line_data['fetch_time'])
        
                    first_day_of_the_month_T=datetime.datetime.strptime(first_day_of_the_month_str, '%Y-%m-%d')
        
                    # 2 calculate monthly_net_inc values for each line
                    monthly_net_inc_play_count=None
                    monthly_net_inc_comment_count=None
                    monthly_net_inc_favorite_count=None
                    monthly_cal_base=None
                    monthly_cal_base_fetch_time=None
                    hist_data_id=None
                    # 2.1 for those data which are released after 2017-09-01T00:00:00, 
                    # assign accumulated values to monthly_net_inc directly, and
                    # tag monthly_cal_base field as 'accumulated_values'            
                    if release_time_T>=first_day_of_the_month_T:
                        monthly_net_inc_play_count=play_count_acc
                        monthly_net_inc_comment_count=comment_count_acc
                        monthly_net_inc_favorite_count=favorite_count_acc
                        
                        monthly_cal_base = 'accumulated_values' 
                        monthly_cal_base_fetch_time = ''
                        lines_of_accumulated_values+=1
                    #
                    # 2.2 for those data which are released before 2017-09-01T00:00:00,
                    # 2.2.1 find data for the same url which is fetched on 2017-08-30 in _type=daily-url,
                    # which is one day before survey date. if there is one hit, substrate
                    # the current accumulated values by values from this hit, and tag monthly_cal_base
                    # field as 'historical_complete'. assign the fetch_time value of the found data
                    # fetched on 2017-08-30 to monthly_cal_base_fetch_time field.                
                    else:
                        # bug fix: the way to calculate _id is not directly add url with date string. 2017-12-29
                        platform=line_data['platform']
                        if platform=='toutiao':
                            videoID=calculate_toutiao_video_id(url)
                            id_str='toutiao_'+videoID
                        elif platform=='new_tudou':
                            videoID=calculate_newTudou_video_id(url)
                            id_str=videoID
                        else:
                            id_str=url
                            videoID=url
                        history_id_in_daily_url = id_str+'_'+the_day_before_first_day_of_the_month_str    
                        try:
                            data_complete_history=es_read.get(index=index, 
                                                              doc_type=doc_type_daily_url,
                                                              id=history_id_in_daily_url,
                                                              request_timeout=100)
                            data_dict_complete_history = data_complete_history['_source']
                            hist_data_id=data_complete_history['_id']
                            play_count_acc_complete_history=int(data_dict_complete_history['play_count'])
                            comment_count_acc_complete_history=int(data_dict_complete_history['comment_count'])
                            favorite_count_acc_complete_history=int(data_dict_complete_history['favorite_count'])
                            fetch_time_complete_history=data_dict_complete_history['fetch_time']
                            
                            monthly_net_inc_play_count=play_count_acc - play_count_acc_complete_history
                            monthly_net_inc_comment_count=comment_count_acc - comment_count_acc_complete_history
                            monthly_net_inc_favorite_count=favorite_count_acc - favorite_count_acc_complete_history
                            
                            monthly_cal_base = 'historical_complete'
                            monthly_cal_base_fetch_time = fetch_time_complete_history
                            lines_of_historical_complete+=1
                    #
                    # 2.2.2 if there is no hit, find all data which are fetch AFTER 2017-09-01T00:00:00
                        # of this url in _type=daily-url,
                        # substrate the current accumulated values by the values of the earliest hit,
                        # and tag monthly_cal_base field as 'historical_uncomplete'. also assign
                        # the fetch_time value of the earliese data to monthly_cal_base_fetch_time field.
                        # this is an aproximate approach, which is equivalent to use the earliest value
                        # in survey date range (which is by defination a nature month) as the boundary value,
                        # no matter what exactly the date of the earliest value is as long as its in the survey
                        # date range.
                        except exceptions.NotFoundError:
                            with stdout_lock:
                                print('[',thread_id,']', url, '\ndoesn\'t found historical data on', the_day_before_first_day_of_the_month_str, file=f_err_log)
                            fetch_time_lower_boundary_ts=int(first_day_of_the_month_T.timestamp()*1e3)
                            fetch_time_ts_mil=fetch_time
                            fetch_time_upper_boundary_ts=fetch_time_ts_mil
                            if platform in ['toutiao', 'new_tudou']:
                                search_body_hist_uncomp = {
                                                "query": {
                                                        "bool": {
                                                            "must": [
                                                                    {"range": {"fetch_time": {
                                                                                    "gte": fetch_time_lower_boundary_ts,
                                                                                    "lte": fetch_time_upper_boundary_ts,}}},
                                                                    {"match_phrase": {"url": videoID}}
                                                                    ],
    
                                                        }
                                                },
                                                "sort": [
                                                        {"fetch_time": {"order": "asc"}}
                                                        ]
                                }
                            else:
                                search_body_hist_uncomp = {
                                                "query": {
                                                        "bool": {
                                                            "filter": [
                                                                    {"range": {"fetch_time": {
                                                                                    "gte": fetch_time_lower_boundary_ts,
                                                                                    "lte": fetch_time_upper_boundary_ts,}}},
                                                                    {"term": {"url.keyword": url}}
                                                                    ],
    
                                                        }
                                                },
                                                "sort": [
                                                        {"fetch_time": {"order": "asc"}}
                                                        ]
                                }                                
                            search_resp=es_read.search(index=index,
                                                       doc_type=doc_type_daily_url,
                                                       body=search_body_hist_uncomp,
                                                       sort='fetch_time:asc',
                                                       size=10,
                                                       request_timeout=100)
                            hist_umcomplete_data_raw_collector = search_resp['hits']['hits']
                            search_resp_total = search_resp['hits']['total']
                            if search_resp_total>=2:
                                earliest_hist_data = hist_umcomplete_data_raw_collector[0]['_source']
                                hist_data_id=hist_umcomplete_data_raw_collector[0]['_id']
        
                                play_count_acc_uncomplete_history=int(earliest_hist_data['play_count'])
                                comment_count_acc_uncomplete_history=int(earliest_hist_data['comment_count'])
                                favorite_count_acc_uncomplete_history=int(earliest_hist_data['favorite_count'])
                                fetch_time_uncomplete_history=earliest_hist_data['fetch_time']                            
        
                                monthly_net_inc_play_count=play_count_acc - play_count_acc_uncomplete_history
                                monthly_net_inc_comment_count=comment_count_acc - comment_count_acc_uncomplete_history
                                monthly_net_inc_favorite_count=favorite_count_acc - favorite_count_acc_uncomplete_history
                                
                                monthly_cal_base = 'historical_uncomplete'
                                monthly_cal_base_fetch_time = fetch_time_uncomplete_history
                                lines_of_historical_uncomplete+=1
                            # if there are no hits, leave the monthly_net_inc fields empty, and
                            # tag monthly_cal_base as historical_data_absent
                            else:
                                with stdout_lock:
                                    print('[',thread_id,']', url, 'found no historical data in present survey date range',
                                          fetch_time_start_iso, 'to', fetch_time_end_iso,
                                          file=f_err_log)
                                monthly_cal_base='historical_data_absent'
                                lines_of_historical_data_absent+=1

                    #
                    # 3 form action and body for bulk write, and update in daily-url
                    # 3.1 update line_data dict
                    #
                    # added update timestamp when calculate monthly_net_inc values on Oct 16 2017
                    line_data.update({'timestamp': int(datetime.datetime.now().timestamp()*1e3)})
                    if monthly_cal_base!=None:
                        line_data.update({'monthly_cal_base':monthly_cal_base})
                        line_data.update({'monthly_net_inc_play_count':monthly_net_inc_play_count,
                                          'monthly_net_inc_comment_count':monthly_net_inc_comment_count,
                                          'monthly_net_inc_favorite_count':monthly_net_inc_favorite_count,})                    
                    if monthly_cal_base_fetch_time!='' and monthly_cal_base_fetch_time!=None:
                        line_data.update({'monthly_cal_base_fetch_time':monthly_cal_base_fetch_time})
                    if hist_data_id!=None:
                        line_data['MNI_hist_data_id']=hist_data_id
                    #
                    # 3.2 form json body for each line
                    action_json_str_line_daily_url = '{"index": {"_id":"'+current_id_in_daily_url+'"}}'
                    data_json_str_line_daily_url = json.dumps(line_data, ensure_ascii=False)
                    bulk_line_body_daily_url = (action_json_str_line_daily_url+'\n'
                                                +data_json_str_line_daily_url+'\n')
                    # 3.3 accumulate all lines into one body for bulk write
                    bulk_body_daily_url += bulk_line_body_daily_url
                

                lines_chk=lines_of_accumulated_values+lines_of_historical_complete+lines_of_historical_uncomplete+lines_of_historical_data_absent                            
                with stdout_lock:
                    print('[',thread_id,']', 'lines check: lines_of_accumulated_values+lines_of_historical_complete+lines_of_historical_uncomplete+lines_of_historical_data_absent=',
                          lines_chk, 
                          '\nlines_of_accumulated_values', lines_of_accumulated_values,
                          'lines_of_historical_complete', lines_of_historical_complete,
                          'lines_of_historical_uncomplete', lines_of_historical_uncomplete,
                          'lines_of_historical_data_absent', lines_of_historical_data_absent,
                          file=f_progress_log)

                # 3.4 time spent for search and calculation
                t_search_and_calculate_end=datetime.datetime.now()
                t_search_and_calculate_delta=t_search_and_calculate_end-t_search_and_calculate_start
                with stdout_lock:
                    print('[',thread_id,']', 'search and calculation for one data batch time spent',
                          t_search_and_calculate_delta, file=f_progress_log)
                    
                # 3.4 bulk write
                t_s = datetime.datetime.now()
                es_write.bulk(body=bulk_body_daily_url, index=index, 
                              doc_type=doc_type_target,
                              request_timeout=100)
                with stdout_lock:
                    print('[',thread_id,']', 'update', doc_type_daily_url, 'with', len(data_raw_collector), 'docs.', file=f_progress_log)
                t_e = datetime.datetime.now()
                t_delta = t_e-t_s
                with stdout_lock:
                    print('[',thread_id,']', 'bulk write for one data batch time spent', t_delta, file=f_progress_log)
                    print('[',thread_id,']', 'overall line_count:', line_counter, datetime.datetime.now(), file=f_progress_log)
                #
                # 3.5 clear data_raw_collector after bulk write
                data_raw_collector.clear()
                
        # lock when finish  
        exit_lock_flag_Lst[thread_id].acquire()
        with stdout_lock:
            print('[',thread_id,']', 'Thread exits.', datetime.datetime.now(), file=f_progress_log)
                

    search_body_in_thread_Lst=[]
    # When you pass objects as arguments to a function, new local variables are created referencing 
    # the original objects without any copy. Which means, if ONE object is passed to different threads 
    # after modification before each passing, and parallelly, this could cause potential problems if
    # there are conflict happens with modifying the one object.
    # Thus, create a list object to store the objects to be passed to thread, one object for one thread,
    # to avoid this potential problem.
    waitfor=[]
    for i in range(0, threads_num):
        release_time_ts_start=release_time_range_Lst[i]['start']
        release_time_ts_end=release_time_range_Lst[i]['end']
        search_body_in_thread={
                "query": {
                        "bool": {
#                                "filter": [{"term": {"platform.keyword": "youtube"}}],
                                "must": [{"range": {"release_time": {"gte": release_time_ts_start, "lt": release_time_ts_end}}},
                                         {"range": {"fetch_time": {"gte": fetch_time_start_ts, "lte": fetch_time_end_ts}}}],
#                                "should": [
#                                        {"term": {"monthly_cal_base.keyword": 
#                                            {"value": "historical_uncomplete"}}},
#                                        {"term": {"monthly_cal_base.keyword": 
#                                            {"value": "historical_complete"}}},
#                                        {"term": {"monthly_cal_base.keyword": 
#                                            {"value": "historical_data_absent"}}},
#                                        {"bool": {
#                                          "must_not": [
#                                            {"exists": {"field":"monthly_cal_base"}}
#                                            ]}},
#                                      ],
#                                "minimum_should_match": 1,
                                "must_not": [
                                            {"exists": {"field": "MNI_hist_data_id"}},
                                            {"term": {"monthly_cal_base.keyword": {"value":"accumulated_values"}}}
                                          ]
                                }
                        }
        }
        search_body_in_thread_Lst.append(search_body_in_thread)
        
        thread=threading.Thread(target=cal_monthly_net_inc_in_thread, args=(i, search_body_in_thread_Lst[i]))
        waitfor.append(thread)
        thread.start()
        
    for thread in waitfor: thread.join()
        
    
    t_e_cal = datetime.datetime.now()
    t_delta_cal = t_e_cal-t_s_cal
    print('for all data of date', date_passed_in.isoformat()[:10], 'total time cost:', t_delta_cal, file=f_progress_log)
    print('Main thread exiting.', file=f_progress_log)
    f_progress_log.close()
    f_err_log.close()
    



