# -*- coding: utf-8 -*-
"""
Created on Wed Aug 23 19:08:43 2017
@author: hanye

Oct 05 2017
remove the calculation of daily net_inc values

Oct 23 2017
Add back calculating for daily net_inc values, using GET from daily-url.
Tested.

Dec 08 2017
Update all-time-url at the same time when writing daily-url

Dec 27 2017
Add a new field 'fetchT_minus_releaseT' represent the value of milliseconds distance
from fetch_time to release_time. Write this field into dialy-url when retrieving data
from source.

Jan 11 2018
use scan method to retrieve data.
add retries.

"""

from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
import datetime
import time
import json
import copy
from elasticsearch import exceptions
from func_calculate_toutiao_video_id import calculate_toutiao_video_id
from func_calculate_newTudou_video_id import calculate_newTudou_video_id
from func_calculate_txxw_video_id import calculate_txxw_video_id



def cal_daily_net_inc_for_missed_values(fetch_time_start_ts, fetch_time_end_ts,
                                          release_time_start_ts=1262275200000,
                                          release_time_end_ts=2524579200000):


    def write_es_bulk(raw_data_collector):
        t_s = datetime.datetime.now()
        write_to_hosts = '192.168.17.11'
        write_to_port = 9200

        es_write = Elasticsearch(hosts=write_to_hosts, port=write_to_port)
        es_read = Elasticsearch(hosts=write_to_hosts, port=write_to_port)

        cal_base_dict = {'history': 'historical_data',
                         # cal_base 'no_need' changed to 'accumulated_values', May 28 2018
                         'no_need': 'accumulated_values',
                         'empirical': 'empirial_data',
                         'historical_absent': 'historical_data_absent',
                         'wait_for_tomorrow': 'wait_for_tomorrow'}

        index = 'short-video-production'

        doc_type_daily_url = 'daily-url'

        # write daily-url FOR TODAY
        bulk_body_daily_url_for_TODAY = ''

        # write daily-url FOR THE PREVIOUS DAY
        bulk_body_daily_url_for_THE_PREVIOUS_DAY = ''

        # latest data dict for TODAY's daily-url
        latest_line_for_TODAY_daily_url = {}

        # data line to be updated with daily net_inc values
        # for THE PREVIOUS DAY
        line_to_update_for_PREVIOUS_DAY_daily_url = {}

        # add counters to see how many lines of what kind of situations have been
        # calculated by what means.
        # On Oct 23 2017
        counter_no_need=0
        counter_history=0
        counter_empirical=0

        toutiao_videoID_set=set()

        chk_dict={
                'total_hits_from_yunhe':0,
                'toutiao_url_num':0,
                'toutiao_videoID_num':0,
                'lines_written_into_daily_url_today':0,
                'lines_written_into_daily_url_previous_day':0,
                'lines_written_into_all_time_url':0,
                'lines_cal_no_need':0,
                'lines_cal_history':0,
                'lines_cal_empirical':0,
                }

        chk_dict['total_hits_from_yunhe']=len(raw_data_collector)

        for line in raw_data_collector:
            ts_every_line = int(datetime.datetime.now().timestamp()*1e3)    # in milliseconds
            data_dict = line
            data_dict['timestamp']=ts_every_line

            # added on 2017-12-27
            fetch_time_ts=data_dict['fetch_time']
            release_time_ts=data_dict['release_time']
            try:
                fetchT_minus_releaseT=fetch_time_ts-release_time_ts
            except:
                fetchT_minus_releaseT=None
            data_dict['fetchT_minus_releaseT']=fetchT_minus_releaseT

            fetch_time_T = datetime.datetime.fromtimestamp(data_dict['fetch_time']/1e3)
            fetch_time_day_str = fetch_time_T.isoformat()[:10]
            fetch_time_previous_day_T = fetch_time_T - datetime.timedelta(days=1)
            fetch_time_previous_day_str = fetch_time_previous_day_T.isoformat()[:10]

            release_time_T = datetime.datetime.fromtimestamp(data_dict['release_time']/1e3)


            if data_dict['url'] != '':
                platform = data_dict['platform']
                # use 19-digit number as id for data from toutiao,
                # similar action will be taken with data from qingbo,
                # in order to merger in the daily-url _type
                if platform == 'toutiao':
                    video_ID=calculate_toutiao_video_id(data_dict['url'])
                    chk_dict['toutiao_url_num']+=1
                    toutiao_videoID_set.add(video_ID)

                    # format id for daily-url
                    id_daily_url = 'toutiao_'+video_ID+'_'+ fetch_time_day_str
                    id_daily_url_previous_day = 'toutiao_'+video_ID+'_'+ fetch_time_previous_day_str

                elif platform == 'new_tudou':
                    video_ID=calculate_newTudou_video_id(data_dict['url'])

                    # format id for daily-url
                    id_daily_url = video_ID+'_'+ fetch_time_day_str
                    id_daily_url_previous_day = video_ID+'_'+ fetch_time_previous_day_str
                elif platform == '腾讯新闻':
                    video_ID=calculate_txxw_video_id(data_dict)

                    # format id for daily-url
                    id_daily_url = 'txxw_'+ video_ID+'_'+ fetch_time_day_str
                    id_daily_url_previous_day = 'txxw_'+ video_ID+'_'+ fetch_time_previous_day_str
                    

                else:
                    # format id for daily-url
                    id_daily_url = data_dict['url']+'_'+fetch_time_day_str
                    id_daily_url_previous_day = data_dict['url']+'_'+fetch_time_previous_day_str

                # find data of previous day according to _id of the previous day in daily-url
                # and calculate the net_inc_* values
                #
                # if fetch date is the same as release date,
                # update daily net_inc FOR TODAY
                if (release_time_T.day==fetch_time_T.day
                    and release_time_T.month==fetch_time_T.month
                    and release_time_T.year==fetch_time_T.year):
                    net_inc_play_count = data_dict['play_count']
                    net_inc_favorite_count = data_dict['favorite_count']
                    net_inc_comment_count = data_dict['comment_count']

                    ts_for_today_nd=int(datetime.datetime.now().timestamp()*1e3)
                    # Update timestamp when updating the data of today
                    # Added on Oct 23 2017

                    cal_base = cal_base_dict['no_need']
                    counter_no_need+=1

                    data_dict.update({'net_inc_play_count':net_inc_play_count,
                                   'net_inc_favorite_count':net_inc_favorite_count,
                                   'net_inc_comment_count':net_inc_comment_count,
                                   'cal_base':cal_base,
                                   'timestamp':ts_for_today_nd})

                #
                # if fetch date is later than release date
                else:
                    try:
                        # if the same url can be found on the previous day,
                        # update daily FOR THE PREVIOUS DAY
                        doc_in_daily_url_on_previous_day = es_read.get(index=index,
                                                                       doc_type=doc_type_daily_url,
                                                                       id=id_daily_url_previous_day,
                                                                       request_timeout=100)
                        play_count_previous_day = doc_in_daily_url_on_previous_day['_source']['play_count']
                        favorite_count_previous_day = doc_in_daily_url_on_previous_day['_source']['favorite_count']
                        comment_count_previous_day = doc_in_daily_url_on_previous_day['_source']['comment_count']

                        net_inc_play_count = data_dict['play_count']-play_count_previous_day
                        net_inc_favorite_count = data_dict['favorite_count']-favorite_count_previous_day
                        net_inc_comment_count = data_dict['comment_count']-comment_count_previous_day

                        ts_for_pre_day=int(datetime.datetime.now().timestamp()*1e3)
                        # Update timestamp when updating the data of Previous day
                        # Added on Oct 23 2017

                        cal_base = cal_base_dict['history']
                        counter_history+=1

                        doc_in_daily_url_on_previous_day['_source'].update({'net_inc_play_count':net_inc_play_count,
                                                   'net_inc_favorite_count':net_inc_favorite_count,
                                                   'net_inc_comment_count':net_inc_comment_count,
                                                   'cal_base':cal_base,
                                                   'timestamp':ts_for_pre_day})

                        # build data dict for THE PREVIOUS DAY's daily-url
                        line_to_update_for_PREVIOUS_DAY_daily_url.update({id_daily_url_previous_day: doc_in_daily_url_on_previous_day['_source']})

                        # added on Dec 25 2017
                        # mark cal_base even if previous day has a hit
                        ts_for_today=int(datetime.datetime.now().timestamp()*1e3)
                        cal_base = cal_base_dict['wait_for_tomorrow']
                        counter_empirical+=1
                        data_dict.update({
                               'cal_base':cal_base,
                               'timestamp':ts_for_today})

                    except exceptions.NotFoundError:
                        # if the url cannot be found on the previous day,
                        # update FOR TODAY with empirical ratio
                        #
                        # The reason is this. This value here is videos released BEFORE
                        # the day of current fetch_time. For example, if the current day of fetch_time
                        # (that's the day the auto-task excuted) is 2017-10-23, the data are fetched at
                        # around 0:00 of 2017-10-23. If a video is fetched at 2017-10-23T01:00:00,
                        # the 'previous day' is defined 2017-10-22, the same day is defined 2017-10-23.
                        # If a video data falls into this sector of code, that means, this video data
                        # is fetched on 2017-10-23, and is released on or before 2017-10-22.
                        # If it's released on 2017-10-22, and NO data on 2017-10-22,
                        # ideally, the accumulated values should be recorded as net_inc value under
                        # the date of 2017-10-22. BUT there are NO data on that day. We should not
                        # make up a whole piece of data. Just let it be. It will fill in the case
                        # 'a video hasn't been catched when it's first released', where the data
                        # from the moment it released to the moment it is catched is lost.
                        # If it's released even earlier than 2017-10-22, say, 2017-10-19,
                        # and there are NO data on 2017-10=22, so, we get a video, whose release date is
                        # 2017-10-19 and no data from 2017-10-22, one piece of data on 2017-10-23.
                        # There is one posibility, that there will be another piece of data on 2017-10-24,
                        # when the date comes, it will be updated by the rules for 'the same url can be found on the previous day',
                        # and the net_inc values will be the corrected ones with 'history' tag.
                        # If there will NOT be any more data, to avoid empty fields, these daily net_inc values
                        # will be filled in empirical values. Anyway, if the future day comes with another
                        # piece of data, these empircal values will be overwrited. It's just OK.
                        print('\ndoesn\'t found _id in daily-url:', id_daily_url_previous_day, file=f_exception_log)

                        ts_for_today=int(datetime.datetime.now().timestamp()*1e3)
                        # if the video cannot be found on the previous day,
                        # and if the video is fetched less than 24 hours afer its
                        # release, will still use accumulated values as the daily
                        # net increase values.
                        # For example, if a video is released on May 25th 2018, 21:05:33,
                        # and fetched on May 26th 2018, 07:30:15.080, in this logic,
                        # the daily net inc data of fetch_time=May 26th 2018, 07:30:15.080
                        # will be the accumulated values. While in the previous logic,
                        # it will be treated as historical_data_absent.
                        # Added on May 28 2018
                        if (fetch_time_T-release_time_T).total_seconds()<24*3600:
                            net_inc_play_count = data_dict['play_count']
                            net_inc_favorite_count = data_dict['favorite_count']
                            net_inc_comment_count = data_dict['comment_count']

                            ts_for_today_nd=int(datetime.datetime.now().timestamp()*1e3)
                            # Update timestamp when updating the data of today
                            # Added on Oct 23 2017

                            cal_base = cal_base_dict['no_need']
                            counter_no_need+=1

                            data_dict.update({'net_inc_play_count':net_inc_play_count,
                                           'net_inc_favorite_count':net_inc_favorite_count,
                                           'net_inc_comment_count':net_inc_comment_count,
                                           'cal_base':cal_base,
                                           'timestamp':ts_for_today_nd})
                        else:
                            # Update timestamp when updating the data of today
                            # Added on Oct 23 2017
                            cal_base = cal_base_dict['historical_absent']
                            counter_empirical+=1

                            data_dict.update({
                                       'cal_base':cal_base,
                                       'timestamp':ts_for_today})

                # move the update latest_line_for_TODAY_daily_url dict to be inside the 1st level of
                # if data_dict['url'] != '': body. So that, whatever happens (calculate today or calculate the previous day)
                # data whose fetch_time is in today will be written into ccr es.
                # commented lines from 193 to 203 accordingly.
                # Dec 08 2017
                #
                # build latest dict for TODAY's daily-url
                # if a url_day_str not in the dict, update the dict directly
                if id_daily_url not in latest_line_for_TODAY_daily_url:
                    latest_line_for_TODAY_daily_url.update({id_daily_url: data_dict})
                # if a url_day_str in the dict, compare the new line's fetch_time value
                # with the one already present, leave the one with greater fetch_time value
                else:
                    present_fetch_time_ts = latest_line_for_TODAY_daily_url[id_daily_url]['fetch_time']/1e3
                    present_fetch_time_T = datetime.datetime.fromtimestamp(present_fetch_time_ts)
                    if fetch_time_T>present_fetch_time_T:
                        latest_line_for_TODAY_daily_url.update({id_daily_url: data_dict})

        print('build data dict list done', datetime.datetime.now(), file=f_proc_log)
        print('For all', len(raw_data_collector), 'lines,'
              'there are', counter_no_need,'ones which are released on the calculation date,',
              'so that the daily net_inc_* use accumulated values directly, and they are taged with \'no_need\'',
              '\nthere are', counter_history, 'ones which are released before the calculation date,',
              'and found their historical data, they are taged with \'history\'',
              '\nthere are', counter_empirical, 'ones which are released before the calculation date,',
              'and DIDNOT found their histrorical data, they are taged with \'historical_absent\'\n',
              file=f_proc_log)

        # format body for TODAY's daily-url
        chk_dict['lines_written_into_daily_url_today']=len(latest_line_for_TODAY_daily_url)
        for ids in latest_line_for_TODAY_daily_url:
            action_json_str_line_daily_url = '{"index": {"_id":"'+ids+'"}}'
            data_json_str_line_daily_url = json.dumps(latest_line_for_TODAY_daily_url[ids],
                                                      ensure_ascii=False)

            bulk_line_body_daily_url = (action_json_str_line_daily_url+'\n'
                                        +data_json_str_line_daily_url+'\n')
            bulk_body_daily_url_for_TODAY += bulk_line_body_daily_url

        # format body for THE PREVIOUS DAY's daily-url
        chk_dict['lines_written_into_daily_url_previous_day']=len(line_to_update_for_PREVIOUS_DAY_daily_url)
        for ids_p in line_to_update_for_PREVIOUS_DAY_daily_url:
            action_json_str_line_p_daily_url = '{"index": {"_id":"'+ids_p+'"}}'
            data_json_str_line_p_daily_url = json.dumps(line_to_update_for_PREVIOUS_DAY_daily_url[ids_p],
                                                        ensure_ascii=False)

            bulk_line_p_body_daily_url = (action_json_str_line_p_daily_url+'\n'
                                          +data_json_str_line_p_daily_url+'\n')
            bulk_body_daily_url_for_THE_PREVIOUS_DAY+=bulk_line_p_body_daily_url

        # write daily-url FOR TODAY
        if len(bulk_body_daily_url_for_TODAY)!=0:
            t_s = datetime.datetime.now()
            es_write.bulk(body=bulk_body_daily_url_for_TODAY, index=index, doc_type=doc_type_daily_url,
                          request_timeout=100)
            print('write TODAY\'s', doc_type_daily_url, 'with', len(latest_line_for_TODAY_daily_url), 'docs.', file=f_proc_log)
            t_e = datetime.datetime.now()
            t_delta = t_e-t_s
            print('time spent', t_delta, file=f_proc_log)

        # write daily-url FOR PREVIOUS DAY
        if len(bulk_body_daily_url_for_THE_PREVIOUS_DAY)!=0:
            t_s = datetime.datetime.now()
            es_write.bulk(body=bulk_body_daily_url_for_THE_PREVIOUS_DAY, index=index, doc_type=doc_type_daily_url,
                          request_timeout=100)
            print('write PREVIOUS DAY\'s', doc_type_daily_url, 'with', len(line_to_update_for_PREVIOUS_DAY_daily_url), 'docs.', file=f_proc_log)
            t_e = datetime.datetime.now()
            t_delta = t_e-t_s
            print('time spent', t_delta, file=f_proc_log)


        chk_dict['toutiao_videoID_num']=len(toutiao_videoID_set)
        chk_dict['lines_cal_no_need']=counter_no_need
        chk_dict['lines_cal_history']=counter_history
        chk_dict['lines_cal_empirical']=counter_empirical

        print('\n****** chk_dict: ******\n', chk_dict, '\n###\n', file=f_proc_log)


    resource_path = '/home/hanye/project_data/Python/Projects/CCI/resource/'
    releaser_mapping_filename = '融合传播指数-短视频账号列表.csv'
    
    TV_column_dict = {}
    TV_column_Lst=[]
    
    f_releaser_mapping = open(resource_path+releaser_mapping_filename, 'r', encoding='gb18030')
    header = f_releaser_mapping.readline().strip().split(',')
    for line in f_releaser_mapping:
        line_Lst = line.strip().split(',')
        line_dict = dict(zip(header, line_Lst))
        TV_column_Lst.append(line_dict)
    
    f_releaser_mapping.close()
    
    
    for line in TV_column_Lst:
        TV_column_name=line['栏目']
        for plt in line:
            if plt not in ['序号', '栏目']:
                if line[plt]!='':
                    if TV_column_name not in TV_column_dict:
                        TV_column_dict[TV_column_name]={plt: [line[plt]]}
                    elif plt not in TV_column_dict[TV_column_name]:
                        TV_column_dict[TV_column_name][plt]=[line[plt]]
                    else:
                        TV_column_dict[TV_column_name][plt].append(line[plt])
    
    
    platform_releaser_dict = {} 
    for column in TV_column_dict:
        for platform in TV_column_dict[column]:
            if platform not in platform_releaser_dict:
                platform_releaser_dict.update({platform: copy.deepcopy(TV_column_dict[column][platform])})
            else:
                releaser_Lst = platform_releaser_dict[platform]
                releaser_Lst += TV_column_dict[column][platform]
                releaser_Lst = list(set(releaser_Lst))
                if '' in releaser_Lst:
                    releaser_Lst.remove('')
                platform_releaser_dict[platform] = releaser_Lst
    # define log files
    path = '/home/hanye/project_data/Python/Projects/proj-short-videos/write-data-into-es/log/'
    fetch_time_start_iso=datetime.datetime.fromtimestamp(fetch_time_start_ts/1e3).isoformat()[:10]
    proc_log_filename = 'cal_daily_net_inc_for_missed_values_'+fetch_time_start_iso+'-log'
    f_proc_log = open(path+proc_log_filename, 'a', encoding='gb18030')
    exception_log_filename = 'cal_daily_net_inc_for_missed_values_'+fetch_time_start_iso+'-exception-log'
    f_exception_log = open(path+exception_log_filename, 'a', encoding='gb18030')


    # fetch data for thoes don't have cal_base field
    hosts = '192.168.17.11'
    port = 9200
    es = Elasticsearch(hosts=hosts, port=port)
    index='short-video-production'
    doc_type='daily-url'
    
    search_body={
                  "query": {
                    "bool": {
                      "filter": [
                        {"range": {"fetch_time": {"gte":0, "lt":0}}},
                        {"range": {"release_time": {"gte":0, "lt":0}}},
                        {"term": {"platform.keyword": None}},
                        {"term": {"releaser.keyword": None}}
                        ]
                    }
                  }

                }
    
    search_body['query']['bool']['filter'][0]['range']['fetch_time']['gte']=fetch_time_start_ts
    search_body['query']['bool']['filter'][0]['range']['fetch_time']['lt']=fetch_time_end_ts
    search_body['query']['bool']['filter'][1]['range']['release_time']['gte']=release_time_start_ts
    search_body['query']['bool']['filter'][1]['range']['release_time']['lt']=release_time_end_ts
    for platform in platform_releaser_dict:
        search_body['query']['bool']['filter'][2]['term']['platform.keyword']=platform
        for releaser in platform_releaser_dict[platform]: 
            search_body['query']['bool']['filter'][3]['term']['releaser.keyword']=releaser
            search_resp=es.search(index=index, doc_type=doc_type, body=search_body,
                                size=0,
                                request_timeout=100)
            total_hits = search_resp['hits']['total']
            print('total_hits: %d %s - %s' % (total_hits, platform, releaser), datetime.datetime.now(), file=f_proc_log)



            scan_resp=scan(client=es, index=index, doc_type=doc_type, query=search_body,
                           scroll=u'5m',
                           request_timeout=200)

            data_collector=[]
            line_counter=0
    
           
            for line in scan_resp:
                line_counter+=1
                data=line['_source']
                data_collector.append(data)
    
                if line_counter%1e3==0 or line_counter==total_hits:
                    print('retrieved lines', line_counter, '/', total_hits,
                          'and write into es at', datetime.datetime.now(), file=f_proc_log)
                    write_es_bulk(data_collector)
                    data_collector = []
            
    
    print('ALL DONE',file=f_proc_log)


    f_proc_log.close()
    f_exception_log.close()

