# -*- coding: utf-8 -*-
"""
Created on Fri Sep  1 12:18:27 2017

 notic that, the line body is using the same data as the one for daily-url
 so, there is a restriction, that, using this program to update all-time-url
 can only be correct when the csv data is the latest one, at least, be newer 
 than the ones already in es
 
Oct 05 2017
Remove the code for calculating daily net_inc values for daily-url

Oct 23 2017
Add back calculating for daily net_inc values, using GET from daily-url.
Tested.

Dec 27 2017
Add a new field 'fetchT_minus_releaseT' represent the value of milliseconds distance
from fetch_time to release_time. Write this field into dialy-url when retrieving data
from source.


@author: hanye
"""

#import os
from elasticsearch import Elasticsearch
#from esCCR import EsUtils
import datetime
import json
import sys
from elasticsearch import exceptions
from func_calculate_toutiao_video_id import calculate_toutiao_video_id
#import re
from func_form_dict_from_csv_line import form_dict_from_csv_line


def write_es_from_qingbo_url_csv(csv_file_path, csv_filename_Lst, f_log=sys.stdout):

    def empirical_ratio(fetch_days_after_release):
        if fetch_days_after_release<=1:
            return 0.4  # 1-0.6
        elif fetch_days_after_release<=2:
            return 0.2  # 1-0.8
        elif fetch_days_after_release<=3:
            return 0.1  # 1-0.9
        else:
            return 0    
    
    def write_es(data_Lst):
        t_s = datetime.datetime.now()
        ccr_host = '192.168.17.11'
        ccr_port = 9200
        user='elastic'
        passwd='hy_csm_16'
        http_auth=(user, passwd)
        
        es_read = Elasticsearch(hosts=ccr_host, port=ccr_port, http_auth=http_auth)
        es_write = Elasticsearch(hosts=ccr_host, port=ccr_port, http_auth=http_auth)
        
        cal_base_dict = {'history': 'historical_data',
                         'no_need': 'no_need',
                         'empirical': 'empirial_data',
                         'historical_absent': 'historical_data_absent',
                         'wait_for_tomorrow': 'wait_for_tomorrow'}        
        
        index = 'short-video-production'
        doc_type_daily_url = 'daily-url'
        doc_type_all_time_url = 'all-time-url'
        
        # write daily-url FOR TODAY
        bulk_body_daily_url_for_TODAY = ''
        # write daily-url FOR THE PREVIOUS DAY
        bulk_body_daily_url_for_THE_PREVIOUS_DAY = ''
        
        # write all-time-url
        bulk_body_all_time_url = ''
        
        # latest data dict for TODAY's daily-url
        latest_line_for_TODAY_daily_url = {}
        
        # data line to be updated with daily net_inc values 
        # for THE PREVIOUS DAY
        line_to_update_for_PREVIOUS_DAY_daily_url = {}

        # add counters to see how many lines of what kind of situations have been 
        # calculated by what means.
        # On Oct 23 2017
        counter_no_need=0
        counter_history=0
        counter_empirical=0
        counter_all_time_url=0
        
        for line in data_Lst:
            if len(line)>=10 and line['url']!='':
                try:            
                    ts_every_line = int(datetime.datetime.now().timestamp()*1e3)
                    if len(line['update_time'])>=14:
                        fetch_time_ts = int(datetime.datetime.strptime(line['update_time'], '%Y-%m-%d %H:%M:%S').timestamp()*1e3)    
                    else:
                        fetch_time_ts = 0
                    if len(line['posttime'])>=14:
                        release_time_ts = int(datetime.datetime.strptime(line['posttime'], '%Y-%m-%d %H:%M:%S').timestamp()*1e3) 
                    else:
                        release_time_ts = 0
                
                    if line['site_name']=='今日头条':
                        platform_name = 'toutiao'
                    elif line['site_name']=='快手':
                        platform_name = 'kwai'
                    else:
                        platform_name = line['site_name'] 
                    if 'title' in line:
                        d_title = line['title']
                    else:
                        d_title = ''
                        
                    video_type_str=''
                    if 'video_type' in line:
                        video_type_id=line['video_type']
                        if video_type_id=='1':
                            video_type_str = 'live-video'
                        if video_type_id=='0':
                            video_type_str = 'video'

                    data_dict = {
                                  'data_provider': 'qingbo',
                                  'duration': int(line['play_time']),
                                  'favorite_count': int(line['favourites_count']),
                                  'fetch_time': fetch_time_ts,
                                  'platform': platform_name,
                                  'play_count': int(line['play_count']),
                                  'release_time': release_time_ts,
                                  'releaser': line['author'],
                                  'title': d_title,
                                  'url': line['url'],
                                  'comment_count': int(line['comments_count']),
                                  'channel': line['subdomain'],
                                  
                                  'timestamp': ts_every_line,
                                  
                                  'repost_count': int(line['reposts_count']),
                                  'vid': line['vid'],     
                    }
                    if video_type_str!='':
                        data_dict.update({'video_type': video_type_str})

                    # added on 2017-12-27
                    fetch_time_ts=data_dict['fetch_time']
                    release_time_ts=data_dict['release_time']
                    try:
                        fetchT_minus_releaseT=fetch_time_ts-release_time_ts
                    except:
                        fetchT_minus_releaseT=None
                    data_dict['fetchT_minus_releaseT']=fetchT_minus_releaseT

                    fetch_time_T = datetime.datetime.fromtimestamp(data_dict['fetch_time']/1e3)
                    fetch_time_day_str = fetch_time_T.isoformat()[:10]   
                    fetch_time_previous_day_T = fetch_time_T - datetime.timedelta(days=1)
                    fetch_time_previous_day_str = fetch_time_previous_day_T.isoformat()[:10]   
                    
                    release_time_T = datetime.datetime.fromtimestamp(data_dict['release_time']/1e3)
                    
                    # build all-time-url json body before calculate net increase values
                    # for there is no need for all-time-url to have net increase values
                    data_json_str_line_all_time_url = json.dumps(data_dict, ensure_ascii=False)
                    
                    
                    if data_dict['url'] != '':
                        platform = data_dict['platform']
                        # use 19-digit number as id for data from toutiao, 
                        # similar action will be taken with data from qingbo,
                        # in order to merger in the daily-url _type
                        if platform == 'toutiao':
#                            url_s_Lst = data_dict['url'].split('/')
#                            video_ID = url_s_Lst[-2] if url_s_Lst[-1]=='' else url_s_Lst[-1]
#                            if video_ID.find('=')!=-1:
#                                video_ID = video_ID.split('=')[-1]
                            # switch to independent function on Nov 06 2017
                            video_ID=calculate_toutiao_video_id(data_dict['url'])                            
                            
                            # format id for daily-url
                            id_daily_url = 'toutiao_'+video_ID+'_'+fetch_time_day_str 
                            id_daily_url_previous_day = 'toutiao_'+video_ID+'_'+fetch_time_previous_day_str
                            # format id for all-time-url
                            id_all_time_url = 'toutiao_'+video_ID
                        else:
                            # format id for daily-url
                            id_daily_url = data_dict['url']+'_'+fetch_time_day_str 
                            id_daily_url_previous_day = data_dict['url']+'_'+fetch_time_previous_day_str
                            # fromat id for all-time-url
                            id_all_time_url = data_dict['url']

                    # find data of previous day according to _id of the previous day in daily-url
                    # and calculate the net_inc_* values
                    #
                    # if fetch date is the same as release date,
                    # update daily net_inc FOR TODAY
                    if (release_time_T.day==fetch_time_T.day
                        and release_time_T.month==fetch_time_T.month
                        and release_time_T.year==fetch_time_T.year):
                        net_inc_play_count = int(data_dict['play_count'])
                        net_inc_favorite_count = int(data_dict['favorite_count'])
                        net_inc_comment_count = int(data_dict['comment_count'])

                        ts_for_today_nd=int(datetime.datetime.now().timestamp()*1e3)
                        # Update timestamp when updating the data of today
                        # Added on Oct 23 2017
    
                        cal_base = cal_base_dict['no_need']
                        counter_no_need+=1
    
                        data_dict.update({'net_inc_play_count':net_inc_play_count,
                                       'net_inc_favorite_count':net_inc_favorite_count,
                                       'net_inc_comment_count':net_inc_comment_count,
                                       'cal_base':cal_base,
                                       'timestamp':ts_for_today_nd})
                    
#                        # build latest dict for TODAY's daily-url
#                        # if a url_day_str not in the dict, update the dict directly
#                        if id_daily_url not in latest_line_for_TODAY_daily_url:
#                            latest_line_for_TODAY_daily_url.update({id_daily_url: data_dict})
#                        # if a url_day_str in the dict, compare the new line's fetch_time value
#                        # with the one already present, leave the one with greater fetch_time value
#                        else:
#                            present_fetch_time_ts = latest_line_for_TODAY_daily_url[id_daily_url]['fetch_time']/1e3
#                            present_fetch_time_T = datetime.datetime.fromtimestamp(present_fetch_time_ts)
#                            if fetch_time_T>present_fetch_time_T:
#                                latest_line_for_TODAY_daily_url.update({id_daily_url: data_dict})

                    #
                    # if fetch date is later than relase date
                    else:                    
                        try: 
                            # if the same url can be found on the previous day,
                            # update daily FOR THE PREVIOUS DAY
                            doc_in_daily_url_on_previous_day = es_read.get(index=index,
                                                                           doc_type=doc_type_daily_url,
                                                                           id=id_daily_url_previous_day,
                                                                           request_timeout=100)
                            play_count_previous_day = int(doc_in_daily_url_on_previous_day['_source']['play_count'])
                            favorite_count_previous_day = int(doc_in_daily_url_on_previous_day['_source']['favorite_count'])
                            comment_count_previous_day = int(doc_in_daily_url_on_previous_day['_source']['comment_count'])
                            
                            net_inc_play_count = int(data_dict['play_count'])-play_count_previous_day
                            net_inc_favorite_count = int(data_dict['favorite_count'])-favorite_count_previous_day
                            net_inc_comment_count = int(data_dict['comment_count'])-comment_count_previous_day
                            
                            ts_for_pre_day=int(datetime.datetime.now().timestamp()*1e3)
                            # Update timestamp when updating the data of Previous day
                            # Added on Oct 23 2017

                            cal_base = cal_base_dict['history']
                            counter_history+=1
    
                            doc_in_daily_url_on_previous_day['_source'].update({'net_inc_play_count':net_inc_play_count,
                                                       'net_inc_favorite_count':net_inc_favorite_count,
                                                       'net_inc_comment_count':net_inc_comment_count,
                                                       'cal_base':cal_base,
                                                       'timestamp':ts_for_pre_day})
                    
                            # build data dict for THE PREVIOUS DAY's daily-url
                            line_to_update_for_PREVIOUS_DAY_daily_url.update({id_daily_url_previous_day: doc_in_daily_url_on_previous_day['_source']})

                            # added on Dec 26 2017
                            # mark cal_base even if previous day has a hit
                            ts_for_today=int(datetime.datetime.now().timestamp()*1e3)
                            cal_base = cal_base_dict['wait_for_tomorrow']
                            counter_empirical+=1
                            data_dict.update({
                                   'cal_base':cal_base,
                                   'timestamp':ts_for_today})         

                        except exceptions.NotFoundError:
                            # if the url cannot be found on the previous day,
                            # update FOR TODAY with empirical ratio
                            # 
                            # The reason is this. This value here is videos released BEFORE 
                            # the day of current fetch_time. For example, if the current day of fetch_time
                            # (that's the day the auto-task excuted) is 2017-10-23, the data are fetched at
                            # around 0:00 of 2017-10-23. If a video is fetched at 2017-10-23T01:00:00,
                            # the 'previous day' is defined 2017-10-22, the same day is defined 2017-10-23.
                            # If a video data falls into this sector of code, that means, this video data
                            # is fetched on 2017-10-23, and is released on or before 2017-10-22.
                            # If it's released on 2017-10-22, and NO data on 2017-10-22, 
                            # ideally, the accumulated values should be recorded as net_inc value under
                            # the date of 2017-10-22. BUT there are NO data on that day. We should not
                            # make up a whole piece of data. Just let it be. It will fill in the case
                            # 'a video hasn't been catched when it's first released', where the data 
                            # from the moment it released to the moment it is catched is lost.
                            # If it's released even earlier than 2017-10-22, say, 2017-10-19, 
                            # and there are NO data on 2017-10=22, so, we get a video, whose release date is
                            # 2017-10-19 and no data from 2017-10-22, one piece of data on 2017-10-23.
                            # There is one posibility, that there will be another piece of data on 2017-10-24,
                            # when the date comes, it will be updated by the rules for 'the same url can be found on the previous day',
                            # and the net_inc values will be the corrected ones with 'history' tag.
                            # If there will NOT be any more data, to avoid empty fields, these daily net_inc values
                            # will be filled in empirical values. Anyway, if the future day comes with another
                            # piece of data, these empircal values will be overwrited. It's just OK.
                            print('\ndoesn\'t found _id in daily-url:', id_daily_url_previous_day, file=f_log)
                            
                            # stop calculate by empircal ratio on 2017-12-25,
                            # asked by Ms Tianli
#                            fetch_days_after_release = (fetch_time_T-release_time_T).total_seconds()/(24*3600)
#                            net_inc_play_count = int(data_dict['play_count']*empirical_ratio(fetch_days_after_release))
#                            net_inc_favorite_count = int(data_dict['favorite_count']*empirical_ratio(fetch_days_after_release))
#                            net_inc_comment_count = int(data_dict['comment_count']*empirical_ratio (fetch_days_after_release))                       

                            ts_for_today=int(datetime.datetime.now().timestamp()*1e3)
                            # Update timestamp when updating the data of today
                            # Added on Oct 23 2017
                            
                            cal_base = cal_base_dict['historical_absent']
                            counter_empirical+=1
                    
                            data_dict.update({
#                                       'net_inc_play_count':net_inc_play_count,
#                                       'net_inc_favorite_count':net_inc_favorite_count,
#                                       'net_inc_comment_count':net_inc_comment_count,
                                       'cal_base':cal_base,
                                       'timestamp':ts_for_today})
    
                    # move the update latest_line_for_TODAY_daily_url dict to be inside the 1st level of
                    # if data_dict['url'] != '': body. So that, whatever happens (calculate today or calculate the previous day)
                    # data whose fetch_time is in today will be written into ccr es.
                    # commented lines from 193 to 203 accordingly.
                    # Dec 26 2017
                    #
                    # build latest dict for TODAY's daily-url
                    # if a url_day_str not in the dict, update the dict directly
                    if id_daily_url not in latest_line_for_TODAY_daily_url:
                        latest_line_for_TODAY_daily_url.update({id_daily_url: data_dict})
                    # if a url_day_str in the dict, compare the new line's fetch_time value
                    # with the one already present, leave the one with greater fetch_time value
                    else:
                        present_fetch_time_ts = int(latest_line_for_TODAY_daily_url[id_daily_url]['fetch_time']/1e3)
                        present_fetch_time_T = datetime.datetime.fromtimestamp(present_fetch_time_ts)
                        if fetch_time_T>present_fetch_time_T:
                            latest_line_for_TODAY_daily_url.update({id_daily_url: data_dict})
    
                except ValueError:
                    print('catched ValueError,', line, file=f_log)
                
                # format body for all-time-url
                # notic that, the line body is using the same data as the one for daily-url
                # so, there is a restriction, that, using this program to update all-time-url
                # can only be correct when the csv data is the latest one, at least, be newer 
                # than the ones already in es
                #
                else:
                    action_json_str_line_all_time_url = '{"index": {"_id":"'+id_all_time_url+'"}}'
                    bulk_line_body_all_time_url = (action_json_str_line_all_time_url+'\n'
                                                 +data_json_str_line_all_time_url+'\n')
                    bulk_body_all_time_url += bulk_line_body_all_time_url
                    counter_all_time_url+=1
                    
            else:
                print('wrong format,', line, file=f_log)


        # format body for TODAY's daily-url    
        for ids in latest_line_for_TODAY_daily_url:
            action_json_str_line_daily_url = '{"index": {"_id":"'+ids+'"}}'
            data_json_str_line_daily_url = json.dumps(latest_line_for_TODAY_daily_url[ids],
                                                      ensure_ascii=False)
    
            bulk_line_body_daily_url = (action_json_str_line_daily_url+'\n'
                                        +data_json_str_line_daily_url+'\n')
            bulk_body_daily_url_for_TODAY += bulk_line_body_daily_url
            
        # format body for THE PREVIOUS DAY's daily-url
        for ids_p in line_to_update_for_PREVIOUS_DAY_daily_url:
            action_json_str_line_p_daily_url = '{"index": {"_id":"'+ids_p+'"}}'
            data_json_str_line_p_daily_url = json.dumps(line_to_update_for_PREVIOUS_DAY_daily_url[ids_p],
                                                        ensure_ascii=False)
            
            bulk_line_p_body_daily_url = (action_json_str_line_p_daily_url+'\n'
                                          +data_json_str_line_p_daily_url+'\n')
            bulk_body_daily_url_for_THE_PREVIOUS_DAY+=bulk_line_p_body_daily_url
        
        # bulk write into daily-url  
        # write daily-url FOR TODAY
        if len(bulk_body_daily_url_for_TODAY)!=0:
            t_s = datetime.datetime.now()
            es_write.bulk(body=bulk_body_daily_url_for_TODAY,
                          index=index, doc_type=doc_type_daily_url,
                          request_timeout=200)
            print('write TODAY\'s', doc_type_daily_url, 'with',
                  len(latest_line_for_TODAY_daily_url), 'docs. Starts at', 
                  datetime.datetime.now(), file=f_log)
            t_e = datetime.datetime.now()
            t_delta = t_e-t_s
            print('time spent', t_delta, file=f_log)
        # write daily-url FOR PREVIOUS DAY
        if len(bulk_body_daily_url_for_THE_PREVIOUS_DAY)!=0:
            t_s = datetime.datetime.now()
            es_write.bulk(body=bulk_body_daily_url_for_THE_PREVIOUS_DAY, 
                          index=index, doc_type=doc_type_daily_url,
                          request_timeout=200)
            print('write PREVIOUS DAY\'s', doc_type_daily_url, 'with',
                  len(line_to_update_for_PREVIOUS_DAY_daily_url), 'docs. Starts at',
                  datetime.datetime.now(), file=f_log)
            #print('bulk_wirte_re', bulk_wirte_re)
            t_e = datetime.datetime.now()
            t_delta = t_e-t_s
            print('time spent', t_delta, file=f_log)              
        
        # bulk write into all-time-url
        es_write.bulk(body=bulk_body_all_time_url,
                      index=index, doc_type=doc_type_all_time_url,
                      request_timeout=200)
        print('write', doc_type_all_time_url, 'with', counter_all_time_url, 'docs.', file=f_log)    
        t_e = datetime.datetime.now()
        t_delta = t_e-t_s   
        print('time spent', t_delta, file=f_log)
        print('*** In all the', len(latest_line_for_TODAY_daily_url)+len(line_to_update_for_PREVIOUS_DAY_daily_url), 
              'lines,\nthere are', counter_no_need, 'ones which are released on the calculation date,',
              'so that the daily net_inc_* use accumulated values directly, and they are taged with \'no_need\'',
              '\nthere are', counter_history, 'ones which are released before the calculation date,',
              'and found their historical data, they are taged with \'history\'',
              '\nthere are', counter_empirical, 'ones which are released before the calculation date,',
              'and DIDNOT found their histrorical data, they are taged with \'historical_absent\'\n',
              file=f_log)
        
        
    
    for fn in csv_filename_Lst:
        f_csv = open(csv_file_path+fn, 'r', encoding='utf-8')
        print('write es from', fn, file=f_log)
        
        header_str=f_csv.readline()   
        header_Lst = header_str.strip().split(',')
        header_Lst_c = []
        for hh in header_Lst:
            hh_c=hh.replace(' ', '').replace('"', '')
            # replace 'id' with 'vid' to be consistent with previous codes
            if hh_c=='id':
                hh_c='vid'
            header_Lst_c.append(hh_c)            
        
        data_Lst = []
        line_counter = 0
        ill_line_counter=0
        line_cache=''
        for line_str in f_csv:
            if line_str[-2:]!='"\n':
                line_cache+=line_str
        
            else:
                if line_cache!='':
                    line_cache+=line_str
                    line_str=line_cache
                    line_cache=''
                else:
                    pass
            
                data_line_dict=form_dict_from_csv_line(line_str, header_Lst_c)
                if data_line_dict!=None:
                    data_Lst.append(data_line_dict)
                else:
                    print('Ill-formed lines found:', line_str,
                          datetime.datetime.now(), file=f_log)
                    ill_line_counter+=1
    
                line_counter += 1
        
                if line_counter%1000==0:
                    print('write es at line counter', line_counter, 'with',
                          len(data_Lst), 'lines, starts at', datetime.datetime.now(), file=f_log)
                    
                    write_es(data_Lst)
                    data_Lst.clear()
                
        write_es(data_Lst)   
        
        print('Total ill lines are:', ill_line_counter, 
              datetime.datetime.now(), file=f_log)
        f_csv.close()
        print(fn, 'total lines', line_counter, file=f_log)

