# -*- coding: utf-8 -*-
"""
Created on Wed Apr 11 09:08:34 2018

Write data from FHTech

@author: hanye
"""

from elasticsearch import Elasticsearch
import datetime, time
import zipfile
from func_transfer_from_ftp import transfer_from_ftp
import sys, os
import logging, threading
import json
from func_cal_doc_id import cal_doc_id
from func_get_releaser_id import *


def func_write_es_from_fh_zip_file(main_logger_name, date_str, batch):
    """
    date_str like '2018-04-11'

    batch must be one of the keys of pattern_dict, which are
    ['00', '08', '12', '18', 'All', 'except_00']

    This program will screen out those data, who are lack of any of the fields like
    'url', 'platform', 'fetch_time', 'release_time', and those whose fetch_time and release_time
    do not satisfy 0<fetch_time<=fmt*1e3 and 0<release_time<=fetch_time, where fmt stands for
    data file modification time, and those has empty url field.

    """

    # 1 read from ftp
#    today_T = datetime.datetime.now()-datetime.timedelta(days=2)
#    date_str = today_T.isoformat()[:10]

    # creater logger
    logger=logging.getLogger('%s.fetch_and_unzip' % main_logger_name)

#    # define logger
#    loggerName='writeES_from_fhtech_%s' % date_str
#    logger=logging.getLogger(loggerName)
#    logger.setLevel(logging.DEBUG)
#    # create handler
#    #path='D:\\CSM\\Python\\test_logging\\'
#    path='/home/hanye/project_data/Python/Projects/proj-short-videos/write-data-into-es/log/'
#    log_fn='fetch_fhtech_from_FTP_and_writeES_for_%s_log_%s' % (date_str, datetime.datetime.now().isoformat().replace(':','-')[:19])
#    fh=logging.FileHandler(path+log_fn)
#    fh.setLevel(logging.INFO)
#    # create formatter and add it to the handler
#    formatter=logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
#    fh.setFormatter(formatter)
#    # add handler to logger
#    logger.addHandler(fh)
#
#    logger.info('log starts.')

    f_log=sys.stdout



    # if fetch is time-conditioned, say,
    # in the morning, only fecth data around 00:00, should use pattern '[MTP]_%s_N\.zip' % date_str
    # Full list for pattern are:
    # All: pattern='[MTP]_%s_[NU]\w{0,2}\.zip' % date_str
    #     1st batch (00:00): pattern='[MTP]_%s_N\.zip' % date_str
    #     2nd~4rd batch: pattern='[MTP]_%s_U_[1-3]\.zip' % date_str
    #         2nd batch: pattern='[MTP]_%s_U_1\.zip' % date_str
    #         3nd batch: pattern='[MTP]_%s_U_2\.zip' % date_str
    #         4nd batch: pattern='[MTP]_%s_U_3\.zip' % date_str

    pattern_dict={
            '00': '[MTP]_%s_N\.zip' % date_str,
            '00_M': 'M_%s_N\.zip' % date_str,
            '00_T': 'T_%s_N\.zip' % date_str,
            '00_P': 'P_%s_N\.zip' % date_str,
            '00_H': 'H_%s_N\.zip' % date_str,

#            '08': '[MTP]_%s_U_1\.zip' % date_str,
#            '08_M': 'M_%s_U_1\.zip' % date_str,
#            '08_T': 'T_%s_U_1\.zip' % date_str,
#            '08_P': 'P_%s_U_1\.zip' % date_str,

            '12': '[MTP]_%s_U_1\.zip' % date_str,
            '12_M': 'M_%s_U_1\.zip' % date_str,
            '12_T': 'T_%s_U_1\.zip' % date_str,
            '12_P': 'P_%s_U_1\.zip' % date_str,
            '12_H': 'H_%s_U_1\.zip' % date_str,

            '18': '[MTP]_%s_U_2\.zip' % date_str,
            '18_M': 'M_%s_U_2\.zip' % date_str,
            '18_T': 'T_%s_U_2\.zip' % date_str,
            '18_P': 'P_%s_U_2\.zip' % date_str,
            '18_H': 'H_%s_U_2\.zip' % date_str,

            'All': '[MTPH]_%s_[NU]\w{0,2}\.zip' % date_str,
            'except_00': '[MTPH]_%s_U_[1-3]\.zip' % date_str,

            }

    #pattern='[MTP]_%s_N\.zip' % date_str
    pattern=pattern_dict[batch]

    # for test on windows PC
    #local_path=r'D:\CSM\Docs\Projects\短视频\code\write-data-into-es\test\FHTech\fhtech-ftp-files\\'
    local_path = '/data0/fhtech-ftp-files/'
    logger.info('[%s] Transfer files from FTP ...' % batch)
    trf_retry_start_T=datetime.datetime.now()
    trf_retry_wait_sec_limit=3600*12
    trf_time_T=datetime.datetime.now()
    while True:
        trf=transfer_from_ftp(pattern, 'FHTech', local_path, f_log=f_log)
        trf_time_T=datetime.datetime.now()
        if trf==None:
            if trf_time_T<trf_retry_start_T+datetime.timedelta(seconds=trf_retry_wait_sec_limit):
                logger.info('[%s] There is no file to be processed, wait ...' % batch)
                time.sleep(60)
            else:
                logger.info('[%s] Have waited for %d seconds, program exits.' % (batch, trf_retry_wait_sec_limit))
                return 0
        else:
            logger.info('[%s] Got target files.' % batch)
            break

    logger.info('[%s] Transfer done. Got %d files:' % (batch, len(trf)))
    logger.info('[%s] %s' % (batch, str(trf)))


    # 2 unzip
    # Must use absolute path to make sure being safe in crontab task
    print('unzip zip file...', file=f_log)
    logger.info('[%s] unzip zip file...' % batch)
    files_unziped=[]
    for filen in trf:
        absolut_zip_f_path=local_path+filen
        t_s = datetime.datetime.now()
        zip_f = zipfile.ZipFile(file=absolut_zip_f_path)
        zip_f.extractall(path=local_path)
        t_e = datetime.datetime.now()
        t_delta = t_e - t_s
        print('unzip %s done, time spent' % filen, t_delta, file=f_log)
        logger.info('[%s] unzip %s done, time spent %s' % (batch, filen, t_delta))
        files_unziped.extend(zip_f.namelist())
    logger.info('[%s] Got unziped files:' % batch)
    logger.info('[%s] %s' % (batch, str(files_unziped)))


    # 3 write into es
    """
    Will first write into all-time-url and time-track simultaneously.
    Then select data that is near 00:00, and write them into daily-url.

    Will NOT do any net-increase-value calculations.

    """


    def write_ATU_TK(thread_id, loggerName, data_Lst):
        """

        ATU stands for all-time-url, which means data in short-video-production/all-time-url
        TK stands for time-track, which means data in short-video-time-track/time-track.

        Passed-in data_Lst should be a list of python dict.
        Field check should be performed before passing in data_Lst.

        """

        # creater logger
        logger=logging.getLogger('%s.write_ATU_TK' % loggerName)

        print('Got input line %d from offline file input round.' % len(data_Lst))
        logger.info('[ %d ] Got input line %d from offline file input round.' % (thread_id, len(data_Lst)))

        es=Elasticsearch(host='192.168.17.11', port=9200)
        index_pro='short-video-all-time-url'
    #    doc_type_daily='daily-url'
        doc_type_all='all-time-url'
        index_time_track='short-video-time-track'
        doc_type_time_track='time-track'

    #    # write daily-url FOR TODAY
    #    bulk_body_daily = ''
        # write all-time-url
        bulk_body_all = ''
        # write time-track
        bulk_time_track = ''

        empty_url_counter=0
        write_counter=0

        for line_dict in data_Lst:
            ts_every_line = int(datetime.datetime.now().timestamp()*1e3)
            line_dict['timestamp']=ts_every_line
            line_dict['data_provider']='fhtech'
            try:
                line_dict['releaser_id_str'] = line_dict['platform'] + "_" + get_releaser_id(platform=line_dict['platform'],releaserUrl=line_dict['releaserUrl'])
            except:
                pass

            line_dict['channel']=line_dict['video_channel']
            line_dict['describe']=line_dict['video_describe']
            line_dict.pop('video_channel')
            line_dict.pop('video_describe')
            if 'isOriginal' in line_dict:
                if line_dict['isOriginal'] in (0, '0'):
                    line_dict['isOriginal'] = False
                if line_dict['isOriginal'] in (1, '1'):
                    line_dict['isOriginal'] = True
            url=line_dict['url']
            if url=='':
                empty_url_counter+=1
            else:
                write_counter+=1
                platform=line_dict['platform']
                fetch_time_ts=line_dict['fetch_time']
                if platform == "haokan":
                    try:
                        int(url)
                        line_dict['video_id'] = url
                        url = "https://haokan.baidu.com/v?vid={0}".format(
                                url)
                        line_dict['url'] = url
                        line_dict['wrong_url'] = "true"
                    except:
                        pass
                id_all = cal_doc_id(platform, url=url, data_dict=line_dict,
                                           doc_id_type='all-time-url')
                id_time_track = cal_doc_id(platform, url=url, data_dict=line_dict,
                                           doc_id_type='time-track', 
                                           fetch_time_ts=fetch_time_ts)
                action_str_time_track = '{"index": {"_id":"%s"}}' % id_time_track
                action_str_all = '{"index": {"_id":"%s"}}' % id_all
                data_str=json.dumps(line_dict, ensure_ascii=False)

                line_body_for_bulk_time_track = action_str_time_track + '\n' + data_str + '\n'
                line_body_for_bulk_all = action_str_all + '\n' + data_str + '\n'

                bulk_time_track += line_body_for_bulk_time_track
                bulk_body_all += line_body_for_bulk_all

        if len(bulk_time_track)>0:
            es.bulk(body=bulk_time_track,
                    index=index_time_track, doc_type=doc_type_time_track,
                    request_timeout=200)
            print('From all %d input lines, write %s/%s with %d lines.' % (len(data_Lst),
                                                                       index_time_track,
                                                                       doc_type_time_track,
                                                                       write_counter))
            logger.info('[ %d ] From all %d input lines, write %s/%s with %d lines.' % (thread_id,
                                                                                        len(data_Lst),
                                                                                        index_time_track,
                                                                                        doc_type_time_track,
                                                                                        write_counter))
        else:
            print('From all %d input lines, write %s/%s with 0 lines.' % (len(data_Lst),
                                                                       index_time_track,
                                                                       doc_type_time_track))
            logger.info('[ %d ] From all %d input lines, write %s/%s with 0 lines.' % (thread_id,
                                                                                    len(data_Lst),
                                                                                    index_time_track,
                                                                                    doc_type_time_track))

        if len(bulk_body_all)>0:
            es.bulk(body=bulk_body_all,
                    index=index_pro, doc_type=doc_type_all,
                    request_timeout=200)
            print('From all %d input lines, write %s/%s with %d lines.' % (len(data_Lst),
                                                                       index_pro,
                                                                       doc_type_all,
                                                                       write_counter))
            logger.info('[ %d ] From all %d input lines, write %s/%s with %d lines.' % (thread_id,
                                                                                        len(data_Lst),
                                                                                        index_pro,
                                                                                        doc_type_all,
                                                                                        write_counter))

        else:
            print('From all %d input lines, write %s/%s with 0 lines.' % (len(data_Lst),
                                                                       index_pro,
                                                                       doc_type_all))
            logger.info('[ %d ] From all %d input lines, write %s/%s with 0 lines.' % (thread_id,
                                                                                    len(data_Lst),
                                                                                    index_pro,
                                                                                    doc_type_all))



    def read_file_check_line_call_write_es_func(thread_id, loggerName, local_path, data_fn):
        # creater logger
        logger=logging.getLogger('%s.read_file_check_line_call_write_es_func' % loggerName)

        print('Writing file', data_fn)
        logger.info('[ %d ] Writing file %s' % (thread_id, data_fn))

        fmt=os.stat(local_path+data_fn).st_mtime
        data_Lst=[]
        with open(local_path+data_fn, 'r') as dataf:
            line_counter=0
            for linestr in dataf:
                line_counter+=1
                fline_d=json.loads(linestr, encoding='utf-8')
                # check each line before appending into list
                mush_have_fields=['url', 'platform', 'fetch_time', 'release_time']
                if all(fds in fline_d for fds in mush_have_fields):
                    if len(fline_d['url'])>0:
                        fts=fline_d['fetch_time']
                        rts=fline_d['release_time']
                        # check if timestamps are legal
                        if 0<fts<=fmt*1e3 and 0<rts<=fts:
                            data_Lst.append(fline_d)
                        else:
                            print('%s Line: [%d] fetch_time or release_time illegal!' % (data_fn, line_counter))
                            logger.info('[ %d ] %s Line: [%d] fetch_time or release_time illegal!' % (thread_id, data_fn, line_counter))
                    else:
                        print('%s Line: [%d] URL is empty!' % (data_fn, line_counter))
                        logger.info('[ %d ] %s Done writing.' % (thread_id, data_fn))
                else:
                    print('%s Line: [%d] Important field absent!' % (data_fn, line_counter), mush_have_fields)
                    logger.info('[ %d ] %s Done writing.' % (thread_id, data_fn))

                if line_counter%1000==0:
                    print('Passing 1000 lines to write, line_counter %d' % line_counter, file=f_log)
                    logger.info('[ %d ] %s Passing 1000 lines to write, line_counter %d' % (thread_id, data_fn, line_counter))
                    write_ATU_TK(thread_id, loggerName, data_Lst)
                    data_Lst.clear()

        if len(data_Lst)>0:
            print('Passing the rest lines to write, line_counter %d' % line_counter, file=f_log)
            logger.info('[ %d ] %s Passing the rest lines to write, line_counter %d' % (thread_id, data_fn, line_counter))
            write_ATU_TK(thread_id, loggerName, data_Lst)
            data_Lst.clear()

        print('Done writing file %s into es.' % data_fn)
        logger.info('[ %d ] %s Done writing, total line number is %d, thread exits.' % (thread_id, data_fn, line_counter))


    # will start one thread for each data file
    i=0
    threads_Lst=[]
    for data_fn in files_unziped:
        threadi=threading.Thread(target=read_file_check_line_call_write_es_func,
                                 args=(i, main_logger_name, local_path, data_fn))
        logger.info('[%s] Starting read_file_check_line_call_write_es threads: [ %d ]' % (batch, i))
        threadi.start()
        threads_Lst.append(threadi)
        i+=1

    # make sure all threads exits
    for threadi in threads_Lst:
        threadi.join()
    logger.info('[%s] All threads exit.' % batch)
    logger.info('[%s] Batch process exits.' % batch)



if __name__=='__main__':
    func_write_es_from_fh_zip_file('2018-04-09', '00')

