# -*- coding: utf-8 -*-
"""
Created on Wed Feb 20 13:53:27 2019

@author: zhouyujiang

search key_word_list daily task 
"""
import argparse
import configparser
import copy
import datetime
import pandas as pd 
import os
import elasticsearch
from elasticsearch.helpers import scan
from func_cal_NI_by_redis import func_cal_increment
from func_cal_NI_by_redis import func_cal_new_released_NI

hosts='192.168.17.11'
port=80
user='zhouyujiang'
passwd='8tM9JDN2LVxM'
http_auth=(user, passwd)
es=elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)

def select_key_word_data(date_str, time_start_str,
                         keyword_config_file = r'/home/hanye/project_data/Python/Projects/proj-short-videos/write_data_into_es_new/config',
                         csv_path = r'/home/hanye/project_data/Python/Projects/Key-word/rzf'):
    
    # 转换时间戳
    def func_turn_timestamp_to_str(data_dict):
            fetch_time_int=int(data_dict['fetch_time']/1000)
            fetch_time_H=datetime.datetime.fromtimestamp(fetch_time_int).isoformat(sep=' ')
            release_time_int=int(data_dict['release_time']/1000)
            release_time_H=datetime.datetime.fromtimestamp(release_time_int).isoformat(sep=' ')
            duration=int(data_dict['duration'])
            duration_H='%02d:%02d:%02d' % (duration//3600, duration%3600//60, duration%60)
            data_dict.update({'fetch_time_H':fetch_time_H,
                      "release_time_H":release_time_H,
                      "duration_H":duration_H})
            return data_dict
    #输出的字段、格式
    columns_list = ['platform', 'releaser', 'title', 'play_count', 'net_inc_play_count', 'comment_count',
                    'favorite_count','release_time_H', 'fetch_time_H', 'url']
    
    # time_start 为关键词开始时间，这个值随着关键词改变
    time_start_re_dt = datetime.datetime.strptime(time_start_str, '%Y-%m-%d')
    time_start_re_ts = int(datetime.datetime.timestamp(time_start_re_dt)*1000)
    #确定csv存放的路径 需要输出
    csv_path_date =  csv_path + '//' + date_str
    if date_str not in os.listdir(csv_path):
        os.mkdir(csv_path_date)
    
    #关键词配置文件的名字
    legal_platforms_config_fn = 'key_word_list.ini'
    
    #读取配置文件
    config = configparser.ConfigParser()
    with open(keyword_config_file + '/' + legal_platforms_config_fn,
              'r', encoding='utf-8') as conf_file:
        config.read_file(conf_file)
    # 将data_str 转换为时间戳
    try:
        time_dt = datetime.datetime.strptime(date_str, '%Y-%m-%d')
        time_start_dt = time_dt
        fetch_time_start_ts = int(datetime.datetime.timestamp(time_start_dt)*1000)
        fetch_time_end_dt = time_dt + datetime.timedelta(1) 
        fetch_time_end_ts = int(datetime.datetime.timestamp(fetch_time_end_dt)*1000)
        fetch_time_end_ts_last = fetch_time_start_ts
        fetch_time_start_dt_last = time_dt - datetime.timedelta(1)
        fetch_time_start_ts_last = int(datetime.datetime.timestamp(fetch_time_start_dt_last)*1000)
    except:
        print('error in 转换为时间戳', date_str)
        return None
    # latform_set 用作平台排序。
    platform_dict = {}
    search_body = {
               "query": {
                    "bool": {
                      "filter": [
                          {"range": {"release_time": {"gte": time_start_re_ts}}},
                          {"match_phrase": {"title": None}}
                        ], "should": []
                    }
                  }
              }
    key_word_list = config['key_word_list'
                          ]['key_words'].split(',')
    
    
    # 查找关键次并写入到csv                      
    for one_key in key_word_list:
        re_list = []
        csv_file_name = one_key.replace(' ', '')
        csv_file_path = csv_path_date + '//' + csv_file_name + '-短视频数据-' + date_str + '.csv'
        one_key_list = one_key.split(' ')
        # 每一个关键词组都有一个重点关键词，必须要
        target_key = one_key_list[0]
        search_keyword_body = copy.deepcopy(search_body)
        search_keyword_body['query']['bool']['filter'][1]['match_phrase']['title'] = target_key
        # 添加增分关键词
        if len(one_key_list) > 1:
            for one in one_key_list[1:]:
                match_dict = {"match_phrase": {"title": one}}
                search_keyword_body['query']['bool']['should'].append(match_dict)
            search_keyword_body['query']['bool'].update({"minimum_should_match": 1})
            fetch_last = {"range": {"fetch_time": {"gte": fetch_time_start_ts_last
                                                   ,"lt":fetch_time_end_ts_last}}}
            fetch_now = {"range": {"fetch_time": {"gte": fetch_time_start_ts
                                                   ,"lt":fetch_time_end_ts}}}
            # 查找今天抓取的 数据 为了计算日增量和导出数据
            search_keyword_body['query']['bool']['filter'].append(fetch_now)
            search_keyword_body_last = copy.deepcopy(search_keyword_body)
            # 查找昨天抓取的数据，为了计算日增量
            search_keyword_body_last['query']['bool']['filter'][2] = fetch_last

        print('当天',search_keyword_body)
        print('前一天',search_keyword_body_last)
        # 计算非new_released 的日增量
        
        func_cal_increment(index_last='short-video-production', 
                           doc_type_last='daily-url',
                           index_now='short-video-production',
                           doc_type_now='daily-url',
                           search_body=search_keyword_body_last,
                           now_body=search_keyword_body,
                           cal_type='N'
                           )
        search_keyword_body_new_released = copy.deepcopy(search_keyword_body)
        search_keyword_body_new_released['query']['bool']['filter'][0][
            'range']['release_time']['gte'] = fetch_time_start_ts_last
        # 计算new_released 的日增量
        func_cal_new_released_NI(cal_type='N',
                                 index_now='short-video-production',
                                 doc_type_now='daily-url',
                                 search_body=search_keyword_body_new_released
                                 
                                 )
        #导出详细每条数据到csv
        print('详细每条数据', search_keyword_body)
        scan_re = scan(client=es, index='short-video-production', 
                       doc_type='daily-url', query=search_keyword_body)
        for one_scan in scan_re:
            line = one_scan['_source']
            platorm = line['platform']
            if platorm not in platform_dict:
                platform_dict[platorm] = []
                new_line = func_turn_timestamp_to_str(line)
                platform_dict[platorm].append(new_line)
            else:
                new_line = func_turn_timestamp_to_str(line)
                platform_dict[platorm].append(new_line)
        for one_key in platform_dict.keys():
            print(one_key)
            one_list = platform_dict[one_key]
            try:
                new_one_list = sorted(one_list, key=lambda x:x['play_count'],reverse=True)
            except:
                new_one_list = one_list
            re_list = re_list + new_one_list

        data = pd.DataFrame(re_list)
        data.to_csv(csv_file_path, columns=columns_list, encoding='gb18030',index=False)

        #导各平台sum值到csv
        agg_body = {"aggs": {
                    "term_plt": {
                      "terms": {
                        "field": "platform.keyword",
                        "size": 20
                      },"aggs": {
                        "sum_nic": {
                          "sum": {
                            "field": "net_inc_play_count"
                          }
                        },"sum_play": {
                          "sum": {
                            "field": "play_count"
                          }
                        }
                      }
                    }
                    }
                    }
        search_sum_body = copy.deepcopy(search_keyword_body)
        search_sum_body.update(agg_body)
        print('聚合',search_sum_body)
        search_sum_re = es.search(index='short-video-production',
                                  doc_type='daily-url',
                                  body=search_sum_body)
        if search_sum_re['hits']['total'] > 0 :
            sum_list = []
            heji_play = 0
            heji_ni = 0
            heji_num = 0
            sumagg_list = search_sum_re['aggregations']['term_plt']['buckets']
            for i in sumagg_list:
                one_sum_dict = {}
                one_sum_dict['platform'] = i['key']
                one_sum_dict['sum_playcount'] = i['sum_play']['value']
                heji_play = heji_play + one_sum_dict['sum_playcount']
                one_sum_dict['sum_net_inc_play_count'] = i['sum_nic']['value']
                heji_ni = heji_ni + one_sum_dict['sum_net_inc_play_count']
                one_sum_dict['video_num'] = i['doc_count']
                heji_num = heji_num + one_sum_dict['video_num']
                sum_list.append(one_sum_dict)
        for o in sum_list:
            if o['platform'] == 'pearvideo':
                o['sum_playcount'] = 0
                o['sum_net_inc_play_count'] = 0
        sum_list = sorted(sum_list, key=lambda x:x['sum_playcount'],reverse=True)
        sum_list.append({'platform': '','sum_playcount':'', 
                         'sum_net_inc_play_count':'',
                         'video_num':''})
        sum_list.append({'platform': '合计','sum_playcount':heji_play, 
                         'sum_net_inc_play_count':heji_ni,
                         'video_num':heji_num})
        for o in sum_list:
            if o['platform'] == 'pearvideo':
                o['sum_playcount'] = ''
                o['sum_net_inc_play_count'] = ''
        sum_file_path = csv_path_date + '//' + csv_file_name + '-短视频数据-' + date_str + 'summary.csv'
        print(sum_file_path)
        sum_data = pd.DataFrame(sum_list)
        sum_data.to_csv(sum_file_path, encoding='gb18030',index=False)
                
                
    # 返回csv文件的路径
    return csv_path_date
    
    
    
if __name__ == '__main__':
    select_key_word_data('2019-2-21', '2019-02-19')     
        
        
        
            