# -*- coding: utf-8 -*-
"""
Created on Tue Apr 24 17:08:48 2018

calculate stats_type:observed, based on this rule:
releasers' observed data values equal to those stats_type:new_released values,
plus MNI values from videos which are published before upper boundary.

@author: hanye
"""

import datetime, hashlib, json
import elasticsearch
from elasticsearch.helpers import scan
#import redis


# 1 mapping new type every time before write data
hosts='192.168.17.11'
port=9200
http_auth=('hanye', 'GHFrhsFjylw5')
es=elasticsearch.Elasticsearch(hosts=hosts,port=port, http_auth=http_auth)

es_indices=elasticsearch.client.IndicesClient(es)

year=2018
month_lower_bdry=1
month_upper_bdry=3
date_flag='Q1'
#doc_type_new='releasers-2018-Q1'
doc_type_new='releasers-%d-%s' % (year, date_flag)
stats_type='observed'


log_path='/home/hanye/project_data/Python/Projects/proj-short-videos/write-data-into-es/log/'
log_fn='sum_up_releaser_data'+'_'+doc_type_new+'_'+datetime.datetime.now().isoformat().replace(':','-')
f_log=open(log_path+log_fn, 'a', encoding='gb18030')
#import sys
#f_log=sys.stdout
print('log starts at', datetime.datetime.now(), file=f_log)
print('doc_type_new', doc_type_new, file=f_log)

index_releaser='releaser'
doc_type_releaser_ori='releasers-2018-Q1'
index_SV='short-video-production'
doc_type_SV='daily-url-2018-Q1'

## mapping
#print('mapping from %s to %s' % (doc_type_ori, doc_type_new),
#      datetime.datetime.now(), file=f_log)
#
#resp_get_mapping=es_indices.get_mapping(index=index, doc_type=doc_type_ori)
#
#mapping_propertity_dict=resp_get_mapping['releaser-v2']['mappings'][doc_type_ori]
#field_data_quarter={
#    "data_quarter": {
#            "type": "text",
#            "fields": {
#              "keyword": {
#                "type": "keyword",
#                "ignore_above": 256
#              }
#            }
#          }
#  }
#mapping_propertity_dict['properties'].update(field_data_quarter)
#
#put_mapping_resp=es_indices.put_mapping(doc_type=doc_type_new,
#                                    body=mapping_propertity_dict,
#                                    index=index)
#
#print('mapping result\n', put_mapping_resp, file=f_log)


# 2 get all releasers of each platform

## 2.1 get all platforms
aggs_platforms_bd={
  "query": {
    "bool": {
      "filter": [
        {"term": {"data_year": year}},
        {"range": {"data_month": {"gte":month_lower_bdry, "lte":month_upper_bdry}}},
#        {"term": {"stats_type.keyword": stats_type}}
        ]
    }
  },
  "size": 1, 
  "aggs": {
    "platforms": {
      "terms": {
        "field": "platform.keyword",
        "size": 50
      }
    }
  }
}
get_platforms_resp=es.search(index=index_releaser, doc_type=doc_type_releaser_ori,
                             body=aggs_platforms_bd, request_timeout=100)
platforms_Lst_raw=get_platforms_resp['aggregations']['platforms']['buckets']
platforms=[]
for pltr in platforms_Lst_raw:
    pltl={'platform': pltr['key'], 'releaser_num': pltr['doc_count']}
    platforms.append(pltl)

## 2.2 get all releasers for each platform, add up, write into new _type
#r = redis.StrictRedis(host='192.168.17.26', port=6379, db=0)
for pltl in platforms:
    platform=pltl['platform']
    print('Processing', platform, datetime.datetime.now(), file=f_log)
    get_plt_releasers_bd={
      "query": {
        "bool": {
          "filter": [
            {"term": {"data_year": year}},
            {"range": {"data_month": {"gte":month_lower_bdry, "lte":month_upper_bdry}}},
            {"term": {"stats_type.keyword": stats_type}},
            {"term": {"platform.keyword": platform}}
            ]
        }
      }
    }
    get_total_releaser_num_resp=es.search(index=index_releaser, doc_type=doc_type_releaser_ori,
                                          body=get_plt_releasers_bd, size=0,
                                          request_timeout=100)
    total_releaser_num=get_total_releaser_num_resp['hits']['total']
    scan_releasers_resp=scan(client=es, query=get_plt_releasers_bd,
                             index=index_releaser, doc_type=doc_type_releaser_ori,
                             request_timeout=300)
    
    platform_releaser_data={}
    line_counter=0
    for line in scan_releasers_resp:
        line_d=line['_source']
        line_counter+=1
        if line_counter%1000==0 or line_counter==total_releaser_num:
            print('[', platform, ']', 'Porcessing %d/%d' % (line_counter, total_releaser_num),
                  datetime.datetime.now(), file=f_log)
        releaser=line_d['releaser']
        vdn=line_d['video_num']
        plc=line_d['play_count_sum']
        fvc=line_d['favorite_count_sum']
        cmtc=line_d['comment_count_sum']
        if releaser not in platform_releaser_data:
            platform_releaser_data.update({releaser: {'releaser': releaser,
                                                      'platform': platform,
                                                      'data_year': line_d['data_year'],
                                                      'video_num': vdn,
                                                      'play_count_sum': plc,
                                                      'favorite_count_sum': fvc,
                                                      'comment_count_sum': cmtc,
                                                      'is_consolidated': line_d['is_consolidated'],
                                                      'parent_releaser': line_d['parent_releaser']
                                                      }})
        else:
            platform_releaser_data[releaser]['video_num']+=vdn
            platform_releaser_data[releaser]['play_count_sum']+=plc
            platform_releaser_data[releaser]['favorite_count_sum']+=fvc
            platform_releaser_data[releaser]['comment_count_sum']+=cmtc
        
    # after all relasers have been iterated
    ## sort by play_count and assign rank no.
    releaser_dict_Lst=[]
    for rel in platform_releaser_data:
        releaser_dict_Lst.append(platform_releaser_data[rel])
    print('[', platform, ']', 'start sorting ...',
          datetime.datetime.now(), file=f_log)
    tss=datetime.datetime.now()
    if platform!='pearvideo':
        releaser_dict_Lst=sorted(releaser_dict_Lst, key=lambda d: d['play_count_sum'], reverse=True)
    else:
        releaser_dict_Lst=sorted(releaser_dict_Lst, key=lambda d: d['favorite_count_sum'], reverse=True)
    tse=datetime.datetime.now()
    tsd=tse-tss
    print('[', platform, ']', 'sorting done, takes', tsd,
          datetime.datetime.now(), file=f_log)
    rank_no=0
    for line in releaser_dict_Lst:
        rank_no+=1
        line.update({'ranking_by_play_count_sum': rank_no})
    
    ## bulk write into new _type
    bulk_write_body=''
    releaser_counter=0
    for releaser_dict in releaser_dict_Lst:
        releaser_counter+=1
        releaser_name=releaser_dict['releaser']
        releaser_name_md5=hashlib.md5(releaser_name.encode('utf-8')).hexdigest()
        doc_id=(releaser_name_md5+'_'
                +platform+'_'
                +str(year)+'_'+date_flag+'_'+stats_type)
        releaser_dict.update({'timestamp': int(datetime.datetime.now().timestamp()*1e3),
                              'field_data_quarter': date_flag,
                              'stats_type': stats_type})
        bulk_write_action='{"index": {"_id":"'+doc_id+'"}}'
        bulk_write_data=json.dumps(releaser_dict, ensure_ascii=False)
        bulk_write_body+=(bulk_write_action+'\n'
                          +bulk_write_data+'\n') 
        if bulk_write_body!='':
            if releaser_counter%1000==0 or releaser_counter==total_releaser_num:
                t_bw_es_s=datetime.datetime.now()
                try:
                    es.bulk(body=bulk_write_body,
                            index=index,
                            doc_type=doc_type_new,
                            request_timeout=300)
                except :
                    print('[', platform, ']', 'Caught Exception, the bulk_write_body is as follows:\n', 
                          bulk_write_body, file=f_log)
                t_bw_es_e=datetime.datetime.now()
                t_bw_es_delta=t_bw_es_e-t_bw_es_s
                print('[', platform, ']', 'write into index', index, 
                      'doc_type', doc_type_new,
                      releaser_counter, '/', total_releaser_num,
                      'time spent', t_bw_es_delta, 
                      'at', datetime.datetime.now(), file=f_log)
                bulk_write_body=''



