func_cal_doc_id.py

# -*- coding: utf-8 -*-
"""
Created on Wed Jun 20 09:19:12 2018

@author: hanye
"""

import hashlib

from write_data_into_es.func_calculate_toutiao_video_id import calculate_toutiao_video_id
from write_data_into_es.func_calculate_newTudou_video_id import calculate_newTudou_video_id
from write_data_into_es.func_calculate_v_qq_video_id import calculate_v_qq_video_id
#from func_calculate_kwai_video_id_by_data import calculate_kwai_video_id_by_data
from write_data_into_es.func_calculate_kwai_video_id_by_url import calculate_kwai_video_id_by_data_by_url
from write_data_into_es.func_calculate_txxw_video_id import calculate_txxw_video_id
from write_data_into_es.func_calculate_wangyi_news_id import calculate_wangyi_news_id
from write_data_into_es.func_calculate_douyin_id import calculate_douyin_id
from write_data_into_es.func_calculate_haokan_video_id import calculate_haokan_id


def vid_cal_func(platform):
    vid_cal_func_dict = {
        'toutiao': calculate_toutiao_video_id,
        'new_tudou': calculate_newTudou_video_id,
        '腾讯视频': calculate_v_qq_video_id,
        'kwai': calculate_kwai_video_id_by_data_by_url,
        '腾讯新闻':calculate_txxw_video_id,
        "网易新闻":calculate_wangyi_news_id,
        "抖音":calculate_douyin_id,
        "haokan":calculate_haokan_id
        }

    def general_vid_cal_func(url):
        return url

    if platform in vid_cal_func_dict:
        return vid_cal_func_dict[platform]
    else:
        return general_vid_cal_func


def hash_name(name):
    name_md5 = hashlib.md5(name.encode('utf-8')).hexdigest()
    return name_md5

def cal_doc_id(platform, url=None,
               fetch_day_str=None,
               fetch_time_ts=None,
               data_dict=None,
               extra_str=None,
               doc_id_type='daily-url',
               **kwargs):
    if doc_id_type == 'releaser':
        if data_dict is not None:
            releaser_name = data_dict['releaser']
            releaser_name_md5 = hash_name(releaser_name)
            if platform is None:
                platform = data_dict['platform']
            vid =(releaser_name_md5 + '_'
                  + platform)
            if extra_str is not None:
                vid += '_%s' % extra_str
        else:
            vid = None
    else:
        try:
            if url == None:
                url = data_dict['url']
        except:
            url = None

        if platform == '腾讯新闻' or platform == 'haokan':
            vid_bare = vid_cal_func(platform)(data_dict)
        else:
            vid_bare = vid_cal_func(platform)(url)
        if doc_id_type == 'daily-url':
            if fetch_day_str != None:
                if platform == 'toutiao':
                    vid = 'toutiao_%s_%s' % (vid_bare, fetch_day_str)
                elif platform == 'new_tudou':
                    vid = 'new_tudou_%s_%s' % (vid_bare, fetch_day_str)
                elif platform == '腾讯视频':
                    releaser_id_str = data_dict.get("releaser_id_str")
                    if releaser_id_str:
                        releaser_id = releaser_id_str[5:]
                        vid = 'v_qq_%s_%s_%s' % (vid_bare, releaser_id,fetch_day_str)
                    else:
                        vid = 'v_qq_%s_%s' % (vid_bare,fetch_day_str)
                elif platform == 'kwai' and vid_bare is not None:
                    vid = 'kwai_%s_%s' % (vid_bare, fetch_day_str)
                elif platform == '腾讯新闻' and vid_bare is not None:
                    vid = 'txxw_%s_%s' % (vid_bare, fetch_day_str)
                elif platform == '网易新闻':
                    vid = 'wyxw_%s_%s' % (vid_bare, fetch_day_str)
                else:
                    vid = '%s_%s' % (vid_bare, fetch_day_str)
            else:
                print('fetch_day_str is needed for doc_id_type: %s' % doc_id_type)
                vid = None
        elif doc_id_type == 'all-time-url':
            if platform == 'toutiao':
                vid = 'toutiao_%s' % (vid_bare)
            elif platform == 'new_tudou':
                vid = 'new_tudou_%s' % (vid_bare)
            elif platform == '腾讯视频':
                releaser_id_str = data_dict.get("releaser_id_str")
                if releaser_id_str:
                    releaser_id = releaser_id_str[5:]
                    vid = 'v_qq_%s_%s' % (vid_bare, releaser_id)
                else:
                    vid = 'v_qq_%s' % (vid_bare)
            elif platform == "haokan" and vid_bare is not None:
                vid = 'haokan_%s' % (vid_bare)
            elif platform == 'kwai' and vid_bare is not None:
                vid = 'kwai_%s' % (vid_bare)
            elif platform == '腾讯新闻' and vid_bare is not None:
                vid = 'txxw_%s' % (vid_bare)
            elif platform == '网易新闻':
                vid = 'wyxw_%s' % (vid_bare)
            else:
                vid = '%s' % (vid_bare)
        elif doc_id_type == 'time-track':
            if fetch_time_ts != None:
                if platform == 'toutiao':
                    vid = 'toutiao_%s_%s' % (vid_bare, str(fetch_time_ts))
                elif platform == 'new_tudou':
                    vid = 'new_tudou_%s_%s' % (vid_bare, str(fetch_time_ts))
                elif platform == '腾讯视频':
                    releaser_id_str = data_dict.get("releaser_id_str")
                    if releaser_id_str:
                        releaser_id = releaser_id_str[5:]
                        vid = 'v_qq_%s_%s_%s' % (vid_bare, releaser_id,  str(fetch_time_ts))
                    else:
                        vid = 'v_qq_%s_%s' % (vid_bare,  str(fetch_time_ts))
                elif platform == 'kwai' and vid_bare is not None:
                    vid = 'kwai_%s_%s' % (vid_bare, str(fetch_time_ts))
                elif platform == '腾讯新闻' and vid_bare is not None:
                    vid = 'txxw_%s_%s' % (vid_bare, str(fetch_time_ts))
                elif platform == '网易新闻' and vid_bare is not None:
                    vid = 'wyxw_%s_%s' % (vid_bare, str(fetch_time_ts))
                else:
                    vid = '%s_%s' % (vid_bare, str(fetch_time_ts))
            else:
                print('fetch_time_ts is needed for doc_id_type: %s' % doc_id_type)
                vid = None
        elif doc_id_type == 'bare':
            vid = vid_bare
        else:
            print('Unknown doc_type: %s!' % doc_id_type)
            vid = None
    return vid