# -*- coding: utf-8 -*- """ Created on Wed Jun 20 09:19:12 2018 @author: hanye """ import hashlib from write_data_into_es.calculate_doc_id.func_calculate_toutiao_video_id import calculate_toutiao_video_id from write_data_into_es.calculate_doc_id.func_calculate_newTudou_video_id import calculate_newTudou_video_id from write_data_into_es.calculate_doc_id.func_calculate_v_qq_video_id import calculate_v_qq_video_id #from func_calculate_kwai_video_id_by_data import calculate_kwai_video_id_by_data from write_data_into_es.calculate_doc_id.func_calculate_kwai_video_id_by_url import calculate_kwai_video_id_by_data_by_url from write_data_into_es.calculate_doc_id.func_calculate_txxw_video_id import calculate_txxw_video_id from write_data_into_es.calculate_doc_id.func_calculate_wangyi_news_id import calculate_wangyi_news_id from write_data_into_es.calculate_doc_id.func_calculate_douyin_id import calculate_douyin_id from write_data_into_es.calculate_doc_id.func_calculate_haokan_video_id import calculate_haokan_id from write_data_into_es.calculate_doc_id.func_calculate_weibo_id import calculate_weibo_id from write_data_into_es.calculate_doc_id.func_calculate_zhihu_id import calculate_zhihu_id from write_data_into_es.calculate_doc_id.func_calculate_douban_id import calculate_douban_id def vid_cal_func(platform): vid_cal_func_dict = { 'toutiao': calculate_toutiao_video_id, 'new_tudou': calculate_newTudou_video_id, '腾讯视频': calculate_v_qq_video_id, 'kwai': calculate_kwai_video_id_by_data_by_url, '腾讯新闻':calculate_txxw_video_id, "网易新闻":calculate_wangyi_news_id, "抖音":calculate_douyin_id, "haokan":calculate_haokan_id, "weibo":calculate_weibo_id, "douban":calculate_douban_id, "zhihu":calculate_zhihu_id, } def general_vid_cal_func(url): return url if platform in vid_cal_func_dict: return vid_cal_func_dict[platform] else: return general_vid_cal_func def hash_name(name): name_md5 = hashlib.md5(name.encode('utf-8')).hexdigest() return name_md5 def cal_doc_id(platform, url=None, fetch_day_str=None, fetch_time_ts=None, data_dict=None, extra_str=None, doc_id_type='daily-url', **kwargs): if doc_id_type == 'releaser': if data_dict is not None: releaser_name = data_dict['releaser'] releaser_name_md5 = hash_name(releaser_name) if platform is None: platform = data_dict['platform'] vid =(releaser_name_md5 + '_' + platform) if extra_str is not None: vid += '_%s' % extra_str else: vid = None else: try: if url == None: url = data_dict['url'] except: url = None vid_bare = vid_cal_func(platform)(data_dict) if doc_id_type == 'daily-url': if fetch_day_str != None: if platform == 'toutiao': vid = 'toutiao_%s_%s' % (vid_bare, fetch_day_str) elif platform == 'new_tudou': vid = 'new_tudou_%s_%s' % (vid_bare, fetch_day_str) elif platform == '腾讯视频': releaser_id_str = data_dict.get("releaser_id_str") if releaser_id_str: releaser_id = releaser_id_str[5:] vid = 'v_qq_%s_%s_%s' % (vid_bare, releaser_id,fetch_day_str) else: vid = 'v_qq_%s_%s' % (vid_bare,fetch_day_str) elif platform == 'kwai' and vid_bare is not None: vid = 'kwai_%s_%s' % (vid_bare, fetch_day_str) elif platform == '腾讯新闻' and vid_bare is not None: vid = 'txxw_%s_%s' % (vid_bare, fetch_day_str) elif platform == '网易新闻': vid = 'wyxw_%s_%s' % (vid_bare, fetch_day_str) else: vid = '%s_%s' % (vid_bare, fetch_day_str) else: print('fetch_day_str is needed for doc_id_type: %s' % doc_id_type) vid = None elif doc_id_type == 'all-time-url': if platform == 'toutiao': vid = 'toutiao_%s' % (vid_bare) elif platform == 'new_tudou': vid = 'new_tudou_%s' % (vid_bare) elif platform == '腾讯视频': releaser_id_str = data_dict.get("releaser_id_str") if releaser_id_str: releaser_id = releaser_id_str[5:] vid = 'v_qq_%s_%s' % (vid_bare, releaser_id) else: vid = 'v_qq_%s' % (vid_bare) elif platform == "haokan" and vid_bare is not None: vid = 'haokan_%s' % (vid_bare) elif platform == 'kwai' and vid_bare is not None: vid = 'kwai_%s' % (vid_bare) elif platform == '腾讯新闻' and vid_bare is not None: vid = 'txxw_%s' % (vid_bare) elif platform == '网易新闻': vid = 'wyxw_%s' % (vid_bare) else: vid = '%s_%s' % (platform,vid_bare) elif doc_id_type == 'time-track': if fetch_time_ts != None: if platform == 'toutiao': vid = 'toutiao_%s_%s' % (vid_bare, str(fetch_time_ts)) elif platform == 'new_tudou': vid = 'new_tudou_%s_%s' % (vid_bare, str(fetch_time_ts)) elif platform == '腾讯视频': releaser_id_str = data_dict.get("releaser_id_str") if releaser_id_str: releaser_id = releaser_id_str[5:] vid = 'v_qq_%s_%s_%s' % (vid_bare, releaser_id, str(fetch_time_ts)) else: vid = 'v_qq_%s_%s' % (vid_bare, str(fetch_time_ts)) elif platform == 'kwai' and vid_bare is not None: vid = 'kwai_%s_%s' % (vid_bare, str(fetch_time_ts)) elif platform == '腾讯新闻' and vid_bare is not None: vid = 'txxw_%s_%s' % (vid_bare, str(fetch_time_ts)) elif platform == '网易新闻' and vid_bare is not None: vid = 'wyxw_%s_%s' % (vid_bare, str(fetch_time_ts)) else: vid = '%s_%s' % (vid_bare, str(fetch_time_ts)) else: print('fetch_time_ts is needed for doc_id_type: %s' % doc_id_type) vid = None elif doc_id_type == 'bare': vid = vid_bare else: print('Unknown doc_type: %s!' % doc_id_type) vid = None return vid