1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 20 09:19:12 2018
@author: hanye
"""
import hashlib
from write_data_into_es.func_calculate_toutiao_video_id import calculate_toutiao_video_id
from write_data_into_es.func_calculate_newTudou_video_id import calculate_newTudou_video_id
from write_data_into_es.func_calculate_v_qq_video_id import calculate_v_qq_video_id
#from func_calculate_kwai_video_id_by_data import calculate_kwai_video_id_by_data
from write_data_into_es.func_calculate_kwai_video_id_by_url import calculate_kwai_video_id_by_data_by_url
from write_data_into_es.func_calculate_txxw_video_id import calculate_txxw_video_id
from write_data_into_es.func_calculate_wangyi_news_id import calculate_wangyi_news_id
from write_data_into_es.func_calculate_douyin_id import calculate_douyin_id
from write_data_into_es.func_calculate_haokan_video_id import calculate_haokan_id
def vid_cal_func(platform):
vid_cal_func_dict = {
'toutiao': calculate_toutiao_video_id,
'new_tudou': calculate_newTudou_video_id,
'腾讯视频': calculate_v_qq_video_id,
'kwai': calculate_kwai_video_id_by_data_by_url,
'腾讯新闻':calculate_txxw_video_id,
"网易新闻":calculate_wangyi_news_id,
"抖音":calculate_douyin_id,
"haokan":calculate_haokan_id
}
def general_vid_cal_func(url):
return url
if platform in vid_cal_func_dict:
return vid_cal_func_dict[platform]
else:
return general_vid_cal_func
def hash_name(name):
name_md5 = hashlib.md5(name.encode('utf-8')).hexdigest()
return name_md5
def cal_doc_id(platform, url=None,
fetch_day_str=None,
fetch_time_ts=None,
data_dict=None,
extra_str=None,
doc_id_type='daily-url',
**kwargs):
if doc_id_type == 'releaser':
if data_dict is not None:
releaser_name = data_dict['releaser']
releaser_name_md5 = hash_name(releaser_name)
if platform is None:
platform = data_dict['platform']
vid =(releaser_name_md5 + '_'
+ platform)
if extra_str is not None:
vid += '_%s' % extra_str
else:
vid = None
else:
try:
if url == None:
url = data_dict['url']
except:
url = None
if platform == '腾讯新闻' or platform == 'haokan':
vid_bare = vid_cal_func(platform)(data_dict)
else:
vid_bare = vid_cal_func(platform)(url)
if doc_id_type == 'daily-url':
if fetch_day_str != None:
if platform == 'toutiao':
vid = 'toutiao_%s_%s' % (vid_bare, fetch_day_str)
elif platform == 'new_tudou':
vid = 'new_tudou_%s_%s' % (vid_bare, fetch_day_str)
elif platform == '腾讯视频':
releaser_id_str = data_dict.get("releaser_id_str")
if releaser_id_str:
releaser_id = releaser_id_str[5:]
vid = 'v_qq_%s_%s_%s' % (vid_bare, releaser_id,fetch_day_str)
else:
vid = 'v_qq_%s_%s' % (vid_bare,fetch_day_str)
elif platform == 'kwai' and vid_bare is not None:
vid = 'kwai_%s_%s' % (vid_bare, fetch_day_str)
elif platform == '腾讯新闻' and vid_bare is not None:
vid = 'txxw_%s_%s' % (vid_bare, fetch_day_str)
elif platform == '网易新闻':
vid = 'wyxw_%s_%s' % (vid_bare, fetch_day_str)
else:
vid = '%s_%s' % (vid_bare, fetch_day_str)
else:
print('fetch_day_str is needed for doc_id_type: %s' % doc_id_type)
vid = None
elif doc_id_type == 'all-time-url':
if platform == 'toutiao':
vid = 'toutiao_%s' % (vid_bare)
elif platform == 'new_tudou':
vid = 'new_tudou_%s' % (vid_bare)
elif platform == '腾讯视频':
releaser_id_str = data_dict.get("releaser_id_str")
if releaser_id_str:
releaser_id = releaser_id_str[5:]
vid = 'v_qq_%s_%s' % (vid_bare, releaser_id)
else:
vid = 'v_qq_%s' % (vid_bare)
elif platform == "haokan" and vid_bare is not None:
vid = 'haokan_%s' % (vid_bare)
elif platform == 'kwai' and vid_bare is not None:
vid = 'kwai_%s' % (vid_bare)
elif platform == '腾讯新闻' and vid_bare is not None:
vid = 'txxw_%s' % (vid_bare)
elif platform == '网易新闻':
vid = 'wyxw_%s' % (vid_bare)
else:
vid = '%s' % (vid_bare)
elif doc_id_type == 'time-track':
if fetch_time_ts != None:
if platform == 'toutiao':
vid = 'toutiao_%s_%s' % (vid_bare, str(fetch_time_ts))
elif platform == 'new_tudou':
vid = 'new_tudou_%s_%s' % (vid_bare, str(fetch_time_ts))
elif platform == '腾讯视频':
releaser_id_str = data_dict.get("releaser_id_str")
if releaser_id_str:
releaser_id = releaser_id_str[5:]
vid = 'v_qq_%s_%s_%s' % (vid_bare, releaser_id, str(fetch_time_ts))
else:
vid = 'v_qq_%s_%s' % (vid_bare, str(fetch_time_ts))
elif platform == 'kwai' and vid_bare is not None:
vid = 'kwai_%s_%s' % (vid_bare, str(fetch_time_ts))
elif platform == '腾讯新闻' and vid_bare is not None:
vid = 'txxw_%s_%s' % (vid_bare, str(fetch_time_ts))
elif platform == '网易新闻' and vid_bare is not None:
vid = 'wyxw_%s_%s' % (vid_bare, str(fetch_time_ts))
else:
vid = '%s_%s' % (vid_bare, str(fetch_time_ts))
else:
print('fetch_time_ts is needed for doc_id_type: %s' % doc_id_type)
vid = None
elif doc_id_type == 'bare':
vid = vid_bare
else:
print('Unknown doc_type: %s!' % doc_id_type)
vid = None
return vid