1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 11 00:38:51 2018
@author: fangyucheng
"""
import json
#import time
import redis
rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=13)
#push and retrieve list url into redis
def push_list_url_to_redis(platform, result_lst):
"""push a list of url(only url, type str) into a redis list
"""
if platform == '腾讯视频':
platform = 'v_qq'
key = "%s_list_url" % platform
for line in result_lst:
rds.lpush(key, line)
print("the length of %s is %s" % (key, rds.llen(key)))
def retrieve_list_url_from_redis(platform, retrieve_count=30):
if platform == '腾讯视频':
platform = 'v_qq'
key = '%s_list_url' % platform
count = 0
result_lst = []
while retrieve_count > count:
url_bytes = rds.rpop(key)
if url_bytes is None:
print("retrieve %s list urls from redis" % len(result_lst))
return result_lst
else:
url = url_bytes.decode("utf-8").replace("\'", "\"")
result_lst.append(url)
count += 1
print("retrieve %s list urls from redis" % len(result_lst))
return result_lst
#end of pushing and retrieving list url into redis
#push and retrieve list page html
def push_list_page_html_to_redis(platform, result_lst):
"""
push download list page html to redis,
it only used for v_qq now
"""
if platform == '腾讯视频':
platform = 'v_qq'
key = '%s_list_page_html' % platform
for line in result_lst:
rds.lpush(key, line)
print("the length of lst_page_html is %s" % rds.llen(key))
def retrieve_list_page_html_from_redis(platform):
if platform == '腾讯视频':
platform = 'v_qq'
key = '%s_list_page_html' % platform
lst_page_html_byte = rds.rpop(key)
lst_page_html_str = lst_page_html_byte.decode("utf-8").replace("\'", "\"")
return lst_page_html_str
#end of pushing and retrieving list page html
#this is to push url into redis set only url
def push_video_url_to_redis_set(platform, url_lst):
"""
push url to redis set
it usually be used in renewing video page play_count
and special platform list page crawler such as new_tudou
"""
if platform == '腾讯视频':
platform = 'v_qq'
key = '%s_video_url' % platform
for line in url_lst:
rds.sadd(key, line)
print("the length of %s set is %s" % (key, rds.scard(key)))
def retrieve_video_url_from_redis_set(platform, retrieve_count=90):
"""
retrieve video url from redis set
"""
if platform == '腾讯视频':
platform = 'v_qq'
key = '%s_video_url' % platform
url_list = []
count = 0
while retrieve_count > count:
url_byte = rds.spop(key)
if url_byte is None:
print("total output platform %s %s urls"
% (platform, len(url_list)))
return url_list
else:
url_str = url_byte.decode('utf-8').replace("\'", "\"")
url_list.append(url_str)
count += 1
print("total output platform %s %s urls" % (platform, len(url_list)))
return url_list
#end of above part
#push and retireve url dict to redis
def push_url_dict_lst_to_redis_set(platform, result_lst):
"""
push a dict with url to redis,
generally, the dict is from list page
"""
if platform == '腾讯视频':
platform = 'v_qq'
key = '%s_url_dict' % platform
for line in result_lst:
rds.sadd(key, line)
print("the length of %s urldict set is %s" % (key, rds.scard(key)))
def retrieve_url_dict_from_redis_set(platform, retrieve_count=90):
"""
retrieve a dict with url from redis
"""
if platform == '腾讯视频':
platform = 'v_qq'
key = '%s_url_dict' % platform
count = 0
result_lst = []
while retrieve_count > count:
url_dic_bytes = rds.spop(key)
if url_dic_bytes is None:
print("total output %s url dicts" % len(result_lst))
return result_lst
else:
url_dic_str = url_dic_bytes.decode('utf-8').replace("\'", "\"")
url_dic = json.loads(url_dic_str)
count += 1
result_lst.append(url_dic)
print("total output %s from %s urldicts" % (len(result_lst), platform))
return result_lst
#end of push and retireve url dict to redis
#push and retrieve video page html
def push_video_page_html_to_redis(platform, result_lst):
"""
push download video page html to redis
"""
if platform == '腾讯视频':
platform = 'v_qq'
key = '%s_video_page_html' % platform
for line in result_lst:
rds.lpush(key, line)
print("the length of %s list is %s" % (key, rds.llen(key)))
def retrieve_video_page_html_from_redis(platform):
"""
retrieve html both video page and list page from redis
"""
if platform == '腾讯视频':
platform = 'v_qq'
key = '%s_video_page_html' % platform
video_html_bytes = rds.rpop(key)
video_html_str = video_html_bytes.decode("utf-8")
return video_html_str
#end of push and retrieve video page html
def length_of_lst(key):
"""
To get the length of a redis list
"""
length = rds.llen(key)
return length
def push_error_html_to_redis(error_page):
"""
Used for asynchronous crawler, to push error list page into redis
"""
rds.lpush('error', error_page)
"""
this part is for renew video data
"""
#set for url
platform_redis_set_reg = {'toutiao': 'toutiao_url_set',
'腾讯视频': 'v_qq_url_set',
'youku': 'youku_url_set',
'iqiyi': 'iqiyi_url_set',
}
#list for html
platform_redis_lst_reg = {'toutiao': 'toutiao_html_lst',
'腾讯视频': 'v_qq_html_lst',
'youku': 'youku_html_lst',
'iqiyi': 'iqiyi_html_lst'}
#def retrieve_video_url_from_redis_set(platform, retrieve_count=90):
# count = 0
# result_lst = []
# redis_key = platform_redis_set_reg[platform]
# while retrieve_count > count:
# url_bytes = rds.spop(redis_key)
# if url_bytes is None:
# print("total output %s urls" % len(result_lst))
# return result_lst
# else:
# url_str = url_bytes.decode('utf-8').replace("\'", "\"")
# count += 1
# result_lst.append(url_str)
# print("total output %s urls" % len(result_lst))
# return result_lst
def push_video_page_html_to_redis_renew(platform, result_lst):
redis_key = platform_redis_lst_reg[platform]
for line in result_lst:
rds.lpush(redis_key, line)
print("the length of %s is %s" % (redis_key, rds.llen(redis_key)))
def retrieve_video_html_from_redis_renew(platform):
redis_key = platform_redis_lst_reg[platform]
video_html_bytes = rds.rpop(redis_key)
video_html_str = video_html_bytes.decode("utf-8").replace("\'", "\"")
return video_html_str
def length_of_set(key):
length = rds.scard(key)
return length
#this is for iqiyi
def iqiyi_push(video_dict):
key = 'iqiyi_video_info'
rds.lpush(key, video_dict)
print("the length of iqiyi list is %s" % rds.llen(key))
def iqiyi_retrieve(retrieve_count=90):
key = 'iqiyi_video_info'
info_list = []
count = 0
while retrieve_count > count:
info_dict_byte = rds.rpop(key)
if info_dict_byte is None:
print("total output %s urls" % len(info_list))
return info_list
else:
info_str = info_dict_byte.decode('utf-8').replace("\'", "\"")
info_dict = json.loads(info_str)
info_list.append(info_dict)
count += 1
print("total output %s urls" % len(info_list))
return info_list
#this is a test purpose
def push_pid_to_redis_set(platform, step, value):
"""step is to determine which step the crawler is working on,
such as parse_list_page, download_video_page, parse_video_page
"""
if platform == '腾讯视频':
platform = 'v_qq'
key = '%s_%s_pid' % (platform, step)
rds.sadd(key, value)
print('push %s to redis set %s' % (value, key))
def delete_pid_from_redis_set(platform, step, value):
"""step is to determine which step the crawler is working on,
such as parse_list_page, download_video_page, parse_video_page
"""
if platform == '腾讯视频':
platform = 'v_qq'
key = '%s_%s_pid' % (platform, step)
rds.srem(key, value)
print('delete %s from redis set %s' % (value, key))