1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 28 15:27:39 2018
@author: hanye
"""
import argparse
import datetime
from crawler_sys.framework.platform_redis_register import get_redis_list_name
from crawler_sys.framework.redis_interact import rds
from crawler_sys.framework.platform_crawler_register import get_crawler
from crawler_sys.utils.output_results import output_result
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platforms', default=[], action='append',
help=('Pass platform names, they will be assembled in python list.'))
parser.add_argument('-b', '--batch_str_Lst', default=[], action='append',
help=('Pass batch_str names, they will be assembled in python list.'))
args = parser.parse_args()
platform_Lst = args.platforms
batch_str_Lst = args.batch_str_Lst
def get_video_dict_by_url(platform, url):
Platform_crawler = get_crawler(platform)
if Platform_crawler != None:
crawler_instant = Platform_crawler()
else:
print('Failed to get crawler for platform %s' % platform)
return None
video_dict = crawler_instant.video_page(url)
return video_dict
def scrap_redis_urls(platform, batch_str):
redis_list_name = get_redis_list_name(platform, batch_str)
video_dict_Lst = []
if redis_list_name is None:
print('Failed to get correct redis list name '
'in platform_redis_register for platform: '
% platform)
return None
else:
urls_total = rds.scard(redis_list_name)
if urls_total == 0:
print('Got %d urls to be processed for %s, program exits, %s'
% (urls_total, redis_list_name, datetime.datetime.now()))
return None
print('Got %d urls to be processed for %s, %s'
% (urls_total, redis_list_name,
datetime.datetime.now()))
url_bin = rds.spop(redis_list_name)
url_counter = 1
while url_bin is not None:
url = url_bin.decode('utf-8')
video_dict = get_video_dict_by_url(platform, url)
if video_dict is not None:
video_dict_Lst.append(video_dict)
url_bin = rds.spop(redis_list_name)
url_counter += 1
if url_counter%100 == 0 or url_counter == urls_total:
print('%s: %d/%d, %s' % (redis_list_name,
url_counter,
urls_total,
datetime.datetime.now()))
if len(video_dict_Lst) >= 100:
output_result(video_dict_Lst, platform,
output_to_es_raw=True,
output_to_es_register=False,
push_to_redis=False,
output_to_file=False)
video_dict_Lst.clear()
if video_dict_Lst != []:
output_result(video_dict_Lst, platform,
output_to_es_raw=True,
output_to_es_register=False,
push_to_redis=False,
output_to_file=False)
video_dict_Lst.clear()
if platform_Lst == []:
print('No platform is given, program exits.')
else:
for platform in platform_Lst:
print('Scraping platform: %s' % platform)
if batch_str_Lst == []:
batch_str = ''
scrap_redis_urls(platform, batch_str)
else:
for batch_str in batch_str_Lst:
print('platform: %s batch: %s' % (platform, batch_str))
scrap_redis_urls(platform, batch_str)