1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 26 14:25:37 2018
@author: hanye
"""
import argparse
import configparser
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from crawler.crawler_sys.framework.platform_crawler_register import platform_crawler_reg
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platform', default=[], action='append',
help=('Pass platform names, they will be assembled in python list.'))
parser.add_argument('-c', '--conf', default=('/home/hanye/crawlersNew/crawler'
'/crawler_sys/framework/config'
'/list_page_urls.ini'), type=str,
help=('absolute path of config file'))
parser.add_argument('-ch', '--channels', default=[], action='append',
help=('Specify channel names, illegal channel names will be ignored, '
'default to be all.'))
args = parser.parse_args()
if args.platform != []:
platforms = args.platform
for platform in platforms:
if platform not in platform_crawler_reg:
print("%s is not a legal platform name" % platform)
else:
platforms = [
'iqiyi',
'youku',
'腾讯视频',
'new_tudou',
'toutiao'
]
config_file = args.conf
config = configparser.RawConfigParser()
config.sections()
config.read(config_file)
channel_Lst = args.channels
for platform in platforms:
print('working on Platform %s' % platform)
Platform_crawler = get_crawler(platform)
crawler = Platform_crawler()
task_list = []
if channel_Lst == []:
for key, value in config[platform].items():
task_list.append(value)
else:
for channel in channel_Lst:
try:
task_url = config[platform][channel]
task_list.append(task_url)
except:
print("there is no channel %s in platform %s" % (channel, platform))
crawler.start_list_page(task_list)