1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# -*- coding:utf-8 -*-
# @Time : 2020/4/24 14:15
# @Author : litao
# -*- coding: utf-8 -*-
"""
Created on Mon May 14 17:52:02 2018
Find urls in given releaser page, and write first batch data into es.
Everytime this program runs, two things will happen:
1 All video urls in given releaser page will be fetched and put into redis url pool,
2 All data related to 1 will be fetched and stored into es.
Data in es will be update when run this program once.
@author: hanye
"""
import sys
import argparse,copy
from crawler.crawler_sys.framework.es_target_releasers import get_releaserUrls_from_es
from crawler.crawler_sys.framework.platform_crawler_register import platform_crawler_reg
import redis,json
from redis.sentinel import Sentinel
# sentinel = Sentinel([('192.168.17.65', 26379),
# ('192.168.17.66', 26379),
# ('192.168.17.67', 26379)
# ],socket_timeout=0.5)
# 查看master节点
# master = sentinel.discover_master('ida_redis_master')
# 查看slave 节点
# slave = sentinel.discover_slaves('ida_redis_master')
# 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True)
rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, decode_responses=True, password='ReDis!GmTx*0aN12')
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platform', default=[], action='append',
help=('Pass platform names, they will be assembled in python list.'))
parser.add_argument('-pj', '--project_tags', default=[], action='append',
help=('Pass platform names, they will be assembled in python list.'))
parser.add_argument('-n', '--max_page', default=2, type=int,
help=('The max page numbers to be scroll for each releaser url, '
'must be an int value, default to 30.'))
parser.add_argument('-fre', '--frequency', default=1, type=int,
help=('choose a frequency to retrieve releaserUrl,'
'1, 3 or 9 is legal number, default 1'))
parser.add_argument('-proxies', '--proxies', default=0, type=int,
help=('Crawler proxies_num'))
parser.add_argument('-d', '--date', default=3, type=int,
help=('Crawler backtracking data time'))
parser.add_argument('-s', '--processes_num', default=5, type=int,
help=('Processes number to be used in multiprocessing'))
parser.add_argument('-article', '--article', default=0, type=int,
help=('is article page'))
args = parser.parse_args()
if args.platform != []:
platforms = args.platform
else:
print('platform must be input')
sys.exit(0)
releaser_page_num_max = args.max_page
frequency = args.frequency
if frequency == '':
frequency = None
processes_num = args.processes_num
frequency = args.frequency
print(frequency)
if frequency == 0:
frequency = None
kwargs_dict = {
"proxies_num": 0,
"date":args.date,
}
if frequency:
if frequency >= 3:
kwargs_dict["proxies_num"] = 3
if args.proxies:
kwargs_dict["proxies_num"] = args.proxies
is_article = args.article
def write_project_to_redis(platform, data):
rds.rpush(platform, data)
def write_releaserUrl_to_redis(data_dic):
write_project_to_redis(data_dic["platform"], json.dumps(data_dic))
for platform in platforms:
# 2 get releaserUrl list on each platform from target-releasers index
releaserUrl_Lst = get_releaserUrls_from_es(platform=platform, frequency=frequency,target_index="target_releasers")
if is_article:
platform = platform + "_article"
rds.hset("process_num",platform,processes_num)
if releaserUrl_Lst == []:
print('Get empty releaserUrl_Lst for platform %s' % platform)
continue
# 3 get crawler for this platform
for releaserUrl,releaser in releaserUrl_Lst:
push_dic = {
"releaserUrl":releaserUrl,
"releaser":releaser,
"platform":platform,
}
push_dic.update(kwargs_dict)
write_releaserUrl_to_redis(push_dic)