1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 6 18:18:09 2018
@author: hanye
"""
#import redis
#from crawler_sys.framework.platform_redis_register import get_redis_list_name
from crawler.crawler_sys.framework.es_crawler import scan_crawler_url_register
#rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=0)
def pull_url_from_es(platform, release_time_lower_bdr=None):
"""
Just pull urls from es index crawler-url-register.
Url reforming things will be done in the method who
is responsible for pushing urls into redis.
Just return url and its platform
"""
if release_time_lower_bdr is None:
release_time_lower_bdr = 0
else:
pass
search_body = {"query": {"bool": {"filter": [{"range": {"release_time":
{"gte": release_time_lower_bdr}}},
{"term": {"platform.keyword": platform}}]}}}
total_hit, scan_resp = scan_crawler_url_register(search_body)
batch_url_Lst = []
if total_hit > 0:
line_counter = 0
for line in scan_resp:
line_counter += 1
line_d = line['_source']
url = line_d['url']
batch_url_Lst.append(url)
else:
pass
return batch_url_Lst
#def url_reformer(platform, url):
# """
# to reform url according to platform, in the future.
# Say, a url of http://www.toutiao.com/group/1234567890123456789
# as a string is different from http://www.365yg.com/u/1234567890123456789,
# but they point to the same resource. They should be reformed
# to one unique url before pushing into redis for futher crawling.
# """
# reformed_url = url
# return reformed_url
#
#def feed_url_into_redis(dict_Lst, platform,
# release_time_lower_bdr=None,
# batch_str=None):
# """
# release_time_lower_bdr must be an int value represent
# timestamp in milliseconds if given.
# All url that is released before release_time_lower_bdr
# will not be pushed into redis. If argument release_time_lower_bdr
# is not given when call this function, all urls will be
# pushed into redis.
# """
# redis_list_name = get_redis_list_name(platform, batch_str)
# if redis_list_name is None:
# print('Failed to get correct redis list name '
# 'in platform_redis_register for platform: '
# % platform)
# return (None, None)
# else:
# print('Feeding url into redis list %s ...' % redis_list_name)
# url_counter = 0
# for data_dict in dict_Lst:
# try:
# url = data_dict['url']
# url_reformed = url_reformer(platform, url)
# if release_time_lower_bdr is None:
# sadd_c = rds.sadd(redis_list_name, url_reformed)
# url_counter += sadd_c
# else:
# url_release_time = data_dict['release_time']
# if url_release_time >= release_time_lower_bdr:
# sadd_c = rds.sadd(redis_list_name, url_reformed)
# url_counter += sadd_c
# except:
# print('Failed to push url into redis, '
# 'might because of lack of url field '
# 'or lack of release_time field, or '
# 'has wrong typed release_time value. '
# 'The failed data dict is: \n %s' % data_dict)
# print('Pushed %d urls into redis' % url_counter)
# return (redis_list_name, url_counter)