1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 6 18:18:09 2018
@author: hanye
"""
import redis, json
from crawler_sys.framework.platform_redis_register import get_redis_list_name
from crawler_sys.framework.es_crawler import scan_crawler_url_register
def redis_path(redis_type=""):
if redis_type == "on_line":
rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, password='ReDis!GmTx*0aN12')
else:
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19)
return rds
def feed_url_into_redis(dict_Lst, expire=0,rds=redis_path):
"""
release_time_lower_bdr must be an int value represent
timestamp in milliseconds if given.
All url that is released before release_time_lower_bdr
will not be pushed into redis. If argument release_time_lower_bdr
is not given when call this function, all urls will be
pushed into redis.
"""
for data_dict in dict_Lst:
try:
doc_id = data_dict['doc_id']
sadd_c = rds.lpush(doc_id, json.dumps(data_dict))
res = rds.lpush("doc_id", doc_id)
if expire:
rds.expire(doc_id,expire)
except:
print('Failed to push data into redis')
print('Pushed data into redis')
return True
def pull_url_from_es(platform, release_time_lower_bdr=None):
"""
Just pull urls from es index crawler-url-register.
Url reforming things will be done in the method who
is responsible for pushing urls into redis.
"""
if release_time_lower_bdr is None:
release_time_lower_bdr = 0
else:
pass
search_body = {"query": {"bool": {"filter": [{"range": {"release_time":
{"gte": release_time_lower_bdr}}},
{"term": {"platform.keyword": platform}}]}}}
total_hit, scan_resp = scan_crawler_url_register(search_body)
batch_url_Lst = []
if total_hit > 0:
line_counter = 0
for line in scan_resp:
line_counter += 1
line_d = line['_source']
batch_url_Lst.append(line_d)
else:
pass
return batch_url_Lst