# -*- coding: utf-8 -*- """ Created on Wed Jun 6 18:18:09 2018 @author: hanye """ import redis, json from crawler_sys.framework.platform_redis_register import get_redis_list_name from crawler_sys.framework.es_crawler import scan_crawler_url_register def redis_path(redis_type=""): if redis_type == "on_line": rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, password='ReDis!GmTx*0aN12') else: rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19) return rds def feed_url_into_redis(dict_Lst, expire=0,rds=redis_path): """ release_time_lower_bdr must be an int value represent timestamp in milliseconds if given. All url that is released before release_time_lower_bdr will not be pushed into redis. If argument release_time_lower_bdr is not given when call this function, all urls will be pushed into redis. """ for data_dict in dict_Lst: try: doc_id = data_dict['doc_id'] sadd_c = rds.lpush(doc_id, json.dumps(data_dict)) res = rds.lpush("doc_id", doc_id) if expire: rds.expire(doc_id,expire) except: print('Failed to push data into redis') print('Pushed data into redis') return True def pull_url_from_es(platform, release_time_lower_bdr=None): """ Just pull urls from es index crawler-url-register. Url reforming things will be done in the method who is responsible for pushing urls into redis. """ if release_time_lower_bdr is None: release_time_lower_bdr = 0 else: pass search_body = {"query": {"bool": {"filter": [{"range": {"release_time": {"gte": release_time_lower_bdr}}}, {"term": {"platform.keyword": platform}}]}}} total_hit, scan_resp = scan_crawler_url_register(search_body) batch_url_Lst = [] if total_hit > 0: line_counter = 0 for line in scan_resp: line_counter += 1 line_d = line['_source'] batch_url_Lst.append(line_d) else: pass return batch_url_Lst