# -*- coding: utf-8 -*- """ Created on Wed Jun 6 18:18:09 2018 @author: hanye """ import redis from crawler_sys.framework.platform_redis_register import get_redis_list_name from crawler_sys.framework.es_crawler import scan_crawler_url_register rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=0) def url_reformer(platform, url): """ to reform url according to platform, in the future. Say, a url of http://www.toutiao.com/group/1234567890123456789 as a string is different from http://www.365yg.com/u/1234567890123456789, but they point to the same resource. They should be reformed to one unique url before pushing into redis for futher crawling. """ reformed_url = url return reformed_url def feed_url_into_redis(dict_Lst, platform, release_time_lower_bdr=None, batch_str=None): """ release_time_lower_bdr must be an int value represent timestamp in milliseconds if given. All url that is released before release_time_lower_bdr will not be pushed into redis. If argument release_time_lower_bdr is not given when call this function, all urls will be pushed into redis. """ redis_list_name = get_redis_list_name(platform, batch_str) if redis_list_name is None: print('Failed to get correct redis list name ' 'in platform_redis_register for platform: ' % platform) return (None, None) else: print('Feeding url into redis list %s ...' % redis_list_name) url_counter = 0 for data_dict in dict_Lst: try: url = data_dict['url'] url_reformed = url_reformer(platform, url) if release_time_lower_bdr is None: sadd_c = rds.sadd(redis_list_name, url_reformed) url_counter += sadd_c else: url_release_time = data_dict['release_time'] if url_release_time >= release_time_lower_bdr: sadd_c = rds.sadd(redis_list_name, url_reformed) url_counter += sadd_c except: print('Failed to push url into redis, ' 'might because of lack of url field ' 'or lack of release_time field, or ' 'has wrong typed release_time value. ' 'The failed data dict is: \n %s' % data_dict) print('Pushed %d urls into redis' % url_counter) return (redis_list_name, url_counter) def pull_url_from_es(platform, release_time_lower_bdr=None): """ Just pull urls from es index crawler-url-register. Url reforming things will be done in the method who is responsible for pushing urls into redis. """ if release_time_lower_bdr is None: release_time_lower_bdr = 0 else: pass search_body = {"query": {"bool": {"filter": [{"range": {"release_time": {"gte": release_time_lower_bdr}}}, {"term": {"platform.keyword": platform}}]}}} total_hit, scan_resp = scan_crawler_url_register(search_body) batch_url_Lst = [] if total_hit > 0: line_counter = 0 for line in scan_resp: line_counter += 1 line_d = line['_source'] batch_url_Lst.append(line_d) else: pass return batch_url_Lst