redis_interact.py

# -*- coding: utf-8 -*-
"""
Created on Wed Jun  6 18:18:09 2018

@author: hanye
"""
import redis, json
from crawler_sys.framework.platform_redis_register import get_redis_list_name
from crawler_sys.framework.es_crawler import scan_crawler_url_register


def redis_path(redis_type=""):
    if redis_type == "on_line":
        rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, password='ReDis!GmTx*0aN12')
    else:
        rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19)
    return rds


def feed_url_into_redis(dict_Lst, expire=0,rds=redis_path):
    """
    release_time_lower_bdr must be an int value represent
    timestamp in milliseconds if given.
    All url that is released before release_time_lower_bdr
    will not be pushed into redis. If argument release_time_lower_bdr
    is not given when call this function, all urls will be
    pushed into redis.
    """
    for data_dict in dict_Lst:
        try:
            doc_id = data_dict['doc_id']
            sadd_c = rds.lpush(doc_id, json.dumps(data_dict))
            res = rds.lpush("doc_id", doc_id)
            if expire:
                rds.expire(doc_id,expire)
        except:
            print('Failed to push data into redis')
    print('Pushed data into redis')
    return True


def pull_url_from_es(platform, release_time_lower_bdr=None):
    """
    Just pull urls from es index crawler-url-register.
    Url reforming things will be done in the method who
    is responsible for pushing urls into redis.
    """
    if release_time_lower_bdr is None:
        release_time_lower_bdr = 0
    else:
        pass
    search_body = {"query": {"bool": {"filter": [{"range": {"release_time":
                                                 {"gte": release_time_lower_bdr}}},
                                                 {"term": {"platform.keyword": platform}}]}}}
    total_hit, scan_resp = scan_crawler_url_register(search_body)
    batch_url_Lst = []
    if total_hit > 0:
        line_counter = 0
        for line in scan_resp:
            line_counter += 1
            line_d = line['_source']
            batch_url_Lst.append(line_d)
    else:
        pass
    return batch_url_Lst