connect_with_redis.py 8.44 KB
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 11 00:38:51 2018

@author: fangyucheng
"""

import json
#import time
import redis

rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=13)

#push and retrieve list url into redis
def push_list_url_to_redis(platform, result_lst):
    """push a list of url(only url, type str) into a redis list
    """
    if platform == '腾讯视频':
        platform = 'v_qq'
    key = "%s_list_url" % platform
    for line in result_lst:
        rds.lpush(key, line)
    print("the length of %s is %s" % (key, rds.llen(key)))

def retrieve_list_url_from_redis(platform, retrieve_count=30):
    if platform == '腾讯视频':
        platform = 'v_qq'
    key = '%s_list_url' % platform
    count = 0
    result_lst = []
    while retrieve_count > count:
        url_bytes = rds.rpop(key)
        if url_bytes is None:
            print("retrieve %s list urls from redis" % len(result_lst))
            return result_lst
        else:
            url = url_bytes.decode("utf-8").replace("\'", "\"")
            result_lst.append(url)
            count += 1
    print("retrieve %s list urls from redis" % len(result_lst))
    return result_lst
#end of pushing and retrieving list url into redis

#push and retrieve list page html
def push_list_page_html_to_redis(platform, result_lst):
    """
    push download list page html to redis,
    it only used for v_qq now
    """
    if platform == '腾讯视频':
        platform = 'v_qq'
    key = '%s_list_page_html' % platform
    for line in result_lst:
        rds.lpush(key, line)
    print("the length of lst_page_html is %s" % rds.llen(key))

def retrieve_list_page_html_from_redis(platform):
    if platform == '腾讯视频':
        platform = 'v_qq'
    key = '%s_list_page_html' % platform
    lst_page_html_byte = rds.rpop(key)
    lst_page_html_str = lst_page_html_byte.decode("utf-8").replace("\'", "\"")
    return lst_page_html_str
#end of pushing and retrieving list page html


#this is to push url into redis set only url
def push_video_url_to_redis_set(platform, url_lst):
    """
    push url to redis set
    it usually be used in renewing video page play_count 
    and special platform list page crawler such as new_tudou
    """
    if platform == '腾讯视频':
        platform = 'v_qq'
    key = '%s_video_url' % platform
    for line in url_lst:
        rds.sadd(key, line)
    print("the length of %s set is %s" % (key, rds.scard(key)))

def retrieve_video_url_from_redis_set(platform, retrieve_count=90):
    """
    retrieve video url from redis set
    """
    if platform == '腾讯视频':
        platform = 'v_qq'
    key = '%s_video_url' % platform
    url_list = []
    count = 0
    while retrieve_count > count:
        url_byte = rds.spop(key)
        if url_byte is None:
            print("total output platform %s %s urls"
                  % (platform, len(url_list)))
            return url_list
        else:
            url_str = url_byte.decode('utf-8').replace("\'", "\"")
            url_list.append(url_str)
            count += 1
    print("total output platform %s %s urls" % (platform, len(url_list)))
    return url_list
#end of above part

#push and retireve url dict to redis
def push_url_dict_lst_to_redis_set(platform, result_lst):
    """
    push a dict with url to redis,
    generally, the dict is from list page
    """
    if platform == '腾讯视频':
        platform = 'v_qq'
    key = '%s_url_dict' % platform
    for line in result_lst:
        rds.sadd(key, line)
    print("the length of %s urldict set is %s" % (key, rds.scard(key)))

def retrieve_url_dict_from_redis_set(platform, retrieve_count=90):
    """
    retrieve a dict with url from redis
    """
    if platform == '腾讯视频':
        platform = 'v_qq'
    key = '%s_url_dict' % platform
    count = 0
    result_lst = []
    while retrieve_count > count:
        url_dic_bytes = rds.spop(key)
        if url_dic_bytes is None:
            print("total output %s url dicts" % len(result_lst))
            return result_lst
        else:
            url_dic_str = url_dic_bytes.decode('utf-8').replace("\'", "\"")
            url_dic = json.loads(url_dic_str)
            count += 1
            result_lst.append(url_dic)
    print("total output %s from %s urldicts" % (len(result_lst), platform))
    return result_lst
#end of push and retireve url dict to redis

#push and retrieve video page html
def push_video_page_html_to_redis(platform, result_lst):
    """
    push download video page html to redis
    """
    if platform == '腾讯视频':
        platform = 'v_qq'
    key = '%s_video_page_html' % platform
    for line in result_lst:
        rds.lpush(key, line)
    print("the length of %s list is %s" % (key, rds.llen(key)))

def retrieve_video_page_html_from_redis(platform):
    """
    retrieve html both video page and list page from redis
    """
    if platform == '腾讯视频':
        platform = 'v_qq'
    key = '%s_video_page_html' % platform
    video_html_bytes = rds.rpop(key)
    video_html_str = video_html_bytes.decode("utf-8")
    return video_html_str
#end of push and retrieve video page html

def length_of_lst(key):
    """
    To get the length of a redis list
    """
    length = rds.llen(key)
    return length


def push_error_html_to_redis(error_page):
    """
    Used for asynchronous crawler, to push error list page into redis
    """
    rds.lpush('error', error_page)



"""
this part is for renew video data
"""
#set for url
platform_redis_set_reg = {'toutiao': 'toutiao_url_set',
                          '腾讯视频': 'v_qq_url_set',
                          'youku': 'youku_url_set',
                          'iqiyi': 'iqiyi_url_set',
                          }
#list for html
platform_redis_lst_reg = {'toutiao': 'toutiao_html_lst',
                          '腾讯视频': 'v_qq_html_lst',
                          'youku': 'youku_html_lst',
                          'iqiyi': 'iqiyi_html_lst'}


#def retrieve_video_url_from_redis_set(platform, retrieve_count=90):
#    count = 0
#    result_lst = []
#    redis_key = platform_redis_set_reg[platform]
#    while retrieve_count > count:
#        url_bytes = rds.spop(redis_key)
#        if url_bytes is None:
#            print("total output %s urls" % len(result_lst))
#            return result_lst
#        else:
#            url_str = url_bytes.decode('utf-8').replace("\'", "\"")
#            count += 1
#            result_lst.append(url_str)
#    print("total output %s urls" % len(result_lst))
#    return result_lst

def push_video_page_html_to_redis_renew(platform, result_lst):
    redis_key = platform_redis_lst_reg[platform]
    for line in result_lst:
        rds.lpush(redis_key, line)
    print("the length of %s is %s" % (redis_key, rds.llen(redis_key)))

def retrieve_video_html_from_redis_renew(platform):
    redis_key = platform_redis_lst_reg[platform]
    video_html_bytes = rds.rpop(redis_key)
    video_html_str = video_html_bytes.decode("utf-8").replace("\'", "\"")
    return video_html_str

def length_of_set(key):
    length = rds.scard(key)
    return length

#this is for iqiyi
def iqiyi_push(video_dict):
    key = 'iqiyi_video_info' 
    rds.lpush(key, video_dict)
    print("the length of iqiyi list is %s" % rds.llen(key))

def iqiyi_retrieve(retrieve_count=90):
    key = 'iqiyi_video_info'
    info_list = []
    count = 0
    while retrieve_count > count:
        info_dict_byte = rds.rpop(key)
        if info_dict_byte is None:
            print("total output %s urls" % len(info_list))
            return info_list
        else:
            info_str = info_dict_byte.decode('utf-8').replace("\'", "\"")
            info_dict = json.loads(info_str)
            info_list.append(info_dict)
            count += 1
    print("total output %s urls" % len(info_list))
    return info_list

#this is a test purpose
def push_pid_to_redis_set(platform, step, value):
    """step is to determine which step the crawler is working on,
    such as parse_list_page, download_video_page, parse_video_page
    """
    if platform == '腾讯视频':
        platform = 'v_qq'
    key = '%s_%s_pid' % (platform, step)
    rds.sadd(key, value)
    print('push %s to redis set %s' % (value, key))

def delete_pid_from_redis_set(platform, step, value):
    """step is to determine which step the crawler is working on,
    such as parse_list_page, download_video_page, parse_video_page
    """
    if platform == '腾讯视频':
        platform = 'v_qq'
    key = '%s_%s_pid' % (platform, step)
    rds.srem(key, value)
    print('delete %s from redis set %s' % (value, key))