check_high_play_count_data_source_v_qq.py 3.64 KB
# -*- coding: utf-8 -*-
"""
Created on Wed Sep  5 17:52:53 2018

@author: fangyucheng
"""

from crawler_sys.site_crawler.crawler_v_qq import Crawler_v_qq
from crawler_sys.utils.output_results import output_result
from crawler_sys.utils import Metaorphosis as meta
from crawler_sys.utils.output_log import output_log

logging = output_log(page_category='video_page',
                     program_info='tencent')

def tran_input_data_to_lst(file_name, file_category='csv'):
    if file_category == 'csv':
        video_info_lst = meta.csv_to_lst_whth_headline(file_name)
        url_lst = []
        for line in video_info_lst:
            try:
                if line['data_provider'] == 'CCR':
                    url_lst.append(line['url'])
            except:
                pass
        return url_lst
    elif file_category == 'file':
        url_lst = meta.str_file_to_lst(file_name)
        return url_lst

url_lst = tran_input_data_to_lst(file_name='R:/CCR/数据需求/短期临时需求/TX', file_category='file')

crawler = Crawler_v_qq()
get_video_page = crawler.video_page

def get_data_source(url_lst=url_lst,
                    output_to_file=False,
                    filepath=None,
                    output_to_es_raw=False,
                    output_to_es_register=False,
                    push_to_redis=False,
                    output_es_index=None,
                    output_doc_type=None):
    result_lst = []
    for url in url_lst:
        video_info = get_video_page(url=url)
        result_lst.append(video_info)
        logging.info('get_data at page %s' % url)
        if len(result_lst) >= 100:
            if output_es_index is not None and output_doc_type is not None:
                output_result(result_lst,
                              platform='腾讯视频',
                              output_to_file=output_to_file,
                              output_to_es_raw=output_to_es_raw,
                              output_to_es_register=output_to_es_register,
                              push_to_redis=push_to_redis,
                              es_index=output_es_index,
                              doc_type=output_doc_type)
                result_lst.clear()
            else:
                output_result(result_lst,
                              platform='腾讯视频',
                              output_to_file=output_to_file,
                              output_to_es_raw=output_to_es_raw,
                              output_to_es_register=output_to_es_register,
                              push_to_redis=push_to_redis)
                result_lst.clear()
    if len(result_lst) != []:
        if output_es_index is not None and output_doc_type is not None:
            output_result(result_lst,
                          platform='腾讯视频',
                          output_to_file=output_to_file,
                          output_to_es_raw=output_to_es_raw,
                          output_to_es_register=output_to_es_register,
                          push_to_redis=push_to_redis,
                          es_index=output_es_index,
                          doc_type=output_doc_type)
            result_lst.clear()
        else:
            output_result(result_lst,
                          platform='腾讯视频',
                          output_to_file=output_to_file,
                          output_to_es_raw=output_to_es_raw,
                          output_to_es_register=output_to_es_register,
                          push_to_redis=push_to_redis)
            result_lst.clear()

if __name__ == '__main__':
    get_data_source(output_to_es_raw=True,
                    output_es_index='test2',
                    output_doc_type='fyc')