add_releasers_video_data.py 2.28 KB
# -*- coding: utf-8 -*-
"""
Created on Thu Sep  6 09:22:24 2018

@author: fangyucheng
"""

import time
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from crawler.crawler_sys.utils import trans_format

def get_target_releaser_video_info(file_name,
                                   output_to_es_raw=True,
                                   es_index=None,
                                   doc_type=None,
                                   releaser_page_num_max=10000):
    start_time = int(time.time()*1e3)
    task_lst = trans_format.csv_to_lst_with_headline(file_name)
    for line in task_lst:
        releaserUrl = line['releaserUrl']
        platform = line['platform']
        crawler = get_crawler(platform=platform)
        crawler_initialization = crawler()
        if platform == 'haokan':
            try:
                crawler_initialization.releaser_page(releaserUrl=releaserUrl,
                                                     releaser_page_num_max=releaser_page_num_max,
                                                     output_to_es_raw=True,
                                                     es_index=es_index,
                                                     doc_type=doc_type,
                                                     fetchFavoriteCommnt=False)
            except:
                print(releaserUrl)
        else:
            try:
                crawler_initialization.releaser_page(releaserUrl=releaserUrl,
                                                     releaser_page_num_max=releaser_page_num_max,
                                                     output_to_es_raw=True,
                                                     es_index=es_index,
                                                     doc_type=doc_type)
            except:
                print(releaserUrl)
    end_time = int(time.time()*1e3)
    time_info = [start_time, end_time]
    return time_info

if __name__ =='__main__':
    get_time = get_target_releaser_video_info(file_name=r'/home/zhouyujiang/cuowu3.csv',
                                              releaser_page_num_max=1000,
                                              es_index='crawler-data-raw',
                                              doc_type='doc'
                                              )