search_page_single_process.py 4.52 KB
# -*- coding: utf-8 -*-
"""
Created on Tue Dec  4 14:00:03 2018

@author: fangyucheng
"""

import argparse
import configparser
from elasticsearch.helpers import scan
from elasticsearch import Elasticsearch
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler

PARSER = argparse.ArgumentParser(description='video platform search page crawler')
# PARSER.add_argument('-c', '--conf', default=('/home/hanye/crawlersNew/crawler'
#                                              '/crawler_sys/framework/config'
#                                              '/search_keywords.ini'),
#                    help=('config file absolute path'))
PARSER.add_argument('-p', '--platform', default=["toutiao","腾讯新闻", "腾讯视频", "new_tudou"], action='append',
                    help=('legal platform name is required'))
PARSER.add_argument('-k', '--key_word_platform', default=[], action='append',
                    help=('key_word_legal platform name is required'))
PARSER.add_argument('-w', '--output_to_es_raw', default=True,
                    help=('output to es raw'))
PARSER.add_argument('-g', '--output_to_es_register', default=False,
                    help=('output to es register'))
PARSER.add_argument('-n', '--maxpage', default=20,
                    help=('maxpage'))

ARGS = PARSER.parse_args()
es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
                             http_auth=('crawler', 'XBcasfo8dgfs'))

index_target_releaser = 'search_keywords'
doc_type_target_releaser = 'doc'

if ARGS.platform != []:
    PLATFORM_LIST = ARGS.platform
# for platform in PLATFORM_LIST:
#     if platform not in legal_platform_name:
#         print("%s is not a legal platform name, "
#               "program will exit" % platform)
#         sys.exit(0)

# CONFIG = configparser.ConfigParser()
# CONFIG.read(ARGS.conf, encoding='utf-8')

OUTPUT_TO_ES_RAW = ARGS.output_to_es_raw
OUTPUT_TO_ES_REGISTER = ARGS.output_to_es_register


def func_search_keywordlist(platform):
    search_body = {"query": {"bool": {"filter": []}}}
    search_resp = es_framework.search(index=index_target_releaser,
                                      doc_type=doc_type_target_releaser,
                                      body=search_body,
                                      size=0,
                                      request_timeout=100)
    total_hit = search_resp['hits']['total']
    releaser_dic = {}
    if total_hit > 0:
        print('Got %d releaser for platform %s.' % (total_hit, platform))
        scan_resp = scan(client=es_framework, query=search_body,
                         index=index_target_releaser,
                         doc_type=doc_type_target_releaser,
                         request_timeout=200)
        for line in scan_resp:
            try:
                title = line['_source']['title']
                page = line['_source']['page']
                releaser_dic[title] = page
            except:
                print('error in :', line)
                continue
    else:
        print('Got zero hits.')
    return releaser_dic


if OUTPUT_TO_ES_RAW is True:
    ES_INDEX = 'test2'
    DOC_TYPE = 'doc'
    print(ES_INDEX, DOC_TYPE)
pages = ARGS.maxpage

for platform in PLATFORM_LIST:
    search_pages = []
    initialize_crawler = get_crawler(platform)
    crawler = initialize_crawler()
    KEYWORD_dic = func_search_keywordlist(platform)

    for keyword in KEYWORD_dic:
        print("search keyword '%s' on platform %s" % (keyword, platform))
        search_pages = int(KEYWORD_dic[keyword])
        try:
            if platform != "腾讯新闻":
                crawler.search_page(keyword=keyword,
                                    search_pages_max=search_pages,
                                    output_to_es_raw=OUTPUT_TO_ES_RAW,
                                    output_to_es_register=OUTPUT_TO_ES_REGISTER,
                                    es_index=ES_INDEX,
                                    doc_type=DOC_TYPE)
            else:
                crawler.search_video_page(keyword, None,
                                          search_pages_max=search_pages,
                                          output_to_es_raw=OUTPUT_TO_ES_RAW,
                                          output_to_es_register=OUTPUT_TO_ES_REGISTER,
                                          es_index=ES_INDEX,
                                          doc_type=DOC_TYPE)
        except Exception as e:
            print(e)
            continue

# config file absolute path in serve
# '/home/hanye/crawlers/crawler_sys/framework/config/search_keywords.ini'