es_crawler.py 2.77 KB
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 14 17:20:57 2018

@author: hanye
"""

import time
from elasticsearch.helpers import scan
from crawler_sys.framework.es_ccr_index_defination import es_framework
from crawler_sys.framework.es_ccr_index_defination import index_crawler_raw
from crawler_sys.framework.es_ccr_index_defination import doc_type_crawler_raw
from crawler_sys.framework.es_ccr_index_defination import index_url_register
from crawler_sys.framework.es_ccr_index_defination import doc_type_url_register
from crawler_sys.framework.func_calculate_newTudou_video_id import calculate_newTudou_video_id
from crawler_sys.framework.func_calculate_toutiao_video_id import calculate_toutiao_video_id
from crawler_sys.framework.func_calculate_wangyi_news_id import calculate_wangyi_news_id

def scan_crawler_raw_index(search_body):
    total_hit, scan_resp = scan_index(index=index_crawler_raw,
                                      doc_type=doc_type_crawler_raw,
                                      search_body=search_body)
#    search_resp = es_framework.search(index=index_crawler_raw,
#                                      doc_type=doc_type_crawler_raw,
#                                      body=search_body,
#                                      size=0, request_timeout=100)
#    total_hit = search_resp['hits']['total']
#    print('Index: %s total hit: %d'
#          % (index_crawler_raw, total_hit))
#    if total_hit>0:
#        scan_resp = scan(client=es_framework,
#                         query=search_body,
#                         index=index_crawler_raw,
#                         doc_type=doc_type_crawler_raw,
#                         request_timeout=300)
#    else:
#        print('Zero hit.')
#        scan_resp = None

    return (total_hit, scan_resp)

def scan_crawler_url_register(search_body):
    total_hit, scan_resp = scan_index(index=index_url_register,
                                      doc_type=doc_type_url_register,
                                      search_body=search_body)
    return (total_hit, scan_resp)


def scan_index(index, doc_type, search_body):
    search_resp = es_framework.search(index=index,
                                      doc_type=doc_type,
                                      body=search_body,
                                      size=0,
                                      request_timeout=100)
    total_hit = search_resp['hits']['total']
    print('Index: %s total hit: %d'
          % (index, total_hit))
    if total_hit > 0:
        scan_resp = scan(client=es_framework,
                         query=search_body,
                         index=index,
                         doc_type=doc_type,
                         request_timeout=300)
    else:
        print('Zero hit.')
        scan_resp = None

    return (total_hit, scan_resp)