1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 14 17:20:57 2018
@author: hanye
"""
import time
from elasticsearch.helpers import scan
from crawler_sys.framework.es_ccr_index_defination import es_framework
from crawler_sys.framework.es_ccr_index_defination import index_crawler_raw
from crawler_sys.framework.es_ccr_index_defination import doc_type_crawler_raw
from crawler_sys.framework.es_ccr_index_defination import index_url_register
from crawler_sys.framework.es_ccr_index_defination import doc_type_url_register
from crawler_sys.framework.func_calculate_newTudou_video_id import calculate_newTudou_video_id
from crawler_sys.framework.func_calculate_toutiao_video_id import calculate_toutiao_video_id
from crawler_sys.framework.func_calculate_wangyi_news_id import calculate_wangyi_news_id
def scan_crawler_raw_index(search_body):
total_hit, scan_resp = scan_index(index=index_crawler_raw,
doc_type=doc_type_crawler_raw,
search_body=search_body)
# search_resp = es_framework.search(index=index_crawler_raw,
# doc_type=doc_type_crawler_raw,
# body=search_body,
# size=0, request_timeout=100)
# total_hit = search_resp['hits']['total']
# print('Index: %s total hit: %d'
# % (index_crawler_raw, total_hit))
# if total_hit>0:
# scan_resp = scan(client=es_framework,
# query=search_body,
# index=index_crawler_raw,
# doc_type=doc_type_crawler_raw,
# request_timeout=300)
# else:
# print('Zero hit.')
# scan_resp = None
return (total_hit, scan_resp)
def scan_crawler_url_register(search_body):
total_hit, scan_resp = scan_index(index=index_url_register,
doc_type=doc_type_url_register,
search_body=search_body)
return (total_hit, scan_resp)
def scan_index(index, doc_type, search_body):
search_resp = es_framework.search(index=index,
doc_type=doc_type,
body=search_body,
size=0,
request_timeout=100)
total_hit = search_resp['hits']['total']
print('Index: %s total hit: %d'
% (index, total_hit))
if total_hit > 0:
scan_resp = scan(client=es_framework,
query=search_body,
index=index,
doc_type=doc_type,
request_timeout=300)
else:
print('Zero hit.')
scan_resp = None
return (total_hit, scan_resp)