Commit e870b0bd authored by litaolemo's avatar litaolemo

update

parent 7e08e22c
......@@ -7,6 +7,9 @@ Created on Tue Dec 4 14:00:03 2018
import argparse
import configparser
import random
from concurrent.futures.process import ProcessPoolExecutor
from elasticsearch.helpers import scan
from elasticsearch import Elasticsearch
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
......@@ -98,20 +101,28 @@ for platform in PLATFORM_LIST:
initialize_crawler = get_crawler(platform)
crawler = initialize_crawler()
KEYWORD_dic = func_search_keywordlist(platform)
KEYWORD_dic = random.shuffle(KEYWORD_dic)
executor = ProcessPoolExecutor(max_workers=3)
futures = []
for keyword in KEYWORD_dic:
print("search keyword '%s' on platform %s" % (keyword, platform))
search_pages = int(KEYWORD_dic[keyword])
try:
crawler.search_page(keyword=keyword,
search_pages_max=search_pages,
output_to_es_raw=OUTPUT_TO_ES_RAW,
output_to_es_register=OUTPUT_TO_ES_REGISTER,
es_index=ES_INDEX,proxies_num=proxies_num)
except Exception as e:
print(e)
continue
# try:
# crawler.search_page(keyword=keyword,
# search_pages_max=search_pages,
# output_to_es_raw=OUTPUT_TO_ES_RAW,
# output_to_es_register=OUTPUT_TO_ES_REGISTER,
# es_index=ES_INDEX,proxies_num=proxies_num)
#
# except Exception as e:
# print(e)
# continue
future = executor.submit(crawler.search_page, keyword,search_pages_max=search_pages,
output_to_es_raw=OUTPUT_TO_ES_RAW,
output_to_es_register=OUTPUT_TO_ES_REGISTER,
es_index=ES_INDEX,proxies_num=proxies_num)
futures.append(future)
executor.shutdown(True)
# config file absolute path in serve
# '/home/hanye/crawlers/crawler_sys/framework/config/search_keywords.ini'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment