Commit e870b0bd authored by litaolemo's avatar litaolemo

update

parent 7e08e22c
...@@ -7,6 +7,9 @@ Created on Tue Dec 4 14:00:03 2018 ...@@ -7,6 +7,9 @@ Created on Tue Dec 4 14:00:03 2018
import argparse import argparse
import configparser import configparser
import random
from concurrent.futures.process import ProcessPoolExecutor
from elasticsearch.helpers import scan from elasticsearch.helpers import scan
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
...@@ -98,20 +101,28 @@ for platform in PLATFORM_LIST: ...@@ -98,20 +101,28 @@ for platform in PLATFORM_LIST:
initialize_crawler = get_crawler(platform) initialize_crawler = get_crawler(platform)
crawler = initialize_crawler() crawler = initialize_crawler()
KEYWORD_dic = func_search_keywordlist(platform) KEYWORD_dic = func_search_keywordlist(platform)
KEYWORD_dic = random.shuffle(KEYWORD_dic)
executor = ProcessPoolExecutor(max_workers=3)
futures = []
for keyword in KEYWORD_dic: for keyword in KEYWORD_dic:
print("search keyword '%s' on platform %s" % (keyword, platform)) print("search keyword '%s' on platform %s" % (keyword, platform))
search_pages = int(KEYWORD_dic[keyword]) search_pages = int(KEYWORD_dic[keyword])
try: # try:
crawler.search_page(keyword=keyword, # crawler.search_page(keyword=keyword,
search_pages_max=search_pages, # search_pages_max=search_pages,
output_to_es_raw=OUTPUT_TO_ES_RAW, # output_to_es_raw=OUTPUT_TO_ES_RAW,
output_to_es_register=OUTPUT_TO_ES_REGISTER, # output_to_es_register=OUTPUT_TO_ES_REGISTER,
es_index=ES_INDEX,proxies_num=proxies_num) # es_index=ES_INDEX,proxies_num=proxies_num)
#
except Exception as e: # except Exception as e:
print(e) # print(e)
continue # continue
future = executor.submit(crawler.search_page, keyword,search_pages_max=search_pages,
output_to_es_raw=OUTPUT_TO_ES_RAW,
output_to_es_register=OUTPUT_TO_ES_REGISTER,
es_index=ES_INDEX,proxies_num=proxies_num)
futures.append(future)
executor.shutdown(True)
# config file absolute path in serve # config file absolute path in serve
# '/home/hanye/crawlers/crawler_sys/framework/config/search_keywords.ini' # '/home/hanye/crawlers/crawler_sys/framework/config/search_keywords.ini'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment