Commit af50a320 authored by litaolemo's avatar litaolemo

update

parent 07c010d6
......@@ -4,6 +4,8 @@
# -*- coding: utf-8 -*-
import argparse
from concurrent.futures.process import ProcessPoolExecutor
from elasticsearch.helpers import scan
from elasticsearch import Elasticsearch
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
......@@ -2685,7 +2687,7 @@ pages = ARGS.maxpage
def search_page_task(platform, output_to_es_raw,
output_to_es_register,
es_index):
search_pages = []
initialize_crawler = get_crawler(platform)
crawler = initialize_crawler()
KEYWORD_dic = func_search_keywordlist(platform)
......@@ -2697,7 +2699,7 @@ def search_page_task(platform, output_to_es_raw,
search_pages_max=search_pages,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index)
es_index=es_index,proxies_num=3)
except Exception as e:
print(e)
......@@ -2712,15 +2714,15 @@ kwargs_dict = {
'es_index': ES_INDEX,
}
pool = Pool(processes=3)
executor = ProcessPoolExecutor(max_workers=3)
futures = []
for platform in PLATFORM_LIST:
search_page_task(platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX)
res = pool.apply_async(func=search_page_task,
args=(platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX))
result.append(res)
pool.close()
pool.join()
platform_str = platform + "_process"
# start_crawler(processe,name)
# print(kwargs_dict)
future = executor.submit(search_page_task, platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX)
futures.append(future)
print('Processe %s start' % platform)
print('=================')
for i in result:
print(i.get())
......@@ -456,8 +456,7 @@ class Crawler_toutiao():
def search_page(self, keyword, search_pages_max=30,
output_to_es_raw=False,
output_to_es_register=False,
es_index=None,
doc_type=None, proxies_num=3):
es_index=None, proxies_num=3):
self.search_page_old(keyword, search_pages_max=search_pages_max, output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
......
......@@ -148,8 +148,7 @@ class Crawler_zhihu():
def search_article_page(self, keyword, search_pages_max=10,
output_to_es_raw=False,
output_to_es_register=False,
es_index=None,
doc_type=None,proxies_num=0,**kwargs):
es_index=None,proxies_num=0,**kwargs):
res_cookies_dict = self.get_serach_page_cookies(keyword=keyword)
headers_search = {
......@@ -223,8 +222,7 @@ class Crawler_zhihu():
platform=self.platform,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type)
es_index=es_index,)
res_list.clear()
if res_list != []:
......@@ -232,8 +230,7 @@ class Crawler_zhihu():
platform=self.platform,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type)
es_index=es_index)
return res_list
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment