Commit af50a320 authored by litaolemo's avatar litaolemo

update

parent 07c010d6
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import argparse import argparse
from concurrent.futures.process import ProcessPoolExecutor
from elasticsearch.helpers import scan from elasticsearch.helpers import scan
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
...@@ -2685,7 +2687,7 @@ pages = ARGS.maxpage ...@@ -2685,7 +2687,7 @@ pages = ARGS.maxpage
def search_page_task(platform, output_to_es_raw, def search_page_task(platform, output_to_es_raw,
output_to_es_register, output_to_es_register,
es_index): es_index):
search_pages = []
initialize_crawler = get_crawler(platform) initialize_crawler = get_crawler(platform)
crawler = initialize_crawler() crawler = initialize_crawler()
KEYWORD_dic = func_search_keywordlist(platform) KEYWORD_dic = func_search_keywordlist(platform)
...@@ -2697,7 +2699,7 @@ def search_page_task(platform, output_to_es_raw, ...@@ -2697,7 +2699,7 @@ def search_page_task(platform, output_to_es_raw,
search_pages_max=search_pages, search_pages_max=search_pages,
output_to_es_raw=output_to_es_raw, output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register, output_to_es_register=output_to_es_register,
es_index=es_index) es_index=es_index,proxies_num=3)
except Exception as e: except Exception as e:
print(e) print(e)
...@@ -2712,15 +2714,15 @@ kwargs_dict = { ...@@ -2712,15 +2714,15 @@ kwargs_dict = {
'es_index': ES_INDEX, 'es_index': ES_INDEX,
} }
pool = Pool(processes=3)
executor = ProcessPoolExecutor(max_workers=3)
futures = []
for platform in PLATFORM_LIST: for platform in PLATFORM_LIST:
search_page_task(platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX) platform_str = platform + "_process"
res = pool.apply_async(func=search_page_task, # start_crawler(processe,name)
args=(platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX)) # print(kwargs_dict)
result.append(res) future = executor.submit(search_page_task, platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX)
pool.close() futures.append(future)
pool.join() print('Processe %s start' % platform)
print('=================')
for i in result:
print(i.get())
...@@ -456,8 +456,7 @@ class Crawler_toutiao(): ...@@ -456,8 +456,7 @@ class Crawler_toutiao():
def search_page(self, keyword, search_pages_max=30, def search_page(self, keyword, search_pages_max=30,
output_to_es_raw=False, output_to_es_raw=False,
output_to_es_register=False, output_to_es_register=False,
es_index=None, es_index=None, proxies_num=3):
doc_type=None, proxies_num=3):
self.search_page_old(keyword, search_pages_max=search_pages_max, output_to_es_raw=output_to_es_raw, self.search_page_old(keyword, search_pages_max=search_pages_max, output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register, output_to_es_register=output_to_es_register,
es_index=es_index, es_index=es_index,
......
...@@ -148,8 +148,7 @@ class Crawler_zhihu(): ...@@ -148,8 +148,7 @@ class Crawler_zhihu():
def search_article_page(self, keyword, search_pages_max=10, def search_article_page(self, keyword, search_pages_max=10,
output_to_es_raw=False, output_to_es_raw=False,
output_to_es_register=False, output_to_es_register=False,
es_index=None, es_index=None,proxies_num=0,**kwargs):
doc_type=None,proxies_num=0,**kwargs):
res_cookies_dict = self.get_serach_page_cookies(keyword=keyword) res_cookies_dict = self.get_serach_page_cookies(keyword=keyword)
headers_search = { headers_search = {
...@@ -223,8 +222,7 @@ class Crawler_zhihu(): ...@@ -223,8 +222,7 @@ class Crawler_zhihu():
platform=self.platform, platform=self.platform,
output_to_es_raw=output_to_es_raw, output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register, output_to_es_register=output_to_es_register,
es_index=es_index, es_index=es_index,)
doc_type=doc_type)
res_list.clear() res_list.clear()
if res_list != []: if res_list != []:
...@@ -232,8 +230,7 @@ class Crawler_zhihu(): ...@@ -232,8 +230,7 @@ class Crawler_zhihu():
platform=self.platform, platform=self.platform,
output_to_es_raw=output_to_es_raw, output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register, output_to_es_register=output_to_es_register,
es_index=es_index, es_index=es_index)
doc_type=doc_type)
return res_list return res_list
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment