# -*- coding:utf-8 -*- # @Time : 2019/7/19 17:09 # @Author : litao # -*- coding: utf-8 -*- import argparse from elasticsearch.helpers import scan from elasticsearch import Elasticsearch from crawler.crawler_sys.framework.platform_crawler_register import get_crawler from multiprocessing import Pool PARSER = argparse.ArgumentParser(description='video platform search page crawler') PARSER.add_argument('-p', '--platform', default=["zhihu","weibo", "toutiao"], action='append', help=('legal platform name is required')) PARSER.add_argument('-k', '--key_word_platform', default=[], action='append', help=('key_word_legal platform name is required')) PARSER.add_argument('-w', '--output_to_es_raw', default=False, help=('output to es raw')) PARSER.add_argument('-g', '--output_to_es_register', default=True, help=('output to es register')) PARSER.add_argument('-n', '--maxpage', default=20, help=('maxpage')) ARGS = PARSER.parse_args() es_framework = Elasticsearch(hosts='192.168.17.11', port=80, http_auth=('crawler', 'XBcasfo8dgfs')) # index_target_releaser = 'search_keywords' # doc_type_target_releaser = 'doc' # index_target_releaser = 'test2' # doc_type_target_releaser = 'keywrod' if ARGS.platform != []: PLATFORM_LIST = ARGS.platform OUTPUT_TO_ES_RAW = ARGS.output_to_es_raw OUTPUT_TO_ES_REGISTER = ARGS.output_to_es_register def func_search_keywordlist(platform): res_dic = {} res_list = ["比基尼线脱毛", "嗨体泪沟", "根据脸型选发型", "圆脸适合什么发型", "5热玛吉", "耳软骨假体鼻综合", "肉毒素去法令纹", "吸脂瘦腹部", "嗨体填充泪沟", "6d小脸针", "水剥离", "嗨体去颈纹", "胶原蛋白填充泪沟", "吸脂瘦全身", "肉毒素去狐臭", "吸脂瘦腰部", "fotona4d", "嘴综合", "胸部下垂矫正", "5g天使光雕", "唇综合", "SVF-gel脂肪胶", "嘴角上扬术", "嗨体注射", "脂肪填充修复", "比基尼脱毛", "lams吸脂", "脂肪填充面部年轻化", "嗨体", "吸脂祛副乳", "m22", "胸部提升", "fotona", "O型腿矫正", "肋骨鼻", "欣颜", "唯颜", "垫眉骨", "咬肌切除", "背部吸脂", "m22王者之冠", "bbl", "胶原蛋白填充祛黑眼圈", "热玛吉", "热玛吉5代", ] for l in res_list: res_dic[l] = 10 return res_dic # def func_search_keywordlist(platform): # search_body = {"query": {"bool": {"filter": []}}} # search_resp = es_framework.search(index=index_target_releaser, # doc_type=doc_type_target_releaser, # body=search_body, # size=0, # request_timeout=100) # total_hit = search_resp['hits']['total'] # releaser_dic = {} # if total_hit > 0: # print('Got %d releaser for platform %s.' % (total_hit, platform)) # scan_resp = scan(client=es_framework, query=search_body, # index=index_target_releaser, # doc_type=doc_type_target_releaser, # request_timeout=200) # for line in scan_resp: # try: # title = line['_source']['title'] # page = line['_source']['page'] # releaser_dic[title] = page # except: # print('error in :', line) # continue # else: # print('Got zero hits.') # return releaser_dic if OUTPUT_TO_ES_RAW is True: ES_INDEX = 'crawler-data-raw' DOC_TYPE = 'doc' print(ES_INDEX, DOC_TYPE) pages = ARGS.maxpage def search_page_task(platform, output_to_es_raw, output_to_es_register, es_index): search_pages = [] initialize_crawler = get_crawler(platform) crawler = initialize_crawler() KEYWORD_dic = func_search_keywordlist(platform) for keyword in KEYWORD_dic: print("search keyword '%s' on platform %s" % (keyword, platform)) search_pages = int(KEYWORD_dic[keyword]) try: crawler.search_page(keyword=keyword, search_pages_max=search_pages, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index) except Exception as e: print(e) continue ES_INDEX = "crawler-data-raw" result = [] kwargs_dict = { 'output_to_es_raw': OUTPUT_TO_ES_RAW, 'output_to_es_register': OUTPUT_TO_ES_REGISTER, 'es_index': ES_INDEX, } # pool = Pool(processes=4) for platform in PLATFORM_LIST: search_page_task(platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX) # res = pool.apply_async(func=search_page_task, # args=(platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX)) # result.append(res) # pool.close() # pool.join() print('=================') for i in result: print(i.get())