Commit 42aa9e6d authored by litaolemo's avatar litaolemo

update redis 更换地址

parent c95a181f
......@@ -8,7 +8,7 @@ import redis, json
from crawler_sys.framework.platform_redis_register import get_redis_list_name
from crawler_sys.framework.es_crawler import scan_crawler_url_register
rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=19)
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19)
def feed_url_into_redis(dict_Lst, expire=0,
......
......@@ -14,7 +14,7 @@ PARSER = argparse.ArgumentParser(description='video platform search page crawler
# '/crawler_sys/framework/config'
# '/search_keywords.ini'),
# help=('config file absolute path'))
PARSER.add_argument('-p', '--platform', default=["toutiao","腾讯新闻", "腾讯视频", "new_tudou"], action='append',
PARSER.add_argument('-p', '--platform', default=["toutiao", "腾讯新闻", "腾讯视频", "new_tudou"], action='append',
help=('legal platform name is required'))
PARSER.add_argument('-k', '--key_word_platform', default=[], action='append',
help=('key_word_legal platform name is required'))
......@@ -29,8 +29,8 @@ ARGS = PARSER.parse_args()
es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
http_auth=('crawler', 'XBcasfo8dgfs'))
index_target_releaser = 'search_keywords'
doc_type_target_releaser = 'doc'
# index_target_releaser = 'search_keywords'
# doc_type_target_releaser = 'doc'
# index_target_releaser = 'test2'
# doc_type_target_releaser = 'keywrod'
......@@ -52,31 +52,82 @@ OUTPUT_TO_ES_REGISTER = ARGS.output_to_es_register
def func_search_keywordlist(platform):
search_body = {"query": {"bool": {"filter": []}}}
search_resp = es_framework.search(index=index_target_releaser,
doc_type=doc_type_target_releaser,
body=search_body,
size=0,
request_timeout=100)
total_hit = search_resp['hits']['total']
releaser_dic = {}
if total_hit > 0:
print('Got %d releaser for platform %s.' % (total_hit, platform))
scan_resp = scan(client=es_framework, query=search_body,
index=index_target_releaser,
doc_type=doc_type_target_releaser,
request_timeout=200)
for line in scan_resp:
try:
title = line['_source']['title']
page = line['_source']['page']
releaser_dic[title] = page
except:
print('error in :', line)
continue
else:
print('Got zero hits.')
return releaser_dic
res_dic = {}
res_list = ["比基尼线脱毛",
"嗨体泪沟",
"根据脸型选发型",
"圆脸适合什么发型",
"5热玛吉",
"耳软骨假体鼻综合",
"肉毒素去法令纹",
"吸脂瘦腹部",
"嗨体填充泪沟",
"6d小脸针",
"水剥离",
"嗨体去颈纹",
"胶原蛋白填充泪沟",
"吸脂瘦全身",
"肉毒素去狐臭",
"吸脂瘦腰部",
"fotona4d",
"嘴综合",
"胸部下垂矫正",
"5g天使光雕",
"唇综合",
"SVF-gel脂肪胶",
"嘴角上扬术",
"嗨体注射",
"脂肪填充修复",
"比基尼脱毛",
"lams吸脂",
"脂肪填充面部年轻化",
"嗨体",
"吸脂祛副乳",
"m22",
"胸部提升",
"fotona",
"O型腿矫正",
"肋骨鼻",
"欣颜",
"唯颜",
"垫眉骨",
"咬肌切除",
"背部吸脂",
"m22王者之冠",
"bbl",
"胶原蛋白填充祛黑眼圈",
]
for l in res_list:
res_dic[l] = 10
return res_dic
# def func_search_keywordlist(platform):
# search_body = {"query": {"bool": {"filter": []}}}
# search_resp = es_framework.search(index=index_target_releaser,
# doc_type=doc_type_target_releaser,
# body=search_body,
# size=0,
# request_timeout=100)
# total_hit = search_resp['hits']['total']
# releaser_dic = {}
# if total_hit > 0:
# print('Got %d releaser for platform %s.' % (total_hit, platform))
# scan_resp = scan(client=es_framework, query=search_body,
# index=index_target_releaser,
# doc_type=doc_type_target_releaser,
# request_timeout=200)
# for line in scan_resp:
# try:
# title = line['_source']['title']
# page = line['_source']['page']
# releaser_dic[title] = page
# except:
# print('error in :', line)
# continue
# else:
# print('Got zero hits.')
# return releaser_dic
if OUTPUT_TO_ES_RAW is True:
......@@ -99,20 +150,13 @@ def search_page_task(platform, output_to_es_raw,
print("search keyword '%s' on platform %s" % (keyword, platform))
search_pages = int(KEYWORD_dic[keyword])
try:
if platform != "腾讯新闻":
crawler.search_page(keyword=keyword,
crawler.search_page(keyword=keyword,
search_pages_max=search_pages,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type)
else:
crawler.search_video_page(keyword, None,
search_pages_max=search_pages,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type,releaser=False)
except Exception as e:
print(e)
continue
......@@ -120,15 +164,16 @@ def search_page_task(platform, output_to_es_raw,
result = []
kwargs_dict = {
'output_to_es_raw': OUTPUT_TO_ES_RAW,
'output_to_es_register': OUTPUT_TO_ES_REGISTER,
'es_index': ES_INDEX,
'doc_type': DOC_TYPE,
'output_to_es_raw': OUTPUT_TO_ES_RAW,
'output_to_es_register': OUTPUT_TO_ES_REGISTER,
'es_index': ES_INDEX,
'doc_type': DOC_TYPE,
}
pool = Pool(processes=4)
for platform in PLATFORM_LIST:
res = pool.apply_async(func=search_page_task, args=(platform,OUTPUT_TO_ES_RAW,OUTPUT_TO_ES_REGISTER,ES_INDEX,DOC_TYPE))
res = pool.apply_async(func=search_page_task,
args=(platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX, DOC_TYPE))
result.append(res)
pool.close()
pool.join()
......
......@@ -44,7 +44,7 @@ from redis.sentinel import Sentinel
# # 连接数据库
# rds_1 = sentinel.master_for('ida_redis_master', socket_timeout=1, db=1, decode_responses=True)
rds_1 = redis.StrictRedis(host='154.8.190.251', port=6379, db=19, decode_responses=True)
rds_1 = redis.StrictRedis(host='172.18.51.10', port=6379, db=19, decode_responses=True)
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-n', '--max_page', default=30, type=int,
......
......@@ -34,7 +34,7 @@ from redis.sentinel import Sentinel
# 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True)
rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=19, decode_responses=True)
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19, decode_responses=True)
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platform', default=[], action='append',
......
......@@ -23,7 +23,7 @@ import kdl, requests
# slave = sentinel.discover_slaves('ida_redis_master')
# # 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=18, decode_responses=True)
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=18, decode_responses=True)
def get_proxy_from_redis():
......
......@@ -107,6 +107,21 @@ class Crawler_zhihu():
print(requests_res.cookies.get_dict())
return requests_res.cookies.get_dict()
def parse_sigle_page(self,aid,data_dict,article_type):
if article_type =="knowledge_ad":
pass
elif article_type == "zvideo":
pass
elif article_type == "search_result":
article_type == data_dict["object"]["type"]
url = data_dict["object"]["type"]
elif article_type == "search_club":
pass
elif article_type == "relevant_query":
pass
else:
pass
def search_article_page(self, keyword, search_pages_max=12,
output_to_es_raw=False,
output_to_es_register=False,
......
This diff is collapsed.
......@@ -4,7 +4,7 @@
import redis,time,json,datetime,sys
from maintenance.func_send_email_with_file import send_file_email
rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=19,decode_responses=True)
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19,decode_responses=True)
def write_email_task_to_redis(task_name=None,file_path=None, data_str=None, email_group=[],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment