Commit 42aa9e6d authored by litaolemo's avatar litaolemo

update redis 更换地址

parent c95a181f
...@@ -8,7 +8,7 @@ import redis, json ...@@ -8,7 +8,7 @@ import redis, json
from crawler_sys.framework.platform_redis_register import get_redis_list_name from crawler_sys.framework.platform_redis_register import get_redis_list_name
from crawler_sys.framework.es_crawler import scan_crawler_url_register from crawler_sys.framework.es_crawler import scan_crawler_url_register
rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=19) rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19)
def feed_url_into_redis(dict_Lst, expire=0, def feed_url_into_redis(dict_Lst, expire=0,
......
...@@ -14,7 +14,7 @@ PARSER = argparse.ArgumentParser(description='video platform search page crawler ...@@ -14,7 +14,7 @@ PARSER = argparse.ArgumentParser(description='video platform search page crawler
# '/crawler_sys/framework/config' # '/crawler_sys/framework/config'
# '/search_keywords.ini'), # '/search_keywords.ini'),
# help=('config file absolute path')) # help=('config file absolute path'))
PARSER.add_argument('-p', '--platform', default=["toutiao","腾讯新闻", "腾讯视频", "new_tudou"], action='append', PARSER.add_argument('-p', '--platform', default=["toutiao", "腾讯新闻", "腾讯视频", "new_tudou"], action='append',
help=('legal platform name is required')) help=('legal platform name is required'))
PARSER.add_argument('-k', '--key_word_platform', default=[], action='append', PARSER.add_argument('-k', '--key_word_platform', default=[], action='append',
help=('key_word_legal platform name is required')) help=('key_word_legal platform name is required'))
...@@ -29,8 +29,8 @@ ARGS = PARSER.parse_args() ...@@ -29,8 +29,8 @@ ARGS = PARSER.parse_args()
es_framework = Elasticsearch(hosts='192.168.17.11', port=80, es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
http_auth=('crawler', 'XBcasfo8dgfs')) http_auth=('crawler', 'XBcasfo8dgfs'))
index_target_releaser = 'search_keywords' # index_target_releaser = 'search_keywords'
doc_type_target_releaser = 'doc' # doc_type_target_releaser = 'doc'
# index_target_releaser = 'test2' # index_target_releaser = 'test2'
# doc_type_target_releaser = 'keywrod' # doc_type_target_releaser = 'keywrod'
...@@ -52,31 +52,82 @@ OUTPUT_TO_ES_REGISTER = ARGS.output_to_es_register ...@@ -52,31 +52,82 @@ OUTPUT_TO_ES_REGISTER = ARGS.output_to_es_register
def func_search_keywordlist(platform): def func_search_keywordlist(platform):
search_body = {"query": {"bool": {"filter": []}}} res_dic = {}
search_resp = es_framework.search(index=index_target_releaser, res_list = ["比基尼线脱毛",
doc_type=doc_type_target_releaser, "嗨体泪沟",
body=search_body, "根据脸型选发型",
size=0, "圆脸适合什么发型",
request_timeout=100) "5热玛吉",
total_hit = search_resp['hits']['total'] "耳软骨假体鼻综合",
releaser_dic = {} "肉毒素去法令纹",
if total_hit > 0: "吸脂瘦腹部",
print('Got %d releaser for platform %s.' % (total_hit, platform)) "嗨体填充泪沟",
scan_resp = scan(client=es_framework, query=search_body, "6d小脸针",
index=index_target_releaser, "水剥离",
doc_type=doc_type_target_releaser, "嗨体去颈纹",
request_timeout=200) "胶原蛋白填充泪沟",
for line in scan_resp: "吸脂瘦全身",
try: "肉毒素去狐臭",
title = line['_source']['title'] "吸脂瘦腰部",
page = line['_source']['page'] "fotona4d",
releaser_dic[title] = page "嘴综合",
except: "胸部下垂矫正",
print('error in :', line) "5g天使光雕",
continue "唇综合",
else: "SVF-gel脂肪胶",
print('Got zero hits.') "嘴角上扬术",
return releaser_dic "嗨体注射",
"脂肪填充修复",
"比基尼脱毛",
"lams吸脂",
"脂肪填充面部年轻化",
"嗨体",
"吸脂祛副乳",
"m22",
"胸部提升",
"fotona",
"O型腿矫正",
"肋骨鼻",
"欣颜",
"唯颜",
"垫眉骨",
"咬肌切除",
"背部吸脂",
"m22王者之冠",
"bbl",
"胶原蛋白填充祛黑眼圈",
]
for l in res_list:
res_dic[l] = 10
return res_dic
# def func_search_keywordlist(platform):
# search_body = {"query": {"bool": {"filter": []}}}
# search_resp = es_framework.search(index=index_target_releaser,
# doc_type=doc_type_target_releaser,
# body=search_body,
# size=0,
# request_timeout=100)
# total_hit = search_resp['hits']['total']
# releaser_dic = {}
# if total_hit > 0:
# print('Got %d releaser for platform %s.' % (total_hit, platform))
# scan_resp = scan(client=es_framework, query=search_body,
# index=index_target_releaser,
# doc_type=doc_type_target_releaser,
# request_timeout=200)
# for line in scan_resp:
# try:
# title = line['_source']['title']
# page = line['_source']['page']
# releaser_dic[title] = page
# except:
# print('error in :', line)
# continue
# else:
# print('Got zero hits.')
# return releaser_dic
if OUTPUT_TO_ES_RAW is True: if OUTPUT_TO_ES_RAW is True:
...@@ -99,20 +150,13 @@ def search_page_task(platform, output_to_es_raw, ...@@ -99,20 +150,13 @@ def search_page_task(platform, output_to_es_raw,
print("search keyword '%s' on platform %s" % (keyword, platform)) print("search keyword '%s' on platform %s" % (keyword, platform))
search_pages = int(KEYWORD_dic[keyword]) search_pages = int(KEYWORD_dic[keyword])
try: try:
if platform != "腾讯新闻": crawler.search_page(keyword=keyword,
crawler.search_page(keyword=keyword,
search_pages_max=search_pages, search_pages_max=search_pages,
output_to_es_raw=output_to_es_raw, output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register, output_to_es_register=output_to_es_register,
es_index=es_index, es_index=es_index,
doc_type=doc_type) doc_type=doc_type)
else:
crawler.search_video_page(keyword, None,
search_pages_max=search_pages,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type,releaser=False)
except Exception as e: except Exception as e:
print(e) print(e)
continue continue
...@@ -120,15 +164,16 @@ def search_page_task(platform, output_to_es_raw, ...@@ -120,15 +164,16 @@ def search_page_task(platform, output_to_es_raw,
result = [] result = []
kwargs_dict = { kwargs_dict = {
'output_to_es_raw': OUTPUT_TO_ES_RAW, 'output_to_es_raw': OUTPUT_TO_ES_RAW,
'output_to_es_register': OUTPUT_TO_ES_REGISTER, 'output_to_es_register': OUTPUT_TO_ES_REGISTER,
'es_index': ES_INDEX, 'es_index': ES_INDEX,
'doc_type': DOC_TYPE, 'doc_type': DOC_TYPE,
} }
pool = Pool(processes=4) pool = Pool(processes=4)
for platform in PLATFORM_LIST: for platform in PLATFORM_LIST:
res = pool.apply_async(func=search_page_task, args=(platform,OUTPUT_TO_ES_RAW,OUTPUT_TO_ES_REGISTER,ES_INDEX,DOC_TYPE)) res = pool.apply_async(func=search_page_task,
args=(platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX, DOC_TYPE))
result.append(res) result.append(res)
pool.close() pool.close()
pool.join() pool.join()
......
...@@ -44,7 +44,7 @@ from redis.sentinel import Sentinel ...@@ -44,7 +44,7 @@ from redis.sentinel import Sentinel
# # 连接数据库 # # 连接数据库
# rds_1 = sentinel.master_for('ida_redis_master', socket_timeout=1, db=1, decode_responses=True) # rds_1 = sentinel.master_for('ida_redis_master', socket_timeout=1, db=1, decode_responses=True)
rds_1 = redis.StrictRedis(host='154.8.190.251', port=6379, db=19, decode_responses=True) rds_1 = redis.StrictRedis(host='172.18.51.10', port=6379, db=19, decode_responses=True)
parser = argparse.ArgumentParser(description='Specify a platform name.') parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-n', '--max_page', default=30, type=int, parser.add_argument('-n', '--max_page', default=30, type=int,
......
...@@ -34,7 +34,7 @@ from redis.sentinel import Sentinel ...@@ -34,7 +34,7 @@ from redis.sentinel import Sentinel
# 连接数据库 # 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True) # rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True)
rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=19, decode_responses=True) rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19, decode_responses=True)
parser = argparse.ArgumentParser(description='Specify a platform name.') parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platform', default=[], action='append', parser.add_argument('-p', '--platform', default=[], action='append',
......
...@@ -23,7 +23,7 @@ import kdl, requests ...@@ -23,7 +23,7 @@ import kdl, requests
# slave = sentinel.discover_slaves('ida_redis_master') # slave = sentinel.discover_slaves('ida_redis_master')
# # 连接数据库 # # 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True) # rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=18, decode_responses=True) rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=18, decode_responses=True)
def get_proxy_from_redis(): def get_proxy_from_redis():
......
...@@ -107,6 +107,21 @@ class Crawler_zhihu(): ...@@ -107,6 +107,21 @@ class Crawler_zhihu():
print(requests_res.cookies.get_dict()) print(requests_res.cookies.get_dict())
return requests_res.cookies.get_dict() return requests_res.cookies.get_dict()
def parse_sigle_page(self,aid,data_dict,article_type):
if article_type =="knowledge_ad":
pass
elif article_type == "zvideo":
pass
elif article_type == "search_result":
article_type == data_dict["object"]["type"]
url = data_dict["object"]["type"]
elif article_type == "search_club":
pass
elif article_type == "relevant_query":
pass
else:
pass
def search_article_page(self, keyword, search_pages_max=12, def search_article_page(self, keyword, search_pages_max=12,
output_to_es_raw=False, output_to_es_raw=False,
output_to_es_register=False, output_to_es_register=False,
......
This diff is collapsed.
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
import redis,time,json,datetime,sys import redis,time,json,datetime,sys
from maintenance.func_send_email_with_file import send_file_email from maintenance.func_send_email_with_file import send_file_email
rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=19,decode_responses=True) rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19,decode_responses=True)
def write_email_task_to_redis(task_name=None,file_path=None, data_str=None, email_group=[], def write_email_task_to_redis(task_name=None,file_path=None, data_str=None, email_group=[],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment