Commit 4d36fce6 authored by litaolemo's avatar litaolemo

update

parent 62991f3f
...@@ -11,8 +11,7 @@ from elasticsearch.helpers import scan ...@@ -11,8 +11,7 @@ from elasticsearch.helpers import scan
#rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0) #rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0)
es_framework = Elasticsearch(hosts='192.168.17.11', port=80, es_framework = Elasticsearch(hosts='172.16.32.37', port=9200)
http_auth=('crawler', 'XBcasfo8dgfs'))
index_target_releaser = 'target_releasers' index_target_releaser = 'target_releasers'
doc_type_target_releaser = 'doc' doc_type_target_releaser = 'doc'
...@@ -38,7 +37,8 @@ def bulk_write_target_releasers(dict_Lst, ...@@ -38,7 +37,8 @@ def bulk_write_target_releasers(dict_Lst,
if write_counter%1000==0 or write_counter==len(dict_Lst): if write_counter%1000==0 or write_counter==len(dict_Lst):
print('Writing into es %d/%d' % (write_counter, len(dict_Lst))) print('Writing into es %d/%d' % (write_counter, len(dict_Lst)))
if bulk_write_body!='': if bulk_write_body!='':
es_framework.bulk(body=bulk_write_body, request_timeout=100) es_framework.bulk(index=index_target_releaser,body=bulk_write_body, request_timeout=100)
def get_releaserUrls_from_es(platform, def get_releaserUrls_from_es(platform,
releaser=None, releaser=None,
...@@ -57,7 +57,6 @@ def get_releaserUrls_from_es(platform, ...@@ -57,7 +57,6 @@ def get_releaserUrls_from_es(platform,
search_body['query']['bool']['filter'].append(frequency_dict) search_body['query']['bool']['filter'].append(frequency_dict)
# print(target_index,doc_type_target_releaser,search_body) # print(target_index,doc_type_target_releaser,search_body)
search_resp=es_framework.search(index=target_index, search_resp=es_framework.search(index=target_index,
doc_type=doc_type_target_releaser,
body=search_body, body=search_body,
size=0, size=0,
request_timeout=100) request_timeout=100)
...@@ -67,7 +66,6 @@ def get_releaserUrls_from_es(platform, ...@@ -67,7 +66,6 @@ def get_releaserUrls_from_es(platform,
print('Got %d releaserUrls for platform %s.' % (total_hit, platform)) print('Got %d releaserUrls for platform %s.' % (total_hit, platform))
scan_resp = scan(client=es_framework, query=search_body, scan_resp = scan(client=es_framework, query=search_body,
index=target_index, index=target_index,
doc_type=doc_type_target_releaser,
request_timeout=200) request_timeout=200)
for line in scan_resp: for line in scan_resp:
try: try:
......
...@@ -16,7 +16,7 @@ Data in es will be update when run this program once. ...@@ -16,7 +16,7 @@ Data in es will be update when run this program once.
""" """
from crawler.crawler_sys.site_crawler_by_redis import (crawler_toutiao, crawler_v_qq, crawler_tudou, crawler_haokan, from crawler.crawler_sys.site_crawler_by_redis import (crawler_toutiao, crawler_v_qq, crawler_tudou, crawler_haokan,
crawler_tencent_news, crawler_tencent_news,
crawler_wangyi_news, crawler_kwai, crawler_douyin,toutiao_article) crawler_wangyi_news, crawler_kwai, crawler_douyin,toutiao_article,crawler_weibo)
import sys import sys
from crawler.crawler_sys.utils.output_results import output_result from crawler.crawler_sys.utils.output_results import output_result
import argparse, copy, datetime, time import argparse, copy, datetime, time
...@@ -27,18 +27,18 @@ from concurrent.futures import ProcessPoolExecutor ...@@ -27,18 +27,18 @@ from concurrent.futures import ProcessPoolExecutor
import threading import threading
from redis.sentinel import Sentinel from redis.sentinel import Sentinel
sentinel = Sentinel([('192.168.17.65', 26379), # sentinel = Sentinel([('192.168.17.65', 26379),
('192.168.17.66', 26379), # ('192.168.17.66', 26379),
('192.168.17.67', 26379) # ('192.168.17.67', 26379)
], socket_timeout=1) # ], socket_timeout=1)
# 查看master节点 # # 查看master节点
master = sentinel.discover_master('ida_redis_master') # master = sentinel.discover_master('ida_redis_master')
# 查看slave 节点 # # 查看slave 节点
slave = sentinel.discover_slaves('ida_redis_master') # slave = sentinel.discover_slaves('ida_redis_master')
# 连接数据库 # # 连接数据库
rds_1 = sentinel.master_for('ida_redis_master', socket_timeout=1, db=1, decode_responses=True) # rds_1 = sentinel.master_for('ida_redis_master', socket_timeout=1, db=1, decode_responses=True)
# rds_1 = redis.StrictRedis(host='192.168.17.60', port=6379, db=1, decode_responses=True) rds_1 = redis.StrictRedis(host='154.8.190.251', port=6379, db=19, decode_responses=True)
parser = argparse.ArgumentParser(description='Specify a platform name.') parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-n', '--max_page', default=30, type=int, parser.add_argument('-n', '--max_page', default=30, type=int,
...@@ -83,7 +83,8 @@ platform_crawler_reg = { ...@@ -83,7 +83,8 @@ platform_crawler_reg = {
# 'Mango': crawler_mango, # 'Mango': crawler_mango,
'抖音': crawler_douyin.Crawler_douyin, '抖音': crawler_douyin.Crawler_douyin,
"网易新闻": crawler_wangyi_news.Crawler_wangyi_news, "网易新闻": crawler_wangyi_news.Crawler_wangyi_news,
"kwai": crawler_kwai.Crawler_kwai "kwai": crawler_kwai.Crawler_kwai,
"weibo": crawler_weibo.Crawler_weibo
} }
......
...@@ -23,17 +23,18 @@ import redis,json ...@@ -23,17 +23,18 @@ import redis,json
from redis.sentinel import Sentinel from redis.sentinel import Sentinel
sentinel = Sentinel([('192.168.17.65', 26379), # sentinel = Sentinel([('192.168.17.65', 26379),
('192.168.17.66', 26379), # ('192.168.17.66', 26379),
('192.168.17.67', 26379) # ('192.168.17.67', 26379)
],socket_timeout=0.5) # ],socket_timeout=0.5)
# 查看master节点 # 查看master节点
master = sentinel.discover_master('ida_redis_master') # master = sentinel.discover_master('ida_redis_master')
# 查看slave 节点 # 查看slave 节点
slave = sentinel.discover_slaves('ida_redis_master') # slave = sentinel.discover_slaves('ida_redis_master')
# 连接数据库 # 连接数据库
rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True) # rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True)
# rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=1, decode_responses=True)
rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=19, decode_responses=True)
parser = argparse.ArgumentParser(description='Specify a platform name.') parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platform', default=[], action='append', parser.add_argument('-p', '--platform', default=[], action='append',
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment