Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
4d36fce6
Commit
4d36fce6
authored
Jul 23, 2020
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
62991f3f
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
26 additions
and
26 deletions
+26
-26
es_target_releasers.py
crawler_sys/framework/es_target_releasers.py
+3
-5
update_data_in_target_releasers_multi_process_by_date_from_redis.py
...a_in_target_releasers_multi_process_by_date_from_redis.py
+14
-13
write_releasers_to_redis.py
crawler_sys/framework/write_releasers_to_redis.py
+9
-8
No files found.
crawler_sys/framework/es_target_releasers.py
View file @
4d36fce6
...
@@ -11,8 +11,7 @@ from elasticsearch.helpers import scan
...
@@ -11,8 +11,7 @@ from elasticsearch.helpers import scan
#rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0)
#rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0)
es_framework
=
Elasticsearch
(
hosts
=
'192.168.17.11'
,
port
=
80
,
es_framework
=
Elasticsearch
(
hosts
=
'172.16.32.37'
,
port
=
9200
)
http_auth
=
(
'crawler'
,
'XBcasfo8dgfs'
))
index_target_releaser
=
'target_releasers'
index_target_releaser
=
'target_releasers'
doc_type_target_releaser
=
'doc'
doc_type_target_releaser
=
'doc'
...
@@ -38,7 +37,8 @@ def bulk_write_target_releasers(dict_Lst,
...
@@ -38,7 +37,8 @@ def bulk_write_target_releasers(dict_Lst,
if
write_counter
%
1000
==
0
or
write_counter
==
len
(
dict_Lst
):
if
write_counter
%
1000
==
0
or
write_counter
==
len
(
dict_Lst
):
print
(
'Writing into es
%
d/
%
d'
%
(
write_counter
,
len
(
dict_Lst
)))
print
(
'Writing into es
%
d/
%
d'
%
(
write_counter
,
len
(
dict_Lst
)))
if
bulk_write_body
!=
''
:
if
bulk_write_body
!=
''
:
es_framework
.
bulk
(
body
=
bulk_write_body
,
request_timeout
=
100
)
es_framework
.
bulk
(
index
=
index_target_releaser
,
body
=
bulk_write_body
,
request_timeout
=
100
)
def
get_releaserUrls_from_es
(
platform
,
def
get_releaserUrls_from_es
(
platform
,
releaser
=
None
,
releaser
=
None
,
...
@@ -57,7 +57,6 @@ def get_releaserUrls_from_es(platform,
...
@@ -57,7 +57,6 @@ def get_releaserUrls_from_es(platform,
search_body
[
'query'
][
'bool'
][
'filter'
]
.
append
(
frequency_dict
)
search_body
[
'query'
][
'bool'
][
'filter'
]
.
append
(
frequency_dict
)
# print(target_index,doc_type_target_releaser,search_body)
# print(target_index,doc_type_target_releaser,search_body)
search_resp
=
es_framework
.
search
(
index
=
target_index
,
search_resp
=
es_framework
.
search
(
index
=
target_index
,
doc_type
=
doc_type_target_releaser
,
body
=
search_body
,
body
=
search_body
,
size
=
0
,
size
=
0
,
request_timeout
=
100
)
request_timeout
=
100
)
...
@@ -67,7 +66,6 @@ def get_releaserUrls_from_es(platform,
...
@@ -67,7 +66,6 @@ def get_releaserUrls_from_es(platform,
print
(
'Got
%
d releaserUrls for platform
%
s.'
%
(
total_hit
,
platform
))
print
(
'Got
%
d releaserUrls for platform
%
s.'
%
(
total_hit
,
platform
))
scan_resp
=
scan
(
client
=
es_framework
,
query
=
search_body
,
scan_resp
=
scan
(
client
=
es_framework
,
query
=
search_body
,
index
=
target_index
,
index
=
target_index
,
doc_type
=
doc_type_target_releaser
,
request_timeout
=
200
)
request_timeout
=
200
)
for
line
in
scan_resp
:
for
line
in
scan_resp
:
try
:
try
:
...
...
crawler_sys/framework/update_data_in_target_releasers_multi_process_by_date_from_redis.py
View file @
4d36fce6
...
@@ -16,7 +16,7 @@ Data in es will be update when run this program once.
...
@@ -16,7 +16,7 @@ Data in es will be update when run this program once.
"""
"""
from
crawler.crawler_sys.site_crawler_by_redis
import
(
crawler_toutiao
,
crawler_v_qq
,
crawler_tudou
,
crawler_haokan
,
from
crawler.crawler_sys.site_crawler_by_redis
import
(
crawler_toutiao
,
crawler_v_qq
,
crawler_tudou
,
crawler_haokan
,
crawler_tencent_news
,
crawler_tencent_news
,
crawler_wangyi_news
,
crawler_kwai
,
crawler_douyin
,
toutiao_article
)
crawler_wangyi_news
,
crawler_kwai
,
crawler_douyin
,
toutiao_article
,
crawler_weibo
)
import
sys
import
sys
from
crawler.crawler_sys.utils.output_results
import
output_result
from
crawler.crawler_sys.utils.output_results
import
output_result
import
argparse
,
copy
,
datetime
,
time
import
argparse
,
copy
,
datetime
,
time
...
@@ -27,18 +27,18 @@ from concurrent.futures import ProcessPoolExecutor
...
@@ -27,18 +27,18 @@ from concurrent.futures import ProcessPoolExecutor
import
threading
import
threading
from
redis.sentinel
import
Sentinel
from
redis.sentinel
import
Sentinel
sentinel
=
Sentinel
([(
'192.168.17.65'
,
26379
),
#
sentinel = Sentinel([('192.168.17.65', 26379),
(
'192.168.17.66'
,
26379
),
#
('192.168.17.66', 26379),
(
'192.168.17.67'
,
26379
)
#
('192.168.17.67', 26379)
],
socket_timeout
=
1
)
#
], socket_timeout=1)
# 查看master节点
#
#
查看master节点
master
=
sentinel
.
discover_master
(
'ida_redis_master'
)
#
master = sentinel.discover_master('ida_redis_master')
# 查看slave 节点
#
#
查看slave 节点
slave
=
sentinel
.
discover_slaves
(
'ida_redis_master'
)
#
slave = sentinel.discover_slaves('ida_redis_master')
# 连接数据库
#
#
连接数据库
rds_1
=
sentinel
.
master_for
(
'ida_redis_master'
,
socket_timeout
=
1
,
db
=
1
,
decode_responses
=
True
)
#
rds_1 = sentinel.master_for('ida_redis_master', socket_timeout=1, db=1, decode_responses=True)
# rds_1 = redis.StrictRedis(host='192.168.17.60', port=6379, db=1
, decode_responses=True)
rds_1
=
redis
.
StrictRedis
(
host
=
'154.8.190.251'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Specify a platform name.'
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Specify a platform name.'
)
parser
.
add_argument
(
'-n'
,
'--max_page'
,
default
=
30
,
type
=
int
,
parser
.
add_argument
(
'-n'
,
'--max_page'
,
default
=
30
,
type
=
int
,
...
@@ -83,7 +83,8 @@ platform_crawler_reg = {
...
@@ -83,7 +83,8 @@ platform_crawler_reg = {
# 'Mango': crawler_mango,
# 'Mango': crawler_mango,
'抖音'
:
crawler_douyin
.
Crawler_douyin
,
'抖音'
:
crawler_douyin
.
Crawler_douyin
,
"网易新闻"
:
crawler_wangyi_news
.
Crawler_wangyi_news
,
"网易新闻"
:
crawler_wangyi_news
.
Crawler_wangyi_news
,
"kwai"
:
crawler_kwai
.
Crawler_kwai
"kwai"
:
crawler_kwai
.
Crawler_kwai
,
"weibo"
:
crawler_weibo
.
Crawler_weibo
}
}
...
...
crawler_sys/framework/write_releasers_to_redis.py
View file @
4d36fce6
...
@@ -23,17 +23,18 @@ import redis,json
...
@@ -23,17 +23,18 @@ import redis,json
from
redis.sentinel
import
Sentinel
from
redis.sentinel
import
Sentinel
sentinel
=
Sentinel
([(
'192.168.17.65'
,
26379
),
#
sentinel = Sentinel([('192.168.17.65', 26379),
(
'192.168.17.66'
,
26379
),
#
('192.168.17.66', 26379),
(
'192.168.17.67'
,
26379
)
#
('192.168.17.67', 26379)
],
socket_timeout
=
0.5
)
#
],socket_timeout=0.5)
# 查看master节点
# 查看master节点
master
=
sentinel
.
discover_master
(
'ida_redis_master'
)
#
master = sentinel.discover_master('ida_redis_master')
# 查看slave 节点
# 查看slave 节点
slave
=
sentinel
.
discover_slaves
(
'ida_redis_master'
)
#
slave = sentinel.discover_slaves('ida_redis_master')
# 连接数据库
# 连接数据库
rds
=
sentinel
.
master_for
(
'ida_redis_master'
,
socket_timeout
=
0.5
,
db
=
1
,
decode_responses
=
True
)
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True)
# rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=1, decode_responses=True)
rds
=
redis
.
StrictRedis
(
host
=
'154.8.190.251'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Specify a platform name.'
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Specify a platform name.'
)
parser
.
add_argument
(
'-p'
,
'--platform'
,
default
=
[],
action
=
'append'
,
parser
.
add_argument
(
'-p'
,
'--platform'
,
default
=
[],
action
=
'append'
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment