Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
42aa9e6d
Commit
42aa9e6d
authored
Aug 04, 2020
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update redis 更换地址
parent
c95a181f
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
107 additions
and
47 deletions
+107
-47
redis_interact.py
crawler_sys/framework/redis_interact.py
+1
-1
search_page_multi_process.py
crawler_sys/framework/search_page_multi_process.py
+87
-42
update_data_in_target_releasers_multi_process_by_date_from_redis.py
...a_in_target_releasers_multi_process_by_date_from_redis.py
+1
-1
write_releasers_to_redis.py
crawler_sys/framework/write_releasers_to_redis.py
+1
-1
func_get_proxy_form_kuaidaili.py
crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
+1
-1
crawler_zhihu.py
crawler_sys/site_crawler/crawler_zhihu.py
+15
-0
output_results.py
crawler_sys/utils/output_results.py
+0
-0
send_email_with_file_auto_task.py
maintenance/send_email_with_file_auto_task.py
+1
-1
No files found.
crawler_sys/framework/redis_interact.py
View file @
42aa9e6d
...
...
@@ -8,7 +8,7 @@ import redis, json
from
crawler_sys.framework.platform_redis_register
import
get_redis_list_name
from
crawler_sys.framework.es_crawler
import
scan_crawler_url_register
rds
=
redis
.
StrictRedis
(
host
=
'1
54.8.190.251
'
,
port
=
6379
,
db
=
19
)
rds
=
redis
.
StrictRedis
(
host
=
'1
72.18.51.10
'
,
port
=
6379
,
db
=
19
)
def
feed_url_into_redis
(
dict_Lst
,
expire
=
0
,
...
...
crawler_sys/framework/search_page_multi_process.py
View file @
42aa9e6d
...
...
@@ -14,7 +14,7 @@ PARSER = argparse.ArgumentParser(description='video platform search page crawler
# '/crawler_sys/framework/config'
# '/search_keywords.ini'),
# help=('config file absolute path'))
PARSER
.
add_argument
(
'-p'
,
'--platform'
,
default
=
[
"toutiao"
,
"腾讯新闻"
,
"腾讯视频"
,
"new_tudou"
],
action
=
'append'
,
PARSER
.
add_argument
(
'-p'
,
'--platform'
,
default
=
[
"toutiao"
,
"腾讯新闻"
,
"腾讯视频"
,
"new_tudou"
],
action
=
'append'
,
help
=
(
'legal platform name is required'
))
PARSER
.
add_argument
(
'-k'
,
'--key_word_platform'
,
default
=
[],
action
=
'append'
,
help
=
(
'key_word_legal platform name is required'
))
...
...
@@ -29,8 +29,8 @@ ARGS = PARSER.parse_args()
es_framework
=
Elasticsearch
(
hosts
=
'192.168.17.11'
,
port
=
80
,
http_auth
=
(
'crawler'
,
'XBcasfo8dgfs'
))
index_target_releaser
=
'search_keywords'
doc_type_target_releaser
=
'doc'
#
index_target_releaser = 'search_keywords'
#
doc_type_target_releaser = 'doc'
# index_target_releaser = 'test2'
# doc_type_target_releaser = 'keywrod'
...
...
@@ -52,31 +52,82 @@ OUTPUT_TO_ES_REGISTER = ARGS.output_to_es_register
def
func_search_keywordlist
(
platform
):
search_body
=
{
"query"
:
{
"bool"
:
{
"filter"
:
[]}}}
search_resp
=
es_framework
.
search
(
index
=
index_target_releaser
,
doc_type
=
doc_type_target_releaser
,
body
=
search_body
,
size
=
0
,
request_timeout
=
100
)
total_hit
=
search_resp
[
'hits'
][
'total'
]
releaser_dic
=
{}
if
total_hit
>
0
:
print
(
'Got
%
d releaser for platform
%
s.'
%
(
total_hit
,
platform
))
scan_resp
=
scan
(
client
=
es_framework
,
query
=
search_body
,
index
=
index_target_releaser
,
doc_type
=
doc_type_target_releaser
,
request_timeout
=
200
)
for
line
in
scan_resp
:
try
:
title
=
line
[
'_source'
][
'title'
]
page
=
line
[
'_source'
][
'page'
]
releaser_dic
[
title
]
=
page
except
:
print
(
'error in :'
,
line
)
continue
else
:
print
(
'Got zero hits.'
)
return
releaser_dic
res_dic
=
{}
res_list
=
[
"比基尼线脱毛"
,
"嗨体泪沟"
,
"根据脸型选发型"
,
"圆脸适合什么发型"
,
"5热玛吉"
,
"耳软骨假体鼻综合"
,
"肉毒素去法令纹"
,
"吸脂瘦腹部"
,
"嗨体填充泪沟"
,
"6d小脸针"
,
"水剥离"
,
"嗨体去颈纹"
,
"胶原蛋白填充泪沟"
,
"吸脂瘦全身"
,
"肉毒素去狐臭"
,
"吸脂瘦腰部"
,
"fotona4d"
,
"嘴综合"
,
"胸部下垂矫正"
,
"5g天使光雕"
,
"唇综合"
,
"SVF-gel脂肪胶"
,
"嘴角上扬术"
,
"嗨体注射"
,
"脂肪填充修复"
,
"比基尼脱毛"
,
"lams吸脂"
,
"脂肪填充面部年轻化"
,
"嗨体"
,
"吸脂祛副乳"
,
"m22"
,
"胸部提升"
,
"fotona"
,
"O型腿矫正"
,
"肋骨鼻"
,
"欣颜"
,
"唯颜"
,
"垫眉骨"
,
"咬肌切除"
,
"背部吸脂"
,
"m22王者之冠"
,
"bbl"
,
"胶原蛋白填充祛黑眼圈"
,
]
for
l
in
res_list
:
res_dic
[
l
]
=
10
return
res_dic
# def func_search_keywordlist(platform):
# search_body = {"query": {"bool": {"filter": []}}}
# search_resp = es_framework.search(index=index_target_releaser,
# doc_type=doc_type_target_releaser,
# body=search_body,
# size=0,
# request_timeout=100)
# total_hit = search_resp['hits']['total']
# releaser_dic = {}
# if total_hit > 0:
# print('Got %d releaser for platform %s.' % (total_hit, platform))
# scan_resp = scan(client=es_framework, query=search_body,
# index=index_target_releaser,
# doc_type=doc_type_target_releaser,
# request_timeout=200)
# for line in scan_resp:
# try:
# title = line['_source']['title']
# page = line['_source']['page']
# releaser_dic[title] = page
# except:
# print('error in :', line)
# continue
# else:
# print('Got zero hits.')
# return releaser_dic
if
OUTPUT_TO_ES_RAW
is
True
:
...
...
@@ -99,20 +150,13 @@ def search_page_task(platform, output_to_es_raw,
print
(
"search keyword '
%
s' on platform
%
s"
%
(
keyword
,
platform
))
search_pages
=
int
(
KEYWORD_dic
[
keyword
])
try
:
if
platform
!=
"腾讯新闻"
:
crawler
.
search_page
(
keyword
=
keyword
,
crawler
.
search_page
(
keyword
=
keyword
,
search_pages_max
=
search_pages
,
output_to_es_raw
=
output_to_es_raw
,
output_to_es_register
=
output_to_es_register
,
es_index
=
es_index
,
doc_type
=
doc_type
)
else
:
crawler
.
search_video_page
(
keyword
,
None
,
search_pages_max
=
search_pages
,
output_to_es_raw
=
output_to_es_raw
,
output_to_es_register
=
output_to_es_register
,
es_index
=
es_index
,
doc_type
=
doc_type
,
releaser
=
False
)
except
Exception
as
e
:
print
(
e
)
continue
...
...
@@ -120,15 +164,16 @@ def search_page_task(platform, output_to_es_raw,
result
=
[]
kwargs_dict
=
{
'output_to_es_raw'
:
OUTPUT_TO_ES_RAW
,
'output_to_es_register'
:
OUTPUT_TO_ES_REGISTER
,
'es_index'
:
ES_INDEX
,
'doc_type'
:
DOC_TYPE
,
'output_to_es_raw'
:
OUTPUT_TO_ES_RAW
,
'output_to_es_register'
:
OUTPUT_TO_ES_REGISTER
,
'es_index'
:
ES_INDEX
,
'doc_type'
:
DOC_TYPE
,
}
pool
=
Pool
(
processes
=
4
)
for
platform
in
PLATFORM_LIST
:
res
=
pool
.
apply_async
(
func
=
search_page_task
,
args
=
(
platform
,
OUTPUT_TO_ES_RAW
,
OUTPUT_TO_ES_REGISTER
,
ES_INDEX
,
DOC_TYPE
))
res
=
pool
.
apply_async
(
func
=
search_page_task
,
args
=
(
platform
,
OUTPUT_TO_ES_RAW
,
OUTPUT_TO_ES_REGISTER
,
ES_INDEX
,
DOC_TYPE
))
result
.
append
(
res
)
pool
.
close
()
pool
.
join
()
...
...
crawler_sys/framework/update_data_in_target_releasers_multi_process_by_date_from_redis.py
View file @
42aa9e6d
...
...
@@ -44,7 +44,7 @@ from redis.sentinel import Sentinel
# # 连接数据库
# rds_1 = sentinel.master_for('ida_redis_master', socket_timeout=1, db=1, decode_responses=True)
rds_1
=
redis
.
StrictRedis
(
host
=
'1
54.8.190.251
'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
rds_1
=
redis
.
StrictRedis
(
host
=
'1
72.18.51.10
'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Specify a platform name.'
)
parser
.
add_argument
(
'-n'
,
'--max_page'
,
default
=
30
,
type
=
int
,
...
...
crawler_sys/framework/write_releasers_to_redis.py
View file @
42aa9e6d
...
...
@@ -34,7 +34,7 @@ from redis.sentinel import Sentinel
# 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True)
rds
=
redis
.
StrictRedis
(
host
=
'1
54.8.190.251
'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
rds
=
redis
.
StrictRedis
(
host
=
'1
72.18.51.10
'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Specify a platform name.'
)
parser
.
add_argument
(
'-p'
,
'--platform'
,
default
=
[],
action
=
'append'
,
...
...
crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
View file @
42aa9e6d
...
...
@@ -23,7 +23,7 @@ import kdl, requests
# slave = sentinel.discover_slaves('ida_redis_master')
# # 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
rds
=
redis
.
StrictRedis
(
host
=
'1
54.8.190.251
'
,
port
=
6379
,
db
=
18
,
decode_responses
=
True
)
rds
=
redis
.
StrictRedis
(
host
=
'1
72.18.51.10
'
,
port
=
6379
,
db
=
18
,
decode_responses
=
True
)
def
get_proxy_from_redis
():
...
...
crawler_sys/site_crawler/crawler_zhihu.py
View file @
42aa9e6d
...
...
@@ -107,6 +107,21 @@ class Crawler_zhihu():
print
(
requests_res
.
cookies
.
get_dict
())
return
requests_res
.
cookies
.
get_dict
()
def
parse_sigle_page
(
self
,
aid
,
data_dict
,
article_type
):
if
article_type
==
"knowledge_ad"
:
pass
elif
article_type
==
"zvideo"
:
pass
elif
article_type
==
"search_result"
:
article_type
==
data_dict
[
"object"
][
"type"
]
url
=
data_dict
[
"object"
][
"type"
]
elif
article_type
==
"search_club"
:
pass
elif
article_type
==
"relevant_query"
:
pass
else
:
pass
def
search_article_page
(
self
,
keyword
,
search_pages_max
=
12
,
output_to_es_raw
=
False
,
output_to_es_register
=
False
,
...
...
crawler_sys/utils/output_results.py
View file @
42aa9e6d
This diff is collapsed.
Click to expand it.
maintenance/send_email_with_file_auto_task.py
View file @
42aa9e6d
...
...
@@ -4,7 +4,7 @@
import
redis
,
time
,
json
,
datetime
,
sys
from
maintenance.func_send_email_with_file
import
send_file_email
rds
=
redis
.
StrictRedis
(
host
=
'1
54.8.190.251
'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
rds
=
redis
.
StrictRedis
(
host
=
'1
72.18.51.10
'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
def
write_email_task_to_redis
(
task_name
=
None
,
file_path
=
None
,
data_str
=
None
,
email_group
=
[],
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment