Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
fd0899b4
Commit
fd0899b4
authored
Jul 30, 2020
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
044201c0
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
35 additions
and
13 deletions
+35
-13
redis_interact.py
crawler_sys/framework/redis_interact.py
+4
-4
update_data_in_target_releasers_multi_process_by_date_from_redis.py
...a_in_target_releasers_multi_process_by_date_from_redis.py
+2
-1
cal_ni_and_put_to_backend.py
crawler_sys/scheduler/cal_ni_and_put_to_backend.py
+27
-6
output_results.py
crawler_sys/utils/output_results.py
+1
-1
func_send_email_with_file.py
maintenance/func_send_email_with_file.py
+1
-1
No files found.
crawler_sys/framework/redis_interact.py
View file @
fd0899b4
...
@@ -11,9 +11,8 @@ from crawler_sys.framework.es_crawler import scan_crawler_url_register
...
@@ -11,9 +11,8 @@ from crawler_sys.framework.es_crawler import scan_crawler_url_register
rds
=
redis
.
StrictRedis
(
host
=
'154.8.190.251'
,
port
=
6379
,
db
=
19
)
rds
=
redis
.
StrictRedis
(
host
=
'154.8.190.251'
,
port
=
6379
,
db
=
19
)
def
feed_url_into_redis
(
dict_Lst
,
platform
,
def
feed_url_into_redis
(
dict_Lst
,
expire
=
0
,
release_time_lower_bdr
=
None
,
):
batch_str
=
None
):
"""
"""
release_time_lower_bdr must be an int value represent
release_time_lower_bdr must be an int value represent
timestamp in milliseconds if given.
timestamp in milliseconds if given.
...
@@ -27,7 +26,8 @@ def feed_url_into_redis(dict_Lst, platform,
...
@@ -27,7 +26,8 @@ def feed_url_into_redis(dict_Lst, platform,
doc_id
=
data_dict
[
'doc_id'
]
doc_id
=
data_dict
[
'doc_id'
]
sadd_c
=
rds
.
lpush
(
doc_id
,
json
.
dumps
(
data_dict
))
sadd_c
=
rds
.
lpush
(
doc_id
,
json
.
dumps
(
data_dict
))
res
=
rds
.
lpush
(
"doc_id"
,
doc_id
)
res
=
rds
.
lpush
(
"doc_id"
,
doc_id
)
rds
.
expire
(
doc_id
,
259200
)
if
expire
:
rds
.
expire
(
doc_id
,
expire
)
except
:
except
:
print
(
'Failed to push data into redis'
)
print
(
'Failed to push data into redis'
)
print
(
'Pushed data into redis'
)
print
(
'Pushed data into redis'
)
...
...
crawler_sys/framework/update_data_in_target_releasers_multi_process_by_date_from_redis.py
View file @
fd0899b4
...
@@ -224,7 +224,8 @@ def single_thead(processe,name):
...
@@ -224,7 +224,8 @@ def single_thead(processe,name):
filepath
=
None
,
filepath
=
None
,
output_to_es_raw
=
output_to_es_raw
,
output_to_es_raw
=
output_to_es_raw
,
es_index
=
es_index
,
es_index
=
es_index
,
output_to_es_register
=
output_to_es_register
)
output_to_es_register
=
output_to_es_register
,
expire
=
86400
)
print
(
len
(
data_list
))
print
(
len
(
data_list
))
data_list
.
clear
()
data_list
.
clear
()
...
...
crawler_sys/scheduler/cal_ni_and_put_to_backend.py
View file @
fd0899b4
...
@@ -256,13 +256,15 @@ def push_data_to_user(res_data: Dict) -> Dict:
...
@@ -256,13 +256,15 @@ def push_data_to_user(res_data: Dict) -> Dict:
print
(
res
)
print
(
res
)
img_info
=
retry_get_url
(
res
+
"-imageinfo"
)
img_info
=
retry_get_url
(
res
+
"-imageinfo"
)
img_info_json
=
img_info
.
json
()
img_info_json
=
img_info
.
json
()
qiniu_img_list
.
append
((
res
+
"-w"
,
img_info_json
))
qiniu_img_list
.
append
((
res
+
"-w"
,
img_info_json
))
except
:
except
:
return
{}
return
{}
# 替换图片
# 替换图片
if
res_data
[
"platform"
]
==
"weibo"
:
if
res_data
[
"platform"
]
==
"weibo"
:
res_data
[
"qiniu_img_list"
]
=
qiniu_img_list
res_data
[
"qiniu_img_list"
]
=
qiniu_img_list
if
"http://t.cn/"
in
res_data
[
"title"
]:
res_data
[
"title"
]
=
res_data
[
"title"
]
.
split
(
"http://t.cn/"
)[
0
]
elif
res_data
[
"platform"
]
==
"douban"
:
elif
res_data
[
"platform"
]
==
"douban"
:
content
=
res_data
.
get
(
"content"
)
content
=
res_data
.
get
(
"content"
)
if
content
:
if
content
:
...
@@ -300,9 +302,12 @@ img_type = {
...
@@ -300,9 +302,12 @@ img_type = {
# "TIFF图片类型")
# "TIFF图片类型")
}
}
def
write_data_into_mysql
(
res_data
):
def
write_data_into_mysql
(
res_data
):
now_str
=
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
now_str
=
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
# 清洗数据为可以入库的格式
data
=
push_data_to_user
(
res_data
)
data
=
push_data_to_user
(
res_data
)
tractate_id
=
0
try
:
try
:
sql_query
=
"""insert into api_tractate
sql_query
=
"""insert into api_tractate
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
...
@@ -342,7 +347,7 @@ def write_data_into_mysql(res_data):
...
@@ -342,7 +347,7 @@ def write_data_into_mysql(res_data):
values ({tractate_id},'{image_url}',{width},{height},{image_webp},{image_url_source},{image_type},{image_webp},'{create_time}','{update_time}')
values ({tractate_id},'{image_url}',{width},{height},{image_webp},{image_url_source},{image_type},{image_webp},'{create_time}','{update_time}')
"""
.
format
(
tractate_id
=
tractate_id
,
image_url
=
img_info
[
0
],
width
=
width
,
"""
.
format
(
tractate_id
=
tractate_id
,
image_url
=
img_info
[
0
],
width
=
width
,
height
=
height
,
image_url_source
=
image_url_source
,
height
=
height
,
image_url_source
=
image_url_source
,
image_type
=
image_type
,
image_webp
=
img_info
[
0
],
image_type
=
image_type
,
image_webp
=
img_info
[
0
],
create_time
=
now_str
,
update_time
=
now_str
)
create_time
=
now_str
,
update_time
=
now_str
)
else
:
else
:
sql_query
=
"""
sql_query
=
"""
...
@@ -358,23 +363,39 @@ def write_data_into_mysql(res_data):
...
@@ -358,23 +363,39 @@ def write_data_into_mysql(res_data):
except
Exception
as
e
:
except
Exception
as
e
:
print
(
"commit error
%
s"
%
e
)
print
(
"commit error
%
s"
%
e
)
conn
.
rollback
()
conn
.
rollback
()
if
tractate_id
:
return
tractate_id
else
:
return
None
def
task_main
():
def
task_main
():
# 实例化数据判断规则 注意高优先级在前 低优先级在后
# 实例化数据判断规则 注意高优先级在前 低优先级在后
push_rule_class1
=
push_rule
(
favorite_count_ni
=
0.000000001
,
time_range
=
5
,
level
=
5
)
push_rule_class1
=
push_rule
(
comment_count_ni
=
20
,
time_range
=
5
,
level
=
5
)
push_rule_class2
=
push_rule
(
comment_count_ni
=
0.0000000001
,
time_range
=
5
,
level
=
3
)
push_rule_class2
=
push_rule
(
comment_count_ni
=
5
,
time_range
=
5
,
level
=
3
)
rules_list
=
[
rules_list
=
[
push_rule_class1
,
push_rule_class1
,
push_rule_class2
push_rule_class2
]
]
# 循环处理抓取数据,返回需要添加至后端的数据
# 循环处理抓取数据,返回需要添加至后端的数据
for
res_data
in
scan_from_redis
(
rules_list
):
for
res_data
in
scan_from_redis
(
rules_list
):
write_data_into_mysql
(
res_data
)
tractate_id
=
write_data_into_mysql
(
res_data
)
if
res_data
[
"level"
]
==
5
:
title_str
=
res_data
[
"platform"
]
+
"帖子内容审核"
body_str
=
"""
问好:
有一篇新的优秀内容需要审核,帖子号为{tractate_id}
内容如下:
{content}
"""
.
format
(
tractate_id
=
tractate_id
,
content
=
res_data
[
"content"
])
send_file_email
(
""
,
""
,
email_group
=
[
"<duanyingrong@igengmei.com>"
],
cc_group
=
[
"litao@igengmei.com"
],
email_msg_body_str
=
body_str
,
title_str
=
title_str
)
# test = {'release_time': 1595952037000, 'fetch_time': 1596012816514, 'url': 'https://www.douban.com/group/topic/186707979/', 'releaser': '🍫', 'repost_count': 40, 'comment_count': 411, 'favorite_count': 144, 'title': '王俊凯终于还是举铁了', 'releaserUrl': 'https://www.douban.com/people/57762442', 'releaser_id_str': 'douban_57762442', 'video_img': 'https://img3.doubanio.com/view/group_topic/sqxs/public/p317684082.webp', 'mid': '186707979', 'platform': 'douban', 'doc_id': 'douban_186707979', 'content': '<div id=\'content\'><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp" width="500"/></div></div><p></p></div>', 'collection_count': 107, 'img_list': ['https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp'], 'level': 5}
# test = {'release_time': 1595952037000, 'fetch_time': 1596012816514, 'url': 'https://www.douban.com/group/topic/186707979/', 'releaser': '🍫', 'repost_count': 40, 'comment_count': 411, 'favorite_count': 144, 'title': '王俊凯终于还是举铁了', 'releaserUrl': 'https://www.douban.com/people/57762442', 'releaser_id_str': 'douban_57762442', 'video_img': 'https://img3.doubanio.com/view/group_topic/sqxs/public/p317684082.webp', 'mid': '186707979', 'platform': 'douban', 'doc_id': 'douban_186707979', 'content': '<div id=\'content\'><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp" width="500"/></div></div><p></p></div>', 'collection_count': 107, 'img_list': ['https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp'], 'level': 5}
# write_data_into_mysql(test)
# write_data_into_mysql(test)
cur
.
close
()
cur
.
close
()
conn
.
close
()
conn
.
close
()
task_main
()
if
__name__
==
"__main__"
:
task_main
()
crawler_sys/utils/output_results.py
View file @
fd0899b4
...
@@ -93,7 +93,7 @@ def output_result(result_Lst, platform,
...
@@ -93,7 +93,7 @@ def output_result(result_Lst, platform,
# feed url into redis
# feed url into redis
if
push_to_redis
:
if
push_to_redis
:
feed_url_into_redis
(
feed_url_into_redis
(
result_Lst
,
platform
)
result_Lst
,
expire
=
kwargs
.
get
(
"expire"
)
)
# output into file according to passed in parameters
# output into file according to passed in parameters
if
output_to_file
is
True
and
filepath
is
not
None
:
if
output_to_file
is
True
and
filepath
is
not
None
:
...
...
maintenance/func_send_email_with_file.py
View file @
fd0899b4
...
@@ -44,7 +44,7 @@ def send_file_email(file_path, data_str, email_group=[],
...
@@ -44,7 +44,7 @@ def send_file_email(file_path, data_str, email_group=[],
outer
[
'To'
]
=
','
.
join
(
email_group
)
outer
[
'To'
]
=
','
.
join
(
email_group
)
outer
[
'Cc'
]
=
','
.
join
(
cc_group
)
outer
[
'Cc'
]
=
','
.
join
(
cc_group
)
if
not
sender
:
if
not
sender
:
outer
[
'From'
]
=
'litao@igengmei.com
.cn
'
outer
[
'From'
]
=
'litao@igengmei.com'
else
:
else
:
outer
[
'From'
]
=
sender
outer
[
'From'
]
=
sender
mail_service
=
'smtp.exmail.qq.com'
mail_service
=
'smtp.exmail.qq.com'
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment