Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
fd0899b4
Commit
fd0899b4
authored
4 years ago
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
044201c0
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
35 additions
and
13 deletions
+35
-13
redis_interact.py
crawler_sys/framework/redis_interact.py
+4
-4
update_data_in_target_releasers_multi_process_by_date_from_redis.py
...a_in_target_releasers_multi_process_by_date_from_redis.py
+2
-1
cal_ni_and_put_to_backend.py
crawler_sys/scheduler/cal_ni_and_put_to_backend.py
+27
-6
output_results.py
crawler_sys/utils/output_results.py
+1
-1
func_send_email_with_file.py
maintenance/func_send_email_with_file.py
+1
-1
No files found.
crawler_sys/framework/redis_interact.py
View file @
fd0899b4
...
@@ -11,9 +11,8 @@ from crawler_sys.framework.es_crawler import scan_crawler_url_register
...
@@ -11,9 +11,8 @@ from crawler_sys.framework.es_crawler import scan_crawler_url_register
rds
=
redis
.
StrictRedis
(
host
=
'154.8.190.251'
,
port
=
6379
,
db
=
19
)
rds
=
redis
.
StrictRedis
(
host
=
'154.8.190.251'
,
port
=
6379
,
db
=
19
)
def
feed_url_into_redis
(
dict_Lst
,
platform
,
def
feed_url_into_redis
(
dict_Lst
,
expire
=
0
,
release_time_lower_bdr
=
None
,
):
batch_str
=
None
):
"""
"""
release_time_lower_bdr must be an int value represent
release_time_lower_bdr must be an int value represent
timestamp in milliseconds if given.
timestamp in milliseconds if given.
...
@@ -27,7 +26,8 @@ def feed_url_into_redis(dict_Lst, platform,
...
@@ -27,7 +26,8 @@ def feed_url_into_redis(dict_Lst, platform,
doc_id
=
data_dict
[
'doc_id'
]
doc_id
=
data_dict
[
'doc_id'
]
sadd_c
=
rds
.
lpush
(
doc_id
,
json
.
dumps
(
data_dict
))
sadd_c
=
rds
.
lpush
(
doc_id
,
json
.
dumps
(
data_dict
))
res
=
rds
.
lpush
(
"doc_id"
,
doc_id
)
res
=
rds
.
lpush
(
"doc_id"
,
doc_id
)
rds
.
expire
(
doc_id
,
259200
)
if
expire
:
rds
.
expire
(
doc_id
,
expire
)
except
:
except
:
print
(
'Failed to push data into redis'
)
print
(
'Failed to push data into redis'
)
print
(
'Pushed data into redis'
)
print
(
'Pushed data into redis'
)
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/framework/update_data_in_target_releasers_multi_process_by_date_from_redis.py
View file @
fd0899b4
...
@@ -224,7 +224,8 @@ def single_thead(processe,name):
...
@@ -224,7 +224,8 @@ def single_thead(processe,name):
filepath
=
None
,
filepath
=
None
,
output_to_es_raw
=
output_to_es_raw
,
output_to_es_raw
=
output_to_es_raw
,
es_index
=
es_index
,
es_index
=
es_index
,
output_to_es_register
=
output_to_es_register
)
output_to_es_register
=
output_to_es_register
,
expire
=
86400
)
print
(
len
(
data_list
))
print
(
len
(
data_list
))
data_list
.
clear
()
data_list
.
clear
()
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/scheduler/cal_ni_and_put_to_backend.py
View file @
fd0899b4
...
@@ -256,13 +256,15 @@ def push_data_to_user(res_data: Dict) -> Dict:
...
@@ -256,13 +256,15 @@ def push_data_to_user(res_data: Dict) -> Dict:
print
(
res
)
print
(
res
)
img_info
=
retry_get_url
(
res
+
"-imageinfo"
)
img_info
=
retry_get_url
(
res
+
"-imageinfo"
)
img_info_json
=
img_info
.
json
()
img_info_json
=
img_info
.
json
()
qiniu_img_list
.
append
((
res
+
"-w"
,
img_info_json
))
qiniu_img_list
.
append
((
res
+
"-w"
,
img_info_json
))
except
:
except
:
return
{}
return
{}
# 替换图片
# 替换图片
if
res_data
[
"platform"
]
==
"weibo"
:
if
res_data
[
"platform"
]
==
"weibo"
:
res_data
[
"qiniu_img_list"
]
=
qiniu_img_list
res_data
[
"qiniu_img_list"
]
=
qiniu_img_list
if
"http://t.cn/"
in
res_data
[
"title"
]:
res_data
[
"title"
]
=
res_data
[
"title"
]
.
split
(
"http://t.cn/"
)[
0
]
elif
res_data
[
"platform"
]
==
"douban"
:
elif
res_data
[
"platform"
]
==
"douban"
:
content
=
res_data
.
get
(
"content"
)
content
=
res_data
.
get
(
"content"
)
if
content
:
if
content
:
...
@@ -300,9 +302,12 @@ img_type = {
...
@@ -300,9 +302,12 @@ img_type = {
# "TIFF图片类型")
# "TIFF图片类型")
}
}
def
write_data_into_mysql
(
res_data
):
def
write_data_into_mysql
(
res_data
):
now_str
=
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
now_str
=
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
# 清洗数据为可以入库的格式
data
=
push_data_to_user
(
res_data
)
data
=
push_data_to_user
(
res_data
)
tractate_id
=
0
try
:
try
:
sql_query
=
"""insert into api_tractate
sql_query
=
"""insert into api_tractate
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
...
@@ -342,7 +347,7 @@ def write_data_into_mysql(res_data):
...
@@ -342,7 +347,7 @@ def write_data_into_mysql(res_data):
values ({tractate_id},'{image_url}',{width},{height},{image_webp},{image_url_source},{image_type},{image_webp},'{create_time}','{update_time}')
values ({tractate_id},'{image_url}',{width},{height},{image_webp},{image_url_source},{image_type},{image_webp},'{create_time}','{update_time}')
"""
.
format
(
tractate_id
=
tractate_id
,
image_url
=
img_info
[
0
],
width
=
width
,
"""
.
format
(
tractate_id
=
tractate_id
,
image_url
=
img_info
[
0
],
width
=
width
,
height
=
height
,
image_url_source
=
image_url_source
,
height
=
height
,
image_url_source
=
image_url_source
,
image_type
=
image_type
,
image_webp
=
img_info
[
0
],
image_type
=
image_type
,
image_webp
=
img_info
[
0
],
create_time
=
now_str
,
update_time
=
now_str
)
create_time
=
now_str
,
update_time
=
now_str
)
else
:
else
:
sql_query
=
"""
sql_query
=
"""
...
@@ -358,23 +363,39 @@ def write_data_into_mysql(res_data):
...
@@ -358,23 +363,39 @@ def write_data_into_mysql(res_data):
except
Exception
as
e
:
except
Exception
as
e
:
print
(
"commit error
%
s"
%
e
)
print
(
"commit error
%
s"
%
e
)
conn
.
rollback
()
conn
.
rollback
()
if
tractate_id
:
return
tractate_id
else
:
return
None
def
task_main
():
def
task_main
():
# 实例化数据判断规则 注意高优先级在前 低优先级在后
# 实例化数据判断规则 注意高优先级在前 低优先级在后
push_rule_class1
=
push_rule
(
favorite_count_ni
=
0.000000001
,
time_range
=
5
,
level
=
5
)
push_rule_class1
=
push_rule
(
comment_count_ni
=
20
,
time_range
=
5
,
level
=
5
)
push_rule_class2
=
push_rule
(
comment_count_ni
=
0.0000000001
,
time_range
=
5
,
level
=
3
)
push_rule_class2
=
push_rule
(
comment_count_ni
=
5
,
time_range
=
5
,
level
=
3
)
rules_list
=
[
rules_list
=
[
push_rule_class1
,
push_rule_class1
,
push_rule_class2
push_rule_class2
]
]
# 循环处理抓取数据,返回需要添加至后端的数据
# 循环处理抓取数据,返回需要添加至后端的数据
for
res_data
in
scan_from_redis
(
rules_list
):
for
res_data
in
scan_from_redis
(
rules_list
):
write_data_into_mysql
(
res_data
)
tractate_id
=
write_data_into_mysql
(
res_data
)
if
res_data
[
"level"
]
==
5
:
title_str
=
res_data
[
"platform"
]
+
"帖子内容审核"
body_str
=
"""
问好:
有一篇新的优秀内容需要审核,帖子号为{tractate_id}
内容如下:
{content}
"""
.
format
(
tractate_id
=
tractate_id
,
content
=
res_data
[
"content"
])
send_file_email
(
""
,
""
,
email_group
=
[
"<duanyingrong@igengmei.com>"
],
cc_group
=
[
"litao@igengmei.com"
],
email_msg_body_str
=
body_str
,
title_str
=
title_str
)
# test = {'release_time': 1595952037000, 'fetch_time': 1596012816514, 'url': 'https://www.douban.com/group/topic/186707979/', 'releaser': '🍫', 'repost_count': 40, 'comment_count': 411, 'favorite_count': 144, 'title': '王俊凯终于还是举铁了', 'releaserUrl': 'https://www.douban.com/people/57762442', 'releaser_id_str': 'douban_57762442', 'video_img': 'https://img3.doubanio.com/view/group_topic/sqxs/public/p317684082.webp', 'mid': '186707979', 'platform': 'douban', 'doc_id': 'douban_186707979', 'content': '<div id=\'content\'><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp" width="500"/></div></div><p></p></div>', 'collection_count': 107, 'img_list': ['https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp'], 'level': 5}
# test = {'release_time': 1595952037000, 'fetch_time': 1596012816514, 'url': 'https://www.douban.com/group/topic/186707979/', 'releaser': '🍫', 'repost_count': 40, 'comment_count': 411, 'favorite_count': 144, 'title': '王俊凯终于还是举铁了', 'releaserUrl': 'https://www.douban.com/people/57762442', 'releaser_id_str': 'douban_57762442', 'video_img': 'https://img3.doubanio.com/view/group_topic/sqxs/public/p317684082.webp', 'mid': '186707979', 'platform': 'douban', 'doc_id': 'douban_186707979', 'content': '<div id=\'content\'><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp" width="500"/></div></div><p></p></div>', 'collection_count': 107, 'img_list': ['https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp'], 'level': 5}
# write_data_into_mysql(test)
# write_data_into_mysql(test)
cur
.
close
()
cur
.
close
()
conn
.
close
()
conn
.
close
()
task_main
()
if
__name__
==
"__main__"
:
task_main
()
This diff is collapsed.
Click to expand it.
crawler_sys/utils/output_results.py
View file @
fd0899b4
...
@@ -93,7 +93,7 @@ def output_result(result_Lst, platform,
...
@@ -93,7 +93,7 @@ def output_result(result_Lst, platform,
# feed url into redis
# feed url into redis
if
push_to_redis
:
if
push_to_redis
:
feed_url_into_redis
(
feed_url_into_redis
(
result_Lst
,
platform
)
result_Lst
,
expire
=
kwargs
.
get
(
"expire"
)
)
# output into file according to passed in parameters
# output into file according to passed in parameters
if
output_to_file
is
True
and
filepath
is
not
None
:
if
output_to_file
is
True
and
filepath
is
not
None
:
...
...
This diff is collapsed.
Click to expand it.
maintenance/func_send_email_with_file.py
View file @
fd0899b4
...
@@ -44,7 +44,7 @@ def send_file_email(file_path, data_str, email_group=[],
...
@@ -44,7 +44,7 @@ def send_file_email(file_path, data_str, email_group=[],
outer
[
'To'
]
=
','
.
join
(
email_group
)
outer
[
'To'
]
=
','
.
join
(
email_group
)
outer
[
'Cc'
]
=
','
.
join
(
cc_group
)
outer
[
'Cc'
]
=
','
.
join
(
cc_group
)
if
not
sender
:
if
not
sender
:
outer
[
'From'
]
=
'litao@igengmei.com
.cn
'
outer
[
'From'
]
=
'litao@igengmei.com'
else
:
else
:
outer
[
'From'
]
=
sender
outer
[
'From'
]
=
sender
mail_service
=
'smtp.exmail.qq.com'
mail_service
=
'smtp.exmail.qq.com'
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment