Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
044201c0
Commit
044201c0
authored
Jul 29, 2020
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update 新增热点帖子抽取存库脚本
parent
699ae037
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
53 additions
and
25 deletions
+53
-25
cal_ni_and_put_to_backend.py
crawler_sys/scheduler/cal_ni_and_put_to_backend.py
+53
-25
test_read_config.py
test/test_read_config.py
+0
-0
No files found.
crawler_sys/scheduler/cal_ni_and_put_to_backend.py
View file @
044201c0
...
...
@@ -247,14 +247,18 @@ def push_data_to_user(res_data: Dict) -> Dict:
:return:
"""
qiniu_img_list
=
[]
content
=
""
if
res_data
[
"img_list"
]:
for
img_url
in
res_data
[
"img_list"
]:
img_wb
=
retry_get_url
(
img_url
)
.
content
res
=
upload
(
img_wb
)
print
(
res
)
img_info
=
retry_get_url
(
res
+
"-imageinfo"
)
img_info_json
=
img_info
.
json
()
qiniu_img_list
.
append
((
res
,
img_info_json
))
try
:
img_wb
=
retry_get_url
(
img_url
)
.
content
res
=
upload
(
img_wb
)
print
(
res
)
img_info
=
retry_get_url
(
res
+
"-imageinfo"
)
img_info_json
=
img_info
.
json
()
qiniu_img_list
.
append
((
res
+
"-w"
,
img_info_json
))
except
:
return
{}
# 替换图片
if
res_data
[
"platform"
]
==
"weibo"
:
...
...
@@ -263,8 +267,10 @@ def push_data_to_user(res_data: Dict) -> Dict:
content
=
res_data
.
get
(
"content"
)
if
content
:
for
count
,
img_url
in
enumerate
(
res_data
[
"img_list"
]):
content
.
replace
(
img_url
,
qiniu_img_list
[
count
][
0
])
# print(qiniu_img_list[count][0])
content
=
content
.
replace
(
img_url
,
qiniu_img_list
[
count
][
0
])
res_data
[
"qiniu_img_list"
]
=
qiniu_img_list
res_data
[
"content"
]
=
content
if
res_data
[
"platform"
]
==
"weibo"
:
res_data
[
"content"
]
=
gm_convert_html_tags
(
res_data
[
"title"
],
all_tags
=
True
)
res_data
[
"title"
]
=
""
...
...
@@ -297,17 +303,15 @@ img_type = {
def
write_data_into_mysql
(
res_data
):
now_str
=
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
data
=
push_data_to_user
(
res_data
)
if
data
.
get
(
"title"
):
sql_query
=
"""insert into api_tractate
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
values ({user_id},'{content}',{is_online},{status},{platform},{content_level},{is_excellent},'{create_time}','{last_modified}',{user_del},{low_quality},{low_quality_deal},'{platform_id}',{pgc_type},'{title}');"""
.
format
(
user_id
=
random
.
choice
(
user_id_list
),
content
=
data
[
"content"
],
is_online
=
1
,
status
=
2
,
platform
=
3
,
content_level
=
data
[
"level"
],
is_excellent
=
0
,
create_time
=
now_str
,
last_modified
=
now_str
,
user_del
=
0
,
low_quality
=
0
,
low_quality_deal
=
0
,
platform_id
=
data
[
"doc_id"
],
pgc_type
=
1
,
title
=
data
[
"title"
])
try
:
sql_query
=
"""insert into api_tractate
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
values ({user_id},'{content}',{is_online},{status},{platform},{content_level},{is_excellent},'{create_time}','{last_modified}',{user_del},{low_quality},{low_quality_deal},'{platform_id}',{pgc_type},'{title}');"""
.
format
(
user_id
=
random
.
choice
(
user_id_list
),
content
=
data
[
"content"
],
is_online
=
1
,
status
=
2
,
platform
=
3
,
content_level
=
data
[
"level"
],
is_excellent
=
0
,
create_time
=
now_str
,
last_modified
=
now_str
,
user_del
=
0
,
low_quality
=
0
,
low_quality_deal
=
0
,
platform_id
=
data
[
"doc_id"
],
pgc_type
=
1
,
title
=
data
[
"title"
])
res
=
cur
.
execute
(
sql_query
)
tractate_id
=
int
(
conn
.
insert_id
())
if
res
:
...
...
@@ -322,15 +326,38 @@ def write_data_into_mysql(res_data):
else
:
image_url_source
=
1
try
:
image_type
=
img_type
.
get
(
img_info
[
0
]
.
split
(
"."
)[
-
1
]
.
upper
())
image_type
=
img_type
.
get
(
img_info
[
1
][
"format"
]
.
upper
())
except
:
image_type
=
1
sql_query
=
"""
insert into api_tractate_images (tractate_id,image_url,width,height,image_url_source,image_type,image_webp,create_time,update_time)
values ({tractate_id},{image_url},{width},{height},{image_url_source},{image_type},{image_webp},'{create_time}','{update_time}')
"""
.
format
(
tractate_id
=
tractate_id
,
image_url
=
img_info
[
0
],
width
=
img_info
[
1
][
"width"
],
height
=
img_info
[
1
][
"heigh"
],
image_url_source
=
image_url_source
,
image_type
=
image_type
,
create_time
=
now_str
,
update_time
=
now_str
)
try
:
width
=
img_info
[
1
][
"width"
]
height
=
img_info
[
1
][
"height"
]
except
:
width
=
0
height
=
0
try
:
if
img_type
==
7
:
sql_query
=
"""
insert into api_tractate_images (tractate_id,image_url,width,image_webp,height,image_url_source,image_type,image_webp,create_time,update_time)
values ({tractate_id},'{image_url}',{width},{height},{image_webp},{image_url_source},{image_type},{image_webp},'{create_time}','{update_time}')
"""
.
format
(
tractate_id
=
tractate_id
,
image_url
=
img_info
[
0
],
width
=
width
,
height
=
height
,
image_url_source
=
image_url_source
,
image_type
=
image_type
,
image_webp
=
img_info
[
0
],
create_time
=
now_str
,
update_time
=
now_str
)
else
:
sql_query
=
"""
insert into api_tractate_images (tractate_id,image_url,width,height,image_url_source,image_type,create_time,update_time)
values ({tractate_id},'{image_url}',{width},{height},{image_url_source},{image_type},'{create_time}','{update_time}')
"""
.
format
(
tractate_id
=
tractate_id
,
image_url
=
img_info
[
0
],
width
=
width
,
height
=
height
,
image_url_source
=
image_url_source
,
image_type
=
image_type
,
create_time
=
now_str
,
update_time
=
now_str
)
res
=
cur
.
execute
(
sql_query
)
if
res
:
conn
.
commit
()
except
Exception
as
e
:
print
(
"commit error
%
s"
%
e
)
conn
.
rollback
()
def
task_main
():
...
...
@@ -344,7 +371,8 @@ def task_main():
# 循环处理抓取数据,返回需要添加至后端的数据
for
res_data
in
scan_from_redis
(
rules_list
):
write_data_into_mysql
(
res_data
)
# test = {'release_time': 1595952037000, 'fetch_time': 1596012816514, 'url': 'https://www.douban.com/group/topic/186707979/', 'releaser': '🍫', 'repost_count': 40, 'comment_count': 411, 'favorite_count': 144, 'title': '王俊凯终于还是举铁了', 'releaserUrl': 'https://www.douban.com/people/57762442', 'releaser_id_str': 'douban_57762442', 'video_img': 'https://img3.doubanio.com/view/group_topic/sqxs/public/p317684082.webp', 'mid': '186707979', 'platform': 'douban', 'doc_id': 'douban_186707979', 'content': '<div id=\'content\'><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp" width="500"/></div></div><p></p></div>', 'collection_count': 107, 'img_list': ['https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp'], 'level': 5}
# write_data_into_mysql(test)
cur
.
close
()
conn
.
close
()
...
...
test/test_read_config.py
View file @
044201c0
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment