Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Chengyang Zhong
crawler
Commits
b8560e3a
Commit
b8560e3a
authored
Nov 30, 2020
by
haowang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update upload picture
parent
8a95ff77
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
104 additions
and
18 deletions
+104
-18
upload_picture.py
tasks/zhihu/upload_picture.py
+104
-18
No files found.
tasks/zhihu/upload_picture.py
View file @
b8560e3a
...
...
@@ -9,10 +9,13 @@ import cv2
import
execjs
from
datetime
import
datetime
from
image_qiniu
import
upload_file
,
IMG_TYPE
from
bs4
import
BeautifulSoup
DATA_OS_PATH
=
'/data'
PROJECT_PATH
=
'/srv/apps/crawler'
# DATA_OS_PATH = '/data'
# PROJECT_PATH = '/srv/apps/crawler'
DATA_OS_PATH
=
'/Users/haowei/workspace/gm/crawler/image'
PROJECT_PATH
=
'/Users/haowei/workspace/gm/crawler'
class
UploadImage
(
object
):
...
...
@@ -134,11 +137,15 @@ class UploadImage(object):
print
(
'upload ..... error'
)
return
None
def
picture_download_and_cut
(
self
,
path
,
new_path
,
table
,
key_id
,
start_id
,
offset
=
0
,
count
=
10
):
def
picture_download_and_cut
(
self
,
path
,
new_path
,
table
,
key_id
,
content_id
,
content
):
'''
文章图片剪切和下载
'''
sql
=
"""select {}, url from {} where id > {} and new_url is null limit {}, {}"""
.
format
(
key_id
,
table
,
start_id
,
offset
,
count
)
urls
=
self
.
find_all_url
(
content
)
self
.
insert_picture_urls
(
table
,
urls
,
content_id
,
key_id
)
sql
=
"""select {}, url from {} where {} = {} and new_url is null"""
.
format
(
key_id
,
table
,
key_id
,
content_id
)
self
.
cur
.
execute
(
sql
)
tuple
=
self
.
cur
.
fetchall
()
self
.
conn
.
commit
()
...
...
@@ -177,27 +184,106 @@ class UploadImage(object):
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
def
picture_download_and_cut_process
(
self
):
pass
# self.picture_download_and_cut(self.ANSWER_PICTURE_PATH, self.ANSWER_PICTURE_CUT_PATH,
# 'zhihu_answer_picture_url', 'answer_id')
# self.picture_download_and_cut(self.ARTICLE_PICTURE_PATH, self.ARTICLE_PICTURE_CUT_PATH,
# 'zhihu_article_picture_url', 'article_id')
# self.picture_download_and_cut(self.THOUGHT_PICTURE_PATH, self.THOUGHT_PICTURE_CUT_PATH,
# 'zhihu_thought_picture_url', 'thought_id')
def
picture_process
(
self
,
path
,
new_path
,
table
,
pic_table
,
key_id
,
offset
=
0
,
count
=
10
):
content_dict
=
self
.
gets_content_dict
(
table
,
key_id
,
offset
,
count
)
for
content_id
,
content
in
content_dict
.
items
():
self
.
picture_download_and_cut
(
path
,
new_path
,
pic_table
,
key_id
,
content_id
,
content
)
def
insert_picture_urls
(
self
,
table
,
urls
,
content_id
,
key_id
,
has_old
=
True
):
def
_delete_repeat_url
(
instance
,
columns
):
print
(
columns
)
sql
=
"""delete from {} where id in ({})"""
.
format
(
table
,
','
.
join
([
str
(
item
)
for
item
in
columns
]))
instance
.
cur
.
execute
(
sql
)
instance
.
conn
.
commit
()
def
_url_exist
(
instance
,
url_
):
sql
=
"""select id from {} where {} = {} and url = '{}'"""
.
format
(
table
,
key_id
,
content_id
,
url_
)
instance
.
cur
.
execute
(
sql
)
res
=
instance
.
cur
.
fetchall
()
instance
.
conn
.
commit
()
if
res
:
res
=
[
item
[
0
]
for
item
in
res
]
# if len(res) > 1:
# _delete_repeat_url(instance, res[1:])
return
False
return
True
values
=
[]
for
url
in
urls
:
if
has_old
and
not
_url_exist
(
self
,
url
):
continue
values
.
append
(
"({}, '{}')"
.
format
(
content_id
,
url
))
if
values
:
into
=
"""insert into {} (answer_id, url) values {}"""
.
format
(
table
,
','
.
join
(
values
))
print
(
into
)
self
.
cur
.
execute
(
into
)
self
.
conn
.
commit
()
def
find_all_url
(
self
,
content
):
new_content
=
self
.
replace_html_image_to_url
(
content
)
rich_obj
=
BeautifulSoup
(
new_content
,
features
=
"html.parser"
)
urls
=
[]
for
item
in
rich_obj
.
find_all
(
"img"
):
print
(
item
.
get
(
'src'
))
urls
.
append
(
item
.
get
(
'src'
))
return
list
(
set
(
urls
))
@staticmethod
def
replace_html_image_to_url
(
content
):
rich_obj
=
BeautifulSoup
(
content
,
features
=
"html.parser"
)
for
item
in
rich_obj
.
find_all
(
"figure"
):
image_obj
=
item
.
find
(
"img"
)
new_rich_obj
=
rich_obj
.
new_tag
(
name
=
"img"
)
new_rich_obj
[
"src"
]
=
image_obj
.
get
(
"src"
,
""
)
item
.
replace_with
(
new_rich_obj
)
return
rich_obj
.
decode
()
def
gets_content_dict
(
self
,
table
,
key_id
,
offset
=
0
,
count
=
10
):
sql
=
"""select {}, content from {} limit {}, {}"""
.
format
(
key_id
,
table
,
offset
,
count
)
self
.
cur
.
execute
(
sql
)
res
=
self
.
cur
.
fetchall
()
self
.
conn
.
commit
()
return
{
item
[
0
]:
item
[
1
]
for
item
in
res
}
if
__name__
==
'__main__'
:
''' 执行命令 python file_name mark offset count '''
mark
=
int
(
sys
.
argv
[
1
])
or
0
start_id
=
int
(
sys
.
argv
[
2
])
or
0
offset
=
int
(
sys
.
argv
[
3
])
or
0
count
=
int
(
sys
.
argv
[
4
])
or
10
offset
=
int
(
sys
.
argv
[
2
])
or
0
count
=
int
(
sys
.
argv
[
3
])
or
10
print
(
datetime
.
now
())
a
=
UploadImage
()
if
mark
==
0
:
a
.
picture_download_and_cut
(
a
.
ANSWER_PICTURE_PATH
,
a
.
ANSWER_PICTURE_CUT_PATH
,
'zhihu_answer_picture_url'
,
'answer_id'
,
start_id
,
offset
,
count
)
a
.
picture_process
(
a
.
ANSWER_PICTURE_PATH
,
a
.
ANSWER_PICTURE_CUT_PATH
,
'zhihu_answer'
,
'zhihu_answer_picture_url'
,
'answer_id'
,
offset
,
count
)
if
mark
==
1
:
a
.
picture_download_and_cut
(
a
.
ARTICLE_PICTURE_PATH
,
a
.
ARTICLE_PICTURE_CUT_PATH
,
'zhihu_article_picture_url'
,
'article_id'
,
start_id
,
offset
,
count
)
a
.
picture_process
(
a
.
ARTICLE_PICTURE_PATH
,
a
.
ARTICLE_PICTURE_CUT_PATH
,
'zhihu_article'
,
'zhihu_article_picture_url'
,
'article_id'
,
offset
,
count
)
if
mark
==
2
:
a
.
picture_download_and_cut
(
a
.
THOUGHT_PICTURE_PATH
,
a
.
THOUGHT_PICTURE_CUT_PATH
,
'zhihu_thought_picture_url'
,
'thought_id'
,
start_id
,
offset
,
count
)
a
.
picture_process
(
a
.
THOUGHT_PICTURE_PATH
,
a
.
THOUGHT_PICTURE_CUT_PATH
,
'zhihu_thought'
,
'zhihu_thought_picture_url'
,
'thought_id'
,
offset
,
count
)
print
(
datetime
.
now
())
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment