Commit b8560e3a authored by haowang's avatar haowang

update upload picture

parent 8a95ff77
...@@ -9,10 +9,13 @@ import cv2 ...@@ -9,10 +9,13 @@ import cv2
import execjs import execjs
from datetime import datetime from datetime import datetime
from image_qiniu import upload_file, IMG_TYPE from image_qiniu import upload_file, IMG_TYPE
from bs4 import BeautifulSoup
DATA_OS_PATH = '/data' # DATA_OS_PATH = '/data'
PROJECT_PATH = '/srv/apps/crawler' # PROJECT_PATH = '/srv/apps/crawler'
DATA_OS_PATH = '/Users/haowei/workspace/gm/crawler/image'
PROJECT_PATH = '/Users/haowei/workspace/gm/crawler'
class UploadImage(object): class UploadImage(object):
...@@ -134,11 +137,15 @@ class UploadImage(object): ...@@ -134,11 +137,15 @@ class UploadImage(object):
print('upload ..... error') print('upload ..... error')
return None return None
def picture_download_and_cut(self, path, new_path, table, key_id, start_id, offset=0, count=10): def picture_download_and_cut(self, path, new_path, table, key_id, content_id, content):
''' '''
文章图片剪切和下载 文章图片剪切和下载
''' '''
sql = """select {}, url from {} where id > {} and new_url is null limit {}, {}""".format(key_id, table, start_id, offset, count)
urls = self.find_all_url(content)
self.insert_picture_urls(table, urls, content_id, key_id)
sql = """select {}, url from {} where {} = {} and new_url is null""".format(key_id, table, key_id, content_id)
self.cur.execute(sql) self.cur.execute(sql)
tuple = self.cur.fetchall() tuple = self.cur.fetchall()
self.conn.commit() self.conn.commit()
...@@ -177,27 +184,106 @@ class UploadImage(object): ...@@ -177,27 +184,106 @@ class UploadImage(object):
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
def picture_download_and_cut_process(self): def picture_process(self, path, new_path, table, pic_table, key_id, offset=0, count=10):
pass content_dict = self.gets_content_dict(table, key_id, offset, count)
# self.picture_download_and_cut(self.ANSWER_PICTURE_PATH, self.ANSWER_PICTURE_CUT_PATH,
# 'zhihu_answer_picture_url', 'answer_id') for content_id, content in content_dict.items():
# self.picture_download_and_cut(self.ARTICLE_PICTURE_PATH, self.ARTICLE_PICTURE_CUT_PATH, self.picture_download_and_cut(path, new_path, pic_table, key_id, content_id, content)
# 'zhihu_article_picture_url', 'article_id')
# self.picture_download_and_cut(self.THOUGHT_PICTURE_PATH, self.THOUGHT_PICTURE_CUT_PATH, def insert_picture_urls(self, table, urls, content_id, key_id, has_old=True):
# 'zhihu_thought_picture_url', 'thought_id')
def _delete_repeat_url(instance, columns):
print(columns)
sql = """delete from {} where id in ({})""".format(table, ','.join([str(item) for item in columns]))
instance.cur.execute(sql)
instance.conn.commit()
def _url_exist(instance, url_):
sql = """select id from {} where {} = {} and url = '{}'""".format(table, key_id, content_id, url_)
instance.cur.execute(sql)
res = instance.cur.fetchall()
instance.conn.commit()
if res:
res = [item[0] for item in res]
# if len(res) > 1:
# _delete_repeat_url(instance, res[1:])
return False
return True
values = []
for url in urls:
if has_old and not _url_exist(self, url):
continue
values.append("({}, '{}')".format(content_id, url))
if values:
into = """insert into {} (answer_id, url) values {}""".format(table, ','.join(values))
print(into)
self.cur.execute(into)
self.conn.commit()
def find_all_url(self, content):
new_content = self.replace_html_image_to_url(content)
rich_obj = BeautifulSoup(new_content, features="html.parser")
urls = []
for item in rich_obj.find_all("img"):
print(item.get('src'))
urls.append(item.get('src'))
return list(set(urls))
@staticmethod
def replace_html_image_to_url(content):
rich_obj = BeautifulSoup(content, features="html.parser")
for item in rich_obj.find_all("figure"):
image_obj = item.find("img")
new_rich_obj = rich_obj.new_tag(name="img")
new_rich_obj["src"] = image_obj.get("src", "")
item.replace_with(new_rich_obj)
return rich_obj.decode()
def gets_content_dict(self, table, key_id, offset=0, count=10):
sql = """select {}, content from {} limit {}, {}""".format(key_id, table, offset, count)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
return {item[0]: item[1] for item in res}
if __name__ == '__main__': if __name__ == '__main__':
''' 执行命令 python file_name mark offset count '''
mark = int(sys.argv[1]) or 0 mark = int(sys.argv[1]) or 0
start_id = int(sys.argv[2]) or 0 offset = int(sys.argv[2]) or 0
offset = int(sys.argv[3]) or 0 count = int(sys.argv[3]) or 10
count = int(sys.argv[4]) or 10
print(datetime.now()) print(datetime.now())
a = UploadImage() a = UploadImage()
if mark == 0: if mark == 0:
a.picture_download_and_cut(a.ANSWER_PICTURE_PATH, a.ANSWER_PICTURE_CUT_PATH, 'zhihu_answer_picture_url', 'answer_id', start_id, offset, count) a.picture_process(
a.ANSWER_PICTURE_PATH,
a.ANSWER_PICTURE_CUT_PATH,
'zhihu_answer',
'zhihu_answer_picture_url',
'answer_id',
offset,
count
)
if mark == 1: if mark == 1:
a.picture_download_and_cut(a.ARTICLE_PICTURE_PATH, a.ARTICLE_PICTURE_CUT_PATH, 'zhihu_article_picture_url', 'article_id', start_id, offset, count) a.picture_process(
a.ARTICLE_PICTURE_PATH,
a.ARTICLE_PICTURE_CUT_PATH,
'zhihu_article',
'zhihu_article_picture_url',
'article_id',
offset,
count
)
if mark == 2: if mark == 2:
a.picture_download_and_cut(a.THOUGHT_PICTURE_PATH, a.THOUGHT_PICTURE_CUT_PATH, 'zhihu_thought_picture_url', 'thought_id', start_id, offset, count) a.picture_process(
a.THOUGHT_PICTURE_PATH,
a.THOUGHT_PICTURE_CUT_PATH,
'zhihu_thought',
'zhihu_thought_picture_url',
'thought_id',
offset,
count)
print(datetime.now()) print(datetime.now())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment