Commit b8560e3a authored by haowang's avatar haowang

update upload picture

parent 8a95ff77
......@@ -9,10 +9,13 @@ import cv2
import execjs
from datetime import datetime
from image_qiniu import upload_file, IMG_TYPE
from bs4 import BeautifulSoup
DATA_OS_PATH = '/data'
PROJECT_PATH = '/srv/apps/crawler'
# DATA_OS_PATH = '/data'
# PROJECT_PATH = '/srv/apps/crawler'
DATA_OS_PATH = '/Users/haowei/workspace/gm/crawler/image'
PROJECT_PATH = '/Users/haowei/workspace/gm/crawler'
class UploadImage(object):
......@@ -134,11 +137,15 @@ class UploadImage(object):
print('upload ..... error')
return None
def picture_download_and_cut(self, path, new_path, table, key_id, start_id, offset=0, count=10):
def picture_download_and_cut(self, path, new_path, table, key_id, content_id, content):
'''
文章图片剪切和下载
'''
sql = """select {}, url from {} where id > {} and new_url is null limit {}, {}""".format(key_id, table, start_id, offset, count)
urls = self.find_all_url(content)
self.insert_picture_urls(table, urls, content_id, key_id)
sql = """select {}, url from {} where {} = {} and new_url is null""".format(key_id, table, key_id, content_id)
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
......@@ -177,27 +184,106 @@ class UploadImage(object):
self.cur.execute(sql)
self.conn.commit()
def picture_download_and_cut_process(self):
pass
# self.picture_download_and_cut(self.ANSWER_PICTURE_PATH, self.ANSWER_PICTURE_CUT_PATH,
# 'zhihu_answer_picture_url', 'answer_id')
# self.picture_download_and_cut(self.ARTICLE_PICTURE_PATH, self.ARTICLE_PICTURE_CUT_PATH,
# 'zhihu_article_picture_url', 'article_id')
# self.picture_download_and_cut(self.THOUGHT_PICTURE_PATH, self.THOUGHT_PICTURE_CUT_PATH,
# 'zhihu_thought_picture_url', 'thought_id')
def picture_process(self, path, new_path, table, pic_table, key_id, offset=0, count=10):
content_dict = self.gets_content_dict(table, key_id, offset, count)
for content_id, content in content_dict.items():
self.picture_download_and_cut(path, new_path, pic_table, key_id, content_id, content)
def insert_picture_urls(self, table, urls, content_id, key_id, has_old=True):
def _delete_repeat_url(instance, columns):
print(columns)
sql = """delete from {} where id in ({})""".format(table, ','.join([str(item) for item in columns]))
instance.cur.execute(sql)
instance.conn.commit()
def _url_exist(instance, url_):
sql = """select id from {} where {} = {} and url = '{}'""".format(table, key_id, content_id, url_)
instance.cur.execute(sql)
res = instance.cur.fetchall()
instance.conn.commit()
if res:
res = [item[0] for item in res]
# if len(res) > 1:
# _delete_repeat_url(instance, res[1:])
return False
return True
values = []
for url in urls:
if has_old and not _url_exist(self, url):
continue
values.append("({}, '{}')".format(content_id, url))
if values:
into = """insert into {} (answer_id, url) values {}""".format(table, ','.join(values))
print(into)
self.cur.execute(into)
self.conn.commit()
def find_all_url(self, content):
new_content = self.replace_html_image_to_url(content)
rich_obj = BeautifulSoup(new_content, features="html.parser")
urls = []
for item in rich_obj.find_all("img"):
print(item.get('src'))
urls.append(item.get('src'))
return list(set(urls))
@staticmethod
def replace_html_image_to_url(content):
rich_obj = BeautifulSoup(content, features="html.parser")
for item in rich_obj.find_all("figure"):
image_obj = item.find("img")
new_rich_obj = rich_obj.new_tag(name="img")
new_rich_obj["src"] = image_obj.get("src", "")
item.replace_with(new_rich_obj)
return rich_obj.decode()
def gets_content_dict(self, table, key_id, offset=0, count=10):
sql = """select {}, content from {} limit {}, {}""".format(key_id, table, offset, count)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
return {item[0]: item[1] for item in res}
if __name__ == '__main__':
''' 执行命令 python file_name mark offset count '''
mark = int(sys.argv[1]) or 0
start_id = int(sys.argv[2]) or 0
offset = int(sys.argv[3]) or 0
count = int(sys.argv[4]) or 10
offset = int(sys.argv[2]) or 0
count = int(sys.argv[3]) or 10
print(datetime.now())
a = UploadImage()
if mark == 0:
a.picture_download_and_cut(a.ANSWER_PICTURE_PATH, a.ANSWER_PICTURE_CUT_PATH, 'zhihu_answer_picture_url', 'answer_id', start_id, offset, count)
a.picture_process(
a.ANSWER_PICTURE_PATH,
a.ANSWER_PICTURE_CUT_PATH,
'zhihu_answer',
'zhihu_answer_picture_url',
'answer_id',
offset,
count
)
if mark == 1:
a.picture_download_and_cut(a.ARTICLE_PICTURE_PATH, a.ARTICLE_PICTURE_CUT_PATH, 'zhihu_article_picture_url', 'article_id', start_id, offset, count)
a.picture_process(
a.ARTICLE_PICTURE_PATH,
a.ARTICLE_PICTURE_CUT_PATH,
'zhihu_article',
'zhihu_article_picture_url',
'article_id',
offset,
count
)
if mark == 2:
a.picture_download_and_cut(a.THOUGHT_PICTURE_PATH, a.THOUGHT_PICTURE_CUT_PATH, 'zhihu_thought_picture_url', 'thought_id', start_id, offset, count)
a.picture_process(
a.THOUGHT_PICTURE_PATH,
a.THOUGHT_PICTURE_CUT_PATH,
'zhihu_thought',
'zhihu_thought_picture_url',
'thought_id',
offset,
count)
print(datetime.now())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment