update upload picture

b8560e3a · haowang · 8a95ff77 · b8560e3a
Commit b8560e3a authored Nov 30, 2020 by haowang
Hide whitespace changes
Inline Side-by-side

Showing with 104 additions and 18 deletions

upload_picture.py tasks/zhihu/upload_picture.py +104 -18

No files found.
--- a/tasks/zhihu/upload_picture.py
+++ b/tasks/zhihu/upload_picture.py
@@ -9,10 +9,13 @@ import cv2
 import execjs
 from datetime import datetime
 from image_qiniu import upload_file, IMG_TYPE
+from bs4 import BeautifulSoup
-DATA_OS_PATH = '/data'
+# DATA_OS_PATH = '/data'
-PROJECT_PATH = '/srv/apps/crawler'
+# PROJECT_PATH = '/srv/apps/crawler'
+DATA_OS_PATH = '/Users/haowei/workspace/gm/crawler/image'
+PROJECT_PATH = '/Users/haowei/workspace/gm/crawler'
 class UploadImage(object):
@@ -134,11 +137,15 @@ class UploadImage(object):
            print('upload .....  error')
            return None
-    def picture_download_and_cut(self, path, new_path, table, key_id, start_id, offset=0, count=10):
+    def picture_download_and_cut(self, path, new_path, table, key_id, content_id, content):
        '''
            文章图片剪切和下载
        '''
-        sql = """select {}, url from {} where id > {} and new_url is null limit {}, {}""".format(key_id, table, start_id, offset, count)
+        urls = self.find_all_url(content)
+        self.insert_picture_urls(table, urls, content_id, key_id)
+        sql = """select {}, url from {} where {} = {} and new_url is null""".format(key_id, table, key_id, content_id)
        self.cur.execute(sql)
        tuple = self.cur.fetchall()
        self.conn.commit()
@@ -177,27 +184,106 @@ class UploadImage(object):
                self.cur.execute(sql)
                self.conn.commit()
-    def picture_download_and_cut_process(self):
+    def picture_process(self, path, new_path, table, pic_table, key_id, offset=0, count=10):
-        pass
+        content_dict = self.gets_content_dict(table, key_id, offset, count)
-        # self.picture_download_and_cut(self.ANSWER_PICTURE_PATH, self.ANSWER_PICTURE_CUT_PATH,
-        #                               'zhihu_answer_picture_url', 'answer_id')
+        for content_id, content in content_dict.items():
-        # self.picture_download_and_cut(self.ARTICLE_PICTURE_PATH, self.ARTICLE_PICTURE_CUT_PATH,
+            self.picture_download_and_cut(path, new_path, pic_table, key_id, content_id, content)
-        #                               'zhihu_article_picture_url', 'article_id')
-        # self.picture_download_and_cut(self.THOUGHT_PICTURE_PATH, self.THOUGHT_PICTURE_CUT_PATH,
+    def insert_picture_urls(self, table, urls, content_id, key_id, has_old=True):
-        #                               'zhihu_thought_picture_url', 'thought_id')
+        def _delete_repeat_url(instance, columns):
+            print(columns)
+            sql = """delete from {} where id in ({})""".format(table, ','.join([str(item) for item in columns]))
+            instance.cur.execute(sql)
+            instance.conn.commit()
+        def _url_exist(instance, url_):
+            sql = """select id from {} where {} = {} and url = '{}'""".format(table, key_id, content_id, url_)
+            instance.cur.execute(sql)
+            res = instance.cur.fetchall()
+            instance.conn.commit()
+            if res:
+                res = [item[0] for item in res]
+                # if len(res) > 1:
+                #     _delete_repeat_url(instance, res[1:])
+                return False
+            return True
+        values = []
+        for url in urls:
+            if has_old and not _url_exist(self, url):
+                continue
+            values.append("({}, '{}')".format(content_id, url))
+        if values:
+            into = """insert into {} (answer_id, url) values {}""".format(table, ','.join(values))
+            print(into)
+            self.cur.execute(into)
+            self.conn.commit()
+    def find_all_url(self, content):
+        new_content = self.replace_html_image_to_url(content)
+        rich_obj = BeautifulSoup(new_content, features="html.parser")
+        urls = []
+        for item in rich_obj.find_all("img"):
+            print(item.get('src'))
+            urls.append(item.get('src'))
+        return list(set(urls))
+    @staticmethod
+    def replace_html_image_to_url(content):
+        rich_obj = BeautifulSoup(content, features="html.parser")
+        for item in rich_obj.find_all("figure"):
+            image_obj = item.find("img")
+            new_rich_obj = rich_obj.new_tag(name="img")
+            new_rich_obj["src"] = image_obj.get("src", "")
+            item.replace_with(new_rich_obj)
+        return rich_obj.decode()
+    def gets_content_dict(self, table, key_id, offset=0, count=10):
+        sql = """select {}, content from {} limit {}, {}""".format(key_id, table, offset, count)
+        self.cur.execute(sql)
+        res = self.cur.fetchall()
+        self.conn.commit()
+        return {item[0]: item[1] for item in res}
 if __name__ == '__main__':
+    ''' 执行命令 python file_name mark offset count '''
    mark = int(sys.argv[1]) or 0
-    start_id = int(sys.argv[2]) or 0
+    offset = int(sys.argv[2]) or 0
-    offset = int(sys.argv[3]) or 0
+    count = int(sys.argv[3]) or 10
-    count = int(sys.argv[4]) or 10
    print(datetime.now())
    a = UploadImage()
    if mark == 0:
-        a.picture_download_and_cut(a.ANSWER_PICTURE_PATH, a.ANSWER_PICTURE_CUT_PATH, 'zhihu_answer_picture_url', 'answer_id', start_id, offset, count)
+        a.picture_process(
+            a.ANSWER_PICTURE_PATH,
+            a.ANSWER_PICTURE_CUT_PATH,
+            'zhihu_answer',
+            'zhihu_answer_picture_url',
+            'answer_id',
+            offset,
+            count
+        )
    if mark == 1:
-        a.picture_download_and_cut(a.ARTICLE_PICTURE_PATH, a.ARTICLE_PICTURE_CUT_PATH, 'zhihu_article_picture_url', 'article_id', start_id, offset, count)
+        a.picture_process(
+            a.ARTICLE_PICTURE_PATH,
+            a.ARTICLE_PICTURE_CUT_PATH,
+            'zhihu_article',
+            'zhihu_article_picture_url',
+            'article_id',
+            offset,
+            count
+        )
    if mark == 2:
-        a.picture_download_and_cut(a.THOUGHT_PICTURE_PATH, a.THOUGHT_PICTURE_CUT_PATH, 'zhihu_thought_picture_url', 'thought_id', start_id, offset, count)
+        a.picture_process(
+            a.THOUGHT_PICTURE_PATH,
+            a.THOUGHT_PICTURE_CUT_PATH,
+            'zhihu_thought',
+            'zhihu_thought_picture_url',
+            'thought_id',
+            offset,
+            count)
    print(datetime.now())