整理知乎爬虫代码

3bd3a181 · zhangmeng · dc9ebe26 · 3bd3a181 · 3bd3a181 · 3bd3a181
Commit 3bd3a181 authored Mar 30, 2021 by zhangmeng
11 changed files
--- a/README.md
+++ b/README.md
@@ -20,4 +20,10 @@ python /srv/apps/crawler/crawler_sys/framework/search_page_single_process.py
 #小红书爬取过程
 1.将github上面litao分支的代码拉到服务器spider-prod-001上
 2.配置环境  激活环境->python->import sys->sys.path->跳转到里面那个site-packages目录下面->vim mypath.pth->改成自己的项目路径 运行的时候就不会报出crawler包找不到
-3.小红书导出为txt的文件 在maintenance的temfile目录下的那个脚本 使用的时候把邮箱跟密码替换成自己的
\ No newline at end of file
+3.小红书导出为txt的文件 在maintenance的temfile目录下的那个脚本 使用的时候把邮箱跟密码替换成自己的
+
+##知乎爬取
+1.将项目部署在spider-001-prod上
+2.运行脚本命令爬取特定网址  python tasks/zhihu/spider.py 0 119 58 'https://www.zhihu.com/people/zhaotianqiang' 
+3. 运行脚本命令清洗图片入库  python tasks/zhihu/upload_picture.py 0 0 84297 0
+4.去mimas项目运行  python django_manage.py qa_insert_by_spider level= 0 offset = 0 count=322784将数据导入自己数据库
\ No newline at end of file
--- a/tasks/zhihu/__init__.py
+++ b/tasks/zhihu/__init__.py
--- a/tasks/zhihu/content_refresh.py
+++ b/tasks/zhihu/content_refresh.py
+# coding=utf-8
+import pymysql
+import execjs
+import os
+import re
+from datetime import datetime
+from pymysql import escape_string
+from bs4 import BeautifulSoup
+import sys
+
+HOST = '172.18.51.14'
+PORT = 3306
+USER = 'spider'
+PASSWD = 'Gengmei123'
+DB = 'spider'
+JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
+
+
+class RefreshContent(object):
+    
+    def __init__(self, is_online=0):
+        '''
+            初始化数据库，调整js规则
+        '''
+        self.update_error_content_id = []
+        self.update_error_url_content_id = {}
+        
+        self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
+                                    passwd=PASSWD,
+                                    db=DB, charset='utf8')
+        self.cur = self.conn.cursor()
+        
+        os.environ["EXECJS_RUNTIME"] = 'Node'
+        try:
+            with open('./zhihu.js', 'r', encoding='utf-8') as f:
+                js = f.read()
+        except:
+            with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
+                js = f.read()
+        self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')
+
+        self.is_online = is_online
+    
+    @staticmethod
+    def replace_html_image_to_url(content):
+        rich_obj = BeautifulSoup(content, features="html.parser")
+        for item in rich_obj.find_all("figure"):
+            image_obj = item.find("img")
+            new_rich_obj = rich_obj.new_tag(name="img")
+            new_rich_obj["src"] = image_obj.get("src", "")
+            item.replace_with(new_rich_obj)
+        return rich_obj.decode()
+    
+    def create_new_content(self, content_id, content, pic_dict):
+        content = self.replace_html_image_to_url(content)
+        rich_obj = BeautifulSoup(content, features="html.parser")
+        update_error = False
+        for item in rich_obj.find_all("img"):
+            url = item.get("src")[23:]
+            new_url = pic_dict.get(url)
+
+            if not new_url:
+                if content_id not in self.update_error_content_id:
+                    self.update_error_content_id.append(content_id)
+                self.update_error_url_content_id[url] = content_id
+                print({content_id: url})
+                update_error = True
+                continue
+            item['src'] = new_url + '-w'
+
+        new_content = r'%s' % (rich_obj.decode())
+        return escape_string(new_content), update_error
+    
+    def get_all_content_ids(self, table, pic_table, key_id, offset=0, count=10):
+        if offset == 0:
+            sql = """select distinct {} from {}""".format(key_id, pic_table)
+            print(sql)
+            self.cur.execute(sql)
+            res = self.cur.fetchall()
+            self.conn.commit()
+            if res:
+                return [item[0] for item in res]
+            return None
+        else:
+            sql = """select answer_id {} from {} limit {}, {}""".format(key_id, table, offset, count)
+            print(sql)
+            self.cur.execute(sql)
+            res = self.cur.fetchall()
+            self.conn.commit()
+            if res:
+                return [item[0] for item in res]
+            return None
+    
+    def refresh_content(self, table, pic_table, key_id, offset=0, count=10):
+        '''
+            替换url，更新回答内容
+        '''
+        content_ids = self.get_all_content_ids(table, pic_table, key_id, offset, count)
+        
+        for content_id in content_ids:
+            print('start deal table: {}, content_id: {},'.format(table, content_id), datetime.now())
+
+            sql = """select content from {} where {} = {} and is_new = 0 and is_online = {}""".format(table, key_id, content_id, self.is_online)
+            print(sql)
+            self.cur.execute(sql)
+            res = self.cur.fetchall()
+            self.conn.commit()
+            if not res:
+                continue
+
+            content = res[0][0]
+            
+            sql = """select url, new_url from {} where {} = {} and new_url is not null""".format(
+                pic_table, key_id, content_id)
+            self.cur.execute(sql)
+            res = self.cur.fetchall()
+            self.conn.commit()
+            pic_dict = {
+                item[0][23:]: item[1] for item in res}
+            
+            new_content, update_error = self.create_new_content(content_id, content, pic_dict)
+            update_code = 1 if not update_error else 0
+
+            sql = """update {} set new_content = '{}', is_new = {} WHERE {} = '{}' """.format(
+                table, new_content, update_code, key_id, content_id)
+            self.cur.execute(sql)
+            self.conn.commit()
+            print('end deal table: {}, content_id: {},'.format(table, content_id), datetime.now())
+
+
+if __name__ == '__main__':
+    """ python script_file mark """
+    print('参数个数为:', len(sys.argv), '个参数。')
+    print('参数列表:', type(sys.argv[0]), sys.argv[0], type(sys.argv[1]), sys.argv[1], type(sys.argv[2]), sys.argv[2], type(sys.argv[3]), sys.argv[3])
+    mark = int(sys.argv[1])
+    offset = int(sys.argv[2])
+    count = int(sys.argv[3])
+    is_online = int(sys.argv[4]) or 0
+    
+    print(datetime.now())
+    refresh = RefreshContent(is_online=is_online)
+    if mark == 0:
+        refresh.refresh_content('zhihu_answer', 'zhihu_answer_picture_url', 'answer_id', offset, count)
+    elif mark == 1:
+        refresh.refresh_content('zhihu_article', 'zhihu_article_picture_url', 'article_id')
+    elif mark == 2:
+        refresh.refresh_content('zhihu_thought', 'zhihu_thought_picture_url', 'thought_id')
+        
+    print('update_error_url_content_ids : ', refresh.update_error_url_content_id)
+    print('update_error_content_ids : ', refresh.update_error_content_id)
+    print(datetime.now())
+
--- a/tasks/zhihu/image_qiniu.py
+++ b/tasks/zhihu/image_qiniu.py
--- a/tasks/zhihu/plans.py
+++ b/tasks/zhihu/plans.py
--- a/tasks/zhihu/plans_self.py
+++ b/tasks/zhihu/plans_self.py
+import os
+import time
+from datetime import datetime
+
+
+if __name__ == '__main__':
+    plans = [
+        #"python tasks/zhihu/spider_self.py 0 169 0 'https://www.zhihu.com/people/bai-fu-mei-yan-jiu-zhong-xin' 1",
+        "python tasks/zhihu/spider_self.py 0 169 66 'https://www.zhihu.com/people/geng-mei-suo-chang' 1",
+]
+
+    for plan in plans:
+        print('start plan ', plan, ' at ', datetime.now())
+        os.system(plan)
+        print('end plan ', plan, ' at ', datetime.now())
+        time.sleep(10)
--- a/tasks/zhihu/spider.py
+++ b/tasks/zhihu/spider.py
--- a/tasks/zhihu/spider_guanjianzi.py
+++ b/tasks/zhihu/spider_guanjianzi.py
--- a/tasks/zhihu/spider_self.py
+++ b/tasks/zhihu/spider_self.py
--- a/tasks/zhihu/upload_picture.py
+++ b/tasks/zhihu/upload_picture.py
--- a/tasks/zhihu/zhihu.js
+++ b/tasks/zhihu/zhihu.js