add notice

b1359e35 · 向万 · d5e464a2 · b1359e35
Commit b1359e35 authored Nov 26, 2020 by 向万
Hide whitespace changes
Inline Side-by-side

Showing with 27 additions and 2 deletions

crawler_zhihu_test.py crawler_sys/site_crawler/crawler_zhihu_test.py +27 -2

No files found.
--- a/crawler_sys/site_crawler/crawler_zhihu_test.py
+++ b/crawler_sys/site_crawler/crawler_zhihu_test.py
@@ -24,6 +24,7 @@ from pymysql import escape_string

 class Crawler_zhihu():

+    #初始化数据库，调整js规则
    def __init__(self):
        self.conn = pymysql.connect(host='bj-cdb-b8oeejac.sql.tencentcdb.com', port=62118, user='work',
                                    passwd='Gengmei1',
@@ -122,6 +123,7 @@ class Crawler_zhihu():
        # print(js)
        self.exec_js = execjs.compile(js, cwd=r'/usr/local/lib/node_modules')

+    #cookies更新
    def get_serach_page_cookies(self):
        url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1"
        headers = {
@@ -141,6 +143,7 @@ class Crawler_zhihu():
        requests_res = retry_get_url(url, headers=headers)
        return requests_res.cookies.get_dict()

+    #插入主要内容数据和图片的url，寻找评论
    def parse_sigle_page(self, data_dict, mark):

        if mark == 0:
@@ -172,6 +175,7 @@ class Crawler_zhihu():

        return

+    #函数主入口
    def search_page(self, answer_page_max, article_page_max, thought_page_max):
        offset = 0

@@ -192,12 +196,13 @@ class Crawler_zhihu():

        self.answer_picture_doneload_and_cut()
        self.answer_refresh_content()
-        #self.article_picture_doneload_and_cut()
-        #self.article_refresh_content()
+        self.article_picture_doneload_and_cut()
+        self.article_refresh_content()
        self.answer_data_complex()
        self.conn.close()
        return

+    #实现文章和回答的数据包请求
    def search_answer_article_page(self, offset, mark, proxies_num=0):

        offset = str(offset)
@@ -231,6 +236,7 @@ class Crawler_zhihu():

        return

+    #实现父评论的数据包请求
    def search_root_comment(self, answerid, offset, mark, proxies_num=0):
        offset = str(offset)
        answerid = str(answerid)
@@ -264,6 +270,7 @@ class Crawler_zhihu():

        return next

+    #插入父评论相关信息并关联子评论
    def root_comment_data(self, data_dict, answerid, mark):
        if mark == 0:
            into = "insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
@@ -282,6 +289,7 @@ class Crawler_zhihu():

        return

+    #文章和回答的数据包请求
    def search_child_comment(self, root_comment_id, offset, proxies_num=0):
        root_comment_id = str(root_comment_id)
        offsets = offset
@@ -313,6 +321,7 @@ class Crawler_zhihu():
            next = 1
        return next

+    #子评论数据插入
    def child_comment_data(self, data_dict, root_comment_id):

        into = "insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s)"
@@ -322,6 +331,7 @@ class Crawler_zhihu():

        return

+    #url请求中的头部伪装
    def headers_handle(self, url):
        res_cookies_dict = self.get_serach_page_cookies()
        headers_search = {
@@ -354,6 +364,7 @@ class Crawler_zhihu():
        headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b",fmd5)
        return headers_search, cookies_dict

+    #回答图片剪切和下载
    def answer_picture_doneload_and_cut(self):
        sql = """select answer_id, url from zhihu_answer_picture_url"""
        self.cur.execute(sql)
@@ -416,6 +427,7 @@ class Crawler_zhihu():
        #     paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
        #     cv2.imwrite(paths, cropped)

+    #图片上传并得到新url
    def upload_image_with_path(self, path, img_type=IMG_TYPE.TOPICIMAGE):
        '''非站内图片处理'''
        try:
@@ -431,6 +443,7 @@ class Crawler_zhihu():
            print('upload .....  error')
            return None

+    #替换url，更新回答内容
    def answer_refresh_content(self):
        sql = """select answer_id, url, new_url from zhihu_answer_picture_url"""
        self.cur.execute(sql)
@@ -455,6 +468,7 @@ class Crawler_zhihu():
            self.cur.execute(sql)
            self.conn.commit()

+    #文章图片剪切和下载
    def article_picture_doneload_and_cut(self):
        sql = """select article_id, url from zhihu_article_picture_url"""
        self.cur.execute(sql)
@@ -517,6 +531,7 @@ class Crawler_zhihu():
        #     paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
        #     cv2.imwrite(paths, cropped)

+    #替换url，更新文章内容
    def article_refresh_content(self):
        sql = """select article_id, url, new_url from zhihu_article_picture_url"""
        self.cur.execute(sql)
@@ -541,6 +556,7 @@ class Crawler_zhihu():
            self.cur.execute(sql)
            self.conn.commit()

+    #想法数据包请求
    def search_thought_page(self, offset, proxies_num=0):

        offset = str(offset)
@@ -568,6 +584,7 @@ class Crawler_zhihu():

        return

+    #想法内容插入
    def parse_thought_sigle_page(self, data_dict):

        for one_dict in data_dict["content"]:
@@ -590,6 +607,7 @@ class Crawler_zhihu():

        return

+    #想法评论数据包请求
    def search_thought_comment(self, answerid, offset, proxies_num=0):
        offset = str(offset)
        answerid = str(answerid)
@@ -620,6 +638,7 @@ class Crawler_zhihu():

        return next

+    #想法评论数据插入
    def thought_comment_data(self, data_dict, answerid):
        into = "insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(%s, %s, %s, %s, %s, %s)"
        values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid, data_dict["created_time"], data_dict["author"]["member"]["id"])
@@ -628,6 +647,7 @@ class Crawler_zhihu():

        return

+    #想法图片剪切和下载
    def thought_picture_doneload_and_cut(self):
        sql = """select thought_id, url from zhihu_thought_picture_url"""
        self.cur.execute(sql)
@@ -690,6 +710,7 @@ class Crawler_zhihu():
        #     paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
        #     cv2.imwrite(paths, cropped)

+    #封装回答最终数据结果格式
    def answer_data_complex(self):
        sql = """select id, content, created_time, comment_count from zhihu_answer"""
        self.cur.execute(sql)
@@ -705,6 +726,7 @@ class Crawler_zhihu():
            root_comment = self.cur.fetchall()
            comment = []
            for j in range(len(root_comment)):
+                reply = []
                if root_comment[j][1] != 0:
                    sql = """select child_comment_id, content, created_time, author_id from zhihu_child_comment as a where a.root_comment_id = '{}' """.format(root_comment[j][0])
                    self.cur.execute(sql)
@@ -732,6 +754,7 @@ class Crawler_zhihu():
            )
        return topics

+    #封装文章最终数据结果格式
    def article_data_complex(self):
        sql = """select id, content, created_time, comment_count from zhihu_article"""
        self.cur.execute(sql)
@@ -747,6 +770,7 @@ class Crawler_zhihu():
            root_comment = self.cur.fetchall()
            comment = []
            for j in range(len(root_comment)):
+                reply = []
                if root_comment[j][1] != 0:
                    sql = """select child_comment_id, content, created_time, author_id from zhihu_child_comment as a where a.root_comment_id = '{}' """.format(root_comment[j][0])
                    self.cur.execute(sql)
@@ -774,6 +798,7 @@ class Crawler_zhihu():
            )
        return topics

+    #封装回答最终数据结果格式
    def thought_data_complex(self):
        sql = """select id, content, created_time, comment_count from zhihu_thought"""
        self.cur.execute(sql)