modify

8a17245b · haowang · aff367ce · 8a17245b
Commit 8a17245b authored Nov 26, 2020 by haowang
Hide whitespace changes
Inline Side-by-side

Showing with 172 additions and 119 deletions

crawler_zhihu_test.py crawler_sys/site_crawler/crawler_zhihu_test.py +172 -119

No files found.
--- a/crawler_sys/site_crawler/crawler_zhihu_test.py
+++ b/crawler_sys/site_crawler/crawler_zhihu_test.py
@@ -24,94 +24,96 @@ from pymysql import escape_string

 class Crawler_zhihu():

-    #初始化数据库，调整js规则
    def __init__(self):
+        '''
+            初始化数据库，调整js规则
+        '''
        self.conn = pymysql.connect(host='bj-cdb-b8oeejac.sql.tencentcdb.com', port=62118, user='work',
                                    passwd='Gengmei1',
                                    db='mimas_dev', charset='utf8')
        self.cur = self.conn.cursor()
-        self.cur.execute("drop table if exists zhihu_answer")
-        sql = """create table zhihu_answer(title char(40),
-                                           content text(59999),
-                                           id int,
-                                           created_time int,
-                                           comment_count int)"""
-        self.cur.execute(sql)
-        self.conn.commit()
-        self.cur.execute("drop table if exists zhihu_article")
-        sql = """create table zhihu_article(title char(40),
-                                            content text(59999),
-                                            id int,
-                                            created_time int,
-                                            comment_count int)"""
-        self.cur.execute(sql)
-        self.conn.commit()
-        self.cur.execute("drop table if exists zhihu_answer_root_comment")
-        sql = """create table zhihu_answer_root_comment(root_comment_id int,
-                                                author_name char(40),
-                                                content text(59999),
-                                                answerid int,
-                                                child_comment_count int,
-                                                featured char(5),
-                                                created_time int,
-                                                author_id char(50))"""
-        self.cur.execute(sql)
-        self.conn.commit()
-        self.cur.execute("drop table if exists zhihu_child_comment")
-        sql = """create table zhihu_child_comment(root_comment_id int,
-                                                 author_name char(40),
-                                                 content text(59999),
-                                                 reply_name char(40),
-                                                 child_comment_id int,
-                                                 created_time int,
-                                                 author_id char(50))"""
-        self.cur.execute(sql)
-        self.conn.commit()
-
-        self.cur.execute("drop table if exists zhihu_article_root_comment")
-        sql = """create table zhihu_article_root_comment(root_comment_id int,
-                                                author_name char(40),
-                                                content text(59999),
-                                                answerid int,
-                                                child_comment_count int,
-                                                featured char(5),
-                                                created_time int,
-                                                author_id char(50))"""
-        self.cur.execute(sql)
-        self.conn.commit()
-
-        self.cur.execute("drop table if exists zhihu_answer_picture_url")
-        sql = """create table zhihu_answer_picture_url(answer_id int, url text(59999), new_url text(59999))"""
-        self.cur.execute(sql)
-        self.conn.commit()
-
-        self.cur.execute("drop table if exists zhihu_article_picture_url")
-        sql = """create table zhihu_article_picture_url(article_id int, url text(59999), new_url text(59999))"""
-        self.cur.execute(sql)
-        self.conn.commit()
-
-        self.cur.execute("drop table if exists zhihu_thought")
-        sql = """create table zhihu_thought(id char(50),
-                                           content text(59999),
-                                           created_time int,
-                                           comment_count int)"""
-        self.cur.execute(sql)
-        self.conn.commit()
-
-        self.cur.execute("drop table if exists zhihu_thought_comment")
-        sql = """create table zhihu_thought_comment(thought_comment_id int,
-                                                author_name char(40),
-                                                content text(59999),
-                                                answerid char(50),
-                                                created_time int,
-                                                author_id char(50))"""
-        self.cur.execute(sql)
-        self.conn.commit()
-
-        self.cur.execute("drop table if exists zhihu_thought_picture_url")
-        sql = """create table zhihu_thought_picture_url(thought_id char(50), url text(59999), new_url text(59999))"""
-        self.cur.execute(sql)
-        self.conn.commit()
+        # self.cur.execute("drop table if exists zhihu_answer")
+        # sql = """create table zhihu_answer(title char(40),
+        #                                    content text(59999),
+        #                                    id int,
+        #                                    created_time int,
+        #                                    comment_count int)"""
+        # self.cur.execute(sql)
+        # self.conn.commit()
+        # self.cur.execute("drop table if exists zhihu_article")
+        # sql = """create table zhihu_article(title char(40),
+        #                                     content text(59999),
+        #                                     id int,
+        #                                     created_time int,
+        #                                     comment_count int)"""
+        # self.cur.execute(sql)
+        # self.conn.commit()
+        # self.cur.execute("drop table if exists zhihu_answer_root_comment")
+        # sql = """create table zhihu_answer_root_comment(root_comment_id int,
+        #                                         author_name char(40),
+        #                                         content text(59999),
+        #                                         answerid int,
+        #                                         child_comment_count int,
+        #                                         featured char(5),
+        #                                         created_time int,
+        #                                         author_id char(50))"""
+        # self.cur.execute(sql)
+        # self.conn.commit()
+        # self.cur.execute("drop table if exists zhihu_child_comment")
+        # sql = """create table zhihu_child_comment(root_comment_id int,
+        #                                          author_name char(40),
+        #                                          content text(59999),
+        #                                          reply_name char(40),
+        #                                          child_comment_id int,
+        #                                          created_time int,
+        #                                          author_id char(50))"""
+        # self.cur.execute(sql)
+        # self.conn.commit()
+        #
+        # self.cur.execute("drop table if exists zhihu_article_root_comment")
+        # sql = """create table zhihu_article_root_comment(root_comment_id int,
+        #                                         author_name char(40),
+        #                                         content text(59999),
+        #                                         answerid int,
+        #                                         child_comment_count int,
+        #                                         featured char(5),
+        #                                         created_time int,
+        #                                         author_id char(50))"""
+        # self.cur.execute(sql)
+        # self.conn.commit()
+        #
+        # self.cur.execute("drop table if exists zhihu_answer_picture_url")
+        # sql = """create table zhihu_answer_picture_url(answer_id int, url text(59999), new_url text(59999))"""
+        # self.cur.execute(sql)
+        # self.conn.commit()
+        #
+        # self.cur.execute("drop table if exists zhihu_article_picture_url")
+        # sql = """create table zhihu_article_picture_url(article_id int, url text(59999), new_url text(59999))"""
+        # self.cur.execute(sql)
+        # self.conn.commit()
+        #
+        # self.cur.execute("drop table if exists zhihu_thought")
+        # sql = """create table zhihu_thought(id char(50),
+        #                                    content text(59999),
+        #                                    created_time int,
+        #                                    comment_count int)"""
+        # self.cur.execute(sql)
+        # self.conn.commit()
+        #
+        # self.cur.execute("drop table if exists zhihu_thought_comment")
+        # sql = """create table zhihu_thought_comment(thought_comment_id int,
+        #                                         author_name char(40),
+        #                                         content text(59999),
+        #                                         answerid char(50),
+        #                                         created_time int,
+        #                                         author_id char(50))"""
+        # self.cur.execute(sql)
+        # self.conn.commit()
+        #
+        # self.cur.execute("drop table if exists zhihu_thought_picture_url")
+        # sql = """create table zhihu_thought_picture_url(thought_id char(50), url text(59999), new_url text(59999))"""
+        # self.cur.execute(sql)
+        # self.conn.commit()

        os.environ["EXECJS_RUNTIME"] = 'Node'
        try:
@@ -123,8 +125,11 @@ class Crawler_zhihu():
        # print(js)
        self.exec_js = execjs.compile(js, cwd=r'/usr/local/lib/node_modules')

-    #cookies更新
    def get_serach_page_cookies(self):
+        '''
+            cookies更新
+        '''
+
        url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1"
        headers = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
@@ -143,8 +148,10 @@ class Crawler_zhihu():
        requests_res = retry_get_url(url, headers=headers)
        return requests_res.cookies.get_dict()

-    #插入主要内容数据和图片的url，寻找评论
    def parse_sigle_page(self, data_dict, mark):
+        '''
+            插入主要内容数据和图片的url，寻找评论
+        '''

        if mark == 0:
            into = "insert into zhihu_answer(title, content, id, created_time, comment_count) value(%s, %s, %s, %s, %s)"
@@ -175,8 +182,10 @@ class Crawler_zhihu():

        return

-    #函数主入口
    def search_page(self, answer_page_max, article_page_max, thought_page_max):
+        '''
+            函数主入口
+        '''
        offset = 0

        for i in range(answer_page_max):
@@ -193,18 +202,18 @@ class Crawler_zhihu():
            self.search_thought_page(offset)
            offset = offset + 20

-
-        self.answer_picture_doneload_and_cut()
-        self.answer_refresh_content()
-        self.article_picture_doneload_and_cut()
-        self.article_refresh_content()
-        self.answer_data_complex()
+        # self.answer_picture_doneload_and_cut()
+        # self.answer_refresh_content()
+        # self.article_picture_doneload_and_cut()
+        # self.article_refresh_content()
+        # self.answer_data_complex()
        self.conn.close()
        return

-    #实现文章和回答的数据包请求
    def search_answer_article_page(self, offset, mark, proxies_num=0):
-
+        '''
+            实现文章和回答的数据包请求
+        '''
        offset = str(offset)
        if mark == 0:
            url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created".format(offset)
@@ -236,8 +245,10 @@ class Crawler_zhihu():

        return

-    #实现父评论的数据包请求
    def search_root_comment(self, answerid, offset, mark, proxies_num=0):
+        '''
+            实现父评论的数据包请求
+        '''
        offset = str(offset)
        answerid = str(answerid)
        if mark == 0:
@@ -270,8 +281,10 @@ class Crawler_zhihu():

        return next

-    #插入父评论相关信息并关联子评论
    def root_comment_data(self, data_dict, answerid, mark):
+        '''
+            插入父评论相关信息并关联子评论
+        '''
        if mark == 0:
            into = "insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
        elif mark == 1:
@@ -289,8 +302,10 @@ class Crawler_zhihu():

        return

-    #文章和回答的数据包请求
    def search_child_comment(self, root_comment_id, offset, proxies_num=0):
+        '''
+            文章和回答的数据包请求
+        '''
        root_comment_id = str(root_comment_id)
        offsets = offset
        offset = str(offset)
@@ -321,8 +336,10 @@ class Crawler_zhihu():
            next = 1
        return next

-    #子评论数据插入
    def child_comment_data(self, data_dict, root_comment_id):
+        '''
+            子评论数据插入
+        '''

        into = "insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s)"
        values = (root_comment_id, data_dict["author"]["member"]["name"], data_dict["content"], data_dict["reply_to_author"]["member"]["name"], data_dict["id"], data_dict["created_time"], data_dict["author"]["member"]["name"])
@@ -331,8 +348,10 @@ class Crawler_zhihu():

        return

-    #url请求中的头部伪装
    def headers_handle(self, url):
+        '''
+            url请求中的头部伪装
+        '''
        res_cookies_dict = self.get_serach_page_cookies()
        headers_search = {

@@ -364,8 +383,10 @@ class Crawler_zhihu():
        headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b",fmd5)
        return headers_search, cookies_dict

-    #回答图片剪切和下载
    def answer_picture_doneload_and_cut(self):
+        '''
+            回答图片剪切和下载
+        '''
        sql = """select answer_id, url from zhihu_answer_picture_url"""
        self.cur.execute(sql)
        tuple = self.cur.fetchall()
@@ -407,7 +428,6 @@ class Crawler_zhihu():
                self.cur.execute(sql)
                self.conn.commit()

-
        # for picture_deals in picture_deal:
        #     result = str(list[i])
        #     result = pattern.findall(result)
@@ -427,8 +447,10 @@ class Crawler_zhihu():
        #     paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
        #     cv2.imwrite(paths, cropped)

-    #图片上传并得到新url
    def upload_image_with_path(self, path, img_type=IMG_TYPE.TOPICIMAGE):
+        '''
+            图片上传并得到新url
+        '''
        '''非站内图片处理'''
        try:
            # with open(path, 'rb') as f:
@@ -443,8 +465,10 @@ class Crawler_zhihu():
            print('upload .....  error')
            return None

-    #替换url，更新回答内容
    def answer_refresh_content(self):
+        '''
+            替换url，更新回答内容
+        '''
        sql = """select answer_id, url, new_url from zhihu_answer_picture_url"""
        self.cur.execute(sql)
        tuple = self.cur.fetchall()
@@ -464,12 +488,14 @@ class Crawler_zhihu():
                                 temp_tuples)
            new_content = r'%s' % (new_content)
            new_content = escape_string(new_content)
-            sql = """update zhihu_answer set content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
+            sql = """update zhihu_answer set new_content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
            self.cur.execute(sql)
            self.conn.commit()

-    #文章图片剪切和下载
    def article_picture_doneload_and_cut(self):
+        '''
+            文章图片剪切和下载
+        '''
        sql = """select article_id, url from zhihu_article_picture_url"""
        self.cur.execute(sql)
        tuple = self.cur.fetchall()
@@ -511,7 +537,6 @@ class Crawler_zhihu():
                self.cur.execute(sql)
                self.conn.commit()

-
        # for picture_deals in picture_deal:
        #     result = str(list[i])
        #     result = pattern.findall(result)
@@ -531,8 +556,10 @@ class Crawler_zhihu():
        #     paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
        #     cv2.imwrite(paths, cropped)

-    #替换url，更新文章内容
    def article_refresh_content(self):
+        '''
+            替换url，更新文章内容
+        '''
        sql = """select article_id, url, new_url from zhihu_article_picture_url"""
        self.cur.execute(sql)
        tuple = self.cur.fetchall()
@@ -552,13 +579,14 @@ class Crawler_zhihu():
                                 temp_tuples)
            new_content = r'%s' % (new_content)
            new_content = escape_string(new_content)
-            sql = """update zhihu_article set content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
+            sql = """update zhihu_article set new_content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
            self.cur.execute(sql)
            self.conn.commit()

-    #想法数据包请求
    def search_thought_page(self, offset, proxies_num=0):
-
+        '''
+            想法数据包请求
+        '''
        offset = str(offset)
        url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment".format(offset)
        [headers_search, cookies_dict] = self.headers_handle(url)
@@ -584,9 +612,10 @@ class Crawler_zhihu():

        return

-    #想法内容插入
    def parse_thought_sigle_page(self, data_dict):
-
+        '''
+            想法内容插入
+        '''
        for one_dict in data_dict["content"]:
            if one_dict["type"] == "text":
                into = "insert into zhihu_thought(content, id, created_time, comment_count) value(%s, %s, %s, %s)"
@@ -607,8 +636,10 @@ class Crawler_zhihu():

        return

-    #想法评论数据包请求
    def search_thought_comment(self, answerid, offset, proxies_num=0):
+        '''
+            想法评论数据包请求
+        '''
        offset = str(offset)
        answerid = str(answerid)
        url = "https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset)
@@ -638,8 +669,10 @@ class Crawler_zhihu():

        return next

-    #想法评论数据插入
    def thought_comment_data(self, data_dict, answerid):
+        '''
+            想法评论数据插入
+        '''
        into = "insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(%s, %s, %s, %s, %s, %s)"
        values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid, data_dict["created_time"], data_dict["author"]["member"]["id"])
        self.cur.execute(into, values)
@@ -647,8 +680,10 @@ class Crawler_zhihu():

        return

-    #想法图片剪切和下载
    def thought_picture_doneload_and_cut(self):
+        '''
+            想法图片剪切和下载
+        '''
        sql = """select thought_id, url from zhihu_thought_picture_url"""
        self.cur.execute(sql)
        tuple = self.cur.fetchall()
@@ -690,7 +725,6 @@ class Crawler_zhihu():
                self.cur.execute(sql)
                self.conn.commit()

-
        # for picture_deals in picture_deal:
        #     result = str(list[i])
        #     result = pattern.findall(result)
@@ -710,8 +744,10 @@ class Crawler_zhihu():
        #     paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
        #     cv2.imwrite(paths, cropped)

-    #封装回答最终数据结果格式
    def answer_data_complex(self):
+        '''
+            封装回答最终数据结果格式
+        '''
        sql = """select id, content, created_time, comment_count from zhihu_answer"""
        self.cur.execute(sql)
        topics = []
@@ -754,8 +790,10 @@ class Crawler_zhihu():
            )
        return topics

-    #封装文章最终数据结果格式
    def article_data_complex(self):
+        '''
+            封装文章最终数据结果格式
+        '''
        sql = """select id, content, created_time, comment_count from zhihu_article"""
        self.cur.execute(sql)
        topics = []
@@ -798,8 +836,10 @@ class Crawler_zhihu():
            )
        return topics

-    #封装回答最终数据结果格式
    def thought_data_complex(self):
+        '''
+            封装回答最终数据结果格式
+        '''
        sql = """select id, content, created_time, comment_count from zhihu_thought"""
        self.cur.execute(sql)
        topics = []
@@ -834,6 +874,19 @@ class Crawler_zhihu():
                }
            )
        return topics
+    
+    def clean_data(self):
+        self.answer_refresh_content()
+        self.article_picture_doneload_and_cut()
+        self.article_refresh_content()
+        self.conn.close()
+        return
+    
+    def complex_data(self):
+        self.answer_data_complex()
+        self.article_data_complex()
+        self.thought_data_complex()
+        self.conn.close()


 if __name__ == '__main__':