modify

f1a51df3 · haowang · 778c9506 · f1a51df3
Commit f1a51df3 authored Nov 26, 2020 by haowang
Hide whitespace changes
Inline Side-by-side

Showing with 41 additions and 20 deletions

crawler_zhihu_test.py crawler_sys/site_crawler/crawler_zhihu_test.py +41 -20

No files found.
--- a/crawler_sys/site_crawler/crawler_zhihu_test.py
+++ b/crawler_sys/site_crawler/crawler_zhihu_test.py
@@ -22,15 +22,38 @@ import cv2
 from pymysql import escape_string


+HOST = '172.18.51.14'
+PORT = '3306'
+USER = 'spider'
+PASSWD = 'Gengmei123'
+DB = 'spider'
+JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
+SPIDER_URL = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1'
+ANSWER_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created'
+ARTICLE_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created'
+THOUGHT_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment'
+ANSWER_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
+ARTICLE_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
+CHILD_COMMENT_START_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments'
+CHILD_COMMENT_OFFSET_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}'
+THOUGHT_COMMENT_URL = 'https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open'
+ANSWER_PICTURE_PATH = '/data/answer_picture/'
+ARTICLE_PICTURE_PATH = '/data/article_picture/'
+THOUGHT_PICTURE_PATH = '/data/thought_picture/'
+ANSWER_PICTURE_CUT_PATH = '/data/answer_picture_cut/'
+ARTICLE_PICTURE_CUT_PATH = '/data/article_picture_cut/'
+THOUGHT_PICTURE_CUT_PATH = '/data/thought_picture_cut/'
+
+
 class Crawler_zhihu():

    def __init__(self):
        '''
            初始化数据库，调整js规则
        '''
-        self.conn = pymysql.connect(host='bj-cdb-b8oeejac.sql.tencentcdb.com', port=62118, user='work',
-                                    passwd='Gengmei1',
-                                    db='mimas_dev', charset='utf8')
+        self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
+                                    passwd=PASSWD,
+                                    db=DB, charset='utf8')
        self.cur = self.conn.cursor()
        # self.cur.execute("drop table if exists zhihu_answer")
        # sql = """create table zhihu_answer(title char(40),
@@ -120,7 +143,7 @@ class Crawler_zhihu():
            with open('./zhihu.js', 'r', encoding='utf-8') as f:
                js = f.read()
        except:
-            with open('/Users/xuwei/crawler/crawler_sys/site_crawler/zhihu.js', 'r', encoding='utf-8') as f:
+            with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
                js = f.read()
        # print(js)
        self.exec_js = execjs.compile(js, cwd=r'/usr/local/lib/node_modules')
@@ -216,9 +239,9 @@ class Crawler_zhihu():
        '''
        offset = str(offset)
        if mark == 0:
-            url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created".format(offset)
+            url = ANSWER_URL.format(offset)
        elif mark == 1:
-            url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created".format(offset)
+            url = ARTICLE_URL.format(offset)
        [headers_search, cookies_dict] = self.headers_handle(url)

        get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
@@ -241,8 +264,6 @@ class Crawler_zhihu():
        else:
            print("article_data_error")

-
-
        return

    def search_root_comment(self, answerid, offset, mark, proxies_num=0):
@@ -252,9 +273,9 @@ class Crawler_zhihu():
        offset = str(offset)
        answerid = str(answerid)
        if mark == 0:
-            url = "https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset)
+            url = ANSWER_ROOT_COMMENT_URL.format(answerid, offset)
        elif mark == 1:
-            url = "https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset)
+            url = ARTICLE_ROOT_COMMENT_URL.format(answerid, offset)

        [headers_search, cookies_dict] = self.headers_handle(url)

@@ -310,9 +331,9 @@ class Crawler_zhihu():
        offsets = offset
        offset = str(offset)
        if offsets == 0:
-            url = "https://www.zhihu.com/api/v4/comments/{0}/child_comments".format(root_comment_id)
+            url = CHILD_COMMENT_START_URL.format(root_comment_id)
        else:
-            url = "https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}".format(root_comment_id, offset)
+            url = CHILD_COMMENT_OFFSET_URL.format(root_comment_id, offset)
        [headers_search, cookies_dict] = self.headers_handle(url)

        get_page = retry_get_url_no_proxies(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
@@ -391,7 +412,7 @@ class Crawler_zhihu():
        self.cur.execute(sql)
        tuple = self.cur.fetchall()
        self.conn.commit()
-        path = "/Users/xuwei/Desktop/answer_picture/"
+        path = ANSWER_PICTURE_PATH
        gif_patt = r'gif'
        for i in range(len(tuple)):
            mark = re.search(gif_patt, tuple[i][1])
@@ -419,7 +440,7 @@ class Crawler_zhihu():
                img = cv2.imread(pathes)
                high, width = img.shape[:2]
                cropped = img[0:int(high / 10 * 9), 0:width]
-                pathes = "/Users/xuwei/Desktop/answer_picture_cut/num" + str(i) + ".jpg"
+                pathes = ANSWER_PICTURE_CUT_PATH + "num" + str(i) + ".jpg"
                cv2.imwrite(pathes, cropped)

                new_url = self.upload_image_with_path(pathes)
@@ -502,7 +523,7 @@ class Crawler_zhihu():
        self.cur.execute(sql)
        tuple = self.cur.fetchall()
        self.conn.commit()
-        path = "/Users/xuwei/Desktop/article_picture/"
+        path = ARTICLE_PICTURE_PATH
        gif_patt = r'gif'
        for i in range(len(tuple)):
            mark = re.search(gif_patt, tuple[i][1])
@@ -530,7 +551,7 @@ class Crawler_zhihu():
                img = cv2.imread(pathes)
                high, width = img.shape[:2]
                cropped = img[0:int(high / 10 * 9), 0:width]
-                pathes = "/Users/xuwei/Desktop/article_picture_cut/num" + str(i) + ".jpg"
+                pathes = ARTICLE_PICTURE_CUT_PATH + "num" + str(i) + ".jpg"
                cv2.imwrite(pathes, cropped)

                new_url = self.upload_image_with_path(pathes)
@@ -590,7 +611,7 @@ class Crawler_zhihu():
            想法数据包请求
        '''
        offset = str(offset)
-        url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment".format(offset)
+        url = THOUGHT_URL.format(offset)
        [headers_search, cookies_dict] = self.headers_handle(url)

        get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
@@ -644,7 +665,7 @@ class Crawler_zhihu():
        '''
        offset = str(offset)
        answerid = str(answerid)
-        url = "https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset)
+        url = THOUGHT_COMMENT_URL.format(answerid, offset)

        [headers_search, cookies_dict] = self.headers_handle(url)

@@ -690,7 +711,7 @@ class Crawler_zhihu():
        self.cur.execute(sql)
        tuple = self.cur.fetchall()
        self.conn.commit()
-        path = "/Users/xuwei/Desktop/thought_picture/"
+        path = THOUGHT_PICTURE_PATH
        gif_patt = r'gif'
        for i in range(len(tuple)):
            mark = re.search(gif_patt, tuple[i][1])
@@ -718,7 +739,7 @@ class Crawler_zhihu():
                img = cv2.imread(pathes)
                high, width = img.shape[:2]
                cropped = img[0:int(high / 10 * 9), 0:width]
-                pathes = "/Users/xuwei/Desktop/thought_picture_cut/num" + str(i) + ".jpg"
+                pathes = THOUGHT_PICTURE_CUT_PATH + "num" + str(i) + ".jpg"
                cv2.imwrite(pathes, cropped)

                new_url = self.upload_image_with_path(pathes)