zhihu spider and data structure

d5e464a2 · 向万 · 772fd8b7 · d5e464a2 · d5e464a2
Commit d5e464a2 authored Nov 26, 2020 by 向万
Show whitespace changes
Inline Side-by-side

Showing with 827 additions and 0 deletions

crawler_zhihu_test.py crawler_sys/site_crawler/crawler_zhihu_test.py +824 -0

requirements.txt requirements.txt +3 -0

No files found.
--- a/crawler_sys/site_crawler/crawler_zhihu_test.py
+++ b/crawler_sys/site_crawler/crawler_zhihu_test.py
+# import rsa
+# import os, sys
+# base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+# sys.path.append("/Users/xuwei")
+# sys.path.append("/Users/xuwei/crawler")
+# sys.path.append("/Users/xuwei/crawler/crawler_sys")
+
+import urllib
+# import execjs
+import pymysql
+import hashlib
+import requests
+import execjs
+import os
+import re
+# from bs4 import BeautifulSoup
+from crawler_sys.utils.output_results import retry_get_url, retry_get_url_no_proxies
+from datetime import datetime
+from gm_upload.gm_upload.upload import upload, upload_file
+from gm_upload.gm_upload.consts import IMG_TYPE
+import cv2
+from pymysql import escape_string
+
+
+class Crawler_zhihu():
+
+    def __init__(self):
+        self.conn = pymysql.connect(host='bj-cdb-b8oeejac.sql.tencentcdb.com', port=62118, user='work',
+                                    passwd='Gengmei1',
+                                    db='mimas_dev', charset='utf8')
+        self.cur = self.conn.cursor()
+        self.cur.execute("drop table if exists zhihu_answer")
+        sql = """create table zhihu_answer(title char(40),
+                                           content text(59999),
+                                           id int,
+                                           created_time int,
+                                           comment_count int)"""
+        self.cur.execute(sql)
+        self.conn.commit()
+        self.cur.execute("drop table if exists zhihu_article")
+        sql = """create table zhihu_article(title char(40),
+                                            content text(59999),
+                                            id int,
+                                            created_time int,
+                                            comment_count int)"""
+        self.cur.execute(sql)
+        self.conn.commit()
+        self.cur.execute("drop table if exists zhihu_answer_root_comment")
+        sql = """create table zhihu_answer_root_comment(root_comment_id int,
+                                                author_name char(40),
+                                                content text(59999),
+                                                answerid int,
+                                                child_comment_count int,
+                                                featured char(5),
+                                                created_time int,
+                                                author_id char(50))"""
+        self.cur.execute(sql)
+        self.conn.commit()
+        self.cur.execute("drop table if exists zhihu_child_comment")
+        sql = """create table zhihu_child_comment(root_comment_id int,
+                                                 author_name char(40),
+                                                 content text(59999),
+                                                 reply_name char(40),
+                                                 child_comment_id int,
+                                                 created_time int,
+                                                 author_id char(50))"""
+        self.cur.execute(sql)
+        self.conn.commit()
+
+        self.cur.execute("drop table if exists zhihu_article_root_comment")
+        sql = """create table zhihu_article_root_comment(root_comment_id int,
+                                                author_name char(40),
+                                                content text(59999),
+                                                answerid int,
+                                                child_comment_count int,
+                                                featured char(5),
+                                                created_time int,
+                                                author_id char(50))"""
+        self.cur.execute(sql)
+        self.conn.commit()
+
+        self.cur.execute("drop table if exists zhihu_answer_picture_url")
+        sql = """create table zhihu_answer_picture_url(answer_id int, url text(59999), new_url text(59999))"""
+        self.cur.execute(sql)
+        self.conn.commit()
+
+        self.cur.execute("drop table if exists zhihu_article_picture_url")
+        sql = """create table zhihu_article_picture_url(article_id int, url text(59999), new_url text(59999))"""
+        self.cur.execute(sql)
+        self.conn.commit()
+
+        self.cur.execute("drop table if exists zhihu_thought")
+        sql = """create table zhihu_thought(id char(50),
+                                           content text(59999),
+                                           created_time int,
+                                           comment_count int)"""
+        self.cur.execute(sql)
+        self.conn.commit()
+
+        self.cur.execute("drop table if exists zhihu_thought_comment")
+        sql = """create table zhihu_thought_comment(thought_comment_id int,
+                                                author_name char(40),
+                                                content text(59999),
+                                                answerid char(50),
+                                                created_time int,
+                                                author_id char(50))"""
+        self.cur.execute(sql)
+        self.conn.commit()
+
+        self.cur.execute("drop table if exists zhihu_thought_picture_url")
+        sql = """create table zhihu_thought_picture_url(thought_id char(50), url text(59999), new_url text(59999))"""
+        self.cur.execute(sql)
+        self.conn.commit()
+
+        os.environ["EXECJS_RUNTIME"] = 'Node'
+        try:
+            with open('./zhihu.js', 'r', encoding='utf-8') as f:
+                js = f.read()
+        except:
+            with open('/Users/xuwei/crawler/crawler_sys/site_crawler/zhihu.js', 'r', encoding='utf-8') as f:
+                js = f.read()
+        # print(js)
+        self.exec_js = execjs.compile(js, cwd=r'/usr/local/lib/node_modules')
+
+    def get_serach_page_cookies(self):
+        url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1"
+        headers = {
+            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+            "accept-encoding": "gzip, deflate, br",
+            "accept-language": "zh-CN,zh;q=0.9",
+            "cache-control": "max-age=0",
+            "cookie": '_SESSIONID=rvJz2mbRjCqJFwvD79ADGb8gpdpRkWXAsdVDqOVALgh; JOID=UVkSBEtHLEUgV_KGUUMd3fULiLJHZglmBHfTo3JnDWADc9KndNqP8XtW9oBWoLo5jsAnAcPG0-JwbPp8rD1JsFI=; osd=W1gdB05NLUojUviHXkAY1_QEi7dNZwZlAX3SrHFiB2EMcNetddWM9HFX-YNTqrs2jcUtAMzF1uhxY_l5pjxGs1c=; _zap=cc1330a6-d6e9-4b25-8232-0b0481e37ea2; _xsrf=4798d610-1db7-489f-8c14-d608c0bbcf08; d_c0="AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"; capsion_ticket="2|1:0|10:1605684439|14:capsion_ticket|44:MzdiN2JlOTZlNWE2NGU5Zjg0MDNiODFlN2ViMjQzNGU=|b84d77e03b5e9447a3740a614d208ca5f3bc1e4fe15fe9b46f1a2a5702da5f99"; SESSIONID=nplCMucg2EXp8xNQAz74jmaLovUS9CG4rVtcFY4jWLT; JOID=V1sUBk8zRBeITtMgLDNzj10QrRI6WzZ_ygKgQU8AB038d6pof-COptdJ1yYpGa7oR9-1bgHql0Hfs4FvUF_YW-A=; osd=Ul4XBkI2QRSIQ9YlLzN-ilgTrR8_XjV_xwelQk8NAkj_d6dteuOOq9JM1CYkHKvrR9KwawLqmkTasIFiVVrbW-0=; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1605608394,1605684435,1605684439,1605684455; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1605684522; KLBRSID=fe0fceb358d671fa6cc33898c8c48b48|1605685245|1605683234',
+            "referer": "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1",
+            "sec-fetch-dest": "document",
+            "sec-fetch-mode": "navigate",
+            "sec-fetch-site": "same-origin",
+            "sec-fetch-user": "?1",
+            "upgrade-insecure-requests": "1",
+            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
+        }
+        requests_res = retry_get_url(url, headers=headers)
+        return requests_res.cookies.get_dict()
+
+    def parse_sigle_page(self, data_dict, mark):
+
+        if mark == 0:
+            into = "insert into zhihu_answer(title, content, id, created_time, comment_count) value(%s, %s, %s, %s, %s)"
+            values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"], data_dict["comment_count"])
+        elif mark == 1:
+            into = "insert into zhihu_article(title, content, id, created_time, comment_count) value(%s, %s, %s, %s, %s)"
+            values = (data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"])
+        self.cur.execute(into, values)
+        self.conn.commit()
+        offset = 0
+        if data_dict["comment_count"] != 0:
+            next = 1
+            while next == 1:
+                next = self.search_root_comment(data_dict["id"], offset, mark)
+                offset = offset + 20
+
+        patt = r'%s(.+?)%s' % ("<noscript><img src=\"", "\" data-caption")
+        pattern = re.compile(patt)
+        result = pattern.findall(data_dict["content"])
+        for results in result:
+            if mark == 0:
+                into = "insert into zhihu_answer_picture_url(answer_id, url) value(%s, %s)"
+            elif mark == 1:
+                into = "insert into zhihu_article_picture_url(article_id, url) value(%s, %s)"
+            values = (data_dict["id"], results)
+            self.cur.execute(into, values)
+            self.conn.commit()
+
+        return
+
+    def search_page(self, answer_page_max, article_page_max, thought_page_max):
+        offset = 0
+
+        for i in range(answer_page_max):
+            self.search_answer_article_page(offset, 0)
+            offset = offset + 20
+
+        offset = 0
+        for i in range(article_page_max):
+            self.search_answer_article_page(offset, 1)
+            offset = offset + 20
+
+        offset = 0
+        for i in range(thought_page_max):
+            self.search_thought_page(offset)
+            offset = offset + 20
+
+
+        self.answer_picture_doneload_and_cut()
+        self.answer_refresh_content()
+        #self.article_picture_doneload_and_cut()
+        #self.article_refresh_content()
+        self.answer_data_complex()
+        self.conn.close()
+        return
+
+    def search_answer_article_page(self, offset, mark, proxies_num=0):
+
+        offset = str(offset)
+        if mark == 0:
+            url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created".format(offset)
+        elif mark == 1:
+            url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created".format(offset)
+        [headers_search, cookies_dict] = self.headers_handle(url)
+
+        get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
+        if get_page.status_code != 200:
+            # retry once
+            get_page = requests.get(url)
+            if get_page.status_code != 200:
+                print("article_error")
+        page_dict = get_page.json()
+        if page_dict.get("data"):
+            for one_line in page_dict['data']:
+                try:
+                    if one_line["content"] != None:
+                        self.parse_sigle_page(one_line, mark)
+                        print("finshed_article" + offset)
+                except KeyError:
+                    # It's totally ok to drop the last return data value.
+                    # The search api just return something seems related to search
+                    continue
+        else:
+            print("article_data_error")
+
+
+
+        return
+
+    def search_root_comment(self, answerid, offset, mark, proxies_num=0):
+        offset = str(offset)
+        answerid = str(answerid)
+        if mark == 0:
+            url = "https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset)
+        elif mark == 1:
+            url = "https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset)
+
+        [headers_search, cookies_dict] = self.headers_handle(url)
+
+        get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
+        if get_page.status_code != 200:
+            # retry once
+            get_page = requests.get(url)
+            if get_page.status_code != 200:
+                print("root_comment_error")
+        page_dict = get_page.json()
+        if page_dict.get("data"):
+            for one_line in page_dict['data']:
+                try:
+                    self.root_comment_data(one_line, answerid, mark)
+                    print("finshed_root" + offset)
+                except KeyError:
+
+                    continue
+        else:
+            print("root_data_error")
+        next = 0
+        if len(page_dict['data']) == 20:
+            next = 1
+
+        return next
+
+    def root_comment_data(self, data_dict, answerid, mark):
+        if mark == 0:
+            into = "insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
+        elif mark == 1:
+            into = "insert into zhihu_article_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
+
+        values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid, data_dict["child_comment_count"], data_dict["featured"], data_dict["created_time"], data_dict["author"]["member"]["id"])
+        self.cur.execute(into, values)
+        self.conn.commit()
+        offset = 0
+        if data_dict["child_comment_count"] != 0:
+            next = 1
+            while next == 1:
+                next = self.search_child_comment(data_dict["id"], offset, mark)
+                offset = offset + 20
+
+        return
+
+    def search_child_comment(self, root_comment_id, offset, proxies_num=0):
+        root_comment_id = str(root_comment_id)
+        offsets = offset
+        offset = str(offset)
+        if offsets == 0:
+            url = "https://www.zhihu.com/api/v4/comments/{0}/child_comments".format(root_comment_id)
+        else:
+            url = "https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}".format(root_comment_id, offset)
+        [headers_search, cookies_dict] = self.headers_handle(url)
+
+        get_page = retry_get_url_no_proxies(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
+        if get_page.status_code != 200:
+            # retry once
+            get_page = requests.get(url)
+            if get_page.status_code != 200:
+                print("child_comment_error")
+        page_dict = get_page.json()
+        if page_dict.get("data"):
+            for one_line in page_dict['data']:
+                try:
+                    self.child_comment_data(one_line, root_comment_id)
+                except KeyError:
+
+                    continue
+        else:
+            pass
+        next = 0
+        if len(page_dict['data']) == 20:
+            next = 1
+        return next
+
+    def child_comment_data(self, data_dict, root_comment_id):
+
+        into = "insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s)"
+        values = (root_comment_id, data_dict["author"]["member"]["name"], data_dict["content"], data_dict["reply_to_author"]["member"]["name"], data_dict["id"], data_dict["created_time"], data_dict["author"]["member"]["name"])
+        self.cur.execute(into, values)
+        self.conn.commit()
+
+        return
+
+    def headers_handle(self, url):
+        res_cookies_dict = self.get_serach_page_cookies()
+        headers_search = {
+
+            "accept": "*/*",
+            "accept-encoding": "gzip, deflate",
+            "accept-language": "zh-CN,zh;q=0.9",
+            "sec-fetch-dest": "empty",
+            "sec-fetch-mode": "cors",
+            "sec-fetch-site": "same-origin",
+            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
+            "x-ab-param": "li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1",
+            "x-api-version": "3.0.91",
+            "x-app-za": "OS=Web",
+            "x-requested-with": "fetch",
+            "x-zse-83": "3_2.0",
+            "x-zse-86": None,
+            "referer": "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?page=1",
+
+        }
+        cookies_dict = {
+            "d_c0": '"AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"',
+            "KLBRSID": None
+        }
+
+        cookies_dict.update(res_cookies_dict)
+
+        f = "+".join(["3_2.0", url.replace("https://www.zhihu.com",""), headers_search["referer"], cookies_dict["d_c0"]])
+        fmd5 = hashlib.new('md5', f.encode()).hexdigest()
+        headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b",fmd5)
+        return headers_search, cookies_dict
+
+    def answer_picture_doneload_and_cut(self):
+        sql = """select answer_id, url from zhihu_answer_picture_url"""
+        self.cur.execute(sql)
+        tuple = self.cur.fetchall()
+        self.conn.commit()
+        path = "/Users/xuwei/Desktop/answer_picture/"
+        gif_patt = r'gif'
+        for i in range(len(tuple)):
+            mark = re.search(gif_patt, tuple[i][1])
+            url = tuple[i][1]
+            [headers_search, cookies_dict] = self.headers_handle(url)
+            r = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
+            print(r.status_code)
+            if mark:
+                pathes = path + str('num') + str(i) + '.gif'
+                with open(pathes, 'wb') as f:  # 打开写入到path路径里-二进制文件，返回的句柄名为f
+                    f.write(r.content)  # 往f里写入r对象的二进制文件
+                f.close()
+                new_url = self.upload_image_with_path(pathes)
+
+                sql = """UPDATE zhihu_answer_picture_url SET new_url = "{0}" WHERE url = "{1}" """.format(str(new_url), str(tuple[i][1]))
+                print(1)
+                self.cur.execute(sql)
+                self.conn.commit()
+            else:
+                pathes = path + str('num') + str(i) + '.jpg'
+                with open(pathes, 'wb') as f:  # 打开写入到path路径里-二进制文件，返回的句柄名为f
+                    f.write(r.content)  # 往f里写入r对象的二进制文件
+                f.close()
+
+                img = cv2.imread(pathes)
+                high, width = img.shape[:2]
+                cropped = img[0:int(high / 10 * 9), 0:width]
+                pathes = "/Users/xuwei/Desktop/answer_picture_cut/num" + str(i) + ".jpg"
+                cv2.imwrite(pathes, cropped)
+
+                new_url = self.upload_image_with_path(pathes)
+                sql = """UPDATE zhihu_answer_picture_url SET new_url = "{0}" WHERE url = "{1}" """.format(str(new_url), str(tuple[i][1]))
+                print(2)
+                self.cur.execute(sql)
+                self.conn.commit()
+
+
+        # for picture_deals in picture_deal:
+        #     result = str(list[i])
+        #     result = pattern.findall(result)
+        #     url = result[0]
+        #     [headers_search, cookies_dict] = self.headers_handle(url)
+        #     r = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
+        #     print(r.status_code)
+        #     with open(path + str('num') + str(i) + '.jpg', 'wb') as f:  # 打开写入到path路径里-二进制文件，返回的句柄名为f
+        #         f.write(r.content)  # 往f里写入r对象的二进制文件
+        #     f.close()
+        #
+        # for i in range(11):
+        #     path = "/Users/xuwei/Desktop/picture/num" + str(i) + ".jpg"
+        #     img = cv2.imread(path)
+        #     high, width = img.shape[:2]
+        #     cropped = img[0:int(high / 10 * 9), 0:width]
+        #     paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
+        #     cv2.imwrite(paths, cropped)
+
+    def upload_image_with_path(self, path, img_type=IMG_TYPE.TOPICIMAGE):
+        '''非站内图片处理'''
+        try:
+            # with open(path, 'rb') as f:
+            #     url = upload(f.read(), img_type=img_type)
+            #     print('upload .....  ', url)
+            #     return url
+
+            url = upload_file(file_path=path, img_type=img_type)
+            print('upload .....  ', url)
+            return url
+        except:
+            print('upload .....  error')
+            return None
+
+    def answer_refresh_content(self):
+        sql = """select answer_id, url, new_url from zhihu_answer_picture_url"""
+        self.cur.execute(sql)
+        tuple = self.cur.fetchall()
+        self.conn.commit()
+        for i in range(len(tuple)):
+            find_id = tuple[i][0]
+            temp = str(tuple[i][1])
+            temp1 = temp.replace("?", "#")
+            sql = """select content from zhihu_answer where zhihu_answer.id = '{}' """.format(find_id)
+            self.cur.execute(sql)
+            tuples = self.cur.fetchall()
+            # tuples = str(tuples)
+            content = tuples[0][0]
+            pattern = r'%s(.+?)%s' % ("<noscript><img src=\"" + temp1, "</figure>")
+            temp_tuples = content.replace("?", "#")
+            new_content = re.sub(pattern, "<noscript><img src=\"" + str(tuple[i][2]) + "\"></noscript></figure>",
+                                 temp_tuples)
+            new_content = r'%s' % (new_content)
+            new_content = escape_string(new_content)
+            sql = """update zhihu_answer set content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
+            self.cur.execute(sql)
+            self.conn.commit()
+
+    def article_picture_doneload_and_cut(self):
+        sql = """select article_id, url from zhihu_article_picture_url"""
+        self.cur.execute(sql)
+        tuple = self.cur.fetchall()
+        self.conn.commit()
+        path = "/Users/xuwei/Desktop/article_picture/"
+        gif_patt = r'gif'
+        for i in range(len(tuple)):
+            mark = re.search(gif_patt, tuple[i][1])
+            url = tuple[i][1]
+            [headers_search, cookies_dict] = self.headers_handle(url)
+            r = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
+            print(r.status_code)
+            if mark:
+                pathes = path + str('num') + str(i) + '.gif'
+                with open(pathes, 'wb') as f:  # 打开写入到path路径里-二进制文件，返回的句柄名为f
+                    f.write(r.content)  # 往f里写入r对象的二进制文件
+                f.close()
+                new_url = self.upload_image_with_path(pathes)
+
+                sql = """UPDATE zhihu_article_picture_url SET new_url = "{0}" WHERE url = "{1}" """.format(str(new_url), str(tuple[i][1]))
+                print(1)
+                self.cur.execute(sql)
+                self.conn.commit()
+            else:
+                pathes = path + str('num') + str(i) + '.jpg'
+                with open(pathes, 'wb') as f:  # 打开写入到path路径里-二进制文件，返回的句柄名为f
+                    f.write(r.content)  # 往f里写入r对象的二进制文件
+                f.close()
+
+                img = cv2.imread(pathes)
+                high, width = img.shape[:2]
+                cropped = img[0:int(high / 10 * 9), 0:width]
+                pathes = "/Users/xuwei/Desktop/article_picture_cut/num" + str(i) + ".jpg"
+                cv2.imwrite(pathes, cropped)
+
+                new_url = self.upload_image_with_path(pathes)
+                sql = """UPDATE zhihu_article_picture_url SET new_url = "{0}" WHERE url = "{1}" """.format(str(new_url), str(tuple[i][1]))
+                print(2)
+                self.cur.execute(sql)
+                self.conn.commit()
+
+
+        # for picture_deals in picture_deal:
+        #     result = str(list[i])
+        #     result = pattern.findall(result)
+        #     url = result[0]
+        #     [headers_search, cookies_dict] = self.headers_handle(url)
+        #     r = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
+        #     print(r.status_code)
+        #     with open(path + str('num') + str(i) + '.jpg', 'wb') as f:  # 打开写入到path路径里-二进制文件，返回的句柄名为f
+        #         f.write(r.content)  # 往f里写入r对象的二进制文件
+        #     f.close()
+        #
+        # for i in range(11):
+        #     path = "/Users/xuwei/Desktop/picture/num" + str(i) + ".jpg"
+        #     img = cv2.imread(path)
+        #     high, width = img.shape[:2]
+        #     cropped = img[0:int(high / 10 * 9), 0:width]
+        #     paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
+        #     cv2.imwrite(paths, cropped)
+
+    def article_refresh_content(self):
+        sql = """select article_id, url, new_url from zhihu_article_picture_url"""
+        self.cur.execute(sql)
+        tuple = self.cur.fetchall()
+        self.conn.commit()
+        for i in range(len(tuple)):
+            find_id = tuple[i][0]
+            temp = str(tuple[i][1])
+            temp1 = temp.replace("?", "#")
+            sql = """select content from zhihu_article where zhihu_article.id = '{}' """.format(find_id)
+            self.cur.execute(sql)
+            tuples = self.cur.fetchall()
+            # tuples = str(tuples)
+            content = tuples[0][0]
+            pattern = r'%s(.+?)%s' % ("<noscript><img src=\"" + temp1, "</figure>")
+            temp_tuples = content.replace("?", "#")
+            new_content = re.sub(pattern, "<noscript><img src=\"" + str(tuple[i][2]) + "\"></noscript></figure>",
+                                 temp_tuples)
+            new_content = r'%s' % (new_content)
+            new_content = escape_string(new_content)
+            sql = """update zhihu_article set content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
+            self.cur.execute(sql)
+            self.conn.commit()
+
+    def search_thought_page(self, offset, proxies_num=0):
+
+        offset = str(offset)
+        url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment".format(offset)
+        [headers_search, cookies_dict] = self.headers_handle(url)
+
+        get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
+        if get_page.status_code != 200:
+            # retry once
+            get_page = requests.get(url)
+            if get_page.status_code != 200:
+                print("article_error")
+        page_dict = get_page.json()
+        if page_dict.get("data"):
+            for one_line in page_dict['data']:
+                try:
+                    self.parse_thought_sigle_page(one_line)
+                    print("finshed_article" + offset)
+                except KeyError:
+                    # It's totally ok to drop the last return data value.
+                    # The search api just return something seems related to search
+                    continue
+        else:
+            print("article_data_error")
+
+        return
+
+    def parse_thought_sigle_page(self, data_dict):
+
+        for one_dict in data_dict["content"]:
+            if one_dict["type"] == "text":
+                into = "insert into zhihu_thought(content, id, created_time, comment_count) value(%s, %s, %s, %s)"
+                values = (one_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"])
+                self.cur.execute(into, values)
+                self.conn.commit()
+            else:
+                into = "insert into zhihu_thought_picture_url(thought_id, url) value(%s, %s)"
+                values = (data_dict["id"], one_dict["url"])
+                self.cur.execute(into, values)
+                self.conn.commit()
+        offset = 0
+        if data_dict["comment_count"] != 0:
+            next = 1
+            while next == 1:
+                next = self.search_thought_comment(data_dict["id"], offset)
+                offset = offset + 20
+
+        return
+
+    def search_thought_comment(self, answerid, offset, proxies_num=0):
+        offset = str(offset)
+        answerid = str(answerid)
+        url = "https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset)
+
+        [headers_search, cookies_dict] = self.headers_handle(url)
+
+        get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
+        if get_page.status_code != 200:
+            # retry once
+            get_page = requests.get(url)
+            if get_page.status_code != 200:
+                print("root_comment_error")
+        page_dict = get_page.json()
+        if page_dict.get("data"):
+            for one_line in page_dict['data']:
+                try:
+                    self.thought_comment_data(one_line, answerid)
+                    print("finshed_root" + offset)
+                except KeyError:
+
+                    continue
+        else:
+            print("root_data_error")
+        next = 0
+        if len(page_dict['data']) == 20:
+            next = 1
+
+        return next
+
+    def thought_comment_data(self, data_dict, answerid):
+        into = "insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(%s, %s, %s, %s, %s, %s)"
+        values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid, data_dict["created_time"], data_dict["author"]["member"]["id"])
+        self.cur.execute(into, values)
+        self.conn.commit()
+
+        return
+
+    def thought_picture_doneload_and_cut(self):
+        sql = """select thought_id, url from zhihu_thought_picture_url"""
+        self.cur.execute(sql)
+        tuple = self.cur.fetchall()
+        self.conn.commit()
+        path = "/Users/xuwei/Desktop/thought_picture/"
+        gif_patt = r'gif'
+        for i in range(len(tuple)):
+            mark = re.search(gif_patt, tuple[i][1])
+            url = tuple[i][1]
+            [headers_search, cookies_dict] = self.headers_handle(url)
+            r = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
+            print(r.status_code)
+            if mark:
+                pathes = path + str('num') + str(i) + '.gif'
+                with open(pathes, 'wb') as f:  # 打开写入到path路径里-二进制文件，返回的句柄名为f
+                    f.write(r.content)  # 往f里写入r对象的二进制文件
+                f.close()
+                new_url = self.upload_image_with_path(pathes)
+
+                sql = """UPDATE zhihu_thought_picture_url SET new_url = "{0}" WHERE url = "{1}" """.format(str(new_url), str(tuple[i][1]))
+                print(1)
+                self.cur.execute(sql)
+                self.conn.commit()
+            else:
+                pathes = path + str('num') + str(i) + '.jpg'
+                with open(pathes, 'wb') as f:  # 打开写入到path路径里-二进制文件，返回的句柄名为f
+                    f.write(r.content)  # 往f里写入r对象的二进制文件
+                f.close()
+
+                img = cv2.imread(pathes)
+                high, width = img.shape[:2]
+                cropped = img[0:int(high / 10 * 9), 0:width]
+                pathes = "/Users/xuwei/Desktop/thought_picture_cut/num" + str(i) + ".jpg"
+                cv2.imwrite(pathes, cropped)
+
+                new_url = self.upload_image_with_path(pathes)
+                sql = """UPDATE zhihu_thought_picture_url SET new_url = "{0}" WHERE url = "{1}" """.format(str(new_url), str(tuple[i][1]))
+                print(2)
+                self.cur.execute(sql)
+                self.conn.commit()
+
+
+        # for picture_deals in picture_deal:
+        #     result = str(list[i])
+        #     result = pattern.findall(result)
+        #     url = result[0]
+        #     [headers_search, cookies_dict] = self.headers_handle(url)
+        #     r = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
+        #     print(r.status_code)
+        #     with open(path + str('num') + str(i) + '.jpg', 'wb') as f:  # 打开写入到path路径里-二进制文件，返回的句柄名为f
+        #         f.write(r.content)  # 往f里写入r对象的二进制文件
+        #     f.close()
+        #
+        # for i in range(11):
+        #     path = "/Users/xuwei/Desktop/picture/num" + str(i) + ".jpg"
+        #     img = cv2.imread(path)
+        #     high, width = img.shape[:2]
+        #     cropped = img[0:int(high / 10 * 9), 0:width]
+        #     paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
+        #     cv2.imwrite(paths, cropped)
+
+    def answer_data_complex(self):
+        sql = """select id, content, created_time, comment_count from zhihu_answer"""
+        self.cur.execute(sql)
+        topics = []
+
+        tuple = self.cur.fetchall()
+        for i in range(len(tuple)):
+            sql = """select url from zhihu_answer_picture_url as a where a.answer_id = '{}' """.format(tuple[i][0])
+            self.cur.execute(sql)
+            images_url = self.cur.fetchall()
+            sql = """select root_comment_id, child_comment_count, content, created_time, author_id from zhihu_answer_root_comment as a where a.answerid = '{}' """.format(tuple[i][0])
+            self.cur.execute(sql)
+            root_comment = self.cur.fetchall()
+            comment = []
+            for j in range(len(root_comment)):
+                if root_comment[j][1] != 0:
+                    sql = """select child_comment_id, content, created_time, author_id from zhihu_child_comment as a where a.root_comment_id = '{}' """.format(root_comment[j][0])
+                    self.cur.execute(sql)
+                    child_comments = self.cur.fetchall()
+                    reply = [{'id': item[0], 'comment':item[1], 'create_time':item[2], 'user':{'id': item[3]}} for item in child_comments]
+                comment.append(
+                    [
+                        {
+                            'id': root_comment[j][0],
+                            'comment': root_comment[j][2],
+                            'create_time': root_comment[j][3],
+                            'user': {'id': root_comment[j][4]},
+                            'reply': reply,
+                        }
+                    ])
+
+            topics.append(
+                {
+                    'images': images_url,
+                    'content': tuple[i][1],
+                    'id': tuple[i][0],
+                    'create_time': tuple[i][2],
+                    'comments': comment,
+                }
+            )
+        return topics
+
+    def article_data_complex(self):
+        sql = """select id, content, created_time, comment_count from zhihu_article"""
+        self.cur.execute(sql)
+        topics = []
+
+        tuple = self.cur.fetchall()
+        for i in range(len(tuple)):
+            sql = """select url from zhihu_article_picture_url as a where a.answer_id = '{}' """.format(tuple[i][0])
+            self.cur.execute(sql)
+            images_url = self.cur.fetchall()
+            sql = """select root_comment_id, child_comment_count, content, created_time, author_id from zhihu_article_root_comment as a where a.answerid = '{}' """.format(tuple[i][0])
+            self.cur.execute(sql)
+            root_comment = self.cur.fetchall()
+            comment = []
+            for j in range(len(root_comment)):
+                if root_comment[j][1] != 0:
+                    sql = """select child_comment_id, content, created_time, author_id from zhihu_child_comment as a where a.root_comment_id = '{}' """.format(root_comment[j][0])
+                    self.cur.execute(sql)
+                    child_comments = self.cur.fetchall()
+                    reply = [{'id': item[0], 'comment':item[1], 'create_time':item[2], 'user':{'id': item[3]}} for item in child_comments]
+                comment.append(
+                    [
+                        {
+                            'id': root_comment[j][0],
+                            'comment': root_comment[j][2],
+                            'create_time': root_comment[j][3],
+                            'user': {'id': root_comment[j][4]},
+                            'reply': reply,
+                        }
+                    ])
+
+            topics.append(
+                {
+                    'images': images_url,
+                    'content': tuple[i][1],
+                    'id': tuple[i][0],
+                    'create_time': tuple[i][2],
+                    'comments': comment,
+                }
+            )
+        return topics
+
+    def thought_data_complex(self):
+        sql = """select id, content, created_time, comment_count from zhihu_thought"""
+        self.cur.execute(sql)
+        topics = []
+
+        tuple = self.cur.fetchall()
+        for i in range(len(tuple)):
+            sql = """select url from zhihu_thought_picture_url as a where a.answer_id = '{}' """.format(tuple[i][0])
+            self.cur.execute(sql)
+            images_url = self.cur.fetchall()
+            sql = """select thought_comment_id, content, created_time, author_id from zhihu_article_root_comment as a where a.answerid = '{}' """.format(tuple[i][0])
+            self.cur.execute(sql)
+            root_comment = self.cur.fetchall()
+            comment = []
+            for j in range(len(root_comment)):
+                comment.append(
+                    [
+                        {
+                            'id': root_comment[j][0],
+                            'comment': root_comment[j][1],
+                            'create_time': root_comment[j][2],
+                            'user': {'id': root_comment[j][3]},
+                        }
+                    ])
+
+            topics.append(
+                {
+                    'images': images_url,
+                    'content': tuple[i][1],
+                    'id': tuple[i][0],
+                    'create_time': tuple[i][2],
+                    'comments': comment,
+                }
+            )
+        return topics
+
+
+if __name__ == '__main__':
+    #a = Crawler_zhihu()
+    # url = a.upload_image_with_path('/Users/xuwei/Desktop/picture/num0.jpg')
+    # print(url)
+
+    # a.data_zuhe()
+
+    print(datetime.now())
+    zhihu = Crawler_zhihu()
+    zhihu.search_page(1, 0, 0)
+    print(datetime.now())
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,5 @@ numpy==1.19.1
 pymysql==0.10.0
 qiniu==7.1.4
 redis==3.5.3
+pymysql==0.10.0
+opencv-python==4.4.0.46
\ No newline at end of file