separate zhihu spider

929f9869 · haowang · 709a4df1 · 929f9869 · 929f9869 · 929f9869
Commit 929f9869 authored Nov 27, 2020 by haowang
5 changed files
--- a/tasks/zhihu/__init__.py
+++ b/tasks/zhihu/__init__.py
--- a/tasks/zhihu/content_refresh.py
+++ b/tasks/zhihu/content_refresh.py
+import pymysql
+import execjs
+import os
+import re
+from datetime import datetime
+from pymysql import escape_string
+import sys
+HOST = '172.18.51.14'
+PORT = 3306
+USER = 'spider'
+PASSWD = 'Gengmei123'
+DB = 'spider'
+JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
+class RefreshContent(object):
+    def __init__(self):
+        '''
+            初始化数据库，调整js规则
+        '''
+        self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
+                                    passwd=PASSWD,
+                                    db=DB, charset='utf8')
+        self.cur = self.conn.cursor()
+        os.environ["EXECJS_RUNTIME"] = 'Node'
+        try:
+            with open('./zhihu.js', 'r', encoding='utf-8') as f:
+                js = f.read()
+        except:
+            with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
+                js = f.read()
+        self.exec_js = execjs.compile(js)
+    def refresh_content(self, table, pic_table, key_id):
+        '''
+            替换url，更新回答内容
+        '''
+        import re
+        sql = """select {}, url, new_url from {}""".format(key_id, pic_table)
+        self.cur.execute(sql)
+        tuple = self.cur.fetchall()
+        self.conn.commit()
+        for i in range(len(tuple)):
+            if tuple[i][2] == None:
+                continue
+            find_id = tuple[i][0]
+            temp = str(tuple[i][1])
+            temp1 = temp.replace("?", "#")
+            sql = """select new_content from {} where {} = '{}' """.format(table, key_id, find_id)
+            self.cur.execute(sql)
+            tuples = self.cur.fetchall()
+            # tuples = str(tuples)
+            content = tuples[0][0]
+            pattern = r'%s(.+?)%s' % ("<noscript><img src=\"" + temp1, "</figure>")
+            temp_tuples = content.replace("?", "#")
+            new_content = re.sub(pattern, "<noscript><img src=\"" + str(tuple[i][2]) + "\"></noscript></figure>",
+                                 temp_tuples)
+            new_content = r'%s' % (new_content)
+            new_content = escape_string(new_content)
+            sql = """update {} set new_content = '{}' WHERE {} = '{}' """.format(table, new_content, key_id, find_id)
+            self.cur.execute(sql)
+            self.conn.commit()
+    def answer_refresh_content(self):
+        '''
+            替换url，更新回答内容
+        '''
+        sql = """select answer_id, url, new_url from zhihu_answer_picture_url"""
+        self.cur.execute(sql)
+        tuple = self.cur.fetchall()
+        self.conn.commit()
+        for i in range(len(tuple)):
+            if tuple[i][2] == None:
+                continue
+            find_id = tuple[i][0]
+            temp = str(tuple[i][1])
+            temp1 = temp.replace("?", "#")
+            sql = """select new_content from zhihu_answer where zhihu_answer.answer_id = '{}' """.format(find_id)
+            self.cur.execute(sql)
+            tuples = self.cur.fetchall()
+            # tuples = str(tuples)
+            content = tuples[0][0]
+            pattern = r'%s(.+?)%s' % ("<noscript><img src=\"" + temp1, "</figure>")
+            temp_tuples = content.replace("?", "#")
+            new_content = re.sub(pattern, "<noscript><img src=\"" + str(tuple[i][2]) + "\"></noscript></figure>",
+                                 temp_tuples)
+            new_content = r'%s' % (new_content)
+            new_content = escape_string(new_content)
+            sql = """update zhihu_answer set new_content = '{}' WHERE answer_id = '{}' """.format(new_content,
+                                                                                                  tuple[i][0])
+            self.cur.execute(sql)
+            self.conn.commit()
+    def article_refresh_content(self):
+        '''
+            替换url，更新文章内容
+        '''
+        sql = """select article_id, url, new_url from zhihu_article_picture_url"""
+        self.cur.execute(sql)
+        tuple = self.cur.fetchall()
+        self.conn.commit()
+        for i in range(len(tuple)):
+            find_id = tuple[i][0]
+            temp = str(tuple[i][1])
+            temp1 = temp.replace("?", "#")
+            sql = """select new_content from zhihu_article where zhihu_article.article_id = '{}' """.format(find_id)
+            self.cur.execute(sql)
+            tuples = self.cur.fetchall()
+            # tuples = str(tuples)
+            content = tuples[0][0]
+            pattern = r'%s(.+?)%s' % ("<noscript><img src=\"" + temp1, "</figure>")
+            temp_tuples = content.replace("?", "#")
+            new_content = re.sub(pattern, "<noscript><img src=\"" + str(tuple[i][2]) + "\"></noscript></figure>",
+                                 temp_tuples)
+            new_content = r'%s' % (new_content)
+            new_content = escape_string(new_content)
+            sql = """update zhihu_article set new_content = '{}' WHERE article_id = '{}' """.format(new_content,
+                                                                                                    tuple[i][0])
+            self.cur.execute(sql)
+            self.conn.commit()
+if __name__ == '__main__':
+    # print('参数个数为:', len(sys.argv), '个参数。')
+    # print('参数列表:', type(sys.argv[0]), sys.argv[0], type(sys.argv[1]), sys.argv[1])
+    mark = int(sys.argv[1])
+    print(datetime.now())
+    refresh = RefreshContent()
+    if mark == 0:
+        refresh.refresh_content('zhihu_answer', 'zhihu_answer_picture_url', 'answer_id')
+    elif mark == 1:
+        refresh.refresh_content('zhihu_article', 'zhihu_article_picture_url', 'article_id')
+    elif mark == 2:
+        refresh.refresh_content('zhihu_thought', 'zhihu_thought_picture_url', 'thought_id')
+    print(datetime.now())
--- a/tasks/image_qiniu.py
+++ b/tasks/image_qiniu.py
--- a/tasks/zhihu/spider.py
+++ b/tasks/zhihu/spider.py
+# import rsa
+# import os, sys
+# base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+# sys.path.append("/Users/xuwei")
+# sys.path.append("/Users/xuwei/crawler")
+# sys.path.append("/Users/xuwei/crawler/crawler_sys")
+import pymysql
+import hashlib
+import requests
+import execjs
+import os
+import re
+import sys
+from crawler_sys.utils.output_results import retry_get_url, retry_get_url_no_proxies
+from datetime import datetime
+HOST = '172.18.51.14'
+PORT = 3306
+USER = 'spider'
+PASSWD = 'Gengmei123'
+DB = 'spider'
+JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
+SPIDER_URL = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1'
+ANSWER_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created'
+ARTICLE_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created'
+THOUGHT_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment'
+ANSWER_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
+ARTICLE_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
+CHILD_COMMENT_START_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments'
+CHILD_COMMENT_OFFSET_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}'
+THOUGHT_COMMENT_URL = 'https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open'
+class Spider(object):
+    def __init__(self):
+        '''
+            初始化数据库，调整js规则
+        '''
+        self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
+                                    passwd=PASSWD,
+                                    db=DB, charset='utf8')
+        self.cur = self.conn.cursor()
+        os.environ["EXECJS_RUNTIME"] = 'Node'
+        try:
+            with open('./zhihu.js', 'r', encoding='utf-8') as f:
+                js = f.read()
+        except:
+            with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
+                js = f.read()
+        self.exec_js = execjs.compile(js)
+    def get_serach_page_cookies(self):
+        '''
+            cookies更新
+        '''
+        url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1"
+        headers = {
+            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+            "accept-encoding": "gzip, deflate, br",
+            "accept-language": "zh-CN,zh;q=0.9",
+            "cache-control": "max-age=0",
+            "cookie": '_SESSIONID=rvJz2mbRjCqJFwvD79ADGb8gpdpRkWXAsdVDqOVALgh; JOID=UVkSBEtHLEUgV_KGUUMd3fULiLJHZglmBHfTo3JnDWADc9KndNqP8XtW9oBWoLo5jsAnAcPG0-JwbPp8rD1JsFI=; osd=W1gdB05NLUojUviHXkAY1_QEi7dNZwZlAX3SrHFiB2EMcNetddWM9HFX-YNTqrs2jcUtAMzF1uhxY_l5pjxGs1c=; _zap=cc1330a6-d6e9-4b25-8232-0b0481e37ea2; _xsrf=4798d610-1db7-489f-8c14-d608c0bbcf08; d_c0="AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"; capsion_ticket="2|1:0|10:1605684439|14:capsion_ticket|44:MzdiN2JlOTZlNWE2NGU5Zjg0MDNiODFlN2ViMjQzNGU=|b84d77e03b5e9447a3740a614d208ca5f3bc1e4fe15fe9b46f1a2a5702da5f99"; SESSIONID=nplCMucg2EXp8xNQAz74jmaLovUS9CG4rVtcFY4jWLT; JOID=V1sUBk8zRBeITtMgLDNzj10QrRI6WzZ_ygKgQU8AB038d6pof-COptdJ1yYpGa7oR9-1bgHql0Hfs4FvUF_YW-A=; osd=Ul4XBkI2QRSIQ9YlLzN-ilgTrR8_XjV_xwelQk8NAkj_d6dteuOOq9JM1CYkHKvrR9KwawLqmkTasIFiVVrbW-0=; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1605608394,1605684435,1605684439,1605684455; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1605684522; KLBRSID=fe0fceb358d671fa6cc33898c8c48b48|1605685245|1605683234',
+            "referer": "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1",
+            "sec-fetch-dest": "document",
+            "sec-fetch-mode": "navigate",
+            "sec-fetch-site": "same-origin",
+            "sec-fetch-user": "?1",
+            "upgrade-insecure-requests": "1",
+            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
+        }
+        requests_res = retry_get_url(url, headers=headers)
+        return requests_res.cookies.get_dict()
+    def parse_sigle_page(self, data_dict, mark):
+        '''
+            插入主要内容数据和图片的url，寻找评论
+        '''
+        if mark == 0:
+            into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
+            values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"],
+                      data_dict["comment_count"], data_dict["content"])
+        elif mark == 1:
+            into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
+            values = (
+            data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"],
+            data_dict["content"])
+        self.cur.execute(into, values)
+        self.conn.commit()
+        offset = 0
+        if data_dict["comment_count"] != 0:
+            next = 1
+            while next == 1:
+                next = self.search_root_comment(data_dict["id"], offset, mark)
+                offset = offset + 20
+        patt = r'%s(.+?)%s' % ("<noscript><img src=\"", "\" data-caption")
+        pattern = re.compile(patt)
+        result = pattern.findall(data_dict["content"])
+        for results in result:
+            if mark == 0:
+                into = "insert into zhihu_answer_picture_url(answer_id, url) value(%s, %s)"
+            elif mark == 1:
+                into = "insert into zhihu_article_picture_url(article_id, url) value(%s, %s)"
+            values = (data_dict["id"], results)
+            self.cur.execute(into, values)
+            self.conn.commit()
+        return
+    def search_page(self, mark, page_max, start_page=0):
+        '''
+            函数主入口
+            params:
+                mark 0 answer, 1 article, 2 thought
+        '''
+        offset = start_page
+        for i in range(page_max):
+            if mark == 0:
+                self.search_answer_article_page(offset, 0)
+            elif mark == 1:
+                self.search_answer_article_page(offset, 1)
+            elif mark == 2:
+                self.search_thought_page(offset)
+            offset = offset + 20
+        self.conn.close()
+        return
+    def search_answer_article_page(self, offset, mark, proxies_num=0):
+        '''
+            实现文章和回答的数据包请求
+        '''
+        offset = str(offset)
+        if mark == 0:
+            url = ANSWER_URL.format(offset)
+        elif mark == 1:
+            url = ARTICLE_URL.format(offset)
+        [headers_search, cookies_dict] = self.headers_handle(url)
+        get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
+        if get_page.status_code != 200:
+            # retry once
+            get_page = requests.get(url)
+            if get_page.status_code != 200:
+                print("article_error")
+        page_dict = get_page.json()
+        if page_dict.get("data"):
+            for one_line in page_dict['data']:
+                try:
+                    if one_line["content"] != None:
+                        self.parse_sigle_page(one_line, mark)
+                        print("finshed_article" + offset)
+                except KeyError:
+                    # It's totally ok to drop the last return data value.
+                    # The search api just return something seems related to search
+                    continue
+        else:
+            print("article_data_error")
+        return
+    def search_root_comment(self, answerid, offset, mark, proxies_num=0):
+        '''
+            实现父评论的数据包请求
+        '''
+        offset = str(offset)
+        answerid = str(answerid)
+        if mark == 0:
+            url = ANSWER_ROOT_COMMENT_URL.format(answerid, offset)
+        elif mark == 1:
+            url = ARTICLE_ROOT_COMMENT_URL.format(answerid, offset)
+        [headers_search, cookies_dict] = self.headers_handle(url)
+        get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
+        if get_page.status_code != 200:
+            # retry once
+            get_page = requests.get(url)
+            if get_page.status_code != 200:
+                print("root_comment_error")
+        page_dict = get_page.json()
+        if page_dict.get("data"):
+            for one_line in page_dict['data']:
+                try:
+                    self.root_comment_data(one_line, answerid, mark)
+                    print("finshed_root" + offset)
+                except KeyError:
+                    continue
+        else:
+            print("root_data_error")
+        next = 0
+        if len(page_dict['data']) == 20:
+            next = 1
+        return next
+    def root_comment_data(self, data_dict, answerid, mark):
+        '''
+            插入父评论相关信息并关联子评论
+        '''
+        if mark == 0:
+            into = "insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
+        elif mark == 1:
+            into = "insert into zhihu_article_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
+        values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid,
+                  data_dict["child_comment_count"], data_dict["featured"], data_dict["created_time"],
+                  data_dict["author"]["member"]["id"])
+        self.cur.execute(into, values)
+        self.conn.commit()
+        offset = 0
+        if data_dict["child_comment_count"] != 0:
+            next = 1
+            while next == 1:
+                next = self.search_child_comment(data_dict["id"], offset, mark)
+                offset = offset + 20
+        return
+    def search_child_comment(self, root_comment_id, offset, proxies_num=0):
+        '''
+            文章和回答的数据包请求
+        '''
+        root_comment_id = str(root_comment_id)
+        offsets = offset
+        offset = str(offset)
+        if offsets == 0:
+            url = CHILD_COMMENT_START_URL.format(root_comment_id)
+        else:
+            url = CHILD_COMMENT_OFFSET_URL.format(root_comment_id, offset)
+        [headers_search, cookies_dict] = self.headers_handle(url)
+        get_page = retry_get_url_no_proxies(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
+        if get_page.status_code != 200:
+            # retry once
+            get_page = requests.get(url)
+            if get_page.status_code != 200:
+                print("child_comment_error")
+        page_dict = get_page.json()
+        if page_dict.get("data"):
+            for one_line in page_dict['data']:
+                try:
+                    self.child_comment_data(one_line, root_comment_id)
+                except KeyError:
+                    continue
+        else:
+            pass
+        next = 0
+        if len(page_dict['data']) == 20:
+            next = 1
+        return next
+    def child_comment_data(self, data_dict, root_comment_id):
+        '''
+            子评论数据插入
+        '''
+        into = "insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s)"
+        values = (root_comment_id, data_dict["author"]["member"]["name"], data_dict["content"],
+                  data_dict["reply_to_author"]["member"]["name"], data_dict["id"], data_dict["created_time"],
+                  data_dict["author"]["member"]["name"])
+        self.cur.execute(into, values)
+        self.conn.commit()
+        return
+    def headers_handle(self, url):
+        '''
+            url请求中的头部伪装
+        '''
+        res_cookies_dict = self.get_serach_page_cookies()
+        headers_search = {
+            "accept": "*/*",
+            "accept-encoding": "gzip, deflate",
+            "accept-language": "zh-CN,zh;q=0.9",
+            "sec-fetch-dest": "empty",
+            "sec-fetch-mode": "cors",
+            "sec-fetch-site": "same-origin",
+            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
+            "x-ab-param": "li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1",
+            "x-api-version": "3.0.91",
+            "x-app-za": "OS=Web",
+            "x-requested-with": "fetch",
+            "x-zse-83": "3_2.0",
+            "x-zse-86": None,
+            "referer": "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?page=1",
+        }
+        cookies_dict = {
+            "d_c0": '"AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"',
+            "KLBRSID": None
+        }
+        cookies_dict.update(res_cookies_dict)
+        f = "+".join(
+            ["3_2.0", url.replace("https://www.zhihu.com", ""), headers_search["referer"], cookies_dict["d_c0"]])
+        fmd5 = hashlib.new('md5', f.encode()).hexdigest()
+        headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b", fmd5)
+        return headers_search, cookies_dict
+    def search_thought_page(self, offset, proxies_num=0):
+        '''
+            想法数据包请求
+        '''
+        offset = str(offset)
+        url = THOUGHT_URL.format(offset)
+        [headers_search, cookies_dict] = self.headers_handle(url)
+        get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
+        if get_page.status_code != 200:
+            # retry once
+            get_page = requests.get(url)
+            if get_page.status_code != 200:
+                print("article_error")
+        page_dict = get_page.json()
+        if page_dict.get("data"):
+            for one_line in page_dict['data']:
+                try:
+                    self.parse_thought_sigle_page(one_line)
+                    print("finshed_article" + offset)
+                except KeyError:
+                    # It's totally ok to drop the last return data value.
+                    # The search api just return something seems related to search
+                    continue
+        else:
+            print("article_data_error")
+        return
+    def parse_thought_sigle_page(self, data_dict):
+        '''
+            想法内容插入
+        '''
+        for one_dict in data_dict["content"]:
+            if one_dict["type"] == "text":
+                into = "insert into zhihu_thought(content, thought_id, created_time, comment_count) value(%s, %s, %s, %s)"
+                values = (one_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"])
+                self.cur.execute(into, values)
+                self.conn.commit()
+            else:
+                into = "insert into zhihu_thought_picture_url(thought_id, url) value(%s, %s)"
+                values = (data_dict["id"], one_dict["url"])
+                self.cur.execute(into, values)
+                self.conn.commit()
+        offset = 0
+        if data_dict["comment_count"] != 0:
+            next = 1
+            while next == 1:
+                next = self.search_thought_comment(data_dict["id"], offset)
+                offset = offset + 20
+        return
+    def search_thought_comment(self, answerid, offset, proxies_num=0):
+        '''
+            想法评论数据包请求
+        '''
+        offset = str(offset)
+        answerid = str(answerid)
+        url = THOUGHT_COMMENT_URL.format(answerid, offset)
+        [headers_search, cookies_dict] = self.headers_handle(url)
+        get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies_num)
+        if get_page.status_code != 200:
+            # retry once
+            get_page = requests.get(url)
+            if get_page.status_code != 200:
+                print("root_comment_error")
+        page_dict = get_page.json()
+        if page_dict.get("data"):
+            for one_line in page_dict['data']:
+                try:
+                    self.thought_comment_data(one_line, answerid)
+                    print("finshed_root" + offset)
+                except KeyError:
+                    continue
+        else:
+            print("root_data_error")
+        next = 0
+        if len(page_dict['data']) == 20:
+            next = 1
+        return next
+    def thought_comment_data(self, data_dict, answerid):
+        '''
+            想法评论数据插入
+        '''
+        into = "insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(%s, %s, %s, %s, %s, %s)"
+        values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid,
+                  data_dict["created_time"], data_dict["author"]["member"]["id"])
+        self.cur.execute(into, values)
+        self.conn.commit()
+        return
+if __name__ == '__main__':
+    mark = int(sys.argv[1])
+    max_page = int(sys.argv[2])
+    start_page = int(sys.argv[3])
+    print(datetime.now())
+    spider = Spider()
+    if mark == 0:
+        spider.search_page(mark, max_page, start_page)
+    elif mark == 1:
+        spider.search_page(mark, max_page, start_page)
+    elif mark == 2:
+        spider.search_page(mark, max_page, start_page)
+    print(datetime.now())
--- a/tasks/upload_picture.py
+++ b/tasks/upload_picture.py
 import os
+import sys
 import re
 import time
 import pymysql
@@ -10,8 +11,8 @@ from datetime import datetime
 from image_qiniu import upload_file, IMG_TYPE
-DATA_OS_PATH = '/Users/haowei/workspace/gm/crawler/image'
+DATA_OS_PATH = '/image'
-PROJECT_PATH = '/Users/haowei/workspace/gm/crawler'
+PROJECT_PATH = '/'
 class UploadImage(object):
@@ -43,7 +44,7 @@ class UploadImage(object):
            with open(self.JS_FILE_PATH, 'r', encoding='utf-8') as f:
                js = f.read()
        # print(js)
-        self.exec_js = execjs.compile(js)
+        self.exec_js = execjs.compile(js, )
    def get_serach_page_cookies(self):
        '''
@@ -133,11 +134,11 @@ class UploadImage(object):
            print('upload .....  error')
            return None
-    def picture_download_and_cut(self, path, new_path, table, key_id):
+    def picture_download_and_cut(self, path, new_path, table, key_id, offset=0, count=10):
        '''
            文章图片剪切和下载
        '''
-        sql = """select {}, url from {}""".format(key_id, table)
+        sql = """select {}, url from {} where new_url == '' limit {}, {}""".format(key_id, table, offset, count)
        self.cur.execute(sql)
        tuple = self.cur.fetchall()
        self.conn.commit()
@@ -186,7 +187,15 @@ class UploadImage(object):
 if __name__ == '__main__':
+    mark = int(sys.argv[1]) or 0
+    offset = int(sys.argv[2]) or 0
+    count = int(sys.argv[3]) or 10
    print(datetime.now())
    a = UploadImage()
-    a.picture_download_and_cut_process()
+    if mark == 0:
+        a.picture_download_and_cut(a.ANSWER_PICTURE_PATH, a.ANSWER_PICTURE_CUT_PATH, 'zhihu_answer_picture_url', 'answer_id', offset, count)
+    if mark == 1:
+        a.picture_download_and_cut(a.ARTICLE_PICTURE_PATH, a.ARTICLE_PICTURE_CUT_PATH, 'zhihu_article_picture_url', 'article_id', offset, count)
+    if mark == 2:
+        a.picture_download_and_cut(a.THOUGHT_PICTURE_PATH, a.THOUGHT_PICTURE_CUT_PATH, 'zhihu_thought_picture_url', 'thought_id', offset, count)
    print(datetime.now())