import pymysql
import hashlib
import requests
import execjs
import os
import re
import sys
import time
from datetime import datetime
import kdl

HOST = '172.18.51.14'
PORT = 3306
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
# JS_FILE_PATH = '/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js'
APIKEY = 'quxguz4hwm9cxnx6wpjhkokx04klpr8v'


def login():
    url = 'https://www.zhihu.com'
    loginUrl = 'https://www.zhihu.com/login/email'

    headers = {
        # "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:41.0) Gecko/20100101 Firefox/41.0',
        'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/55.0.2883.87 Chrome/55.0.2883.87 Safari/537.36',
        "Referer": "http://www.zhihu.com/",
        'Host': 'www.zhihu.com',
        'rememberme': "true"
    }

    data = {
        'email': 'yousangdandan@yeah.net',
        'password': '5358569'
    }

    global session
    session = requests.session()
    login_req = session.post(loginUrl, data=data,headers=headers)
    print('loginReq:{}'.format(loginReq.status_code))
    return login_req
    


def get_proxy():
    auth = kdl.Auth("990866563045611", APIKEY)
    client = kdl.Client(auth)
    ips = client.get_dps(1, sign_type='hmacsha1', format='json', area='北京,上海,广东')
    print("dps proxy: ", ips, client.get_proxy_authorization())
    # return { "http": "http://{}".format(ips[0]), "https": "https://{}".format(ips[0]), }, client.get_proxy_authorization()
    return { "http": "http://{}".format('171.35.213.172:9999'), "https": "https://{}".format('171.35.213.172:9999'), }, client.get_proxy_authorization()


def retry_get_url(url, retrys=5, headers={}, timeout=10, **kwargs):
    retry_c = 0
    while retry_c < retrys:
        time.sleep(3)

        try:
            # proxies, proxy_authorization = get_proxy()
            # headers.update(proxy_authorization)
            # get_resp = requests.get(url, headers=headers, timeout=timeout, proxies=proxies, **kwargs)
            get_resp = requests.get(url, headers=headers, timeout=timeout, **kwargs)
            return get_resp
        except Exception as e:
            retry_c += 1
            print(e)
            # proxies, proxy_authorization = get_proxy()
    print('Failed to get page %s after %d retries, %s'
          % (url, retrys, datetime.now()))
    return None


class Spider(object):
    
    def __init__(self, spider_url):
        '''
            初始化数据库，调整js规则
        '''
        self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
                                    passwd=PASSWD,
                                    db=DB, charset='utf8')
        self.cur = self.conn.cursor()

        self.page_count = 1000
        self.use_proxy = True
        
        self.spider_url = spider_url
        detail_url = '/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={}&limit=20&sort_by=created'
        self.ANSWER_URL = self.spider_url.replace("https://www.zhihu.com/people", "https://www.zhihu.com/api/v4/members") + detail_url

        os.environ["EXECJS_RUNTIME"] = 'Node'
        try:
            with open('./zhihu.js', 'r', encoding='utf-8') as f:
                js = f.read()
        except:
            with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
                js = f.read()
        # self.exec_js = execjs.compile(js)
        self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')

    def headers_handle(self, url):
        '''
            url请求中的头部伪装
        '''

        '''
        {
            'Server': 'CLOUD ELB 1.0.0', 
            'Date': 'Fri, 01 Jan 2021 08:36:59 GMT', 
            'Content-Type': 'text/html; charset=utf-8', 
            'Vary': 'Accept-Encoding', 
            'content-security-policy': "default-src * blob:; img-src * data: blob: resource: t.captcha.qq.com cstaticdun.126.net necaptcha.nosdn.127.net; connect-src * wss: blob: resource:; frame-src 'self' *.zhihu.com mailto: tel: weixin: *.vzuu.com mo.m.taobao.com getpocket.com note.youdao.com safari-extension://com.evernote.safari.clipper-Q79WDW8YH9 zhihujs: captcha.guard.qcloud.com pos.baidu.com dup.baidustatic.com openapi.baidu.com wappass.baidu.com passport.baidu.com *.cme.qcloud.com vs-cdn.tencent-cloud.com t.captcha.qq.com c.dun.163.com; script-src 'self' blob: *.zhihu.com g.alicdn.com qzonestyle.gtimg.cn res.wx.qq.com open.mobile.qq.com 'unsafe-eval' unpkg.zhimg.com unicom.zhimg.com resource: captcha.gtimg.com captcha.guard.qcloud.com pagead2.googlesyndication.com cpro.baidustatic.com pos.baidu.com dup.baidustatic.com i.hao61.net 'nonce-8555a150-24a4-490b-9a1e-d48bdb590dfe' hm.baidu.com zz.bdstatic.com b.bdstatic.com imgcache.qq.com vs-cdn.tencent-cloud.com gw.alipayobjects.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net; style-src 'self' 'unsafe-inline' *.zhihu.com unicom.zhimg.com resource: captcha.gtimg.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net", 'x-content-security-policy': "default-src * blob:; img-src * data: blob: resource: t.captcha.qq.com cstaticdun.126.net necaptcha.nosdn.127.net; connect-src * wss: blob: resource:; frame-src 'self' *.zhihu.com mailto: tel: weixin: *.vzuu.com mo.m.taobao.com getpocket.com note.youdao.com safari-extension://com.evernote.safari.clipper-Q79WDW8YH9 zhihujs: captcha.guard.qcloud.com pos.baidu.com dup.baidustatic.com openapi.baidu.com wappass.baidu.com passport.baidu.com *.cme.qcloud.com vs-cdn.tencent-cloud.com t.captcha.qq.com c.dun.163.com; script-src 'self' blob: *.zhihu.com g.alicdn.com qzonestyle.gtimg.cn res.wx.qq.com open.mobile.qq.com 'unsafe-eval' unpkg.zhimg.com unicom.zhimg.com resource: captcha.gtimg.com captcha.guard.qcloud.com pagead2.googlesyndication.com cpro.baidustatic.com pos.baidu.com dup.baidustatic.com i.hao61.net 'nonce-8555a150-24a4-490b-9a1e-d48bdb590dfe' hm.baidu.com zz.bdstatic.com b.bdstatic.com imgcache.qq.com vs-cdn.tencent-cloud.com gw.alipayobjects.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net; style-src 'self' 'unsafe-inline' *.zhihu.com unicom.zhimg.com resource: captcha.gtimg.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net", 'x-webkit-csp': "default-src * blob:; img-src * data: blob: resource: t.captcha.qq.com cstaticdun.126.net necaptcha.nosdn.127.net; connect-src * wss: blob: resource:; frame-src 'self' *.zhihu.com mailto: tel: weixin: *.vzuu.com mo.m.taobao.com getpocket.com note.youdao.com safari-extension://com.evernote.safari.clipper-Q79WDW8YH9 zhihujs: captcha.guard.qcloud.com pos.baidu.com dup.baidustatic.com openapi.baidu.com wappass.baidu.com passport.baidu.com *.cme.qcloud.com vs-cdn.tencent-cloud.com t.captcha.qq.com c.dun.163.com; script-src 'self' blob: *.zhihu.com g.alicdn.com qzonestyle.gtimg.cn res.wx.qq.com open.mobile.qq.com 'unsafe-eval' unpkg.zhimg.com unicom.zhimg.com resource: captcha.gtimg.com captcha.guard.qcloud.com pagead2.googlesyndication.com cpro.baidustatic.com pos.baidu.com dup.baidustatic.com i.hao61.net 'nonce-8555a150-24a4-490b-9a1e-d48bdb590dfe' hm.baidu.com zz.bdstatic.com b.bdstatic.com imgcache.qq.com vs-cdn.tencent-cloud.com gw.alipayobjects.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net; style-src 'self' 'unsafe-inline' *.zhihu.com unicom.zhimg.com resource: captcha.gtimg.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net", 
            'x-frame-options': 'SAMEORIGIN', 
            'strict-transport-security': 'max-age=15552000; includeSubDomains', 
            'surrogate-control': 'no-store', 
            'cache-control': 'no-cache, no-store, must-revalidate, private, max-age=0', 
            'pragma': 'no-cache', 
            'expires': '0', 
            'x-content-type-options': 'nosniff', 
            'x-xss-protection': '1; mode=block', 
            'X-Backend-Response': '0.022', 
            'Referrer-Policy': 'no-referrer-when-downgrade', 
            'X-SecNG-Response': '0.023000001907349', 
            'x-lb-timing': '0.023', 
            'x-idc-id': '2', 
            'Set-Cookie': 'KLBRSID=fb3eda1aa35a9ed9f88f346a7a3ebe83|1609490219|1609490131; Path=/', 
            'X-Cache-Lookup': 'Cache Miss, Cache Miss', 
            'Content-Encoding': 'gzip', 
            'Transfer-Encoding': 'chunked', 
            'X-NWS-LOG-UUID': '16179100128830453442', 
            'Connection': 'keep-alive', 
            'x-edge-timing': '0.061', 
            'x-cdn-provider': 'tencent'}

        '''

        '''
        html
            cache-control: no-cache, no-store, must-revalidate, private, max-age=0
            content-encoding: gzip
            content-type: application/json
            date: Fri, 01 Jan 2021 07:01:52 GMT
            etag: W/"6a7ddf80b3ab19ba789d570163ac1eacb4bde53e"
            expires: Fri, 02 Jan 2000 00:00:00 GMT
            pragma: no-cache
            referrer-policy: no-referrer-when-downgrade
            server: CLOUD ELB 1.0.0
            set-cookie: KLBRSID=5430ad6ccb1a51f38ac194049bce5dfe|1609484511|1609484497; Path=/
            vary: Accept-Encoding
            x-backend-response: 0.573
            x-cache-lookup: Cache Miss
            x-cdn-provider: tencent
            x-edge-timing: 0.634
            x-idc-id: 2
            x-lb-timing: 0.599
            x-nws-log-uuid: 12448536375904178345
            x-secng-response: 0.59800004959106
            x-udid: AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=
        '''

        res_cookies_dict = self.get_serach_page_cookies()
        referer = self.spider_url.replace("https://www.zhihu.com/people", "https://www.zhihu.com/api/v4/members")
        headers_search = {
            "accept": "*/*",
            "accept-encoding": "gzip, deflate",
            "accept-language": "zh-CN,zh;q=0.9",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
            "x-ab-param": "li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1",
            "x-api-version": "3.0.91",
            "x-app-za": "OS=Web",
            "x-requested-with": "fetch",
            "x-zse-83": "3_2.0",
            "x-zse-86": None,
            "referer": referer + "/answers?page=1",
        }
        # cookies_dict = {
        #     "d_c0": '"AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"',
        #     "KLBRSID": '5430ad6ccb1a51f38ac194049bce5dfe|1609484506|1609484497',
        # }
        
        # cookies_dict.update(res_cookies_dict)
        
        f = "+".join(
            ["3_2.0", url.replace("https://www.zhihu.com", ""), headers_search["referer"], cookies_dict["d_c0"]])
        fmd5 = hashlib.new('md5', f.encode()).hexdigest()
        headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b", fmd5)
        return headers_search, cookies_dict

    def get_serach_page_cookies(self):
        '''
            cookies更新
        '''
        
        headers = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            "cache-control": "max-age=0",
            "cookie": '_xsrf=vTWamiEoaOQszAl6fjdlxOtqyhDvOen9; d_c0="AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"; q_c1=3e9e37a46b1d4bfd87f7d1fcb084daad|1545899267000|1545899267000; _ga=GA1.2.929033900.1582626815; capsion_ticket="2|1:0|10:1608602928|14:capsion_ticket|44:MmRhNDdmYWJhZjU3NGQ4ODg3NDAzNGIwNDNiMTdlNDE=|7924fa0d0e36d3ed2a4af65dafa4684c9b36a70d586ec3adb1963c8df5f55e81"; _zap=6fd2d768-daa1-4be1-9a96-43d86c1bbc75; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1609325059,1609337218,1609401296,1609405637; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1609484506; KLBRSID=5430ad6ccb1a51f38ac194049bce5dfe|1609484506|1609484497',
            "referer": self.spider_url,
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "same-origin",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
        }
        requests_res = retry_get_url(self.spider_url, headers=headers)
        return requests_res.cookies.get_dict()
    
    def update_page_count(self, answer_count):
        count = int(answer_count / 20)
        temp = int(answer_count % 20)
        if temp > 0:
            count += 1
        self.page_count = count

    def check_data_exist(self, data_dict, mark):
        '''
            数据插入前检测
        '''
        sql = "select id from {table} where answer_id = {id_}"
        exist = None
        if mark == 0:
            select_sql = sql.format(table='zhihu_answer', id_=data_dict["id"])
            self.cur.execute(select_sql)
            exist = self.cur.fetchone()
        if mark == 1:
            select_sql = sql.format(table='zhihu_article', id_=data_dict["id"])
            self.cur.execute(select_sql)
            exist = self.cur.fetchone()

        if exist:
            return True
        return False

    def parse_sigle_page(self, data_dict, mark):
        '''
            插入主要内容数据和图片的url
        '''
        if not self.check_data_exist(data_dict, mark):

            if mark == 0:
                    into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
                    values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"],
                            data_dict["comment_count"], data_dict["content"])

            elif mark == 1:
                into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
                values = (
                data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"],
                data_dict["content"])
            print(data_dict["question"]["title"])
            self.cur.execute(into, values)
            self.conn.commit()

        return
    
    def search_page(self, mark, page_max, start_page=0, need_commend=False):
        '''
            函数主入口
            
            params:
                mark 0 answer, 1 article, 2 thought
        '''
        offset = start_page
        
        for i in range(page_max):
            if i > self.page_count - 1:
                break
            if mark == 0:
                self.search_answer_article_page(offset, 0, 0)
            elif mark == 1:
                self.search_answer_article_page(offset, 1, 0)
            elif mark == 2:
                self.search_thought_page(offset)
                
            offset = offset + 20
            time.sleep(10)

        self.conn.close()
        return
    
    def get_page_data(self, url, headers_search, cookies_dict):
        get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict)
        if get_page.status_code != 200:
            # retry once
            get_page = retry_get_url(url)
            if get_page.status_code != 200:
                print("article_error, url : ", url, " status_code: ", get_page.status_code)
        try:
            page_dict = get_page.json()
            print('get page json data success! {}', offset)
        except:
            print('retry get page data : {}', offset)
            self.get_page_data(url)

    def search_answer_article_page(self, offset, mark, proxies_num=0):
        '''
            实现文章和回答的数据包请求
        '''
        offset = str(offset)
        if mark == 0:
            url = self.ANSWER_URL.format(offset)
        elif mark == 1:
            url = ARTICLE_URL.format(offset)
        [headers_search, cookies_dict] = self.headers_handle(url)
        
        page_dict = self.get_page_data(url, headers_search, cookies_dict)
        if page_dict.get("data"):
            print(self.page_count)
            if self.page_count == 1000:
                self.update_page_count(page_dict["paging"].get("totals", 0))
            for one_line in page_dict['data']:
                try:
                    if one_line["content"] != None:
                        self.parse_sigle_page(one_line, mark)
                        print("finshed_crawler " + offset)
                except KeyError:
                    # It's totally ok to drop the last return data value.
                    # The search api just return something seems related to search
                    print('page data error')
                    continue
        else:
            print("article_data_error, offset: ", offset, " url: ", url)
            self.use_proxy = True
            self.search_answer_article_page(offset=offset, mark=mark)
        
        return
    

if __name__ == '__main__':
    '''
    python tasks/zhihu/spider.py 0 1 0 'https://www.zhihu.com/people/taoxi-1130'
    python script_file_path mark(指定是问题还是其他， 0 是问题， 1是文章， 2是想法) max_page(最大页码) start_page(起始页码 0开始) spider_url(要爬取用户知乎首页地址)
    '''

    mark = int(sys.argv[1])
    max_page = int(sys.argv[2])
    start_page = int(sys.argv[3])
    spider_url = sys.argv[4]

    # spider_url = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers'
    print(datetime.now())
    spider = Spider(spider_url=spider_url)
    if mark == 0:
        spider.search_page(mark, max_page, start_page)
    elif mark == 1:
        spider.search_page(mark, max_page, start_page)
    elif mark == 2:
        spider.search_page(mark, max_page, start_page)
    print(datetime.now())
