import pymysql
import hashlib
import requests
import execjs
import os
import re
import sys
import time
from datetime import datetime
import base64
from requests_html import HTMLSession
from PIL import Image
from captcha.zhihu_captcha import ZhihuCaptcha

HOST = '172.18.51.14'
PORT = 3306
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
# JS_FILE_PATH = '/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js'


class Spider(object):
    
    def __init__(self, spider_url):
        '''
            初始化数据库，调整js规则
        '''
        self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
                                    passwd=PASSWD,
                                    db=DB, charset='utf8')
        self.cur = self.conn.cursor()

        self.page_count = 1000
        self.use_proxy = True
        
        self.spider_url = spider_url
        detail_url = '/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={}&limit=20&sort_by=created'
        self.ANSWER_URL = self.spider_url.replace("https://www.zhihu.com/people", "https://www.zhihu.com/api/v4/members") + detail_url

        os.environ["EXECJS_RUNTIME"] = 'Node'
        try:
            with open('./zhihu.js', 'r', encoding='utf-8') as f:
                js = f.read()
        except:
            with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
                js = f.read()
        # self.exec_js = execjs.compile(js)
        self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')

        self.session = requests.session()
        self.HTMLSession = HTMLSession()

        self.login_req = self._login()

    def _login(self):
        url = 'https://www.zhihu.com'
        loginUrl = 'https://www.zhihu.com/login/email'

        headers = {
            'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/55.0.2883.87 Chrome/55.0.2883.87 Safari/537.36',
            "Referer": "http://www.zhihu.com/",
            'Host': 'www.zhihu.com',
            'rememberme': "true"
        }
        self.headers = headers

        data = {
            'email': 'yousangdandan@yeah.net',
            'password': '5358569'
        }

        
        login_req = self.session.post(loginUrl, data=data, headers=headers)
        print('loginReq:{}'.format(login_req.status_code))
        self.set_cookie(self.session.cookies)
        return login_req

    def set_cookies(self, cookies):
        self.HTMLSession.cookies = cookies
        self.session.cookies = cookies

    def appeal(self, url):
        self.captcha_model = ZhihuCaptcha()
        r = self.HTMLSession.get('https://www.zhihu.com/api/v4/anticrawl/captcha_appeal')
        captchaUrl = r.json()['img_base64']
        captchaUrl = re.sub('\n', '', captchaUrl)
        with open('cache/captcha2.png', 'wb') as f:
            img_base64 = base64.b64decode(captchaUrl.strip('data:image/png;base64,').strip())
            print(img_base64)
            f.write(img_base64)
        im = Image.open('cache/captcha2.png')
        captcha = self.captcha_model.recgImg(im)
        print(captcha)

        r = self.HTMLSession.post('https://www.zhihu.com/api/v4/anticrawl/captcha_appeal',
                                  data=json.dumps({"captcha": captcha}),
                                  headers={"User-Agent": user_agent,
                                           "referer": 'https://www.zhihu.com/account/unhuman?type=unhuman&message=%E7%B3%BB%E7%BB%9F%E7%9B%91%E6%B5%8B%E5%88%B0%E6%82%A8%E7%9A%84%E7%BD%91%E7%BB%9C%E7%8E%AF%E5%A2%83%E5%AD%98%E5%9C%A8%E5%BC%82%E5%B8%B8%EF%BC%8C%E4%B8%BA%E4%BF%9D%E8%AF%81%E6%82%A8%E7%9A%84%E6%AD%A3%E5%B8%B8%E8%AE%BF%E9%97%AE%EF%BC%8C%E8%AF%B7%E8%BE%93%E5%85%A5%E9%AA%8C%E8%AF%81%E7%A0%81%E8%BF%9B%E8%A1%8C%E9%AA%8C%E8%AF%81%E3%80%82&need_login=false',
                                           'Content-Type': 'application/json',
                                           'x-xsrftoken': self.HTMLSession.cookies._cookies['.zhihu.com']['/']['_xsrf'].value})
        return self.HTMLSession.get(url, allow_redirects=False)
    
    def retry_get_url(self, url, retrys=5, timeout=10, **kwargs):
        retry_c = 0
        while retry_c < retrys:
            time.sleep(5)
            try:
                import pdb; pdb.set_trace()
                get_resp = self.session.get(url, headers=self.headers, timeout=timeout, **kwargs)
                if get_resp.status_code == 403:
                    get_resp = self.appeal(url)
                return get_resp
            except Exception as e:
                retry_c += 1
                print(e)
        print('Failed to get page %s after %d retries, %s'
            % (url, retrys, datetime.now()))
        return None
  
    def update_page_count(self, answer_count):
        count = int(answer_count / 20)
        temp = int(answer_count % 20)
        if temp > 0:
            count += 1
        self.page_count = count

    def check_data_exist(self, data_dict, mark):
        '''
            数据插入前检测
        '''
        sql = "select id from {table} where answer_id = {id_}"
        exist = None
        if mark == 0:
            select_sql = sql.format(table='zhihu_answer', id_=data_dict["id"])
            self.cur.execute(select_sql)
            exist = self.cur.fetchone()
        if mark == 1:
            select_sql = sql.format(table='zhihu_article', id_=data_dict["id"])
            self.cur.execute(select_sql)
            exist = self.cur.fetchone()

        if exist:
            return True
        return False

    def parse_sigle_page(self, data_dict, mark):
        '''
            插入主要内容数据和图片的url
        '''
        if not self.check_data_exist(data_dict, mark):

            if mark == 0:
                    into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
                    values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"],
                            data_dict["comment_count"], data_dict["content"])

            elif mark == 1:
                into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
                values = (
                data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"],
                data_dict["content"])
            print(data_dict["question"]["title"])
            self.cur.execute(into, values)
            self.conn.commit()
    
    def search_page(self, mark, page_max, start_page=0, need_commend=False):
        '''
            函数主入口
            
            params:
                mark 0 answer, 1 article, 2 thought
        '''
        offset = start_page
        
        for i in range(page_max):
            if i > self.page_count - 1:
                break
            if mark == 0:
                self.search_answer_article_page(offset, 0, 0)
            elif mark == 1:
                self.search_answer_article_page(offset, 1, 0)
            elif mark == 2:
                self.search_thought_page(offset)
                
            offset = offset + 20
            time.sleep(10)

        self.conn.close()
    
    def get_page_data(self, url):
        get_page = self.retry_get_url(url)
        if get_page.status_code != 200:
            # retry once
            get_page = self.retry_get_url(url)
            if get_page.status_code != 200:
                print("article_error, url : ", url, " status_code: ", get_page.status_code)
        try:
            page_dict = get_page.json()
            print('get page json data success! {}', offset)
        except:
            print('retry get page data : {}', offset)
            self.get_page_data(url)

    def search_answer_article_page(self, offset, mark, proxies_num=0):
        '''
            实现文章和回答的数据包请求
        '''
        offset = str(offset)
        if mark == 0:
            url = self.ANSWER_URL.format(offset)
        elif mark == 1:
            url = ARTICLE_URL.format(offset)
        
        page_dict = self.get_page_data(url)
        if page_dict.get("data"):
            print(self.page_count)
            if self.page_count == 1000:
                self.update_page_count(page_dict["paging"].get("totals", 0))
            for one_line in page_dict['data']:
                try:
                    if one_line["content"] != None:
                        self.parse_sigle_page(one_line, mark)
                        print("finshed_crawler " + offset)
                except KeyError:
                    print('page data error')
                    continue
        else:
            print("article_data_error, offset: ", offset, " url: ", url)
            self.use_proxy = True
            self.search_answer_article_page(offset=offset, mark=mark)
        
        return
    

if __name__ == '__main__':
    '''
    python tasks/zhihu/spider.py 0 1 0 'https://www.zhihu.com/people/taoxi-1130'
    python script_file_path mark(指定是问题还是其他， 0 是问题， 1是文章， 2是想法) max_page(最大页码) start_page(起始页码 0开始) spider_url(要爬取用户知乎首页地址)
    '''

    mark = int(sys.argv[1])
    max_page = int(sys.argv[2])
    start_page = int(sys.argv[3])
    spider_url = sys.argv[4]

    # spider_url = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers'
    print(datetime.now())
    spider = Spider(spider_url=spider_url)
    if mark == 0:
        spider.search_page(mark, max_page, start_page)
    elif mark == 1:
        spider.search_page(mark, max_page, start_page)
    elif mark == 2:
        spider.search_page(mark, max_page, start_page)
    print(datetime.now())
