import os
import sys
import re
import time
import pymysql
import requests
import hashlib
import cv2
import execjs
from datetime import datetime
from image_qiniu import upload_file, IMG_TYPE
from bs4 import BeautifulSoup
# pip3 install "requests[security]" -i https://pypi.tuna.tsinghua.edu.cn/simple


# DATA_OS_PATH = '/data'
# PROJECT_PATH = '/srv/apps/crawler'
DATA_OS_PATH = '/Users/haowei/workspace/gm/crawler/image'
PROJECT_PATH = '/Users/haowei/workspace/gm/crawler'


class UploadImage(object):
    
    def __init__(self):
        HOST = '172.18.51.14'
        PORT = 3306
        USER = 'spider'
        PASSWD = 'Gengmei123'
        DB = 'spider'
        self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
                                    passwd=PASSWD,
                                    db=DB, charset='utf8')
        self.cur = self.conn.cursor()
        
        self.ANSWER_PICTURE_PATH = DATA_OS_PATH + '/answer_picture/'
        self.ARTICLE_PICTURE_PATH = DATA_OS_PATH + '/article_picture/'
        self.THOUGHT_PICTURE_PATH = DATA_OS_PATH + '/thought_picture/'
        self.ANSWER_PICTURE_CUT_PATH = DATA_OS_PATH + '/answer_picture_cut/'
        self.ARTICLE_PICTURE_CUT_PATH = DATA_OS_PATH + '/article_picture_cut/'
        self.THOUGHT_PICTURE_CUT_PATH = DATA_OS_PATH + '/thought_picture_cut/'
        self.JS_FILE_PATH = PROJECT_PATH + '/crawler_sys/site_crawler/zhihu.js'
        
        os.environ["EXECJS_RUNTIME"] = 'Node'
        try:
            with open('./zhihu.js', 'r', encoding='utf-8') as f:
                js = f.read()
        except:
            with open(self.JS_FILE_PATH, 'r', encoding='utf-8') as f:
                js = f.read()
        # self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')
        self.exec_js = execjs.compile(js)
    
    def get_serach_page_cookies(self):
        '''
            cookies更新
        '''
        
        url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1"
        headers = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            "cache-control": "max-age=0",
            "cookie": '_SESSIONID=rvJz2mbRjCqJFwvD79ADGb8gpdpRkWXAsdVDqOVALgh; JOID=UVkSBEtHLEUgV_KGUUMd3fULiLJHZglmBHfTo3JnDWADc9KndNqP8XtW9oBWoLo5jsAnAcPG0-JwbPp8rD1JsFI=; osd=W1gdB05NLUojUviHXkAY1_QEi7dNZwZlAX3SrHFiB2EMcNetddWM9HFX-YNTqrs2jcUtAMzF1uhxY_l5pjxGs1c=; _zap=cc1330a6-d6e9-4b25-8232-0b0481e37ea2; _xsrf=4798d610-1db7-489f-8c14-d608c0bbcf08; d_c0="AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"; capsion_ticket="2|1:0|10:1605684439|14:capsion_ticket|44:MzdiN2JlOTZlNWE2NGU5Zjg0MDNiODFlN2ViMjQzNGU=|b84d77e03b5e9447a3740a614d208ca5f3bc1e4fe15fe9b46f1a2a5702da5f99"; SESSIONID=nplCMucg2EXp8xNQAz74jmaLovUS9CG4rVtcFY4jWLT; JOID=V1sUBk8zRBeITtMgLDNzj10QrRI6WzZ_ygKgQU8AB038d6pof-COptdJ1yYpGa7oR9-1bgHql0Hfs4FvUF_YW-A=; osd=Ul4XBkI2QRSIQ9YlLzN-ilgTrR8_XjV_xwelQk8NAkj_d6dteuOOq9JM1CYkHKvrR9KwawLqmkTasIFiVVrbW-0=; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1605608394,1605684435,1605684439,1605684455; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1605684522; KLBRSID=fe0fceb358d671fa6cc33898c8c48b48|1605685245|1605683234',
            "referer": "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1",
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "same-origin",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
        }
        requests_res = self.retry_get_url(url, headers=headers)
        return requests_res.cookies.get_dict()
    
    def headers_handle(self, url):
        '''
            url请求中的头部伪装
        '''
        res_cookies_dict = self.get_serach_page_cookies()
        headers_search = {
            
            "accept": "*/*",
            "accept-encoding": "gzip, deflate",
            "accept-language": "zh-CN,zh;q=0.9",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
            "x-ab-param": "li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1",
            "x-api-version": "3.0.91",
            "x-app-za": "OS=Web",
            "x-requested-with": "fetch",
            "x-zse-83": "3_2.0",
            "x-zse-86": None,
            "referer": "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?page=1",
            
        }
        cookies_dict = {
            "d_c0": '"AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"',
            "KLBRSID": None
        }
        
        cookies_dict.update(res_cookies_dict)
        
        f = "+".join(
            ["3_2.0", url.replace("https://www.zhihu.com", ""), headers_search["referer"], cookies_dict["d_c0"]])
        fmd5 = hashlib.new('md5', f.encode()).hexdigest()
        headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b", fmd5)
        return headers_search, cookies_dict
    
    @staticmethod
    def retry_get_url(url, retrys=5, proxies=None, timeout=10, **kwargs):
        retry_c = 0
        while retry_c < retrys:
            try:
                requests.packages.urllib3.disable_warnings()
                get_resp = requests.get(url, verify=False, timeout=timeout, **kwargs)
                return get_resp
            except Exception as e:
                retry_c += 1
                time.sleep(2)
                print(e)
        print('Failed to get page %s after %d retries, %s'
              % (url, retrys, datetime.now()))
        return None
    
    @staticmethod
    def upload_image_with_path(path, img_type=IMG_TYPE.TOPICIMAGE):
        '''
            图片上传并得到新url
        '''
        '''非站内图片处理'''
        try:
            url = upload_file(file_path=path, img_type=img_type)
            print('upload .....  ', url)
            return url
        except:
            print('upload .....  error')
            return None
    
    def picture_download_and_cut(self, path, new_path, table, key_id, content_id, content):
        '''
            文章图片剪切和下载
        '''
        
        def _deal_image_by_path(res, file_path, old_url, i):
            img = cv2.imread(file_path)
            if img is not None:
                high, width = img.shape[:2]
                cropped = img[0:int(high / 10 * 9), 0:width]
                pathes = new_path + "num" + str(i) + ".jpg"
                cv2.imwrite(pathes, cropped)

                new_url = self.upload_image_with_path(pathes)
                sql = """UPDATE {} SET new_url = "{}" WHERE url = "{}" """.format(
                    table, str(new_url), str(res[i][1]))
                self.cur.execute(sql)
                self.conn.commit()
            else:
                print('image open error : ', file_path)
                _upload_picture(file_path, old_url)

        def _upload_picture(file_path, old_url):
            new_url = self.upload_image_with_path(file_path)

            sql = """UPDATE {} SET new_url = "{}" WHERE url = "{}" """.format(
                table, str(new_url), str(old_url))
            self.cur.execute(sql)
            self.conn.commit()

        def _download_picture():
            sql = """select {}, url from {} where {} = {} and new_url is null""".format(key_id, table, key_id, content_id)
            self.cur.execute(sql)
            res = self.cur.fetchall()
            self.conn.commit()
            gif_patt = r'gif'
            for i in range(len(res)):
                mark = re.search(gif_patt, res[i][1])
                url = res[i][1]
                [headers_search, cookies_dict] = self.headers_handle(url)
                r = self.retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
                if not r:
                    continue
                # try:
                if mark:
                    pathes = path + str('num') + str(i) + '.gif'
                    with open(pathes, 'wb') as f:  # 打开写入到path路径里-二进制文件，返回的句柄名为f
                        f.write(r.content)  # 往f里写入r对象的二进制文件
                    f.close()
                    new_url = self.upload_image_with_path(pathes)
                    
                    sql = """UPDATE {} SET new_url = "{}" WHERE url = "{}" """.format(
                        table, str(new_url), str(url))
                    self.cur.execute(sql)
                    self.conn.commit()
                else:
                    pathes = path + str('num') + str(i) + '.jpg'
                    with open(pathes, 'wb') as f:  # 打开写入到path路径里-二进制文件，返回的句柄名为f
                        f.write(r.content)  # 往f里写入r对象的二进制文件
                    f.close()
                    _deal_image_by_path(res, pathes, url, i)
                # except Exception as e:
                #     print(e)

        urls = self.find_all_url(content)
        self.insert_picture_urls(table, urls, content_id, key_id)
        _download_picture()
        _download_picture()
    
    def picture_process(self, path, new_path, table, pic_table, key_id, offset=0, count=10):
        content_dict = self.gets_content_dict(table, key_id, offset, count)
        
        for content_id, content in content_dict.items():
            self.picture_download_and_cut(path, new_path, pic_table, key_id, content_id, content)
    
    def insert_picture_urls(self, table, urls, content_id, key_id):
        
        def _delete_repeat_url(instance, columns):
            print(columns)
            sql = """delete from {} where id in ({})""".format(table, ','.join([str(item) for item in columns]))
            
            instance.cur.execute(sql)
            instance.conn.commit()
        
        def _url_exist(instance, url_):
            sql = """select id from {} where {} = {} and url = '{}'""".format(table, key_id, content_id, url_)
            instance.cur.execute(sql)
            res = instance.cur.fetchall()
            instance.conn.commit()
            if res:
                res = [item[0] for item in res]
                # if len(res) > 1:
                #     _delete_repeat_url(instance, res[1:])
                return False
            return True
        
        values = []
        for url in urls:
            if not _url_exist(self, url):
                continue
            values.append("({}, '{}')".format(content_id, url))
        
        if values:
            into = """insert into {} (answer_id, url) values {}""".format(table, ','.join(values))
            print(into)
            self.cur.execute(into)
            self.conn.commit()
    
    def find_all_url(self, content):
        new_content = self.replace_html_image_to_url(content)
        rich_obj = BeautifulSoup(new_content, features="html.parser")
        urls = []
        for item in rich_obj.find_all("img"):
            print(item.get('src'))
            urls.append(item.get('src'))
        return list(set(urls))

    @staticmethod
    def replace_html_image_to_url(content):
        rich_obj = BeautifulSoup(content, features="html.parser")
        for item in rich_obj.find_all("figure"):
            image_obj = item.find("img")
            new_rich_obj = rich_obj.new_tag(name="img")
            new_rich_obj["src"] = image_obj.get("src", "")
            item.replace_with(new_rich_obj)
        return rich_obj.decode()
    
    def gets_content_dict(self, table, key_id, offset=0, count=10):
        sql = """select {}, content from {} limit {}, {}""".format(key_id, table, offset, count)
        self.cur.execute(sql)
        res = self.cur.fetchall()
        self.conn.commit()
        return {item[0]: item[1] for item in res}
    
    
if __name__ == '__main__':
    ''' 执行命令 python file_name mark offset count '''
    mark = int(sys.argv[1]) or 0
    offset = int(sys.argv[2]) or 0
    count = int(sys.argv[3]) or 10
    print(datetime.now())
    a = UploadImage()
    if mark == 0:
        a.picture_process(
            a.ANSWER_PICTURE_PATH,
            a.ANSWER_PICTURE_CUT_PATH,
            'zhihu_answer',
            'zhihu_answer_picture_url',
            'answer_id',
            offset,
            count
        )
    if mark == 1:
        a.picture_process(
            a.ARTICLE_PICTURE_PATH,
            a.ARTICLE_PICTURE_CUT_PATH,
            'zhihu_article',
            'zhihu_article_picture_url',
            'article_id',
            offset,
            count
        )
    if mark == 2:
        a.picture_process(
            a.THOUGHT_PICTURE_PATH,
            a.THOUGHT_PICTURE_CUT_PATH,
            'zhihu_thought',
            'zhihu_thought_picture_url',
            'thought_id',
            offset,
            count)
    print(datetime.now())
