# coding=utf-8
import pymysql
import execjs
import os
import re
from datetime import datetime
from pymysql import escape_string
from bs4 import BeautifulSoup
import sys

HOST = '172.18.51.14'
PORT = 3306
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
JS_FILE_PATH = '/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js'
# JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'


class RefreshContent(object):
    
    def __init__(self):
        '''
            初始化数据库，调整js规则
        '''
        self.update_error_content_id = []
        self.update_error_url_content_id = {}
        
        self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
                                    passwd=PASSWD,
                                    db=DB, charset='utf8')
        self.cur = self.conn.cursor()
        
        os.environ["EXECJS_RUNTIME"] = 'Node'
        try:
            with open('./zhihu.js', 'r', encoding='utf-8') as f:
                js = f.read()
        except:
            with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
                js = f.read()
        self.exec_js = execjs.compile(js)
        # self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')
    
    @staticmethod
    def replace_html_image_to_url(content):
        rich_obj = BeautifulSoup(content, features="html.parser")
        for item in rich_obj.find_all("figure"):
            image_obj = item.find("img")
            new_rich_obj = rich_obj.new_tag(name="img")
            new_rich_obj["src"] = image_obj.get("src", "")
            item.replace_with(new_rich_obj)
        return rich_obj.decode()
    
    def create_new_content(self, content_id, content, pic_dict):
        content = self.replace_html_image_to_url(content)
        rich_obj = BeautifulSoup(content, features="html.parser")
        update_error = False
        for item in rich_obj.find_all("img"):
            url = item.get("src")[23:]
            new_url = pic_dict.get(url)

            if not new_url:
                if content_id not in self.update_error_content_id:
                    self.update_error_content_id.append(content_id)
                self.update_error_url_content_id[url] = content_id
                print({content_id: url})
                update_error = True
                continue
            item['src'] = new_url + '-w'

        new_content = r'%s' % (rich_obj.decode())
        return escape_string(new_content), update_error
    
    def get_all_content_ids(self, table, pic_table, key_id, offset=0, count=10):
        if offset == 0:
            sql = """select distinct {} from {}""".format(key_id, pic_table)
            print(sql)
            self.cur.execute(sql)
            res = self.cur.fetchall()
            self.conn.commit()
            if res:
                return [item[0] for item in res]
            return None
        else:
            sql = """select answer_id {} from {} limit {}, {}""".format(key_id, table, offset, count)
            print(sql)
            self.cur.execute(sql)
            res = self.cur.fetchall()
            self.conn.commit()
            if res:
                return [item[0] for item in res]
            return None
    
    def refresh_content(self, table, pic_table, key_id, offset=0, count=10):
        '''
            替换url，更新回答内容
        '''
        content_ids = self.get_all_content_ids(table, pic_table, key_id, offset, count)
        
        for content_id in content_ids:
            print('start deal table: {}, content_id: {},'.format(table, content_id), datetime.now())

            sql = """select content from {} where {} = {} and is_new = 0""".format(table, key_id, content_id)
            print(sql)
            self.cur.execute(sql)
            res = self.cur.fetchall()
            self.conn.commit()
            if not res:
                continue

            content = res[0][0]
            
            sql = """select url, new_url from {} where {} = {} and new_url is not null""".format(
                pic_table, key_id, content_id)
            self.cur.execute(sql)
            res = self.cur.fetchall()
            self.conn.commit()
            pic_dict = {
                item[0][23:]: item[1] for item in res}
            
            new_content, update_error = self.create_new_content(content_id, content, pic_dict)
            update_code = 1 if not update_error else 0

            sql = """update {} set new_content = '{}', is_new = {} WHERE {} = '{}' """.format(
                table, new_content, update_code, key_id, content_id)
            self.cur.execute(sql)
            self.conn.commit()
            print('end deal table: {}, content_id: {},'.format(table, content_id), datetime.now())


if __name__ == '__main__':
    """ python script_file mark """
    print('参数个数为:', len(sys.argv), '个参数。')
    print('参数列表:', type(sys.argv[0]), sys.argv[0], type(sys.argv[1]), sys.argv[1], type(sys.argv[2]), sys.argv[2], type(sys.argv[3]), sys.argv[3])
    mark = int(sys.argv[1])
    offset = int(sys.argv[2])
    count = int(sys.argv[3])
    print(datetime.now())
    refresh = RefreshContent()
    if mark == 0:
        refresh.refresh_content('zhihu_answer', 'zhihu_answer_picture_url', 'answer_id', offset, count)
    elif mark == 1:
        refresh.refresh_content('zhihu_article', 'zhihu_article_picture_url', 'article_id')
    elif mark == 2:
        refresh.refresh_content('zhihu_thought', 'zhihu_thought_picture_url', 'thought_id')
        
    print('update_error_url_content_ids : ', refresh.update_error_url_content_id)
    print('update_error_content_ids : ', refresh.update_error_content_id)
    print(datetime.now())
