Commit ea702309 authored by haowang's avatar haowang

modify content refresh

parent c606ec23
# coding=utf-8
import pymysql import pymysql
import execjs import execjs
import os import os
import re import re
from datetime import datetime from datetime import datetime
from pymysql import escape_string from pymysql import escape_string
from bs4 import BeautifulSoup
import sys import sys
HOST = '172.18.51.14' HOST = '172.18.51.14'
...@@ -11,7 +13,8 @@ PORT = 3306 ...@@ -11,7 +13,8 @@ PORT = 3306
USER = 'spider' USER = 'spider'
PASSWD = 'Gengmei123' PASSWD = 'Gengmei123'
DB = 'spider' DB = 'spider'
JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js' JS_FILE_PATH = '/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js'
# JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
class RefreshContent(object): class RefreshContent(object):
...@@ -33,113 +36,81 @@ class RefreshContent(object): ...@@ -33,113 +36,81 @@ class RefreshContent(object):
with open(JS_FILE_PATH, 'r', encoding='utf-8') as f: with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read() js = f.read()
self.exec_js = execjs.compile(js) self.exec_js = execjs.compile(js)
# self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')
@staticmethod
def replace_html_image_to_url(content):
rich_obj = BeautifulSoup(content, features="html.parser")
for item in rich_obj.find_all("figure"):
image_obj = item.find("img")
new_rich_obj = rich_obj.new_tag(name="img")
new_rich_obj["src"] = image_obj.get("src", "")
item.replace_with(new_rich_obj)
return rich_obj.decode()
def create_new_content(self, content, pic_dict):
content = self.replace_html_image_to_url(content)
rich_obj = BeautifulSoup(content, features="html.parser")
for item in rich_obj.find_all("img"):
url = item.get("src")
new_url = pic_dict.get(url)
item['src'] = new_url + '-w'
return rich_obj.decode()
@staticmethod
def replace_url_to_new_url(content, url_dict):
rich_obj = BeautifulSoup(content, features="html.parser")
for item in rich_obj.find_all("img"):
url = item.get("src")
new_url = url_dict.get(url)
item['src'] = new_url
return rich_obj.decode()
def get_all_content_ids(self, table, key_id):
sql = """select distinct {} from {}""".format(key_id, table)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
if res:
return [item[0] for item in res]
return None
def refresh_content(self, table, pic_table, key_id): def refresh_content(self, table, pic_table, key_id):
''' '''
替换url,更新回答内容 替换url,更新回答内容
''' '''
import re content_ids = self.get_all_content_ids(pic_table, key_id)
sql = """select {}, url, new_url from {} """.format(key_id, pic_table)
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
for i in range(len(tuple)):
find_id, url, new_url = tuple[i][0], tuple[i][1], tuple[i][2]
if new_url is None:
continue
print('deal : ', find_id, new_url, datetime.now())
temp = str(url)
temp1 = temp.replace("?", "#")
sql = """select new_content from {} where {} = '{}' """.format(table, key_id, find_id)
self.cur.execute(sql)
tuples = self.cur.fetchall()
# tuples = str(tuples)
content = tuples[0][0]
# pattern = r'%s(.+?)%s' % ("<noscript><img src=\"" + temp1, "</figure>")
temp_tuples = content.replace("?", "#")
# new_content = re.sub(pattern, "<noscript><img src=\"" + new_url + "\"></noscript></figure>",
# temp_tuples)
url_replace = str(new_url) + "-w"
new_content = content.replace('"'+temp1+'"', '"'+url_replace+'"')
new_content = new_content.replace("<noscript>", "")
new_content = new_content.replace("</noscript>", "")
# url_replace = str(new_url) + "-w" for content_id in content_ids:
# new_content = new_content.replace('"'+new_url+'"', url_replace) print('start deal table: {}, content_id: {},'.format(table, content_id), datetime.now())
sql = """select url, new_url from {} where {} = {} and new_url is not null""".format(pic_table, key_id, content_id)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update {} set new_content = '{}' WHERE {} = '{}' """.format(table, new_content, key_id, find_id)
self.cur.execute(sql) self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit() self.conn.commit()
pic_dict = {item[0]: item[1] for item in res}
def answer_refresh_content(self): sql = """select content from {} where {} = {}""".format(table, key_id, content_id)
'''
替换url,更新回答内容
'''
sql = """select answer_id, url, new_url from zhihu_answer_picture_url"""
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
for i in range(len(tuple)):
if tuple[i][2] == None:
continue
find_id = tuple[i][0]
temp = str(tuple[i][1])
temp1 = temp.replace("?", "#")
sql = """select new_content from zhihu_answer where zhihu_answer.answer_id = '{}' """.format(find_id)
self.cur.execute(sql)
tuples = self.cur.fetchall()
# tuples = str(tuples)
content = tuples[0][0]
pattern = r'%s(.+?)%s' % ("<noscript><img src=\"" + temp1, "</figure>")
temp_tuples = content.replace("?", "#")
new_content = re.sub(pattern, "<noscript><img src=\"" + str(tuple[i][2]) + "\"></noscript></figure>",
temp_tuples)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update zhihu_answer set new_content = '{}' WHERE answer_id = '{}' """.format(new_content,
tuple[i][0])
self.cur.execute(sql) self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit() self.conn.commit()
content = res[0][0]
def article_refresh_content(self): new_content = self.create_new_content(content, pic_dict)
''' print(new_content)
替换url,更新文章内容 import pdb; pdb.set_trace()
'''
sql = """select article_id, url, new_url from zhihu_article_picture_url""" sql = """update {} set new_content = '{}' WHERE {} = '{}' """.format(table, new_content, key_id, content_id)
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
for i in range(len(tuple)):
find_id = tuple[i][0]
temp = str(tuple[i][1])
temp1 = temp.replace("?", "#")
sql = """select new_content from zhihu_article where zhihu_article.article_id = '{}' """.format(find_id)
self.cur.execute(sql)
tuples = self.cur.fetchall()
# tuples = str(tuples)
content = tuples[0][0]
pattern = r'%s(.+?)%s' % ("<noscript><img src=\"" + temp1, "</figure>")
temp_tuples = content.replace("?", "#")
new_content = re.sub(pattern, "<noscript><img src=\"" + str(tuple[i][2]) + "\"></noscript></figure>",
temp_tuples)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update zhihu_article set new_content = '{}' WHERE article_id = '{}' """.format(new_content,
tuple[i][0])
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
print('end deal table: {}, content_id: {},'.format(table, content_id), datetime.now())
if __name__ == '__main__': if __name__ == '__main__':
# print('参数个数为:', len(sys.argv), '个参数。') """ python script_file mark """
# print('参数列表:', type(sys.argv[0]), sys.argv[0], type(sys.argv[1]), sys.argv[1]) print('参数个数为:', len(sys.argv), '个参数。')
print('参数列表:', type(sys.argv[0]), sys.argv[0], type(sys.argv[1]), sys.argv[1])
mark = int(sys.argv[1]) mark = int(sys.argv[1])
print(datetime.now()) print(datetime.now())
refresh = RefreshContent() refresh = RefreshContent()
......
...@@ -98,17 +98,17 @@ class Spider(object): ...@@ -98,17 +98,17 @@ class Spider(object):
next = self.search_root_comment(data_dict["id"], offset, mark) next = self.search_root_comment(data_dict["id"], offset, mark)
offset = offset + 20 offset = offset + 20
patt = r'%s(.+?)%s' % ("<noscript><img src=\"", "\" data-caption") # patt = r'%s(.+?)%s' % ("<noscript><img src=\"", "\" data-caption")
pattern = re.compile(patt) # pattern = re.compile(patt)
result = pattern.findall(data_dict["content"]) # result = pattern.findall(data_dict["content"])
for results in result: # for results in result:
if mark == 0: # if mark == 0:
into = "insert into zhihu_answer_picture_url(answer_id, url) value(%s, %s)" # into = "insert into zhihu_answer_picture_url(answer_id, url) value(%s, %s)"
elif mark == 1: # elif mark == 1:
into = "insert into zhihu_article_picture_url(article_id, url) value(%s, %s)" # into = "insert into zhihu_article_picture_url(article_id, url) value(%s, %s)"
values = (data_dict["id"], results) # values = (data_dict["id"], results)
self.cur.execute(into, values) # self.cur.execute(into, values)
self.conn.commit() # self.conn.commit()
return return
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment