Commit 929f9869 authored by haowang's avatar haowang

separate zhihu spider

parent 709a4df1
import pymysql
import execjs
import os
import re
from datetime import datetime
from pymysql import escape_string
import sys
HOST = '172.18.51.14'
PORT = 3306
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
class RefreshContent(object):
def __init__(self):
'''
初始化数据库,调整js规则
'''
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd=PASSWD,
db=DB, charset='utf8')
self.cur = self.conn.cursor()
os.environ["EXECJS_RUNTIME"] = 'Node'
try:
with open('./zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
except:
with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read()
self.exec_js = execjs.compile(js)
def refresh_content(self, table, pic_table, key_id):
'''
替换url,更新回答内容
'''
import re
sql = """select {}, url, new_url from {}""".format(key_id, pic_table)
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
for i in range(len(tuple)):
if tuple[i][2] == None:
continue
find_id = tuple[i][0]
temp = str(tuple[i][1])
temp1 = temp.replace("?", "#")
sql = """select new_content from {} where {} = '{}' """.format(table, key_id, find_id)
self.cur.execute(sql)
tuples = self.cur.fetchall()
# tuples = str(tuples)
content = tuples[0][0]
pattern = r'%s(.+?)%s' % ("<noscript><img src=\"" + temp1, "</figure>")
temp_tuples = content.replace("?", "#")
new_content = re.sub(pattern, "<noscript><img src=\"" + str(tuple[i][2]) + "\"></noscript></figure>",
temp_tuples)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update {} set new_content = '{}' WHERE {} = '{}' """.format(table, new_content, key_id, find_id)
self.cur.execute(sql)
self.conn.commit()
def answer_refresh_content(self):
'''
替换url,更新回答内容
'''
sql = """select answer_id, url, new_url from zhihu_answer_picture_url"""
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
for i in range(len(tuple)):
if tuple[i][2] == None:
continue
find_id = tuple[i][0]
temp = str(tuple[i][1])
temp1 = temp.replace("?", "#")
sql = """select new_content from zhihu_answer where zhihu_answer.answer_id = '{}' """.format(find_id)
self.cur.execute(sql)
tuples = self.cur.fetchall()
# tuples = str(tuples)
content = tuples[0][0]
pattern = r'%s(.+?)%s' % ("<noscript><img src=\"" + temp1, "</figure>")
temp_tuples = content.replace("?", "#")
new_content = re.sub(pattern, "<noscript><img src=\"" + str(tuple[i][2]) + "\"></noscript></figure>",
temp_tuples)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update zhihu_answer set new_content = '{}' WHERE answer_id = '{}' """.format(new_content,
tuple[i][0])
self.cur.execute(sql)
self.conn.commit()
def article_refresh_content(self):
'''
替换url,更新文章内容
'''
sql = """select article_id, url, new_url from zhihu_article_picture_url"""
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
for i in range(len(tuple)):
find_id = tuple[i][0]
temp = str(tuple[i][1])
temp1 = temp.replace("?", "#")
sql = """select new_content from zhihu_article where zhihu_article.article_id = '{}' """.format(find_id)
self.cur.execute(sql)
tuples = self.cur.fetchall()
# tuples = str(tuples)
content = tuples[0][0]
pattern = r'%s(.+?)%s' % ("<noscript><img src=\"" + temp1, "</figure>")
temp_tuples = content.replace("?", "#")
new_content = re.sub(pattern, "<noscript><img src=\"" + str(tuple[i][2]) + "\"></noscript></figure>",
temp_tuples)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update zhihu_article set new_content = '{}' WHERE article_id = '{}' """.format(new_content,
tuple[i][0])
self.cur.execute(sql)
self.conn.commit()
if __name__ == '__main__':
# print('参数个数为:', len(sys.argv), '个参数。')
# print('参数列表:', type(sys.argv[0]), sys.argv[0], type(sys.argv[1]), sys.argv[1])
mark = int(sys.argv[1])
print(datetime.now())
refresh = RefreshContent()
if mark == 0:
refresh.refresh_content('zhihu_answer', 'zhihu_answer_picture_url', 'answer_id')
elif mark == 1:
refresh.refresh_content('zhihu_article', 'zhihu_article_picture_url', 'article_id')
elif mark == 2:
refresh.refresh_content('zhihu_thought', 'zhihu_thought_picture_url', 'thought_id')
print(datetime.now())
This diff is collapsed.
import os import os
import sys
import re import re
import time import time
import pymysql import pymysql
...@@ -10,8 +11,8 @@ from datetime import datetime ...@@ -10,8 +11,8 @@ from datetime import datetime
from image_qiniu import upload_file, IMG_TYPE from image_qiniu import upload_file, IMG_TYPE
DATA_OS_PATH = '/Users/haowei/workspace/gm/crawler/image' DATA_OS_PATH = '/image'
PROJECT_PATH = '/Users/haowei/workspace/gm/crawler' PROJECT_PATH = '/'
class UploadImage(object): class UploadImage(object):
...@@ -43,7 +44,7 @@ class UploadImage(object): ...@@ -43,7 +44,7 @@ class UploadImage(object):
with open(self.JS_FILE_PATH, 'r', encoding='utf-8') as f: with open(self.JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read() js = f.read()
# print(js) # print(js)
self.exec_js = execjs.compile(js) self.exec_js = execjs.compile(js, )
def get_serach_page_cookies(self): def get_serach_page_cookies(self):
''' '''
...@@ -133,11 +134,11 @@ class UploadImage(object): ...@@ -133,11 +134,11 @@ class UploadImage(object):
print('upload ..... error') print('upload ..... error')
return None return None
def picture_download_and_cut(self, path, new_path, table, key_id): def picture_download_and_cut(self, path, new_path, table, key_id, offset=0, count=10):
''' '''
文章图片剪切和下载 文章图片剪切和下载
''' '''
sql = """select {}, url from {}""".format(key_id, table) sql = """select {}, url from {} where new_url == '' limit {}, {}""".format(key_id, table, offset, count)
self.cur.execute(sql) self.cur.execute(sql)
tuple = self.cur.fetchall() tuple = self.cur.fetchall()
self.conn.commit() self.conn.commit()
...@@ -186,7 +187,15 @@ class UploadImage(object): ...@@ -186,7 +187,15 @@ class UploadImage(object):
if __name__ == '__main__': if __name__ == '__main__':
mark = int(sys.argv[1]) or 0
offset = int(sys.argv[2]) or 0
count = int(sys.argv[3]) or 10
print(datetime.now()) print(datetime.now())
a = UploadImage() a = UploadImage()
a.picture_download_and_cut_process() if mark == 0:
a.picture_download_and_cut(a.ANSWER_PICTURE_PATH, a.ANSWER_PICTURE_CUT_PATH, 'zhihu_answer_picture_url', 'answer_id', offset, count)
if mark == 1:
a.picture_download_and_cut(a.ARTICLE_PICTURE_PATH, a.ARTICLE_PICTURE_CUT_PATH, 'zhihu_article_picture_url', 'article_id', offset, count)
if mark == 2:
a.picture_download_and_cut(a.THOUGHT_PICTURE_PATH, a.THOUGHT_PICTURE_CUT_PATH, 'zhihu_thought_picture_url', 'thought_id', offset, count)
print(datetime.now()) print(datetime.now())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment