Commit 9bde750f authored by haowang's avatar haowang

modify zhihu spider

parent 7c2c462d
......@@ -24,6 +24,7 @@ class RefreshContent(object):
初始化数据库,调整js规则
'''
self.update_error_content_id = []
self.update_error_url_content_id = {}
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd=PASSWD,
......@@ -53,49 +54,75 @@ class RefreshContent(object):
def create_new_content(self, content_id, content, pic_dict):
content = self.replace_html_image_to_url(content)
rich_obj = BeautifulSoup(content, features="html.parser")
update_error = False
for item in rich_obj.find_all("img"):
url = item.get("src")
url = item.get("src")[23:]
new_url = pic_dict.get(url)
if not new_url:
self.update_error_content_id.append({content_id: url})
if content_id not in self.update_error_content_id:
self.update_error_content_id.append(content_id)
self.update_error_url_content_id[url] = content_id
print({content_id: url})
update_error = True
continue
item['src'] = new_url + '-w'
return rich_obj.decode()
new_content = r'%s' % (rich_obj.decode())
return escape_string(new_content), update_error
def get_all_content_ids(self, table, key_id):
sql = """select distinct {} from {}""".format(key_id, table)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
if res:
return [item[0] for item in res]
return None
def get_all_content_ids(self, table, pic_table, key_id, offset=0, count=10):
if offset == 0:
sql = """select distinct {} from {}""".format(key_id, pic_table)
print(sql)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
if res:
return [item[0] for item in res]
return None
else:
sql = """select answer_id {} from {} limit {}, {}""".format(key_id, table, offset, count)
print(sql)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
if res:
return [item[0] for item in res]
return None
def refresh_content(self, table, pic_table, key_id):
def refresh_content(self, table, pic_table, key_id, offset=0, count=10):
'''
替换url,更新回答内容
'''
content_ids = self.get_all_content_ids(pic_table, key_id)
content_ids = self.get_all_content_ids(table, pic_table, key_id, offset, count)
for content_id in content_ids:
print('start deal table: {}, content_id: {},'.format(table, content_id), datetime.now())
sql = """select url, new_url from {} where {} = {} and new_url is not null""".format(pic_table, key_id, content_id)
sql = """select content from {} where {} = {} and is_new = 0""".format(table, key_id, content_id)
print(sql)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
pic_dict = {item[0]: item[1] for item in res}
if not res:
continue
content = res[0][0]
sql = """select content from {} where {} = {}""".format(table, key_id, content_id)
sql = """select url, new_url from {} where {} = {} and new_url is not null""".format(
pic_table, key_id, content_id)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
content = res[0][0]
pic_dict = {
item[0][23:]: item[1] for item in res}
new_content = self.create_new_content(content_id, content, pic_dict)
new_content, update_error = self.create_new_content(content_id, content, pic_dict)
update_code = 1 if not update_error else 0
sql = """update {} set new_content = '{}' WHERE {} = '{}' """.format(table, new_content, key_id, content_id)
sql = """update {} set new_content = '{}', is_new = {} WHERE {} = '{}' """.format(
table, new_content, update_code, key_id, content_id)
self.cur.execute(sql)
self.conn.commit()
print('end deal table: {}, content_id: {},'.format(table, content_id), datetime.now())
......@@ -104,16 +131,19 @@ class RefreshContent(object):
if __name__ == '__main__':
""" python script_file mark """
print('参数个数为:', len(sys.argv), '个参数。')
print('参数列表:', type(sys.argv[0]), sys.argv[0], type(sys.argv[1]), sys.argv[1])
print('参数列表:', type(sys.argv[0]), sys.argv[0], type(sys.argv[1]), sys.argv[1], type(sys.argv[2]), sys.argv[2], type(sys.argv[3]), sys.argv[3])
mark = int(sys.argv[1])
offset = int(sys.argv[2])
count = int(sys.argv[3])
print(datetime.now())
refresh = RefreshContent()
if mark == 0:
refresh.refresh_content('zhihu_answer', 'zhihu_answer_picture_url', 'answer_id')
refresh.refresh_content('zhihu_answer', 'zhihu_answer_picture_url', 'answer_id', offset, count)
elif mark == 1:
refresh.refresh_content('zhihu_article', 'zhihu_article_picture_url', 'article_id')
elif mark == 2:
refresh.refresh_content('zhihu_thought', 'zhihu_thought_picture_url', 'thought_id')
print(refresh.update_error_content_id)
print('update_error_url_content_ids : ', refresh.update_error_url_content_id)
print('update_error_content_ids : ', refresh.update_error_content_id)
print(datetime.now())
This diff is collapsed.
......@@ -10,6 +10,7 @@ import execjs
from datetime import datetime
from image_qiniu import upload_file, IMG_TYPE
from bs4 import BeautifulSoup
# pip3 install "requests[security]" -i https://pypi.tuna.tsinghua.edu.cn/simple
# DATA_OS_PATH = '/data'
......@@ -109,15 +110,16 @@ class UploadImage(object):
return headers_search, cookies_dict
@staticmethod
def retry_get_url(url, retrys=3, proxies=None, timeout=10, **kwargs):
def retry_get_url(url, retrys=5, proxies=None, timeout=10, **kwargs):
retry_c = 0
while retry_c < retrys:
try:
get_resp = requests.get(url, timeout=timeout, **kwargs)
requests.packages.urllib3.disable_warnings()
get_resp = requests.get(url, verify=False, timeout=timeout, **kwargs)
return get_resp
except Exception as e:
retry_c += 1
time.sleep(1)
time.sleep(2)
print(e)
print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now()))
......@@ -142,9 +144,9 @@ class UploadImage(object):
文章图片剪切和下载
'''
def _deal_image_by_path(file_path, old_url):
def _deal_image_by_path(res, file_path, old_url, i):
img = cv2.imread(file_path)
if img:
if img is not None:
high, width = img.shape[:2]
cropped = img[0:int(high / 10 * 9), 0:width]
pathes = new_path + "num" + str(i) + ".jpg"
......@@ -152,7 +154,7 @@ class UploadImage(object):
new_url = self.upload_image_with_path(pathes)
sql = """UPDATE {} SET new_url = "{}" WHERE url = "{}" """.format(
table, str(new_url), str(tuple[i][1]))
table, str(new_url), str(res[i][1]))
self.cur.execute(sql)
self.conn.commit()
else:
......@@ -167,36 +169,44 @@ class UploadImage(object):
self.cur.execute(sql)
self.conn.commit()
def _download_picture():
sql = """select {}, url from {} where {} = {} and new_url is null""".format(key_id, table, key_id, content_id)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
gif_patt = r'gif'
for i in range(len(res)):
mark = re.search(gif_patt, res[i][1])
url = res[i][1]
[headers_search, cookies_dict] = self.headers_handle(url)
r = self.retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
if not r:
continue
# try:
if mark:
pathes = path + str('num') + str(i) + '.gif'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
new_url = self.upload_image_with_path(pathes)
sql = """UPDATE {} SET new_url = "{}" WHERE url = "{}" """.format(
table, str(new_url), str(url))
self.cur.execute(sql)
self.conn.commit()
else:
pathes = path + str('num') + str(i) + '.jpg'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
_deal_image_by_path(res, pathes, url, i)
# except Exception as e:
# print(e)
urls = self.find_all_url(content)
self.insert_picture_urls(table, urls, content_id, key_id)
sql = """select {}, url from {} where {} = {} and new_url is null""".format(key_id, table, key_id, content_id)
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
gif_patt = r'gif'
for i in range(len(tuple)):
mark = re.search(gif_patt, tuple[i][1])
url = tuple[i][1]
[headers_search, cookies_dict] = self.headers_handle(url)
r = self.retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
if mark:
pathes = path + str('num') + str(i) + '.gif'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
new_url = self.upload_image_with_path(pathes)
sql = """UPDATE {} SET new_url = "{}" WHERE url = "{}" """.format(
table, str(new_url), str(url))
self.cur.execute(sql)
self.conn.commit()
else:
pathes = path + str('num') + str(i) + '.jpg'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
_deal_image_by_path(pathes, url)
_download_picture()
_download_picture()
def picture_process(self, path, new_path, table, pic_table, key_id, offset=0, count=10):
content_dict = self.gets_content_dict(table, key_id, offset, count)
......
CREATE TABLE `zhihu_answer` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`title` varchar(40) DEFAULT NULL COMMENT '标题',
`content` mediumtext COMMENT '内容',
`answer_id` int(11) DEFAULT NULL COMMENT 'id',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`comment_count` int(11) DEFAULT NULL COMMENT '评论数',
`new_content` mediumtext COMMENT '新内容',
`is_new` tinyint(1) NOT NULL DEFAULT 0 COMMENT '是否已更新',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎问答';
alter table `zhihu_answer` add column `is_new` tinyint(1) NOT NULL DEFAULT 0 COMMENT '是否已更新';
CREATE TABLE `zhihu_answer_root_comment` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`root_comment_id` int(11) DEFAULT NULL COMMENT '父评论id',
`author_name` varchar(40) DEFAULT NULL COMMENT '作者名',
`content` mediumtext COMMENT '内容',
`answerid` int(11) DEFAULT NULL COMMENT '回答ID',
`child_comment_count` int(11) DEFAULT NULL COMMENT '子评论数量',
`featured` varchar(5) DEFAULT NULL COMMENT '是否精彩评论',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`author_id` varchar(50) DEFAULT NULL COMMENT '作者id',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎问答父评论';
CREATE TABLE `zhihu_article` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`title` varchar(40) DEFAULT NULL COMMENT '标题',
`content` mediumtext COMMENT '内容',
`article_id` int(11) DEFAULT NULL COMMENT 'id',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`comment_count` int(11) DEFAULT NULL COMMENT '评论数量',
`new_content` mediumtext COMMENT '新内容',
`is_new` tinyint(1) NOT NULL DEFAULT 0 COMMENT '是否已更新',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎文章';
CREATE TABLE `zhihu_child_comment` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`root_comment_id` int(11) DEFAULT NULL COMMENT '父评论id',
`author_name` varchar(40) DEFAULT NULL COMMENT '作者名',
`content` mediumtext COMMENT '内容',
`reply_name` varchar(40) DEFAULT NULL COMMENT '回复者名字',
`child_comment_id` int(11) DEFAULT NULL COMMENT '子评论id',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`author_id` varchar(50) DEFAULT NULL COMMENT '作者id',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎子评论';
CREATE TABLE `zhihu_article_root_comment` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`root_comment_id` int(11) DEFAULT NULL COMMENT '父评论id',
`author_name` varchar(40) DEFAULT NULL COMMENT '作者名',
`content` mediumtext COMMENT '内容',
`answerid` int(11) DEFAULT NULL COMMENT '文章id',
`child_comment_count` int(11) DEFAULT NULL COMMENT '子评论数',
`featured` varchar(5) DEFAULT NULL COMMENT '是否精彩评论',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`author_id` varchar(50) DEFAULT NULL COMMENT '作者id',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎文章评论';
CREATE TABLE `zhihu_answer_picture_url` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`answer_id` int(11) DEFAULT NULL COMMENT '问答id',
`url` mediumtext COMMENT 'url',
`new_url` mediumtext COMMENT '新url',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎问答图片';
CREATE TABLE `zhihu_article_picture_url` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`article_id` int(11) DEFAULT NULL COMMENT '文章id',
`url` mediumtext COMMENT 'url',
`new_url` mediumtext COMMENT '新url',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎文章图片';
CREATE TABLE `zhihu_thought` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`thought_id` varchar(50) DEFAULT NULL COMMENT '想法id',
`content` text COMMENT '内容',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`comment_count` int(11) DEFAULT NULL COMMENT '评论数量',
`new_content` mediumtext COMMENT '新内容',
`is_new` tinyint(1) NOT NULL DEFAULT 0 COMMENT '是否已更新',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎想法';
CREATE TABLE `zhihu_thought_comment` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`thought_comment_id` int(11) DEFAULT NULL COMMENT '想法评论id',
`author_name` varchar(40) DEFAULT NULL COMMENT '作者名',
`content` text COMMENT '内容',
`answerid` varchar(50) DEFAULT NULL COMMENT '想法id',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`author_id` varchar(50) DEFAULT NULL COMMENT '作者名',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎想法评论';
CREATE TABLE `zhihu_thought_picture_url` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`thought_id` varchar(50) DEFAULT NULL COMMENT '想法ID',
`url` text COMMENT 'url',
`new_url` text COMMENT 'new url',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎想法图片'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment