Commit 8a17245b authored by haowang's avatar haowang

modify

parent aff367ce
......@@ -24,94 +24,96 @@ from pymysql import escape_string
class Crawler_zhihu():
#初始化数据库,调整js规则
def __init__(self):
'''
初始化数据库,调整js规则
'''
self.conn = pymysql.connect(host='bj-cdb-b8oeejac.sql.tencentcdb.com', port=62118, user='work',
passwd='Gengmei1',
db='mimas_dev', charset='utf8')
self.cur = self.conn.cursor()
self.cur.execute("drop table if exists zhihu_answer")
sql = """create table zhihu_answer(title char(40),
content text(59999),
id int,
created_time int,
comment_count int)"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_article")
sql = """create table zhihu_article(title char(40),
content text(59999),
id int,
created_time int,
comment_count int)"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_answer_root_comment")
sql = """create table zhihu_answer_root_comment(root_comment_id int,
author_name char(40),
content text(59999),
answerid int,
child_comment_count int,
featured char(5),
created_time int,
author_id char(50))"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_child_comment")
sql = """create table zhihu_child_comment(root_comment_id int,
author_name char(40),
content text(59999),
reply_name char(40),
child_comment_id int,
created_time int,
author_id char(50))"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_article_root_comment")
sql = """create table zhihu_article_root_comment(root_comment_id int,
author_name char(40),
content text(59999),
answerid int,
child_comment_count int,
featured char(5),
created_time int,
author_id char(50))"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_answer_picture_url")
sql = """create table zhihu_answer_picture_url(answer_id int, url text(59999), new_url text(59999))"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_article_picture_url")
sql = """create table zhihu_article_picture_url(article_id int, url text(59999), new_url text(59999))"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_thought")
sql = """create table zhihu_thought(id char(50),
content text(59999),
created_time int,
comment_count int)"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_thought_comment")
sql = """create table zhihu_thought_comment(thought_comment_id int,
author_name char(40),
content text(59999),
answerid char(50),
created_time int,
author_id char(50))"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_thought_picture_url")
sql = """create table zhihu_thought_picture_url(thought_id char(50), url text(59999), new_url text(59999))"""
self.cur.execute(sql)
self.conn.commit()
# self.cur.execute("drop table if exists zhihu_answer")
# sql = """create table zhihu_answer(title char(40),
# content text(59999),
# id int,
# created_time int,
# comment_count int)"""
# self.cur.execute(sql)
# self.conn.commit()
# self.cur.execute("drop table if exists zhihu_article")
# sql = """create table zhihu_article(title char(40),
# content text(59999),
# id int,
# created_time int,
# comment_count int)"""
# self.cur.execute(sql)
# self.conn.commit()
# self.cur.execute("drop table if exists zhihu_answer_root_comment")
# sql = """create table zhihu_answer_root_comment(root_comment_id int,
# author_name char(40),
# content text(59999),
# answerid int,
# child_comment_count int,
# featured char(5),
# created_time int,
# author_id char(50))"""
# self.cur.execute(sql)
# self.conn.commit()
# self.cur.execute("drop table if exists zhihu_child_comment")
# sql = """create table zhihu_child_comment(root_comment_id int,
# author_name char(40),
# content text(59999),
# reply_name char(40),
# child_comment_id int,
# created_time int,
# author_id char(50))"""
# self.cur.execute(sql)
# self.conn.commit()
#
# self.cur.execute("drop table if exists zhihu_article_root_comment")
# sql = """create table zhihu_article_root_comment(root_comment_id int,
# author_name char(40),
# content text(59999),
# answerid int,
# child_comment_count int,
# featured char(5),
# created_time int,
# author_id char(50))"""
# self.cur.execute(sql)
# self.conn.commit()
#
# self.cur.execute("drop table if exists zhihu_answer_picture_url")
# sql = """create table zhihu_answer_picture_url(answer_id int, url text(59999), new_url text(59999))"""
# self.cur.execute(sql)
# self.conn.commit()
#
# self.cur.execute("drop table if exists zhihu_article_picture_url")
# sql = """create table zhihu_article_picture_url(article_id int, url text(59999), new_url text(59999))"""
# self.cur.execute(sql)
# self.conn.commit()
#
# self.cur.execute("drop table if exists zhihu_thought")
# sql = """create table zhihu_thought(id char(50),
# content text(59999),
# created_time int,
# comment_count int)"""
# self.cur.execute(sql)
# self.conn.commit()
#
# self.cur.execute("drop table if exists zhihu_thought_comment")
# sql = """create table zhihu_thought_comment(thought_comment_id int,
# author_name char(40),
# content text(59999),
# answerid char(50),
# created_time int,
# author_id char(50))"""
# self.cur.execute(sql)
# self.conn.commit()
#
# self.cur.execute("drop table if exists zhihu_thought_picture_url")
# sql = """create table zhihu_thought_picture_url(thought_id char(50), url text(59999), new_url text(59999))"""
# self.cur.execute(sql)
# self.conn.commit()
os.environ["EXECJS_RUNTIME"] = 'Node'
try:
......@@ -123,8 +125,11 @@ class Crawler_zhihu():
# print(js)
self.exec_js = execjs.compile(js, cwd=r'/usr/local/lib/node_modules')
#cookies更新
def get_serach_page_cookies(self):
'''
cookies更新
'''
url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1"
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
......@@ -143,8 +148,10 @@ class Crawler_zhihu():
requests_res = retry_get_url(url, headers=headers)
return requests_res.cookies.get_dict()
#插入主要内容数据和图片的url,寻找评论
def parse_sigle_page(self, data_dict, mark):
'''
插入主要内容数据和图片的url,寻找评论
'''
if mark == 0:
into = "insert into zhihu_answer(title, content, id, created_time, comment_count) value(%s, %s, %s, %s, %s)"
......@@ -175,8 +182,10 @@ class Crawler_zhihu():
return
#函数主入口
def search_page(self, answer_page_max, article_page_max, thought_page_max):
'''
函数主入口
'''
offset = 0
for i in range(answer_page_max):
......@@ -193,18 +202,18 @@ class Crawler_zhihu():
self.search_thought_page(offset)
offset = offset + 20
self.answer_picture_doneload_and_cut()
self.answer_refresh_content()
self.article_picture_doneload_and_cut()
self.article_refresh_content()
self.answer_data_complex()
# self.answer_picture_doneload_and_cut()
# self.answer_refresh_content()
# self.article_picture_doneload_and_cut()
# self.article_refresh_content()
# self.answer_data_complex()
self.conn.close()
return
#实现文章和回答的数据包请求
def search_answer_article_page(self, offset, mark, proxies_num=0):
'''
实现文章和回答的数据包请求
'''
offset = str(offset)
if mark == 0:
url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created".format(offset)
......@@ -236,8 +245,10 @@ class Crawler_zhihu():
return
#实现父评论的数据包请求
def search_root_comment(self, answerid, offset, mark, proxies_num=0):
'''
实现父评论的数据包请求
'''
offset = str(offset)
answerid = str(answerid)
if mark == 0:
......@@ -270,8 +281,10 @@ class Crawler_zhihu():
return next
#插入父评论相关信息并关联子评论
def root_comment_data(self, data_dict, answerid, mark):
'''
插入父评论相关信息并关联子评论
'''
if mark == 0:
into = "insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
elif mark == 1:
......@@ -289,8 +302,10 @@ class Crawler_zhihu():
return
#文章和回答的数据包请求
def search_child_comment(self, root_comment_id, offset, proxies_num=0):
'''
文章和回答的数据包请求
'''
root_comment_id = str(root_comment_id)
offsets = offset
offset = str(offset)
......@@ -321,8 +336,10 @@ class Crawler_zhihu():
next = 1
return next
#子评论数据插入
def child_comment_data(self, data_dict, root_comment_id):
'''
子评论数据插入
'''
into = "insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s)"
values = (root_comment_id, data_dict["author"]["member"]["name"], data_dict["content"], data_dict["reply_to_author"]["member"]["name"], data_dict["id"], data_dict["created_time"], data_dict["author"]["member"]["name"])
......@@ -331,8 +348,10 @@ class Crawler_zhihu():
return
#url请求中的头部伪装
def headers_handle(self, url):
'''
url请求中的头部伪装
'''
res_cookies_dict = self.get_serach_page_cookies()
headers_search = {
......@@ -364,8 +383,10 @@ class Crawler_zhihu():
headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b",fmd5)
return headers_search, cookies_dict
#回答图片剪切和下载
def answer_picture_doneload_and_cut(self):
'''
回答图片剪切和下载
'''
sql = """select answer_id, url from zhihu_answer_picture_url"""
self.cur.execute(sql)
tuple = self.cur.fetchall()
......@@ -407,7 +428,6 @@ class Crawler_zhihu():
self.cur.execute(sql)
self.conn.commit()
# for picture_deals in picture_deal:
# result = str(list[i])
# result = pattern.findall(result)
......@@ -427,8 +447,10 @@ class Crawler_zhihu():
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped)
#图片上传并得到新url
def upload_image_with_path(self, path, img_type=IMG_TYPE.TOPICIMAGE):
'''
图片上传并得到新url
'''
'''非站内图片处理'''
try:
# with open(path, 'rb') as f:
......@@ -443,8 +465,10 @@ class Crawler_zhihu():
print('upload ..... error')
return None
#替换url,更新回答内容
def answer_refresh_content(self):
'''
替换url,更新回答内容
'''
sql = """select answer_id, url, new_url from zhihu_answer_picture_url"""
self.cur.execute(sql)
tuple = self.cur.fetchall()
......@@ -464,12 +488,14 @@ class Crawler_zhihu():
temp_tuples)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update zhihu_answer set content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
sql = """update zhihu_answer set new_content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
self.cur.execute(sql)
self.conn.commit()
#文章图片剪切和下载
def article_picture_doneload_and_cut(self):
'''
文章图片剪切和下载
'''
sql = """select article_id, url from zhihu_article_picture_url"""
self.cur.execute(sql)
tuple = self.cur.fetchall()
......@@ -511,7 +537,6 @@ class Crawler_zhihu():
self.cur.execute(sql)
self.conn.commit()
# for picture_deals in picture_deal:
# result = str(list[i])
# result = pattern.findall(result)
......@@ -531,8 +556,10 @@ class Crawler_zhihu():
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped)
#替换url,更新文章内容
def article_refresh_content(self):
'''
替换url,更新文章内容
'''
sql = """select article_id, url, new_url from zhihu_article_picture_url"""
self.cur.execute(sql)
tuple = self.cur.fetchall()
......@@ -552,13 +579,14 @@ class Crawler_zhihu():
temp_tuples)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update zhihu_article set content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
sql = """update zhihu_article set new_content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
self.cur.execute(sql)
self.conn.commit()
#想法数据包请求
def search_thought_page(self, offset, proxies_num=0):
'''
想法数据包请求
'''
offset = str(offset)
url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment".format(offset)
[headers_search, cookies_dict] = self.headers_handle(url)
......@@ -584,9 +612,10 @@ class Crawler_zhihu():
return
#想法内容插入
def parse_thought_sigle_page(self, data_dict):
'''
想法内容插入
'''
for one_dict in data_dict["content"]:
if one_dict["type"] == "text":
into = "insert into zhihu_thought(content, id, created_time, comment_count) value(%s, %s, %s, %s)"
......@@ -607,8 +636,10 @@ class Crawler_zhihu():
return
#想法评论数据包请求
def search_thought_comment(self, answerid, offset, proxies_num=0):
'''
想法评论数据包请求
'''
offset = str(offset)
answerid = str(answerid)
url = "https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset)
......@@ -638,8 +669,10 @@ class Crawler_zhihu():
return next
#想法评论数据插入
def thought_comment_data(self, data_dict, answerid):
'''
想法评论数据插入
'''
into = "insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid, data_dict["created_time"], data_dict["author"]["member"]["id"])
self.cur.execute(into, values)
......@@ -647,8 +680,10 @@ class Crawler_zhihu():
return
#想法图片剪切和下载
def thought_picture_doneload_and_cut(self):
'''
想法图片剪切和下载
'''
sql = """select thought_id, url from zhihu_thought_picture_url"""
self.cur.execute(sql)
tuple = self.cur.fetchall()
......@@ -690,7 +725,6 @@ class Crawler_zhihu():
self.cur.execute(sql)
self.conn.commit()
# for picture_deals in picture_deal:
# result = str(list[i])
# result = pattern.findall(result)
......@@ -710,8 +744,10 @@ class Crawler_zhihu():
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped)
#封装回答最终数据结果格式
def answer_data_complex(self):
'''
封装回答最终数据结果格式
'''
sql = """select id, content, created_time, comment_count from zhihu_answer"""
self.cur.execute(sql)
topics = []
......@@ -754,8 +790,10 @@ class Crawler_zhihu():
)
return topics
#封装文章最终数据结果格式
def article_data_complex(self):
'''
封装文章最终数据结果格式
'''
sql = """select id, content, created_time, comment_count from zhihu_article"""
self.cur.execute(sql)
topics = []
......@@ -798,8 +836,10 @@ class Crawler_zhihu():
)
return topics
#封装回答最终数据结果格式
def thought_data_complex(self):
'''
封装回答最终数据结果格式
'''
sql = """select id, content, created_time, comment_count from zhihu_thought"""
self.cur.execute(sql)
topics = []
......@@ -834,6 +874,19 @@ class Crawler_zhihu():
}
)
return topics
def clean_data(self):
self.answer_refresh_content()
self.article_picture_doneload_and_cut()
self.article_refresh_content()
self.conn.close()
return
def complex_data(self):
self.answer_data_complex()
self.article_data_complex()
self.thought_data_complex()
self.conn.close()
if __name__ == '__main__':
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment