Commit 8a17245b authored by haowang's avatar haowang

modify

parent aff367ce
...@@ -24,94 +24,96 @@ from pymysql import escape_string ...@@ -24,94 +24,96 @@ from pymysql import escape_string
class Crawler_zhihu(): class Crawler_zhihu():
#初始化数据库,调整js规则
def __init__(self): def __init__(self):
'''
初始化数据库,调整js规则
'''
self.conn = pymysql.connect(host='bj-cdb-b8oeejac.sql.tencentcdb.com', port=62118, user='work', self.conn = pymysql.connect(host='bj-cdb-b8oeejac.sql.tencentcdb.com', port=62118, user='work',
passwd='Gengmei1', passwd='Gengmei1',
db='mimas_dev', charset='utf8') db='mimas_dev', charset='utf8')
self.cur = self.conn.cursor() self.cur = self.conn.cursor()
self.cur.execute("drop table if exists zhihu_answer") # self.cur.execute("drop table if exists zhihu_answer")
sql = """create table zhihu_answer(title char(40), # sql = """create table zhihu_answer(title char(40),
content text(59999), # content text(59999),
id int, # id int,
created_time int, # created_time int,
comment_count int)""" # comment_count int)"""
self.cur.execute(sql) # self.cur.execute(sql)
self.conn.commit() # self.conn.commit()
self.cur.execute("drop table if exists zhihu_article") # self.cur.execute("drop table if exists zhihu_article")
sql = """create table zhihu_article(title char(40), # sql = """create table zhihu_article(title char(40),
content text(59999), # content text(59999),
id int, # id int,
created_time int, # created_time int,
comment_count int)""" # comment_count int)"""
self.cur.execute(sql) # self.cur.execute(sql)
self.conn.commit() # self.conn.commit()
self.cur.execute("drop table if exists zhihu_answer_root_comment") # self.cur.execute("drop table if exists zhihu_answer_root_comment")
sql = """create table zhihu_answer_root_comment(root_comment_id int, # sql = """create table zhihu_answer_root_comment(root_comment_id int,
author_name char(40), # author_name char(40),
content text(59999), # content text(59999),
answerid int, # answerid int,
child_comment_count int, # child_comment_count int,
featured char(5), # featured char(5),
created_time int, # created_time int,
author_id char(50))""" # author_id char(50))"""
self.cur.execute(sql) # self.cur.execute(sql)
self.conn.commit() # self.conn.commit()
self.cur.execute("drop table if exists zhihu_child_comment") # self.cur.execute("drop table if exists zhihu_child_comment")
sql = """create table zhihu_child_comment(root_comment_id int, # sql = """create table zhihu_child_comment(root_comment_id int,
author_name char(40), # author_name char(40),
content text(59999), # content text(59999),
reply_name char(40), # reply_name char(40),
child_comment_id int, # child_comment_id int,
created_time int, # created_time int,
author_id char(50))""" # author_id char(50))"""
self.cur.execute(sql) # self.cur.execute(sql)
self.conn.commit() # self.conn.commit()
#
self.cur.execute("drop table if exists zhihu_article_root_comment") # self.cur.execute("drop table if exists zhihu_article_root_comment")
sql = """create table zhihu_article_root_comment(root_comment_id int, # sql = """create table zhihu_article_root_comment(root_comment_id int,
author_name char(40), # author_name char(40),
content text(59999), # content text(59999),
answerid int, # answerid int,
child_comment_count int, # child_comment_count int,
featured char(5), # featured char(5),
created_time int, # created_time int,
author_id char(50))""" # author_id char(50))"""
self.cur.execute(sql) # self.cur.execute(sql)
self.conn.commit() # self.conn.commit()
#
self.cur.execute("drop table if exists zhihu_answer_picture_url") # self.cur.execute("drop table if exists zhihu_answer_picture_url")
sql = """create table zhihu_answer_picture_url(answer_id int, url text(59999), new_url text(59999))""" # sql = """create table zhihu_answer_picture_url(answer_id int, url text(59999), new_url text(59999))"""
self.cur.execute(sql) # self.cur.execute(sql)
self.conn.commit() # self.conn.commit()
#
self.cur.execute("drop table if exists zhihu_article_picture_url") # self.cur.execute("drop table if exists zhihu_article_picture_url")
sql = """create table zhihu_article_picture_url(article_id int, url text(59999), new_url text(59999))""" # sql = """create table zhihu_article_picture_url(article_id int, url text(59999), new_url text(59999))"""
self.cur.execute(sql) # self.cur.execute(sql)
self.conn.commit() # self.conn.commit()
#
self.cur.execute("drop table if exists zhihu_thought") # self.cur.execute("drop table if exists zhihu_thought")
sql = """create table zhihu_thought(id char(50), # sql = """create table zhihu_thought(id char(50),
content text(59999), # content text(59999),
created_time int, # created_time int,
comment_count int)""" # comment_count int)"""
self.cur.execute(sql) # self.cur.execute(sql)
self.conn.commit() # self.conn.commit()
#
self.cur.execute("drop table if exists zhihu_thought_comment") # self.cur.execute("drop table if exists zhihu_thought_comment")
sql = """create table zhihu_thought_comment(thought_comment_id int, # sql = """create table zhihu_thought_comment(thought_comment_id int,
author_name char(40), # author_name char(40),
content text(59999), # content text(59999),
answerid char(50), # answerid char(50),
created_time int, # created_time int,
author_id char(50))""" # author_id char(50))"""
self.cur.execute(sql) # self.cur.execute(sql)
self.conn.commit() # self.conn.commit()
#
self.cur.execute("drop table if exists zhihu_thought_picture_url") # self.cur.execute("drop table if exists zhihu_thought_picture_url")
sql = """create table zhihu_thought_picture_url(thought_id char(50), url text(59999), new_url text(59999))""" # sql = """create table zhihu_thought_picture_url(thought_id char(50), url text(59999), new_url text(59999))"""
self.cur.execute(sql) # self.cur.execute(sql)
self.conn.commit() # self.conn.commit()
os.environ["EXECJS_RUNTIME"] = 'Node' os.environ["EXECJS_RUNTIME"] = 'Node'
try: try:
...@@ -123,8 +125,11 @@ class Crawler_zhihu(): ...@@ -123,8 +125,11 @@ class Crawler_zhihu():
# print(js) # print(js)
self.exec_js = execjs.compile(js, cwd=r'/usr/local/lib/node_modules') self.exec_js = execjs.compile(js, cwd=r'/usr/local/lib/node_modules')
#cookies更新
def get_serach_page_cookies(self): def get_serach_page_cookies(self):
'''
cookies更新
'''
url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1" url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1"
headers = { headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
...@@ -143,8 +148,10 @@ class Crawler_zhihu(): ...@@ -143,8 +148,10 @@ class Crawler_zhihu():
requests_res = retry_get_url(url, headers=headers) requests_res = retry_get_url(url, headers=headers)
return requests_res.cookies.get_dict() return requests_res.cookies.get_dict()
#插入主要内容数据和图片的url,寻找评论
def parse_sigle_page(self, data_dict, mark): def parse_sigle_page(self, data_dict, mark):
'''
插入主要内容数据和图片的url,寻找评论
'''
if mark == 0: if mark == 0:
into = "insert into zhihu_answer(title, content, id, created_time, comment_count) value(%s, %s, %s, %s, %s)" into = "insert into zhihu_answer(title, content, id, created_time, comment_count) value(%s, %s, %s, %s, %s)"
...@@ -175,8 +182,10 @@ class Crawler_zhihu(): ...@@ -175,8 +182,10 @@ class Crawler_zhihu():
return return
#函数主入口
def search_page(self, answer_page_max, article_page_max, thought_page_max): def search_page(self, answer_page_max, article_page_max, thought_page_max):
'''
函数主入口
'''
offset = 0 offset = 0
for i in range(answer_page_max): for i in range(answer_page_max):
...@@ -193,18 +202,18 @@ class Crawler_zhihu(): ...@@ -193,18 +202,18 @@ class Crawler_zhihu():
self.search_thought_page(offset) self.search_thought_page(offset)
offset = offset + 20 offset = offset + 20
# self.answer_picture_doneload_and_cut()
self.answer_picture_doneload_and_cut() # self.answer_refresh_content()
self.answer_refresh_content() # self.article_picture_doneload_and_cut()
self.article_picture_doneload_and_cut() # self.article_refresh_content()
self.article_refresh_content() # self.answer_data_complex()
self.answer_data_complex()
self.conn.close() self.conn.close()
return return
#实现文章和回答的数据包请求
def search_answer_article_page(self, offset, mark, proxies_num=0): def search_answer_article_page(self, offset, mark, proxies_num=0):
'''
实现文章和回答的数据包请求
'''
offset = str(offset) offset = str(offset)
if mark == 0: if mark == 0:
url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created".format(offset) url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created".format(offset)
...@@ -236,8 +245,10 @@ class Crawler_zhihu(): ...@@ -236,8 +245,10 @@ class Crawler_zhihu():
return return
#实现父评论的数据包请求
def search_root_comment(self, answerid, offset, mark, proxies_num=0): def search_root_comment(self, answerid, offset, mark, proxies_num=0):
'''
实现父评论的数据包请求
'''
offset = str(offset) offset = str(offset)
answerid = str(answerid) answerid = str(answerid)
if mark == 0: if mark == 0:
...@@ -270,8 +281,10 @@ class Crawler_zhihu(): ...@@ -270,8 +281,10 @@ class Crawler_zhihu():
return next return next
#插入父评论相关信息并关联子评论
def root_comment_data(self, data_dict, answerid, mark): def root_comment_data(self, data_dict, answerid, mark):
'''
插入父评论相关信息并关联子评论
'''
if mark == 0: if mark == 0:
into = "insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)" into = "insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
elif mark == 1: elif mark == 1:
...@@ -289,8 +302,10 @@ class Crawler_zhihu(): ...@@ -289,8 +302,10 @@ class Crawler_zhihu():
return return
#文章和回答的数据包请求
def search_child_comment(self, root_comment_id, offset, proxies_num=0): def search_child_comment(self, root_comment_id, offset, proxies_num=0):
'''
文章和回答的数据包请求
'''
root_comment_id = str(root_comment_id) root_comment_id = str(root_comment_id)
offsets = offset offsets = offset
offset = str(offset) offset = str(offset)
...@@ -321,8 +336,10 @@ class Crawler_zhihu(): ...@@ -321,8 +336,10 @@ class Crawler_zhihu():
next = 1 next = 1
return next return next
#子评论数据插入
def child_comment_data(self, data_dict, root_comment_id): def child_comment_data(self, data_dict, root_comment_id):
'''
子评论数据插入
'''
into = "insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s)" into = "insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s)"
values = (root_comment_id, data_dict["author"]["member"]["name"], data_dict["content"], data_dict["reply_to_author"]["member"]["name"], data_dict["id"], data_dict["created_time"], data_dict["author"]["member"]["name"]) values = (root_comment_id, data_dict["author"]["member"]["name"], data_dict["content"], data_dict["reply_to_author"]["member"]["name"], data_dict["id"], data_dict["created_time"], data_dict["author"]["member"]["name"])
...@@ -331,8 +348,10 @@ class Crawler_zhihu(): ...@@ -331,8 +348,10 @@ class Crawler_zhihu():
return return
#url请求中的头部伪装
def headers_handle(self, url): def headers_handle(self, url):
'''
url请求中的头部伪装
'''
res_cookies_dict = self.get_serach_page_cookies() res_cookies_dict = self.get_serach_page_cookies()
headers_search = { headers_search = {
...@@ -364,8 +383,10 @@ class Crawler_zhihu(): ...@@ -364,8 +383,10 @@ class Crawler_zhihu():
headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b",fmd5) headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b",fmd5)
return headers_search, cookies_dict return headers_search, cookies_dict
#回答图片剪切和下载
def answer_picture_doneload_and_cut(self): def answer_picture_doneload_and_cut(self):
'''
回答图片剪切和下载
'''
sql = """select answer_id, url from zhihu_answer_picture_url""" sql = """select answer_id, url from zhihu_answer_picture_url"""
self.cur.execute(sql) self.cur.execute(sql)
tuple = self.cur.fetchall() tuple = self.cur.fetchall()
...@@ -407,7 +428,6 @@ class Crawler_zhihu(): ...@@ -407,7 +428,6 @@ class Crawler_zhihu():
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
# for picture_deals in picture_deal: # for picture_deals in picture_deal:
# result = str(list[i]) # result = str(list[i])
# result = pattern.findall(result) # result = pattern.findall(result)
...@@ -427,8 +447,10 @@ class Crawler_zhihu(): ...@@ -427,8 +447,10 @@ class Crawler_zhihu():
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg" # paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped) # cv2.imwrite(paths, cropped)
#图片上传并得到新url
def upload_image_with_path(self, path, img_type=IMG_TYPE.TOPICIMAGE): def upload_image_with_path(self, path, img_type=IMG_TYPE.TOPICIMAGE):
'''
图片上传并得到新url
'''
'''非站内图片处理''' '''非站内图片处理'''
try: try:
# with open(path, 'rb') as f: # with open(path, 'rb') as f:
...@@ -443,8 +465,10 @@ class Crawler_zhihu(): ...@@ -443,8 +465,10 @@ class Crawler_zhihu():
print('upload ..... error') print('upload ..... error')
return None return None
#替换url,更新回答内容
def answer_refresh_content(self): def answer_refresh_content(self):
'''
替换url,更新回答内容
'''
sql = """select answer_id, url, new_url from zhihu_answer_picture_url""" sql = """select answer_id, url, new_url from zhihu_answer_picture_url"""
self.cur.execute(sql) self.cur.execute(sql)
tuple = self.cur.fetchall() tuple = self.cur.fetchall()
...@@ -464,12 +488,14 @@ class Crawler_zhihu(): ...@@ -464,12 +488,14 @@ class Crawler_zhihu():
temp_tuples) temp_tuples)
new_content = r'%s' % (new_content) new_content = r'%s' % (new_content)
new_content = escape_string(new_content) new_content = escape_string(new_content)
sql = """update zhihu_answer set content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0]) sql = """update zhihu_answer set new_content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
#文章图片剪切和下载
def article_picture_doneload_and_cut(self): def article_picture_doneload_and_cut(self):
'''
文章图片剪切和下载
'''
sql = """select article_id, url from zhihu_article_picture_url""" sql = """select article_id, url from zhihu_article_picture_url"""
self.cur.execute(sql) self.cur.execute(sql)
tuple = self.cur.fetchall() tuple = self.cur.fetchall()
...@@ -511,7 +537,6 @@ class Crawler_zhihu(): ...@@ -511,7 +537,6 @@ class Crawler_zhihu():
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
# for picture_deals in picture_deal: # for picture_deals in picture_deal:
# result = str(list[i]) # result = str(list[i])
# result = pattern.findall(result) # result = pattern.findall(result)
...@@ -531,8 +556,10 @@ class Crawler_zhihu(): ...@@ -531,8 +556,10 @@ class Crawler_zhihu():
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg" # paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped) # cv2.imwrite(paths, cropped)
#替换url,更新文章内容
def article_refresh_content(self): def article_refresh_content(self):
'''
替换url,更新文章内容
'''
sql = """select article_id, url, new_url from zhihu_article_picture_url""" sql = """select article_id, url, new_url from zhihu_article_picture_url"""
self.cur.execute(sql) self.cur.execute(sql)
tuple = self.cur.fetchall() tuple = self.cur.fetchall()
...@@ -552,13 +579,14 @@ class Crawler_zhihu(): ...@@ -552,13 +579,14 @@ class Crawler_zhihu():
temp_tuples) temp_tuples)
new_content = r'%s' % (new_content) new_content = r'%s' % (new_content)
new_content = escape_string(new_content) new_content = escape_string(new_content)
sql = """update zhihu_article set content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0]) sql = """update zhihu_article set new_content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
#想法数据包请求
def search_thought_page(self, offset, proxies_num=0): def search_thought_page(self, offset, proxies_num=0):
'''
想法数据包请求
'''
offset = str(offset) offset = str(offset)
url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment".format(offset) url = "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment".format(offset)
[headers_search, cookies_dict] = self.headers_handle(url) [headers_search, cookies_dict] = self.headers_handle(url)
...@@ -584,9 +612,10 @@ class Crawler_zhihu(): ...@@ -584,9 +612,10 @@ class Crawler_zhihu():
return return
#想法内容插入
def parse_thought_sigle_page(self, data_dict): def parse_thought_sigle_page(self, data_dict):
'''
想法内容插入
'''
for one_dict in data_dict["content"]: for one_dict in data_dict["content"]:
if one_dict["type"] == "text": if one_dict["type"] == "text":
into = "insert into zhihu_thought(content, id, created_time, comment_count) value(%s, %s, %s, %s)" into = "insert into zhihu_thought(content, id, created_time, comment_count) value(%s, %s, %s, %s)"
...@@ -607,8 +636,10 @@ class Crawler_zhihu(): ...@@ -607,8 +636,10 @@ class Crawler_zhihu():
return return
#想法评论数据包请求
def search_thought_comment(self, answerid, offset, proxies_num=0): def search_thought_comment(self, answerid, offset, proxies_num=0):
'''
想法评论数据包请求
'''
offset = str(offset) offset = str(offset)
answerid = str(answerid) answerid = str(answerid)
url = "https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset) url = "https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open".format(answerid, offset)
...@@ -638,8 +669,10 @@ class Crawler_zhihu(): ...@@ -638,8 +669,10 @@ class Crawler_zhihu():
return next return next
#想法评论数据插入
def thought_comment_data(self, data_dict, answerid): def thought_comment_data(self, data_dict, answerid):
'''
想法评论数据插入
'''
into = "insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(%s, %s, %s, %s, %s, %s)" into = "insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid, data_dict["created_time"], data_dict["author"]["member"]["id"]) values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid, data_dict["created_time"], data_dict["author"]["member"]["id"])
self.cur.execute(into, values) self.cur.execute(into, values)
...@@ -647,8 +680,10 @@ class Crawler_zhihu(): ...@@ -647,8 +680,10 @@ class Crawler_zhihu():
return return
#想法图片剪切和下载
def thought_picture_doneload_and_cut(self): def thought_picture_doneload_and_cut(self):
'''
想法图片剪切和下载
'''
sql = """select thought_id, url from zhihu_thought_picture_url""" sql = """select thought_id, url from zhihu_thought_picture_url"""
self.cur.execute(sql) self.cur.execute(sql)
tuple = self.cur.fetchall() tuple = self.cur.fetchall()
...@@ -690,7 +725,6 @@ class Crawler_zhihu(): ...@@ -690,7 +725,6 @@ class Crawler_zhihu():
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
# for picture_deals in picture_deal: # for picture_deals in picture_deal:
# result = str(list[i]) # result = str(list[i])
# result = pattern.findall(result) # result = pattern.findall(result)
...@@ -710,8 +744,10 @@ class Crawler_zhihu(): ...@@ -710,8 +744,10 @@ class Crawler_zhihu():
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg" # paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped) # cv2.imwrite(paths, cropped)
#封装回答最终数据结果格式
def answer_data_complex(self): def answer_data_complex(self):
'''
封装回答最终数据结果格式
'''
sql = """select id, content, created_time, comment_count from zhihu_answer""" sql = """select id, content, created_time, comment_count from zhihu_answer"""
self.cur.execute(sql) self.cur.execute(sql)
topics = [] topics = []
...@@ -754,8 +790,10 @@ class Crawler_zhihu(): ...@@ -754,8 +790,10 @@ class Crawler_zhihu():
) )
return topics return topics
#封装文章最终数据结果格式
def article_data_complex(self): def article_data_complex(self):
'''
封装文章最终数据结果格式
'''
sql = """select id, content, created_time, comment_count from zhihu_article""" sql = """select id, content, created_time, comment_count from zhihu_article"""
self.cur.execute(sql) self.cur.execute(sql)
topics = [] topics = []
...@@ -798,8 +836,10 @@ class Crawler_zhihu(): ...@@ -798,8 +836,10 @@ class Crawler_zhihu():
) )
return topics return topics
#封装回答最终数据结果格式
def thought_data_complex(self): def thought_data_complex(self):
'''
封装回答最终数据结果格式
'''
sql = """select id, content, created_time, comment_count from zhihu_thought""" sql = """select id, content, created_time, comment_count from zhihu_thought"""
self.cur.execute(sql) self.cur.execute(sql)
topics = [] topics = []
...@@ -835,6 +875,19 @@ class Crawler_zhihu(): ...@@ -835,6 +875,19 @@ class Crawler_zhihu():
) )
return topics return topics
def clean_data(self):
self.answer_refresh_content()
self.article_picture_doneload_and_cut()
self.article_refresh_content()
self.conn.close()
return
def complex_data(self):
self.answer_data_complex()
self.article_data_complex()
self.thought_data_complex()
self.conn.close()
if __name__ == '__main__': if __name__ == '__main__':
#a = Crawler_zhihu() #a = Crawler_zhihu()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment