Commit b1359e35 authored by 向万's avatar 向万

add notice

parent d5e464a2
...@@ -24,6 +24,7 @@ from pymysql import escape_string ...@@ -24,6 +24,7 @@ from pymysql import escape_string
class Crawler_zhihu(): class Crawler_zhihu():
#初始化数据库,调整js规则
def __init__(self): def __init__(self):
self.conn = pymysql.connect(host='bj-cdb-b8oeejac.sql.tencentcdb.com', port=62118, user='work', self.conn = pymysql.connect(host='bj-cdb-b8oeejac.sql.tencentcdb.com', port=62118, user='work',
passwd='Gengmei1', passwd='Gengmei1',
...@@ -122,6 +123,7 @@ class Crawler_zhihu(): ...@@ -122,6 +123,7 @@ class Crawler_zhihu():
# print(js) # print(js)
self.exec_js = execjs.compile(js, cwd=r'/usr/local/lib/node_modules') self.exec_js = execjs.compile(js, cwd=r'/usr/local/lib/node_modules')
#cookies更新
def get_serach_page_cookies(self): def get_serach_page_cookies(self):
url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1" url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1"
headers = { headers = {
...@@ -141,6 +143,7 @@ class Crawler_zhihu(): ...@@ -141,6 +143,7 @@ class Crawler_zhihu():
requests_res = retry_get_url(url, headers=headers) requests_res = retry_get_url(url, headers=headers)
return requests_res.cookies.get_dict() return requests_res.cookies.get_dict()
#插入主要内容数据和图片的url,寻找评论
def parse_sigle_page(self, data_dict, mark): def parse_sigle_page(self, data_dict, mark):
if mark == 0: if mark == 0:
...@@ -172,6 +175,7 @@ class Crawler_zhihu(): ...@@ -172,6 +175,7 @@ class Crawler_zhihu():
return return
#函数主入口
def search_page(self, answer_page_max, article_page_max, thought_page_max): def search_page(self, answer_page_max, article_page_max, thought_page_max):
offset = 0 offset = 0
...@@ -192,12 +196,13 @@ class Crawler_zhihu(): ...@@ -192,12 +196,13 @@ class Crawler_zhihu():
self.answer_picture_doneload_and_cut() self.answer_picture_doneload_and_cut()
self.answer_refresh_content() self.answer_refresh_content()
#self.article_picture_doneload_and_cut() self.article_picture_doneload_and_cut()
#self.article_refresh_content() self.article_refresh_content()
self.answer_data_complex() self.answer_data_complex()
self.conn.close() self.conn.close()
return return
#实现文章和回答的数据包请求
def search_answer_article_page(self, offset, mark, proxies_num=0): def search_answer_article_page(self, offset, mark, proxies_num=0):
offset = str(offset) offset = str(offset)
...@@ -231,6 +236,7 @@ class Crawler_zhihu(): ...@@ -231,6 +236,7 @@ class Crawler_zhihu():
return return
#实现父评论的数据包请求
def search_root_comment(self, answerid, offset, mark, proxies_num=0): def search_root_comment(self, answerid, offset, mark, proxies_num=0):
offset = str(offset) offset = str(offset)
answerid = str(answerid) answerid = str(answerid)
...@@ -264,6 +270,7 @@ class Crawler_zhihu(): ...@@ -264,6 +270,7 @@ class Crawler_zhihu():
return next return next
#插入父评论相关信息并关联子评论
def root_comment_data(self, data_dict, answerid, mark): def root_comment_data(self, data_dict, answerid, mark):
if mark == 0: if mark == 0:
into = "insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)" into = "insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
...@@ -282,6 +289,7 @@ class Crawler_zhihu(): ...@@ -282,6 +289,7 @@ class Crawler_zhihu():
return return
#文章和回答的数据包请求
def search_child_comment(self, root_comment_id, offset, proxies_num=0): def search_child_comment(self, root_comment_id, offset, proxies_num=0):
root_comment_id = str(root_comment_id) root_comment_id = str(root_comment_id)
offsets = offset offsets = offset
...@@ -313,6 +321,7 @@ class Crawler_zhihu(): ...@@ -313,6 +321,7 @@ class Crawler_zhihu():
next = 1 next = 1
return next return next
#子评论数据插入
def child_comment_data(self, data_dict, root_comment_id): def child_comment_data(self, data_dict, root_comment_id):
into = "insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s)" into = "insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s)"
...@@ -322,6 +331,7 @@ class Crawler_zhihu(): ...@@ -322,6 +331,7 @@ class Crawler_zhihu():
return return
#url请求中的头部伪装
def headers_handle(self, url): def headers_handle(self, url):
res_cookies_dict = self.get_serach_page_cookies() res_cookies_dict = self.get_serach_page_cookies()
headers_search = { headers_search = {
...@@ -354,6 +364,7 @@ class Crawler_zhihu(): ...@@ -354,6 +364,7 @@ class Crawler_zhihu():
headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b",fmd5) headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b",fmd5)
return headers_search, cookies_dict return headers_search, cookies_dict
#回答图片剪切和下载
def answer_picture_doneload_and_cut(self): def answer_picture_doneload_and_cut(self):
sql = """select answer_id, url from zhihu_answer_picture_url""" sql = """select answer_id, url from zhihu_answer_picture_url"""
self.cur.execute(sql) self.cur.execute(sql)
...@@ -416,6 +427,7 @@ class Crawler_zhihu(): ...@@ -416,6 +427,7 @@ class Crawler_zhihu():
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg" # paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped) # cv2.imwrite(paths, cropped)
#图片上传并得到新url
def upload_image_with_path(self, path, img_type=IMG_TYPE.TOPICIMAGE): def upload_image_with_path(self, path, img_type=IMG_TYPE.TOPICIMAGE):
'''非站内图片处理''' '''非站内图片处理'''
try: try:
...@@ -431,6 +443,7 @@ class Crawler_zhihu(): ...@@ -431,6 +443,7 @@ class Crawler_zhihu():
print('upload ..... error') print('upload ..... error')
return None return None
#替换url,更新回答内容
def answer_refresh_content(self): def answer_refresh_content(self):
sql = """select answer_id, url, new_url from zhihu_answer_picture_url""" sql = """select answer_id, url, new_url from zhihu_answer_picture_url"""
self.cur.execute(sql) self.cur.execute(sql)
...@@ -455,6 +468,7 @@ class Crawler_zhihu(): ...@@ -455,6 +468,7 @@ class Crawler_zhihu():
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
#文章图片剪切和下载
def article_picture_doneload_and_cut(self): def article_picture_doneload_and_cut(self):
sql = """select article_id, url from zhihu_article_picture_url""" sql = """select article_id, url from zhihu_article_picture_url"""
self.cur.execute(sql) self.cur.execute(sql)
...@@ -517,6 +531,7 @@ class Crawler_zhihu(): ...@@ -517,6 +531,7 @@ class Crawler_zhihu():
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg" # paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped) # cv2.imwrite(paths, cropped)
#替换url,更新文章内容
def article_refresh_content(self): def article_refresh_content(self):
sql = """select article_id, url, new_url from zhihu_article_picture_url""" sql = """select article_id, url, new_url from zhihu_article_picture_url"""
self.cur.execute(sql) self.cur.execute(sql)
...@@ -541,6 +556,7 @@ class Crawler_zhihu(): ...@@ -541,6 +556,7 @@ class Crawler_zhihu():
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
#想法数据包请求
def search_thought_page(self, offset, proxies_num=0): def search_thought_page(self, offset, proxies_num=0):
offset = str(offset) offset = str(offset)
...@@ -568,6 +584,7 @@ class Crawler_zhihu(): ...@@ -568,6 +584,7 @@ class Crawler_zhihu():
return return
#想法内容插入
def parse_thought_sigle_page(self, data_dict): def parse_thought_sigle_page(self, data_dict):
for one_dict in data_dict["content"]: for one_dict in data_dict["content"]:
...@@ -590,6 +607,7 @@ class Crawler_zhihu(): ...@@ -590,6 +607,7 @@ class Crawler_zhihu():
return return
#想法评论数据包请求
def search_thought_comment(self, answerid, offset, proxies_num=0): def search_thought_comment(self, answerid, offset, proxies_num=0):
offset = str(offset) offset = str(offset)
answerid = str(answerid) answerid = str(answerid)
...@@ -620,6 +638,7 @@ class Crawler_zhihu(): ...@@ -620,6 +638,7 @@ class Crawler_zhihu():
return next return next
#想法评论数据插入
def thought_comment_data(self, data_dict, answerid): def thought_comment_data(self, data_dict, answerid):
into = "insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(%s, %s, %s, %s, %s, %s)" into = "insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid, data_dict["created_time"], data_dict["author"]["member"]["id"]) values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid, data_dict["created_time"], data_dict["author"]["member"]["id"])
...@@ -628,6 +647,7 @@ class Crawler_zhihu(): ...@@ -628,6 +647,7 @@ class Crawler_zhihu():
return return
#想法图片剪切和下载
def thought_picture_doneload_and_cut(self): def thought_picture_doneload_and_cut(self):
sql = """select thought_id, url from zhihu_thought_picture_url""" sql = """select thought_id, url from zhihu_thought_picture_url"""
self.cur.execute(sql) self.cur.execute(sql)
...@@ -690,6 +710,7 @@ class Crawler_zhihu(): ...@@ -690,6 +710,7 @@ class Crawler_zhihu():
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg" # paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped) # cv2.imwrite(paths, cropped)
#封装回答最终数据结果格式
def answer_data_complex(self): def answer_data_complex(self):
sql = """select id, content, created_time, comment_count from zhihu_answer""" sql = """select id, content, created_time, comment_count from zhihu_answer"""
self.cur.execute(sql) self.cur.execute(sql)
...@@ -705,6 +726,7 @@ class Crawler_zhihu(): ...@@ -705,6 +726,7 @@ class Crawler_zhihu():
root_comment = self.cur.fetchall() root_comment = self.cur.fetchall()
comment = [] comment = []
for j in range(len(root_comment)): for j in range(len(root_comment)):
reply = []
if root_comment[j][1] != 0: if root_comment[j][1] != 0:
sql = """select child_comment_id, content, created_time, author_id from zhihu_child_comment as a where a.root_comment_id = '{}' """.format(root_comment[j][0]) sql = """select child_comment_id, content, created_time, author_id from zhihu_child_comment as a where a.root_comment_id = '{}' """.format(root_comment[j][0])
self.cur.execute(sql) self.cur.execute(sql)
...@@ -732,6 +754,7 @@ class Crawler_zhihu(): ...@@ -732,6 +754,7 @@ class Crawler_zhihu():
) )
return topics return topics
#封装文章最终数据结果格式
def article_data_complex(self): def article_data_complex(self):
sql = """select id, content, created_time, comment_count from zhihu_article""" sql = """select id, content, created_time, comment_count from zhihu_article"""
self.cur.execute(sql) self.cur.execute(sql)
...@@ -747,6 +770,7 @@ class Crawler_zhihu(): ...@@ -747,6 +770,7 @@ class Crawler_zhihu():
root_comment = self.cur.fetchall() root_comment = self.cur.fetchall()
comment = [] comment = []
for j in range(len(root_comment)): for j in range(len(root_comment)):
reply = []
if root_comment[j][1] != 0: if root_comment[j][1] != 0:
sql = """select child_comment_id, content, created_time, author_id from zhihu_child_comment as a where a.root_comment_id = '{}' """.format(root_comment[j][0]) sql = """select child_comment_id, content, created_time, author_id from zhihu_child_comment as a where a.root_comment_id = '{}' """.format(root_comment[j][0])
self.cur.execute(sql) self.cur.execute(sql)
...@@ -774,6 +798,7 @@ class Crawler_zhihu(): ...@@ -774,6 +798,7 @@ class Crawler_zhihu():
) )
return topics return topics
#封装回答最终数据结果格式
def thought_data_complex(self): def thought_data_complex(self):
sql = """select id, content, created_time, comment_count from zhihu_thought""" sql = """select id, content, created_time, comment_count from zhihu_thought"""
self.cur.execute(sql) self.cur.execute(sql)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment