Commit b1359e35 authored by 向万's avatar 向万

add notice

parent d5e464a2
......@@ -24,6 +24,7 @@ from pymysql import escape_string
class Crawler_zhihu():
#初始化数据库,调整js规则
def __init__(self):
self.conn = pymysql.connect(host='bj-cdb-b8oeejac.sql.tencentcdb.com', port=62118, user='work',
passwd='Gengmei1',
......@@ -122,6 +123,7 @@ class Crawler_zhihu():
# print(js)
self.exec_js = execjs.compile(js, cwd=r'/usr/local/lib/node_modules')
#cookies更新
def get_serach_page_cookies(self):
url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1"
headers = {
......@@ -141,6 +143,7 @@ class Crawler_zhihu():
requests_res = retry_get_url(url, headers=headers)
return requests_res.cookies.get_dict()
#插入主要内容数据和图片的url,寻找评论
def parse_sigle_page(self, data_dict, mark):
if mark == 0:
......@@ -172,6 +175,7 @@ class Crawler_zhihu():
return
#函数主入口
def search_page(self, answer_page_max, article_page_max, thought_page_max):
offset = 0
......@@ -192,12 +196,13 @@ class Crawler_zhihu():
self.answer_picture_doneload_and_cut()
self.answer_refresh_content()
#self.article_picture_doneload_and_cut()
#self.article_refresh_content()
self.article_picture_doneload_and_cut()
self.article_refresh_content()
self.answer_data_complex()
self.conn.close()
return
#实现文章和回答的数据包请求
def search_answer_article_page(self, offset, mark, proxies_num=0):
offset = str(offset)
......@@ -231,6 +236,7 @@ class Crawler_zhihu():
return
#实现父评论的数据包请求
def search_root_comment(self, answerid, offset, mark, proxies_num=0):
offset = str(offset)
answerid = str(answerid)
......@@ -264,6 +270,7 @@ class Crawler_zhihu():
return next
#插入父评论相关信息并关联子评论
def root_comment_data(self, data_dict, answerid, mark):
if mark == 0:
into = "insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s, %s)"
......@@ -282,6 +289,7 @@ class Crawler_zhihu():
return
#文章和回答的数据包请求
def search_child_comment(self, root_comment_id, offset, proxies_num=0):
root_comment_id = str(root_comment_id)
offsets = offset
......@@ -313,6 +321,7 @@ class Crawler_zhihu():
next = 1
return next
#子评论数据插入
def child_comment_data(self, data_dict, root_comment_id):
into = "insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(%s, %s, %s, %s, %s, %s, %s)"
......@@ -322,6 +331,7 @@ class Crawler_zhihu():
return
#url请求中的头部伪装
def headers_handle(self, url):
res_cookies_dict = self.get_serach_page_cookies()
headers_search = {
......@@ -354,6 +364,7 @@ class Crawler_zhihu():
headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b",fmd5)
return headers_search, cookies_dict
#回答图片剪切和下载
def answer_picture_doneload_and_cut(self):
sql = """select answer_id, url from zhihu_answer_picture_url"""
self.cur.execute(sql)
......@@ -416,6 +427,7 @@ class Crawler_zhihu():
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped)
#图片上传并得到新url
def upload_image_with_path(self, path, img_type=IMG_TYPE.TOPICIMAGE):
'''非站内图片处理'''
try:
......@@ -431,6 +443,7 @@ class Crawler_zhihu():
print('upload ..... error')
return None
#替换url,更新回答内容
def answer_refresh_content(self):
sql = """select answer_id, url, new_url from zhihu_answer_picture_url"""
self.cur.execute(sql)
......@@ -455,6 +468,7 @@ class Crawler_zhihu():
self.cur.execute(sql)
self.conn.commit()
#文章图片剪切和下载
def article_picture_doneload_and_cut(self):
sql = """select article_id, url from zhihu_article_picture_url"""
self.cur.execute(sql)
......@@ -517,6 +531,7 @@ class Crawler_zhihu():
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped)
#替换url,更新文章内容
def article_refresh_content(self):
sql = """select article_id, url, new_url from zhihu_article_picture_url"""
self.cur.execute(sql)
......@@ -541,6 +556,7 @@ class Crawler_zhihu():
self.cur.execute(sql)
self.conn.commit()
#想法数据包请求
def search_thought_page(self, offset, proxies_num=0):
offset = str(offset)
......@@ -568,6 +584,7 @@ class Crawler_zhihu():
return
#想法内容插入
def parse_thought_sigle_page(self, data_dict):
for one_dict in data_dict["content"]:
......@@ -590,6 +607,7 @@ class Crawler_zhihu():
return
#想法评论数据包请求
def search_thought_comment(self, answerid, offset, proxies_num=0):
offset = str(offset)
answerid = str(answerid)
......@@ -620,6 +638,7 @@ class Crawler_zhihu():
return next
#想法评论数据插入
def thought_comment_data(self, data_dict, answerid):
into = "insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["id"], data_dict["author"]["member"]["name"], data_dict["content"], answerid, data_dict["created_time"], data_dict["author"]["member"]["id"])
......@@ -628,6 +647,7 @@ class Crawler_zhihu():
return
#想法图片剪切和下载
def thought_picture_doneload_and_cut(self):
sql = """select thought_id, url from zhihu_thought_picture_url"""
self.cur.execute(sql)
......@@ -690,6 +710,7 @@ class Crawler_zhihu():
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped)
#封装回答最终数据结果格式
def answer_data_complex(self):
sql = """select id, content, created_time, comment_count from zhihu_answer"""
self.cur.execute(sql)
......@@ -705,6 +726,7 @@ class Crawler_zhihu():
root_comment = self.cur.fetchall()
comment = []
for j in range(len(root_comment)):
reply = []
if root_comment[j][1] != 0:
sql = """select child_comment_id, content, created_time, author_id from zhihu_child_comment as a where a.root_comment_id = '{}' """.format(root_comment[j][0])
self.cur.execute(sql)
......@@ -732,6 +754,7 @@ class Crawler_zhihu():
)
return topics
#封装文章最终数据结果格式
def article_data_complex(self):
sql = """select id, content, created_time, comment_count from zhihu_article"""
self.cur.execute(sql)
......@@ -747,6 +770,7 @@ class Crawler_zhihu():
root_comment = self.cur.fetchall()
comment = []
for j in range(len(root_comment)):
reply = []
if root_comment[j][1] != 0:
sql = """select child_comment_id, content, created_time, author_id from zhihu_child_comment as a where a.root_comment_id = '{}' """.format(root_comment[j][0])
self.cur.execute(sql)
......@@ -774,6 +798,7 @@ class Crawler_zhihu():
)
return topics
#封装回答最终数据结果格式
def thought_data_complex(self):
sql = """select id, content, created_time, comment_count from zhihu_thought"""
self.cur.execute(sql)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment