Commit 778c9506 authored by haowang's avatar haowang

modify

parents 8a17245b 5761803e
...@@ -154,11 +154,11 @@ class Crawler_zhihu(): ...@@ -154,11 +154,11 @@ class Crawler_zhihu():
''' '''
if mark == 0: if mark == 0:
into = "insert into zhihu_answer(title, content, id, created_time, comment_count) value(%s, %s, %s, %s, %s)" into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"], data_dict["comment_count"]) values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"], data_dict["comment_count"], data_dict["content"])
elif mark == 1: elif mark == 1:
into = "insert into zhihu_article(title, content, id, created_time, comment_count) value(%s, %s, %s, %s, %s)" into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"]) values = (data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"], data_dict["content"])
self.cur.execute(into, values) self.cur.execute(into, values)
self.conn.commit() self.conn.commit()
offset = 0 offset = 0
...@@ -229,7 +229,7 @@ class Crawler_zhihu(): ...@@ -229,7 +229,7 @@ class Crawler_zhihu():
print("article_error") print("article_error")
page_dict = get_page.json() page_dict = get_page.json()
if page_dict.get("data"): if page_dict.get("data"):
for one_line in page_dict['data']: for one_line in page_dict['data'][:1]:
try: try:
if one_line["content"] != None: if one_line["content"] != None:
self.parse_sigle_page(one_line, mark) self.parse_sigle_page(one_line, mark)
...@@ -474,10 +474,12 @@ class Crawler_zhihu(): ...@@ -474,10 +474,12 @@ class Crawler_zhihu():
tuple = self.cur.fetchall() tuple = self.cur.fetchall()
self.conn.commit() self.conn.commit()
for i in range(len(tuple)): for i in range(len(tuple)):
if tuple[i][2] == None:
continue
find_id = tuple[i][0] find_id = tuple[i][0]
temp = str(tuple[i][1]) temp = str(tuple[i][1])
temp1 = temp.replace("?", "#") temp1 = temp.replace("?", "#")
sql = """select content from zhihu_answer where zhihu_answer.id = '{}' """.format(find_id) sql = """select new_content from zhihu_answer where zhihu_answer.answer_id = '{}' """.format(find_id)
self.cur.execute(sql) self.cur.execute(sql)
tuples = self.cur.fetchall() tuples = self.cur.fetchall()
# tuples = str(tuples) # tuples = str(tuples)
...@@ -488,7 +490,7 @@ class Crawler_zhihu(): ...@@ -488,7 +490,7 @@ class Crawler_zhihu():
temp_tuples) temp_tuples)
new_content = r'%s' % (new_content) new_content = r'%s' % (new_content)
new_content = escape_string(new_content) new_content = escape_string(new_content)
sql = """update zhihu_answer set new_content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0]) sql = """update zhihu_answer set new_content = '{}' WHERE answer_id = '{}' """.format(new_content, tuple[i][0])
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
...@@ -568,7 +570,7 @@ class Crawler_zhihu(): ...@@ -568,7 +570,7 @@ class Crawler_zhihu():
find_id = tuple[i][0] find_id = tuple[i][0]
temp = str(tuple[i][1]) temp = str(tuple[i][1])
temp1 = temp.replace("?", "#") temp1 = temp.replace("?", "#")
sql = """select content from zhihu_article where zhihu_article.id = '{}' """.format(find_id) sql = """select new_content from zhihu_article where zhihu_article.article_id = '{}' """.format(find_id)
self.cur.execute(sql) self.cur.execute(sql)
tuples = self.cur.fetchall() tuples = self.cur.fetchall()
# tuples = str(tuples) # tuples = str(tuples)
...@@ -579,7 +581,7 @@ class Crawler_zhihu(): ...@@ -579,7 +581,7 @@ class Crawler_zhihu():
temp_tuples) temp_tuples)
new_content = r'%s' % (new_content) new_content = r'%s' % (new_content)
new_content = escape_string(new_content) new_content = escape_string(new_content)
sql = """update zhihu_article set new_content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0]) sql = """update zhihu_article set new_content = '{}' WHERE article_id = '{}' """.format(new_content, tuple[i][0])
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
...@@ -599,7 +601,7 @@ class Crawler_zhihu(): ...@@ -599,7 +601,7 @@ class Crawler_zhihu():
print("article_error") print("article_error")
page_dict = get_page.json() page_dict = get_page.json()
if page_dict.get("data"): if page_dict.get("data"):
for one_line in page_dict['data']: for one_line in page_dict['data'][:1]:
try: try:
self.parse_thought_sigle_page(one_line) self.parse_thought_sigle_page(one_line)
print("finshed_article" + offset) print("finshed_article" + offset)
...@@ -618,7 +620,7 @@ class Crawler_zhihu(): ...@@ -618,7 +620,7 @@ class Crawler_zhihu():
''' '''
for one_dict in data_dict["content"]: for one_dict in data_dict["content"]:
if one_dict["type"] == "text": if one_dict["type"] == "text":
into = "insert into zhihu_thought(content, id, created_time, comment_count) value(%s, %s, %s, %s)" into = "insert into zhihu_thought(content, thought_id, created_time, comment_count) value(%s, %s, %s, %s)"
values = (one_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"]) values = (one_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"])
self.cur.execute(into, values) self.cur.execute(into, values)
self.conn.commit() self.conn.commit()
...@@ -748,7 +750,7 @@ class Crawler_zhihu(): ...@@ -748,7 +750,7 @@ class Crawler_zhihu():
''' '''
封装回答最终数据结果格式 封装回答最终数据结果格式
''' '''
sql = """select id, content, created_time, comment_count from zhihu_answer""" sql = """select anwser_id, content, created_time, comment_count from zhihu_answer"""
self.cur.execute(sql) self.cur.execute(sql)
topics = [] topics = []
...@@ -794,7 +796,7 @@ class Crawler_zhihu(): ...@@ -794,7 +796,7 @@ class Crawler_zhihu():
''' '''
封装文章最终数据结果格式 封装文章最终数据结果格式
''' '''
sql = """select id, content, created_time, comment_count from zhihu_article""" sql = """select article_id, content, created_time, comment_count from zhihu_article"""
self.cur.execute(sql) self.cur.execute(sql)
topics = [] topics = []
...@@ -840,7 +842,7 @@ class Crawler_zhihu(): ...@@ -840,7 +842,7 @@ class Crawler_zhihu():
''' '''
封装回答最终数据结果格式 封装回答最终数据结果格式
''' '''
sql = """select id, content, created_time, comment_count from zhihu_thought""" sql = """select thought_id, content, created_time, comment_count from zhihu_thought"""
self.cur.execute(sql) self.cur.execute(sql)
topics = [] topics = []
...@@ -898,5 +900,5 @@ if __name__ == '__main__': ...@@ -898,5 +900,5 @@ if __name__ == '__main__':
print(datetime.now()) print(datetime.now())
zhihu = Crawler_zhihu() zhihu = Crawler_zhihu()
zhihu.search_page(1, 0, 0) zhihu.search_page(1, 1, 1)
print(datetime.now()) print(datetime.now())
...@@ -14,4 +14,5 @@ pymysql==0.10.0 ...@@ -14,4 +14,5 @@ pymysql==0.10.0
qiniu==7.1.4 qiniu==7.1.4
redis==3.5.3 redis==3.5.3
pymysql==0.10.0 pymysql==0.10.0
opencv-python==4.4.0.46 opencv-python==4.4.0.46
\ No newline at end of file PyExecJS==1.5.1
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment