Commit 5761803e authored by 向万's avatar 向万

code

parent aff367ce
...@@ -33,17 +33,19 @@ class Crawler_zhihu(): ...@@ -33,17 +33,19 @@ class Crawler_zhihu():
self.cur.execute("drop table if exists zhihu_answer") self.cur.execute("drop table if exists zhihu_answer")
sql = """create table zhihu_answer(title char(40), sql = """create table zhihu_answer(title char(40),
content text(59999), content text(59999),
id int, answer_id int,
created_time int, created_time int,
comment_count int)""" comment_count int,
new_content text(59999))"""
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
self.cur.execute("drop table if exists zhihu_article") self.cur.execute("drop table if exists zhihu_article")
sql = """create table zhihu_article(title char(40), sql = """create table zhihu_article(title char(40),
content text(59999), content text(59999),
id int, article_id int,
created_time int, created_time int,
comment_count int)""" comment_count int,
new_content text(59999))"""
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
self.cur.execute("drop table if exists zhihu_answer_root_comment") self.cur.execute("drop table if exists zhihu_answer_root_comment")
...@@ -91,7 +93,7 @@ class Crawler_zhihu(): ...@@ -91,7 +93,7 @@ class Crawler_zhihu():
self.conn.commit() self.conn.commit()
self.cur.execute("drop table if exists zhihu_thought") self.cur.execute("drop table if exists zhihu_thought")
sql = """create table zhihu_thought(id char(50), sql = """create table zhihu_thought(thought_id char(50),
content text(59999), content text(59999),
created_time int, created_time int,
comment_count int)""" comment_count int)"""
...@@ -147,11 +149,11 @@ class Crawler_zhihu(): ...@@ -147,11 +149,11 @@ class Crawler_zhihu():
def parse_sigle_page(self, data_dict, mark): def parse_sigle_page(self, data_dict, mark):
if mark == 0: if mark == 0:
into = "insert into zhihu_answer(title, content, id, created_time, comment_count) value(%s, %s, %s, %s, %s)" into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"], data_dict["comment_count"]) values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"], data_dict["comment_count"], data_dict["content"])
elif mark == 1: elif mark == 1:
into = "insert into zhihu_article(title, content, id, created_time, comment_count) value(%s, %s, %s, %s, %s)" into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"]) values = (data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"], data_dict["content"])
self.cur.execute(into, values) self.cur.execute(into, values)
self.conn.commit() self.conn.commit()
offset = 0 offset = 0
...@@ -198,6 +200,7 @@ class Crawler_zhihu(): ...@@ -198,6 +200,7 @@ class Crawler_zhihu():
self.answer_refresh_content() self.answer_refresh_content()
self.article_picture_doneload_and_cut() self.article_picture_doneload_and_cut()
self.article_refresh_content() self.article_refresh_content()
self.thought_picture_doneload_and_cut()
self.answer_data_complex() self.answer_data_complex()
self.conn.close() self.conn.close()
return return
...@@ -220,7 +223,7 @@ class Crawler_zhihu(): ...@@ -220,7 +223,7 @@ class Crawler_zhihu():
print("article_error") print("article_error")
page_dict = get_page.json() page_dict = get_page.json()
if page_dict.get("data"): if page_dict.get("data"):
for one_line in page_dict['data']: for one_line in page_dict['data'][:1]:
try: try:
if one_line["content"] != None: if one_line["content"] != None:
self.parse_sigle_page(one_line, mark) self.parse_sigle_page(one_line, mark)
...@@ -450,10 +453,12 @@ class Crawler_zhihu(): ...@@ -450,10 +453,12 @@ class Crawler_zhihu():
tuple = self.cur.fetchall() tuple = self.cur.fetchall()
self.conn.commit() self.conn.commit()
for i in range(len(tuple)): for i in range(len(tuple)):
if tuple[i][2] == None:
continue
find_id = tuple[i][0] find_id = tuple[i][0]
temp = str(tuple[i][1]) temp = str(tuple[i][1])
temp1 = temp.replace("?", "#") temp1 = temp.replace("?", "#")
sql = """select content from zhihu_answer where zhihu_answer.id = '{}' """.format(find_id) sql = """select new_content from zhihu_answer where zhihu_answer.answer_id = '{}' """.format(find_id)
self.cur.execute(sql) self.cur.execute(sql)
tuples = self.cur.fetchall() tuples = self.cur.fetchall()
# tuples = str(tuples) # tuples = str(tuples)
...@@ -464,7 +469,7 @@ class Crawler_zhihu(): ...@@ -464,7 +469,7 @@ class Crawler_zhihu():
temp_tuples) temp_tuples)
new_content = r'%s' % (new_content) new_content = r'%s' % (new_content)
new_content = escape_string(new_content) new_content = escape_string(new_content)
sql = """update zhihu_answer set content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0]) sql = """update zhihu_answer set new_content = '{}' WHERE answer_id = '{}' """.format(new_content, tuple[i][0])
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
...@@ -541,7 +546,7 @@ class Crawler_zhihu(): ...@@ -541,7 +546,7 @@ class Crawler_zhihu():
find_id = tuple[i][0] find_id = tuple[i][0]
temp = str(tuple[i][1]) temp = str(tuple[i][1])
temp1 = temp.replace("?", "#") temp1 = temp.replace("?", "#")
sql = """select content from zhihu_article where zhihu_article.id = '{}' """.format(find_id) sql = """select new_content from zhihu_article where zhihu_article.article_id = '{}' """.format(find_id)
self.cur.execute(sql) self.cur.execute(sql)
tuples = self.cur.fetchall() tuples = self.cur.fetchall()
# tuples = str(tuples) # tuples = str(tuples)
...@@ -552,7 +557,7 @@ class Crawler_zhihu(): ...@@ -552,7 +557,7 @@ class Crawler_zhihu():
temp_tuples) temp_tuples)
new_content = r'%s' % (new_content) new_content = r'%s' % (new_content)
new_content = escape_string(new_content) new_content = escape_string(new_content)
sql = """update zhihu_article set content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0]) sql = """update zhihu_article set new_content = '{}' WHERE article_id = '{}' """.format(new_content, tuple[i][0])
self.cur.execute(sql) self.cur.execute(sql)
self.conn.commit() self.conn.commit()
...@@ -571,7 +576,7 @@ class Crawler_zhihu(): ...@@ -571,7 +576,7 @@ class Crawler_zhihu():
print("article_error") print("article_error")
page_dict = get_page.json() page_dict = get_page.json()
if page_dict.get("data"): if page_dict.get("data"):
for one_line in page_dict['data']: for one_line in page_dict['data'][:1]:
try: try:
self.parse_thought_sigle_page(one_line) self.parse_thought_sigle_page(one_line)
print("finshed_article" + offset) print("finshed_article" + offset)
...@@ -589,7 +594,7 @@ class Crawler_zhihu(): ...@@ -589,7 +594,7 @@ class Crawler_zhihu():
for one_dict in data_dict["content"]: for one_dict in data_dict["content"]:
if one_dict["type"] == "text": if one_dict["type"] == "text":
into = "insert into zhihu_thought(content, id, created_time, comment_count) value(%s, %s, %s, %s)" into = "insert into zhihu_thought(content, thought_id, created_time, comment_count) value(%s, %s, %s, %s)"
values = (one_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"]) values = (one_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"])
self.cur.execute(into, values) self.cur.execute(into, values)
self.conn.commit() self.conn.commit()
...@@ -712,7 +717,7 @@ class Crawler_zhihu(): ...@@ -712,7 +717,7 @@ class Crawler_zhihu():
#封装回答最终数据结果格式 #封装回答最终数据结果格式
def answer_data_complex(self): def answer_data_complex(self):
sql = """select id, content, created_time, comment_count from zhihu_answer""" sql = """select anwser_id, content, created_time, comment_count from zhihu_answer"""
self.cur.execute(sql) self.cur.execute(sql)
topics = [] topics = []
...@@ -756,7 +761,7 @@ class Crawler_zhihu(): ...@@ -756,7 +761,7 @@ class Crawler_zhihu():
#封装文章最终数据结果格式 #封装文章最终数据结果格式
def article_data_complex(self): def article_data_complex(self):
sql = """select id, content, created_time, comment_count from zhihu_article""" sql = """select article_id, content, created_time, comment_count from zhihu_article"""
self.cur.execute(sql) self.cur.execute(sql)
topics = [] topics = []
...@@ -800,7 +805,7 @@ class Crawler_zhihu(): ...@@ -800,7 +805,7 @@ class Crawler_zhihu():
#封装回答最终数据结果格式 #封装回答最终数据结果格式
def thought_data_complex(self): def thought_data_complex(self):
sql = """select id, content, created_time, comment_count from zhihu_thought""" sql = """select thought_id, content, created_time, comment_count from zhihu_thought"""
self.cur.execute(sql) self.cur.execute(sql)
topics = [] topics = []
...@@ -845,5 +850,5 @@ if __name__ == '__main__': ...@@ -845,5 +850,5 @@ if __name__ == '__main__':
print(datetime.now()) print(datetime.now())
zhihu = Crawler_zhihu() zhihu = Crawler_zhihu()
zhihu.search_page(1, 0, 0) zhihu.search_page(1, 1, 1)
print(datetime.now()) print(datetime.now())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment