Commit 5761803e authored by 向万's avatar 向万

code

parent aff367ce
......@@ -33,17 +33,19 @@ class Crawler_zhihu():
self.cur.execute("drop table if exists zhihu_answer")
sql = """create table zhihu_answer(title char(40),
content text(59999),
id int,
answer_id int,
created_time int,
comment_count int)"""
comment_count int,
new_content text(59999))"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_article")
sql = """create table zhihu_article(title char(40),
content text(59999),
id int,
article_id int,
created_time int,
comment_count int)"""
comment_count int,
new_content text(59999))"""
self.cur.execute(sql)
self.conn.commit()
self.cur.execute("drop table if exists zhihu_answer_root_comment")
......@@ -91,7 +93,7 @@ class Crawler_zhihu():
self.conn.commit()
self.cur.execute("drop table if exists zhihu_thought")
sql = """create table zhihu_thought(id char(50),
sql = """create table zhihu_thought(thought_id char(50),
content text(59999),
created_time int,
comment_count int)"""
......@@ -147,11 +149,11 @@ class Crawler_zhihu():
def parse_sigle_page(self, data_dict, mark):
if mark == 0:
into = "insert into zhihu_answer(title, content, id, created_time, comment_count) value(%s, %s, %s, %s, %s)"
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"], data_dict["comment_count"])
into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"], data_dict["comment_count"], data_dict["content"])
elif mark == 1:
into = "insert into zhihu_article(title, content, id, created_time, comment_count) value(%s, %s, %s, %s, %s)"
values = (data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"])
into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"], data_dict["content"])
self.cur.execute(into, values)
self.conn.commit()
offset = 0
......@@ -198,6 +200,7 @@ class Crawler_zhihu():
self.answer_refresh_content()
self.article_picture_doneload_and_cut()
self.article_refresh_content()
self.thought_picture_doneload_and_cut()
self.answer_data_complex()
self.conn.close()
return
......@@ -220,7 +223,7 @@ class Crawler_zhihu():
print("article_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
for one_line in page_dict['data'][:1]:
try:
if one_line["content"] != None:
self.parse_sigle_page(one_line, mark)
......@@ -450,10 +453,12 @@ class Crawler_zhihu():
tuple = self.cur.fetchall()
self.conn.commit()
for i in range(len(tuple)):
if tuple[i][2] == None:
continue
find_id = tuple[i][0]
temp = str(tuple[i][1])
temp1 = temp.replace("?", "#")
sql = """select content from zhihu_answer where zhihu_answer.id = '{}' """.format(find_id)
sql = """select new_content from zhihu_answer where zhihu_answer.answer_id = '{}' """.format(find_id)
self.cur.execute(sql)
tuples = self.cur.fetchall()
# tuples = str(tuples)
......@@ -464,7 +469,7 @@ class Crawler_zhihu():
temp_tuples)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update zhihu_answer set content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
sql = """update zhihu_answer set new_content = '{}' WHERE answer_id = '{}' """.format(new_content, tuple[i][0])
self.cur.execute(sql)
self.conn.commit()
......@@ -541,7 +546,7 @@ class Crawler_zhihu():
find_id = tuple[i][0]
temp = str(tuple[i][1])
temp1 = temp.replace("?", "#")
sql = """select content from zhihu_article where zhihu_article.id = '{}' """.format(find_id)
sql = """select new_content from zhihu_article where zhihu_article.article_id = '{}' """.format(find_id)
self.cur.execute(sql)
tuples = self.cur.fetchall()
# tuples = str(tuples)
......@@ -552,7 +557,7 @@ class Crawler_zhihu():
temp_tuples)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update zhihu_article set content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
sql = """update zhihu_article set new_content = '{}' WHERE article_id = '{}' """.format(new_content, tuple[i][0])
self.cur.execute(sql)
self.conn.commit()
......@@ -571,7 +576,7 @@ class Crawler_zhihu():
print("article_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
for one_line in page_dict['data'][:1]:
try:
self.parse_thought_sigle_page(one_line)
print("finshed_article" + offset)
......@@ -589,7 +594,7 @@ class Crawler_zhihu():
for one_dict in data_dict["content"]:
if one_dict["type"] == "text":
into = "insert into zhihu_thought(content, id, created_time, comment_count) value(%s, %s, %s, %s)"
into = "insert into zhihu_thought(content, thought_id, created_time, comment_count) value(%s, %s, %s, %s)"
values = (one_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"])
self.cur.execute(into, values)
self.conn.commit()
......@@ -712,7 +717,7 @@ class Crawler_zhihu():
#封装回答最终数据结果格式
def answer_data_complex(self):
sql = """select id, content, created_time, comment_count from zhihu_answer"""
sql = """select anwser_id, content, created_time, comment_count from zhihu_answer"""
self.cur.execute(sql)
topics = []
......@@ -756,7 +761,7 @@ class Crawler_zhihu():
#封装文章最终数据结果格式
def article_data_complex(self):
sql = """select id, content, created_time, comment_count from zhihu_article"""
sql = """select article_id, content, created_time, comment_count from zhihu_article"""
self.cur.execute(sql)
topics = []
......@@ -800,7 +805,7 @@ class Crawler_zhihu():
#封装回答最终数据结果格式
def thought_data_complex(self):
sql = """select id, content, created_time, comment_count from zhihu_thought"""
sql = """select thought_id, content, created_time, comment_count from zhihu_thought"""
self.cur.execute(sql)
topics = []
......@@ -845,5 +850,5 @@ if __name__ == '__main__':
print(datetime.now())
zhihu = Crawler_zhihu()
zhihu.search_page(1, 0, 0)
zhihu.search_page(1, 1, 1)
print(datetime.now())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment