Commit 778c9506 authored by haowang's avatar haowang

modify

parents 8a17245b 5761803e
......@@ -154,11 +154,11 @@ class Crawler_zhihu():
'''
if mark == 0:
into = "insert into zhihu_answer(title, content, id, created_time, comment_count) value(%s, %s, %s, %s, %s)"
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"], data_dict["comment_count"])
into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"], data_dict["comment_count"], data_dict["content"])
elif mark == 1:
into = "insert into zhihu_article(title, content, id, created_time, comment_count) value(%s, %s, %s, %s, %s)"
values = (data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"])
into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"], data_dict["content"])
self.cur.execute(into, values)
self.conn.commit()
offset = 0
......@@ -229,7 +229,7 @@ class Crawler_zhihu():
print("article_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
for one_line in page_dict['data'][:1]:
try:
if one_line["content"] != None:
self.parse_sigle_page(one_line, mark)
......@@ -474,10 +474,12 @@ class Crawler_zhihu():
tuple = self.cur.fetchall()
self.conn.commit()
for i in range(len(tuple)):
if tuple[i][2] == None:
continue
find_id = tuple[i][0]
temp = str(tuple[i][1])
temp1 = temp.replace("?", "#")
sql = """select content from zhihu_answer where zhihu_answer.id = '{}' """.format(find_id)
sql = """select new_content from zhihu_answer where zhihu_answer.answer_id = '{}' """.format(find_id)
self.cur.execute(sql)
tuples = self.cur.fetchall()
# tuples = str(tuples)
......@@ -488,7 +490,7 @@ class Crawler_zhihu():
temp_tuples)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update zhihu_answer set new_content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
sql = """update zhihu_answer set new_content = '{}' WHERE answer_id = '{}' """.format(new_content, tuple[i][0])
self.cur.execute(sql)
self.conn.commit()
......@@ -568,7 +570,7 @@ class Crawler_zhihu():
find_id = tuple[i][0]
temp = str(tuple[i][1])
temp1 = temp.replace("?", "#")
sql = """select content from zhihu_article where zhihu_article.id = '{}' """.format(find_id)
sql = """select new_content from zhihu_article where zhihu_article.article_id = '{}' """.format(find_id)
self.cur.execute(sql)
tuples = self.cur.fetchall()
# tuples = str(tuples)
......@@ -579,7 +581,7 @@ class Crawler_zhihu():
temp_tuples)
new_content = r'%s' % (new_content)
new_content = escape_string(new_content)
sql = """update zhihu_article set new_content = '{}' WHERE id = '{}' """.format(new_content, tuple[i][0])
sql = """update zhihu_article set new_content = '{}' WHERE article_id = '{}' """.format(new_content, tuple[i][0])
self.cur.execute(sql)
self.conn.commit()
......@@ -599,7 +601,7 @@ class Crawler_zhihu():
print("article_error")
page_dict = get_page.json()
if page_dict.get("data"):
for one_line in page_dict['data']:
for one_line in page_dict['data'][:1]:
try:
self.parse_thought_sigle_page(one_line)
print("finshed_article" + offset)
......@@ -618,7 +620,7 @@ class Crawler_zhihu():
'''
for one_dict in data_dict["content"]:
if one_dict["type"] == "text":
into = "insert into zhihu_thought(content, id, created_time, comment_count) value(%s, %s, %s, %s)"
into = "insert into zhihu_thought(content, thought_id, created_time, comment_count) value(%s, %s, %s, %s)"
values = (one_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"])
self.cur.execute(into, values)
self.conn.commit()
......@@ -748,7 +750,7 @@ class Crawler_zhihu():
'''
封装回答最终数据结果格式
'''
sql = """select id, content, created_time, comment_count from zhihu_answer"""
sql = """select anwser_id, content, created_time, comment_count from zhihu_answer"""
self.cur.execute(sql)
topics = []
......@@ -794,7 +796,7 @@ class Crawler_zhihu():
'''
封装文章最终数据结果格式
'''
sql = """select id, content, created_time, comment_count from zhihu_article"""
sql = """select article_id, content, created_time, comment_count from zhihu_article"""
self.cur.execute(sql)
topics = []
......@@ -840,7 +842,7 @@ class Crawler_zhihu():
'''
封装回答最终数据结果格式
'''
sql = """select id, content, created_time, comment_count from zhihu_thought"""
sql = """select thought_id, content, created_time, comment_count from zhihu_thought"""
self.cur.execute(sql)
topics = []
......@@ -898,5 +900,5 @@ if __name__ == '__main__':
print(datetime.now())
zhihu = Crawler_zhihu()
zhihu.search_page(1, 0, 0)
zhihu.search_page(1, 1, 1)
print(datetime.now())
......@@ -14,4 +14,5 @@ pymysql==0.10.0
qiniu==7.1.4
redis==3.5.3
pymysql==0.10.0
opencv-python==4.4.0.46
\ No newline at end of file
opencv-python==4.4.0.46
PyExecJS==1.5.1
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment