Commit 5512a01d authored by 段英荣's avatar 段英荣

获取回答列表

parent 66e74343
......@@ -264,7 +264,7 @@ class ZhihuAccount(object):
return content,cur_image_index
# 知乎搜索词搜索
def zhihu_query_by_word(self,query_word,zhihu_spider_fd,cur_image_index):
def zhihu_query_by_word(self,query_word,zhihu_spider_fd,zhihu_spider_question_fd,cur_image_index,question_answer_dict):
for begin_index in range(0,200,10):
query_by_word_url = "https://www.zhihu.com/api/v4/search_v3?t=general&correction=1&lc_idx=62&" \
......@@ -285,9 +285,10 @@ class ZhihuAccount(object):
data_type = data_item["object"]["type"]
content = data_item["object"]["content"]
# content = copy.deepcopy(tmp_content)
platform_id = data_item["object"]["id"]
platform_id = str(data_item["object"]["id"])
user_id = random.choice(majia_user_list)
question_id = ""
have_saved_this_answer = False
img_url_list = re.findall('src="(.*?)"', content)
content,cur_image_index = self._dispose_content_url(content=content,img_url_list=img_url_list,cur_image_index=cur_image_index)
......@@ -298,23 +299,47 @@ class ZhihuAccount(object):
if data_type == "article":
title = data_item["object"]["title"]
title = title.replace("<em>","")
title = title.replace("</em>", "")
elif data_type == "answer":
title = data_item["object"]["question"]["name"]
question_id = data_item["object"]["question"]["id"]
title = title.replace("<em>","")
title = title.replace("</em>", "")
question_id = str(data_item["object"]["question"]["id"])
if question_id not in question_answer_dict:
question_answer_dict[question_id] = set()
question_answer_dict[question_id].add(platform_id)
self.zhihu_answers_list_by_question(question_id,question_answer_dict,zhihu_spider_fd,cur_image_index)
question_item_dict = {
"user_id": user_id,
"platform_id": question_id,
"title": title,
"content": content,
"type": data_type,
"question_id": ""
}
zhihu_spider_question_fd.write(json.dumps(question_item_dict) + "\n")
elif platform_id not in question_answer_dict[question_id]:
question_answer_dict[question_id].add(platform_id)
else:
have_saved_this_answer = True
else:
print("type is:%s" % data_type)
title = ""
item_dict = {
"user_id": user_id,
"platform_id": platform_id,
"title": title,
"content": content,
"type": data_type,
"question_id": question_id
}
if not have_saved_this_answer:
item_dict = {
"user_id": user_id,
"platform_id": platform_id,
"title": title,
"content": content,
"type": data_type,
"question_id": question_id
}
zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
except:
print(traceback.format_exc())
print(str(data_item))
......@@ -323,9 +348,66 @@ class ZhihuAccount(object):
# 知乎问题对应的回答列表
def zhihu_answers_list_by_question(self,question_id):
answers_list_by_question_url = "https://www.zhihu.com/api/v4/questions/" + str(question_id) + "/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset=0&limit=10&sort_by=default&platform=desktop"
def zhihu_answers_list_by_question(self,question_id,question_answer_dict,zhihu_spider_fd,cur_image_index):
for begin_index in range(0,200,10):
answers_list_by_question_url = "https://www.zhihu.com/api/v4/questions/" + str(question_id) + \
"/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&" \
"sort_by=default&platform=desktop" + "&offset=" + str(begin_index) + "&limit=10"
res = self.session.get(answers_list_by_question_url,allow_redirects=False)
print(10*"*")
raw_content = brotli.decompress(res.content)
print(type(raw_content))
raw_content_dict = json.loads(str(raw_content,encoding="utf-8"))
if "data" in raw_content_dict:
for data_item in raw_content_dict["data"]:
try:
user_id=random.choice(majia_user_list)
data_type = data_item["type"]
platform_id = str(data_item["id"])
data_content = data_item["content"]
question_id = ""
question_title = ""
have_saved_this_answer = False
img_url_list = re.findall('src="(.*?)"', data_content)
data_content, cur_image_index = self._dispose_content_url(content=data_content, img_url_list=img_url_list,
cur_image_index=cur_image_index)
img_url_list = re.findall('data-original="(.*?)"', data_content)
data_content, cur_image_index = self._dispose_content_url(content=data_content, img_url_list=img_url_list,
cur_image_index=cur_image_index)
if data_type == "answer" and "question" in data_item:
question_id = str(data_item["question"]["id"])
question_title = data_item["question"]["title"]
if question_id not in question_answer_dict:
question_answer_dict[question_id] = set()
question_answer_dict[question_id].add(platform_id)
elif platform_id not in question_answer_dict[question_id]:
question_answer_dict[question_id].add(platform_id)
else:
have_saved_this_answer = True
if not have_saved_this_answer:
item_dict = {
"user_id": user_id,
"platform_id": platform_id,
"title": question_title,
"content": data_content,
"type": data_type,
"question_id": question_id
}
zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
except:
print(traceback.format_exc())
print(str(data_item))
return
if __name__ == '__main__':
......@@ -336,10 +418,18 @@ if __name__ == '__main__':
account.login(captcha_lang='en', load_cookies=True)
#account.test_member_article()
zhihu_spider_data = "./zhihu_spider_data.txt"
zhihu_spider_fd = open(zhihu_spider_data,"w")
zhihu_spider_data_file = "./zhihu_spider_data.txt"
zhihu_spider_fd = open(zhihu_spider_data_file,"w")
zhihu_spider_question_data_file = "./zhihu_spider_question_data.txt"
zhihu_spider_question_fd = open(zhihu_spider_question_data_file,"w")
# 问题回答映射词典
question_answer_dict = dict()
cur_image_index = 0
account.zhihu_query_by_word(top_query_list[0],zhihu_spider_fd,cur_image_index)
account.zhihu_query_by_word(top_query_list[0],zhihu_spider_fd,zhihu_spider_question_fd,cur_image_index,question_answer_dict)
zhihu_spider_fd.close()
\ No newline at end of file
zhihu_spider_fd.close()
zhihu_spider_question_fd.close()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment