Commit b7b19ad0 authored by 段英荣's avatar 段英荣

应用用户搜索词爬取

parent b480d5d9
......@@ -363,108 +363,117 @@ class ZhihuAccount(object):
# 知乎搜索词搜索
def zhihu_query_by_word(self,query_word,zhihu_spider_fd,zhihu_spider_question_fd,question_answer_dict,cur_word_index):
try:
cur_image_index = 0
for begin_index in range(0, 100, 10):
# query_by_word_url = "https://www.zhihu.com/api/v4/search_v3?t=general&correction=1&lc_idx=62&" \
# "show_all_topics=0&search_hash_id=1dbb1e923a17f147356177932d1236e1&" \
# "vertical_info=0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C1" + "&q=" + str(query_word) + "&offset=" + str(begin_index) + "&limit=10"
query_by_word_url = "https://www.zhihu.com/api/v4/search_v3?"
query_params_dict = {
"q": query_word,
"offset": begin_index,
"limit": 10,
"lc_idx": 22,
"show_all_topics": 0,
"search_hash_id": "dc4a11848e2540981cf28634ff3609c0",
"vertical_info": "0,0,0,0,0,0,0,0,0,1",
"correction": 1,
"t": "general"
}
query_by_word_url += urllib.parse.urlencode(query_params_dict)
res = self.session.get(query_by_word_url, allow_redirects=False)
print(10 * "*")
print(query_by_word_url)
print(res)
raw_content = brotli.decompress(res.content)
print(type(raw_content))
raw_content_dict = json.loads(str(raw_content, encoding="utf-8"))
if "data" in raw_content_dict:
for data_item in raw_content_dict["data"]:
if data_item["type"] == "search_result":
try:
data_type = data_item["object"]["type"]
content = data_item["object"]["content"] if "content" in data_item["object"] else ""
# content = copy.deepcopy(tmp_content)
platform_id = str(data_item["object"]["id"])
user_id = random.choice(majia_user_list)
question_id = ""
have_saved_this_answer = False
img_url_list = re.findall('src="(.*?)"', content)
content, cur_image_index = self._dispose_content_url(content=content,
img_url_list=img_url_list,
cur_image_index=cur_image_index,
cur_word_index=cur_word_index)
img_url_list = re.findall('data-original="(.*?)"', content)
content, cur_image_index = self._dispose_content_url(content=content,
img_url_list=img_url_list,
cur_image_index=cur_image_index,
cur_word_index=cur_word_index)
if data_type == "article":
title = data_item["object"]["title"]
title = title.replace("<em>", "")
title = title.replace("</em>", "")
elif data_type == "answer":
title = data_item["object"]["question"]["name"]
title = title.replace("<em>", "")
title = title.replace("</em>", "")
question_id = str(data_item["object"]["question"]["id"])
if question_id not in question_answer_dict:
# self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
question_answer_dict[question_id] = set()
question_answer_dict[question_id].add(platform_id)
cur_image_index = self.zhihu_answers_list_by_question(question_id,
question_answer_dict,
zhihu_spider_fd,
cur_image_index,
cur_word_index)
question_item_dict = {
"user_id": user_id,
"platform_id": question_id,
"title": title,
"content": content,
"type": data_type,
"question_id": "",
"tags": self.get_tfidf_words_from_content(content)
}
# self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_question_fd,item_dict=question_item_dict)
zhihu_spider_question_fd.write(json.dumps(question_item_dict) + "\n")
elif platform_id not in question_answer_dict[question_id]:
question_answer_dict[question_id].add(platform_id)
# self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
else:
have_saved_this_answer = True
else:
print("type is:%s" % data_type)
title = ""
cur_image_index = 0
for begin_index in range(0,100,10):
# query_by_word_url = "https://www.zhihu.com/api/v4/search_v3?t=general&correction=1&lc_idx=62&" \
# "show_all_topics=0&search_hash_id=1dbb1e923a17f147356177932d1236e1&" \
# "vertical_info=0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C1" + "&q=" + str(query_word) + "&offset=" + str(begin_index) + "&limit=10"
query_by_word_url = "https://www.zhihu.com/api/v4/search_v3?"
query_params_dict = {
"q": query_word,
"offset": begin_index,
"limit": 10,
"lc_idx": 22,
"show_all_topics": 0,
"search_hash_id": "dc4a11848e2540981cf28634ff3609c0",
"vertical_info": "0,0,0,0,0,0,0,0,0,1",
"correction": 1,
"t": "general"
}
query_by_word_url += urllib.parse.urlencode(query_params_dict)
res = self.session.get(query_by_word_url,allow_redirects=False)
print(10*"*")
print(query_by_word_url)
print(res)
raw_content = brotli.decompress(res.content)
print(type(raw_content))
raw_content_dict = json.loads(str(raw_content,encoding="utf-8"))
if "data" in raw_content_dict:
for data_item in raw_content_dict["data"]:
if data_item["type"] == "search_result":
try:
data_type = data_item["object"]["type"]
content = data_item["object"]["content"] if "content" in data_item["object"] else ""
# content = copy.deepcopy(tmp_content)
platform_id = str(data_item["object"]["id"])
user_id = random.choice(majia_user_list)
question_id = ""
have_saved_this_answer = False
img_url_list = re.findall('src="(.*?)"', content)
content,cur_image_index = self._dispose_content_url(content=content,img_url_list=img_url_list,cur_image_index=cur_image_index,cur_word_index=cur_word_index)
img_url_list = re.findall('data-original="(.*?)"', content)
content,cur_image_index = self._dispose_content_url(content=content,img_url_list=img_url_list,cur_image_index=cur_image_index,cur_word_index=cur_word_index)
if data_type == "article":
title = data_item["object"]["title"]
title = title.replace("<em>","")
title = title.replace("</em>", "")
elif data_type == "answer":
title = data_item["object"]["question"]["name"]
title = title.replace("<em>","")
title = title.replace("</em>", "")
question_id = str(data_item["object"]["question"]["id"])
if question_id not in question_answer_dict:
# self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
question_answer_dict[question_id] = set()
question_answer_dict[question_id].add(platform_id)
cur_image_index = self.zhihu_answers_list_by_question(question_id,question_answer_dict,zhihu_spider_fd,cur_image_index,cur_word_index)
question_item_dict = {
if not have_saved_this_answer:
item_dict = {
"user_id": user_id,
"platform_id": question_id,
"platform_id": platform_id,
"title": title,
"content": content,
"type": data_type,
"question_id": "",
"question_id": question_id,
"tags": self.get_tfidf_words_from_content(content)
}
# self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_question_fd,item_dict=question_item_dict)
zhihu_spider_question_fd.write(json.dumps(question_item_dict) + "\n")
elif platform_id not in question_answer_dict[question_id]:
question_answer_dict[question_id].add(platform_id)
# self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
else:
have_saved_this_answer = True
else:
print("type is:%s" % data_type)
title = ""
if not have_saved_this_answer:
item_dict = {
"user_id": user_id,
"platform_id": platform_id,
"title": title,
"content": content,
"type": data_type,
"question_id": question_id,
"tags": self.get_tfidf_words_from_content(content)
}
# self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_fd, item_dict=item_dict)
zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
except:
print(traceback.format_exc())
print(str(data_item))
# time.sleep(2)
# self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_fd, item_dict=item_dict)
zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
except:
print(traceback.format_exc())
print(str(data_item))
except:
print(traceback.format_exc())
# 知乎问题对应的回答列表
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment