Commit b7b19ad0 authored by 段英荣's avatar 段英荣

应用用户搜索词爬取

parent b480d5d9
......@@ -363,9 +363,9 @@ class ZhihuAccount(object):
# 知乎搜索词搜索
def zhihu_query_by_word(self,query_word,zhihu_spider_fd,zhihu_spider_question_fd,question_answer_dict,cur_word_index):
try:
cur_image_index = 0
for begin_index in range(0,100,10):
for begin_index in range(0, 100, 10):
# query_by_word_url = "https://www.zhihu.com/api/v4/search_v3?t=general&correction=1&lc_idx=62&" \
# "show_all_topics=0&search_hash_id=1dbb1e923a17f147356177932d1236e1&" \
# "vertical_info=0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C1" + "&q=" + str(query_word) + "&offset=" + str(begin_index) + "&limit=10"
......@@ -384,14 +384,14 @@ class ZhihuAccount(object):
}
query_by_word_url += urllib.parse.urlencode(query_params_dict)
res = self.session.get(query_by_word_url,allow_redirects=False)
print(10*"*")
res = self.session.get(query_by_word_url, allow_redirects=False)
print(10 * "*")
print(query_by_word_url)
print(res)
raw_content = brotli.decompress(res.content)
print(type(raw_content))
raw_content_dict = json.loads(str(raw_content,encoding="utf-8"))
raw_content_dict = json.loads(str(raw_content, encoding="utf-8"))
if "data" in raw_content_dict:
for data_item in raw_content_dict["data"]:
......@@ -406,19 +406,24 @@ class ZhihuAccount(object):
have_saved_this_answer = False
img_url_list = re.findall('src="(.*?)"', content)
content,cur_image_index = self._dispose_content_url(content=content,img_url_list=img_url_list,cur_image_index=cur_image_index,cur_word_index=cur_word_index)
content, cur_image_index = self._dispose_content_url(content=content,
img_url_list=img_url_list,
cur_image_index=cur_image_index,
cur_word_index=cur_word_index)
img_url_list = re.findall('data-original="(.*?)"', content)
content,cur_image_index = self._dispose_content_url(content=content,img_url_list=img_url_list,cur_image_index=cur_image_index,cur_word_index=cur_word_index)
content, cur_image_index = self._dispose_content_url(content=content,
img_url_list=img_url_list,
cur_image_index=cur_image_index,
cur_word_index=cur_word_index)
if data_type == "article":
title = data_item["object"]["title"]
title = title.replace("<em>","")
title = title.replace("<em>", "")
title = title.replace("</em>", "")
elif data_type == "answer":
title = data_item["object"]["question"]["name"]
title = title.replace("<em>","")
title = title.replace("<em>", "")
title = title.replace("</em>", "")
question_id = str(data_item["object"]["question"]["id"])
......@@ -426,7 +431,11 @@ class ZhihuAccount(object):
# self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
question_answer_dict[question_id] = set()
question_answer_dict[question_id].add(platform_id)
cur_image_index = self.zhihu_answers_list_by_question(question_id,question_answer_dict,zhihu_spider_fd,cur_image_index,cur_word_index)
cur_image_index = self.zhihu_answers_list_by_question(question_id,
question_answer_dict,
zhihu_spider_fd,
cur_image_index,
cur_word_index)
question_item_dict = {
"user_id": user_id,
......@@ -463,8 +472,8 @@ class ZhihuAccount(object):
except:
print(traceback.format_exc())
print(str(data_item))
# time.sleep(2)
except:
print(traceback.format_exc())
# 知乎问题对应的回答列表
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment