Commit e741a98a authored by 段英荣's avatar 段英荣

增加图片存储目录

parent 75c11956
......@@ -35,6 +35,7 @@ import traceback
import pymysql
import jieba
import jieba.analyse
import os
......@@ -293,20 +294,24 @@ class ZhihuAccount(object):
print(50*"*")
def _dispose_content_url(self,content,img_url_list,cur_image_index):
def _dispose_content_url(self,content,img_url_list,cur_image_index,cur_word_index):
try:
img_dir = "./img_" + str(cur_word_index)
if not os.path.exists(img_dir):
os.makedirs(img_dir)
for ori_img_url in img_url_list:
if ori_img_url.find(".jpg") >= 0 or ori_img_url.find(".png") >= 0:
cur_image_index += 1
local_img_url_path = "./image/img_" + str(cur_image_index) + ".png"
local_img_url_path = img_dir + "/img_" + str(cur_image_index) + ".png"
print(ori_img_url, local_img_url_path)
urlretrieve(ori_img_url, local_img_url_path)
local_cv2_img = cv2.imread(local_img_url_path)
height, weidth, channel = local_cv2_img.shape
local_cropped_img = local_cv2_img[0:(height - 100), 0:weidth]
local_cropped_img_url_path = "./image/cropped_image_" + str(cur_image_index) + ".png"
local_cropped_img_url_path = img_dir + "/cropped_image_" + str(cur_image_index) + ".png"
cv2.imwrite(local_cropped_img_url_path, local_cropped_img)
qiniu_url = upload_file(local_cropped_img_url_path)
content = content.replace(ori_img_url, qiniu_url)
......@@ -317,8 +322,9 @@ class ZhihuAccount(object):
return content,cur_image_index
# 知乎搜索词搜索
def zhihu_query_by_word(self,query_word,zhihu_spider_fd,zhihu_spider_question_fd,cur_image_index,question_answer_dict):
def zhihu_query_by_word(self,query_word,zhihu_spider_fd,zhihu_spider_question_fd,question_answer_dict,cur_word_index):
cur_image_index = 0
for begin_index in range(0,200,10):
query_by_word_url = "https://www.zhihu.com/api/v4/search_v3?t=general&correction=1&lc_idx=62&" \
"show_all_topics=0&search_hash_id=1dbb1e923a17f147356177932d1236e1&" \
......@@ -344,10 +350,10 @@ class ZhihuAccount(object):
have_saved_this_answer = False
img_url_list = re.findall('src="(.*?)"', content)
content,cur_image_index = self._dispose_content_url(content=content,img_url_list=img_url_list,cur_image_index=cur_image_index)
content,cur_image_index = self._dispose_content_url(content=content,img_url_list=img_url_list,cur_image_index=cur_image_index,cur_word_index=cur_word_index)
img_url_list = re.findall('data-original="(.*?)"', content)
content,cur_image_index = self._dispose_content_url(content=content,img_url_list=img_url_list,cur_image_index=cur_image_index)
content,cur_image_index = self._dispose_content_url(content=content,img_url_list=img_url_list,cur_image_index=cur_image_index,cur_word_index=cur_word_index)
if data_type == "article":
......@@ -363,7 +369,7 @@ class ZhihuAccount(object):
if question_id not in question_answer_dict:
question_answer_dict[question_id] = set()
question_answer_dict[question_id].add(platform_id)
self.zhihu_answers_list_by_question(question_id,question_answer_dict,zhihu_spider_fd,cur_image_index)
self.zhihu_answers_list_by_question(question_id,question_answer_dict,zhihu_spider_fd,cur_image_index,cur_word_index)
question_item_dict = {
"user_id": user_id,
......@@ -403,7 +409,7 @@ class ZhihuAccount(object):
# 知乎问题对应的回答列表
def zhihu_answers_list_by_question(self,question_id,question_answer_dict,zhihu_spider_fd,cur_image_index):
def zhihu_answers_list_by_question(self,question_id,question_answer_dict,zhihu_spider_fd,cur_image_index,cur_word_index):
for begin_index in range(0,200,10):
answers_list_by_question_url = "https://www.zhihu.com/api/v4/questions/" + str(question_id) + \
......@@ -430,11 +436,11 @@ class ZhihuAccount(object):
img_url_list = re.findall('src="(.*?)"', data_content)
data_content, cur_image_index = self._dispose_content_url(content=data_content, img_url_list=img_url_list,
cur_image_index=cur_image_index)
cur_image_index=cur_image_index,cur_word_index=cur_word_index)
img_url_list = re.findall('data-original="(.*?)"', data_content)
data_content, cur_image_index = self._dispose_content_url(content=data_content, img_url_list=img_url_list,
cur_image_index=cur_image_index)
cur_image_index=cur_image_index,cur_word_index=cur_word_index)
if data_type == "answer" and "question" in data_item:
question_id = str(data_item["question"]["id"])
......@@ -483,11 +489,12 @@ if __name__ == '__main__':
# 问题回答映射词典
question_answer_dict = dict()
cur_image_index = 0
cur_word_index = 0
for query_word in top_query_list:
print("query_word:%s" % query_word)
account.zhihu_query_by_word(query_word,zhihu_spider_fd,zhihu_spider_question_fd,cur_image_index,question_answer_dict)
cur_word_index += 1
print("query_word:%s" % query_word,flush=True)
account.zhihu_query_by_word(query_word=query_word,zhihu_spider_fd=zhihu_spider_fd,zhihu_spider_question_fd=zhihu_spider_question_data_file,question_answer_dict=question_answer_dict,cur_word_index=cur_word_index)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment