Commit 462cd16e authored by 段英荣's avatar 段英荣

增加图片存储目录

parent 36b96254
...@@ -36,9 +36,18 @@ import pymysql ...@@ -36,9 +36,18 @@ import pymysql
import jieba import jieba
import jieba.analyse import jieba.analyse
import os import os
from gevent import monkey; monkey.patch_socket()
import gevent
from threading import Thread, Lock
mutex_for_get_query_word = Lock()
mutex_for_zhihu_save_question_info = Lock()
mutex_for_zhihu_save_file= Lock()
g_cur_word_index = 0
top_query_list = [ top_query_list = [
"瘦脸针","双眼皮","水光针","手术面部提升","鼻翼缩小","玻尿酸","吸脂","线雕","鼻综合","光子嫩肤","瘦腿针","美白针","热玛吉","隆鼻","超声刀","脱毛","祛斑","果酸焕肤","面部吸脂","皮秒","瘦肩针","自体脂肪填充面部","牙齿矫正","微针","热拉提","鼻翼缩小","瘦脸","下巴","植发","溶脂针","点阵激光","双眼皮修复","小气泡","鼻基底","祛眼袋","隆胸","祛痘","开眼角","除皱","牙齿美白","埋线双眼皮","颧骨","下颌角","纹眉","激光脱毛","玻尿酸丰下巴","法令纹","玻尿酸隆鼻","洗牙","吸脂瘦大腿","溶脂","保妥适","黄金微针","自体脂肪填充","美白","黑眼圈","白瓷娃娃","祛疤","切开双眼皮","泪沟","光纤溶脂","磨骨","嗨体","肉毒素","丰胸(隆胸)","微针祛痘坑","激光祛斑","假体下巴","植发际线","面部提升","肋骨鼻","蜂巢皮秒","祛痘祛痘印","腰腹吸脂","瘦腿","面部填充","厚唇改薄术","下眼睑下至","溶解酶","私密","点痣","酒窝","女性私密紧致","艾莉薇","伊婉V","无针水光","自体脂肪","人中缩短","m22","激光点痣","丰唇","脸型","埋线隆鼻","埋线","收缩毛孔","黑脸娃娃","伊婉C","开外眼角","童颜针","妊娠纹" "瘦脸针","双眼皮","水光针","手术面部提升","鼻翼缩小","玻尿酸","吸脂","线雕","鼻综合","光子嫩肤","瘦腿针","美白针","热玛吉","隆鼻","超声刀","脱毛","祛斑","果酸焕肤","面部吸脂","皮秒","瘦肩针","自体脂肪填充面部","牙齿矫正","微针","热拉提","鼻翼缩小","瘦脸","下巴","植发","溶脂针","点阵激光","双眼皮修复","小气泡","鼻基底","祛眼袋","隆胸","祛痘","开眼角","除皱","牙齿美白","埋线双眼皮","颧骨","下颌角","纹眉","激光脱毛","玻尿酸丰下巴","法令纹","玻尿酸隆鼻","洗牙","吸脂瘦大腿","溶脂","保妥适","黄金微针","自体脂肪填充","美白","黑眼圈","白瓷娃娃","祛疤","切开双眼皮","泪沟","光纤溶脂","磨骨","嗨体","肉毒素","丰胸(隆胸)","微针祛痘坑","激光祛斑","假体下巴","植发际线","面部提升","肋骨鼻","蜂巢皮秒","祛痘祛痘印","腰腹吸脂","瘦腿","面部填充","厚唇改薄术","下眼睑下至","溶解酶","私密","点痣","酒窝","女性私密紧致","艾莉薇","伊婉V","无针水光","自体脂肪","人中缩短","m22","激光点痣","丰唇","脸型","埋线隆鼻","埋线","收缩毛孔","黑脸娃娃","伊婉C","开外眼角","童颜针","妊娠纹"
] ]
...@@ -277,6 +286,32 @@ class ZhihuAccount(object): ...@@ -277,6 +286,32 @@ class ZhihuAccount(object):
print(traceback.format_exc()) print(traceback.format_exc())
return [] return []
def mutex_for_zhihu_save_question_info(self,question_answer_dict,question_id,platform_id):
mutex_for_zhihu_save_question_info.acquire(True)
try:
if question_id not in question_answer_dict:
question_answer_dict[question_id] = set()
question_answer_dict[question_id].add(platform_id)
except:
print(traceback.format_exc())
mutex_for_zhihu_save_question_info.release()
def mutex_for_zhihu_save_file_info(self,file_fd,item_dict):
mutex_for_zhihu_save_file.acquire(True)
try:
file_fd.write(json.dumps(item_dict) + "\n")
except:
print(traceback.format_exc())
mutex_for_zhihu_save_file.release()
# 知乎个人文章列表 # 知乎个人文章列表
def test_member_article(self): def test_member_article(self):
member_article_url = "https://www.zhihu.com/api/v4/members/li-pei-rong-96/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=40&limit=20&sort_by=created" member_article_url = "https://www.zhihu.com/api/v4/members/li-pei-rong-96/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=40&limit=20&sort_by=created"
...@@ -316,6 +351,9 @@ class ZhihuAccount(object): ...@@ -316,6 +351,9 @@ class ZhihuAccount(object):
qiniu_url = upload_file(local_cropped_img_url_path) qiniu_url = upload_file(local_cropped_img_url_path)
content = content.replace(ori_img_url, qiniu_url) content = content.replace(ori_img_url, qiniu_url)
os.remove(local_img_url_path)
os.remove(local_cropped_img_url_path)
return content,cur_image_index return content,cur_image_index
except: except:
print(traceback.format_exc()) print(traceback.format_exc())
...@@ -367,8 +405,9 @@ class ZhihuAccount(object): ...@@ -367,8 +405,9 @@ class ZhihuAccount(object):
question_id = str(data_item["object"]["question"]["id"]) question_id = str(data_item["object"]["question"]["id"])
if question_id not in question_answer_dict: if question_id not in question_answer_dict:
question_answer_dict[question_id] = set() self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
question_answer_dict[question_id].add(platform_id) # question_answer_dict[question_id] = set()
# question_answer_dict[question_id].add(platform_id)
cur_image_index = self.zhihu_answers_list_by_question(question_id,question_answer_dict,zhihu_spider_fd,cur_image_index,cur_word_index) cur_image_index = self.zhihu_answers_list_by_question(question_id,question_answer_dict,zhihu_spider_fd,cur_image_index,cur_word_index)
question_item_dict = { question_item_dict = {
...@@ -380,9 +419,11 @@ class ZhihuAccount(object): ...@@ -380,9 +419,11 @@ class ZhihuAccount(object):
"question_id": "", "question_id": "",
"tags": self.get_tfidf_words_from_content(content) "tags": self.get_tfidf_words_from_content(content)
} }
zhihu_spider_question_fd.write(json.dumps(question_item_dict) + "\n") self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_question_fd,item_dict=question_item_dict)
# zhihu_spider_question_fd.write(json.dumps(question_item_dict) + "\n")
elif platform_id not in question_answer_dict[question_id]: elif platform_id not in question_answer_dict[question_id]:
question_answer_dict[question_id].add(platform_id) # question_answer_dict[question_id].add(platform_id)
self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
else: else:
have_saved_this_answer = True have_saved_this_answer = True
else: else:
...@@ -399,13 +440,13 @@ class ZhihuAccount(object): ...@@ -399,13 +440,13 @@ class ZhihuAccount(object):
"question_id": question_id, "question_id": question_id,
"tags": self.get_tfidf_words_from_content(content) "tags": self.get_tfidf_words_from_content(content)
} }
zhihu_spider_fd.write(json.dumps(item_dict) + "\n") self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_fd, item_dict=item_dict)
# zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
except: except:
print(traceback.format_exc()) print(traceback.format_exc())
print(str(data_item)) print(str(data_item))
time.sleep(2) # time.sleep(2)
# 知乎问题对应的回答列表 # 知乎问题对应的回答列表
...@@ -447,10 +488,14 @@ class ZhihuAccount(object): ...@@ -447,10 +488,14 @@ class ZhihuAccount(object):
question_title = data_item["question"]["title"] question_title = data_item["question"]["title"]
if question_id not in question_answer_dict: if question_id not in question_answer_dict:
question_answer_dict[question_id] = set() # question_answer_dict[question_id] = set()
question_answer_dict[question_id].add(platform_id) # question_answer_dict[question_id].add(platform_id)
self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,
question_id=question_id, platform_id=platform_id)
elif platform_id not in question_answer_dict[question_id]: elif platform_id not in question_answer_dict[question_id]:
question_answer_dict[question_id].add(platform_id) # question_answer_dict[question_id].add(platform_id)
self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,
question_id=question_id, platform_id=platform_id)
else: else:
have_saved_this_answer = True have_saved_this_answer = True
...@@ -465,8 +510,8 @@ class ZhihuAccount(object): ...@@ -465,8 +510,8 @@ class ZhihuAccount(object):
"tags": self.get_tfidf_words_from_content(data_content) "tags": self.get_tfidf_words_from_content(data_content)
} }
zhihu_spider_fd.write(json.dumps(item_dict) + "\n") # zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_fd, item_dict=item_dict)
except: except:
print(traceback.format_exc()) print(traceback.format_exc())
print(str(data_item)) print(str(data_item))
...@@ -474,6 +519,42 @@ class ZhihuAccount(object): ...@@ -474,6 +519,42 @@ class ZhihuAccount(object):
return cur_image_index return cur_image_index
def get_query_word():
mutex_for_get_query_word.acquire(True)
global g_cur_word_index
query_word = ""
g_cur_word_index += 1
try:
query_word = top_query_list.pop()
except:
print(traceback.format_exc())
mutex_for_get_query_word.release()
return query_word,g_cur_word_index
def concurrence_dispose_query_word(account_obj,zhihu_spider_fd,zhihu_spider_question_fd,question_answer_dict):
try:
is_run = True
while is_run:
query_word,g_cur_word_index = get_query_word()
print("query_word:%s" % query_word)
if query_word and len(query_word)>0:
account_obj.zhihu_query_by_word(query_word=query_word, zhihu_spider_fd=zhihu_spider_fd,
zhihu_spider_question_fd=zhihu_spider_question_fd,
question_answer_dict=question_answer_dict, cur_word_index=g_cur_word_index)
else:
is_run = False
except:
print(traceback.format_exc())
if __name__ == '__main__': if __name__ == '__main__':
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.settings") os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.settings")
...@@ -492,13 +573,21 @@ if __name__ == '__main__': ...@@ -492,13 +573,21 @@ if __name__ == '__main__':
# 问题回答映射词典 # 问题回答映射词典
question_answer_dict = dict() question_answer_dict = dict()
cur_word_index = 0 # cur_word_index = 0
for query_word in top_query_list: # for query_word in top_query_list:
cur_word_index += 1 # cur_word_index += 1
print("query_word:%s" % query_word,flush=True) # print("query_word:%s" % query_word,flush=True)
account.zhihu_query_by_word(query_word=query_word,zhihu_spider_fd=zhihu_spider_fd,zhihu_spider_question_fd=zhihu_spider_question_fd,question_answer_dict=question_answer_dict,cur_word_index=cur_word_index) # account.zhihu_query_by_word(query_word=query_word,zhihu_spider_fd=zhihu_spider_fd,zhihu_spider_question_fd=zhihu_spider_question_fd,question_answer_dict=question_answer_dict,cur_word_index=cur_word_index)
gevent_spawn_obj_list = list()
for cur_index in range(0,50,1):
g_obj = gevent.spawn(concurrence_dispose_query_word,account,zhihu_spider_fd,zhihu_spider_question_fd,question_answer_dict)
gevent_spawn_obj_list.append(g_obj)
for g_obj in gevent_spawn_obj_list:
g_obj.join()
zhihu_spider_fd.close() zhihu_spider_fd.close()
zhihu_spider_question_fd.close() zhihu_spider_question_fd.close()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment