Commit 3a5d56a3 authored by 段英荣's avatar 段英荣

加入并发

parent 1212dc7d
......@@ -405,9 +405,9 @@ class ZhihuAccount(object):
question_id = str(data_item["object"]["question"]["id"])
if question_id not in question_answer_dict:
self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
# question_answer_dict[question_id] = set()
# question_answer_dict[question_id].add(platform_id)
# self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
question_answer_dict[question_id] = set()
question_answer_dict[question_id].add(platform_id)
cur_image_index = self.zhihu_answers_list_by_question(question_id,question_answer_dict,zhihu_spider_fd,cur_image_index,cur_word_index)
question_item_dict = {
......@@ -419,11 +419,11 @@ class ZhihuAccount(object):
"question_id": "",
"tags": self.get_tfidf_words_from_content(content)
}
self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_question_fd,item_dict=question_item_dict)
# zhihu_spider_question_fd.write(json.dumps(question_item_dict) + "\n")
# self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_question_fd,item_dict=question_item_dict)
zhihu_spider_question_fd.write(json.dumps(question_item_dict) + "\n")
elif platform_id not in question_answer_dict[question_id]:
# question_answer_dict[question_id].add(platform_id)
self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
question_answer_dict[question_id].add(platform_id)
# self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
else:
have_saved_this_answer = True
else:
......@@ -440,8 +440,8 @@ class ZhihuAccount(object):
"question_id": question_id,
"tags": self.get_tfidf_words_from_content(content)
}
self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_fd, item_dict=item_dict)
# zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
# self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_fd, item_dict=item_dict)
zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
except:
print(traceback.format_exc())
print(str(data_item))
......@@ -488,14 +488,14 @@ class ZhihuAccount(object):
question_title = data_item["question"]["title"]
if question_id not in question_answer_dict:
# question_answer_dict[question_id] = set()
# question_answer_dict[question_id].add(platform_id)
self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,
question_id=question_id, platform_id=platform_id)
question_answer_dict[question_id] = set()
question_answer_dict[question_id].add(platform_id)
# self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,
# question_id=question_id, platform_id=platform_id)
elif platform_id not in question_answer_dict[question_id]:
# question_answer_dict[question_id].add(platform_id)
self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,
question_id=question_id, platform_id=platform_id)
question_answer_dict[question_id].add(platform_id)
# self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,
# question_id=question_id, platform_id=platform_id)
else:
have_saved_this_answer = True
......@@ -510,8 +510,8 @@ class ZhihuAccount(object):
"tags": self.get_tfidf_words_from_content(data_content)
}
# zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_fd, item_dict=item_dict)
zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
# self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_fd, item_dict=item_dict)
except:
print(traceback.format_exc())
print(str(data_item))
......@@ -537,7 +537,7 @@ def get_query_word():
return query_word,g_cur_word_index
def concurrence_dispose_query_word(account_obj,zhihu_spider_fd,zhihu_spider_question_fd,question_answer_dict):
def concurrence_dispose_query_word(account_obj):
try:
is_run = True
......@@ -545,12 +545,26 @@ def concurrence_dispose_query_word(account_obj,zhihu_spider_fd,zhihu_spider_ques
query_word,g_cur_word_index = get_query_word()
print("query_word:%s" % query_word)
zhihu_spider_data_file = "./zhihu_spider_data_for_query_word_" + str(query_word) + ".txt"
zhihu_spider_fd = open(zhihu_spider_data_file, "w")
zhihu_spider_question_data_file = "./zhihu_spider_question_data_for_query_word_" + str(query_word) + ".txt"
zhihu_spider_question_fd = open(zhihu_spider_question_data_file, "w")
# 问题回答映射词典
question_answer_dict = dict()
if query_word and len(query_word)>0:
account_obj.zhihu_query_by_word(query_word=query_word, zhihu_spider_fd=zhihu_spider_fd,
zhihu_spider_question_fd=zhihu_spider_question_fd,
question_answer_dict=question_answer_dict, cur_word_index=g_cur_word_index)
else:
is_run = False
zhihu_spider_fd.close()
zhihu_spider_question_fd.close()
except:
print(traceback.format_exc())
......@@ -564,15 +578,6 @@ if __name__ == '__main__':
account.add_jieba_tag_word()
#account.test_member_article()
zhihu_spider_data_file = "./zhihu_spider_data.txt"
zhihu_spider_fd = open(zhihu_spider_data_file,"w")
zhihu_spider_question_data_file = "./zhihu_spider_question_data.txt"
zhihu_spider_question_fd = open(zhihu_spider_question_data_file,"w")
# 问题回答映射词典
question_answer_dict = dict()
# cur_word_index = 0
# for query_word in top_query_list:
# cur_word_index += 1
......@@ -582,12 +587,9 @@ if __name__ == '__main__':
gevent_spawn_obj_list = list()
for cur_index in range(0,50,1):
g_obj = gevent.spawn(concurrence_dispose_query_word,account,zhihu_spider_fd,zhihu_spider_question_fd,question_answer_dict)
g_obj = gevent.spawn(concurrence_dispose_query_word,account)
gevent_spawn_obj_list.append(g_obj)
for g_obj in gevent_spawn_obj_list:
g_obj.join()
zhihu_spider_fd.close()
zhihu_spider_question_fd.close()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment