Commit b480d5d9 authored by 段英荣's avatar 段英荣

应用用户搜索词爬取

parent 1e32f0fc
......@@ -57,6 +57,8 @@ majia_user_list = [
"32269952","32269956","32269962","32269966","32269973","32269978","32269980","32269982","32269987","32269989","32270003","32270004","32270007","32270012","32270015","32270017","32270020","32270024","32270027","32270031","32270041","32270044","32270047","32270050","32270054","32270055","32270057","32270059","32270063","32270066","32269913","32269918","32269920","32269927","32269933","32269939","32269943","32269948","32269957","32269965","32269972","32269979","32269983","32269988","32269995","32270002","32270005","32270011","32270016","32270022","32270029","32270036","32270040","32270051","32270061","32270065","32270071","32270075","32270081","32270085","32270094","32270096","32270110","32270116","32270121","32270141","32270147","32270152","32270156","32270161","32270114","32270119","32270122","32270125","32270129","32270131","32270133","32270134","32270137","32270167","32270068","32270070","32270076","32270078","32270083","32270087","32270093","32270095","32270099","32270105","32269992","32270018","32270023","32270030","32270034","32270043","32270048","32270052","32270056","32270060"
]
g_query_word_set = set()
g_if_get_query_word = False
ZHENGXING_HOST = "172.16.30.141"
ZHENGXING_USER = "work"
......@@ -363,7 +365,7 @@ class ZhihuAccount(object):
def zhihu_query_by_word(self,query_word,zhihu_spider_fd,zhihu_spider_question_fd,question_answer_dict,cur_word_index):
cur_image_index = 0
for begin_index in range(0,200,10):
for begin_index in range(0,100,10):
# query_by_word_url = "https://www.zhihu.com/api/v4/search_v3?t=general&correction=1&lc_idx=62&" \
# "show_all_topics=0&search_hash_id=1dbb1e923a17f147356177932d1236e1&" \
# "vertical_info=0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C1" + "&q=" + str(query_word) + "&offset=" + str(begin_index) + "&limit=10"
......@@ -468,7 +470,7 @@ class ZhihuAccount(object):
# 知乎问题对应的回答列表
def zhihu_answers_list_by_question(self,question_id,question_answer_dict,zhihu_spider_fd,cur_image_index,cur_word_index):
for begin_index in range(0,200,10):
for begin_index in range(0,100,10):
# answers_list_by_question_url = "https://www.zhihu.com/api/v4/questions/" + str(question_id) + \
# "/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&" \
# "sort_by=default&platform=desktop" + "&offset=" + str(begin_index) + "&limit=10"
......@@ -551,16 +553,35 @@ def get_query_word():
global g_cur_word_index
query_word = ""
ret_word = ""
g_cur_word_index += 1
try:
query_word = top_query_list.pop()
# query_word = top_query_list.pop()
global g_query_word_set
global g_if_get_query_word
if len(g_query_word_set) == 0 and not g_if_get_query_word:
g_if_get_query_word = True
offi_query_word_fd = open("/data/log/spider/test_service/offi_query_word_from_20190101_20200115.txt","r")
for line in offi_query_word_fd:
line = line.strip()
line = line.strip("\r")
line = line.strip("\t")
line = line.strip(" ")
query_word,query_counts = line.split("\t")
query_word = query_word.strip()
g_query_word_set.add(query_word)
offi_query_word_fd.close()
ret_word = g_query_word_set.pop()
except:
print(traceback.format_exc())
mutex_for_get_query_word.release()
return query_word,g_cur_word_index
return ret_word,g_cur_word_index
def concurrence_dispose_query_word(account_obj):
......@@ -571,10 +592,10 @@ def concurrence_dispose_query_word(account_obj):
query_word,g_cur_word_index = get_query_word()
print("query_word:%s" % query_word)
zhihu_spider_data_file = "./zhihu_spider_data_for_query_word_" + str(query_word) + ".txt"
zhihu_spider_data_file = "./data/zhihu_spider_data_for_query_word_" + str(query_word) + ".txt"
zhihu_spider_fd = open(zhihu_spider_data_file, "w")
zhihu_spider_question_data_file = "./zhihu_spider_question_data_for_query_word_" + str(query_word) + ".txt"
zhihu_spider_question_data_file = "./data/zhihu_spider_question_data_for_query_word_" + str(query_word) + ".txt"
zhihu_spider_question_fd = open(zhihu_spider_question_data_file, "w")
# 问题回答映射词典
......@@ -612,7 +633,7 @@ if __name__ == '__main__':
gevent_spawn_obj_list = list()
for cur_index in range(0,50,1):
for cur_index in range(0,500,1):
g_obj = gevent.spawn(concurrence_dispose_query_word,account)
gevent_spawn_obj_list.append(g_obj)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment