Commit d7443fce authored by 赵威's avatar 赵威

write result

parent 04dccced
...@@ -37,24 +37,21 @@ def write_result(): ...@@ -37,24 +37,21 @@ def write_result():
embedding_dict = {} embedding_dict = {}
for item in get_answer_info_from_es(["id", "answer", "content_level"]): for item in get_answer_info_from_es(["id", "answer", "content_level"]):
count += 1 count += 1
if count < 1000: try:
try: id = int(item["_id"])
id = int(item["_id"]) soup = BeautifulSoup(item["_source"]["answer"], "html.parser")
soup = BeautifulSoup(item["_source"]["answer"], "html.parser") content = soup.get_text()
content = soup.get_text() # content_level = str(item["_source"]["content_level"])
# content_level = str(item["_source"]["content_level"]) # print(count, id, content)
print(count, id, content) embedding_dict[id] = bc.encode([content]).tolist()[0]
embedding_dict[id] = bc.encode([content]).tolist()[0] except Exception as e:
except Exception as e: pass
pass
answer_ids = np.array(list(embedding_dict.keys())).astype("int") answer_ids = np.array(list(embedding_dict.keys())).astype("int")
answer_embeddings = np.array(list(embedding_dict.values())).astype("float32") answer_embeddings = np.array(list(embedding_dict.values())).astype("float32")
print(answer_embeddings.shape) print(answer_embeddings.shape)
index = faiss.IndexFlatL2(answer_embeddings.shape[1]) index = faiss.IndexFlatL2(answer_embeddings.shape[1])
print("trained: " + str(index.is_trained))
index2 = faiss.IndexIDMap(index) index2 = faiss.IndexIDMap(index)
index2.add_with_ids(answer_embeddings, answer_ids) index2.add_with_ids(answer_embeddings, answer_ids)
print("trained: " + str(index2.is_trained)) print("trained: " + str(index2.is_trained))
...@@ -64,18 +61,6 @@ def write_result(): ...@@ -64,18 +61,6 @@ def write_result():
# faiss.write_index(index2, index_path) # faiss.write_index(index2, index_path)
# print(index_path) # print(index_path)
# id = tmp_tuple[0]
# emb = np.array([embedding_dict[id]]).astype("float32")
# print(emb)
# D, I = index2.search(emb, 10)
# distances = D.tolist()[0]
# ids = I.tolist()[0]
# res = []
# for (index, i) in enumerate(distances):
# if i <= 1.0:
# res.append(ids[index])
# print(res, "\n")
for (id, emb) in embedding_dict.items(): for (id, emb) in embedding_dict.items():
emb = np.array([emb]).astype("float32") emb = np.array([emb]).astype("float32")
D, I = index2.search(emb, 10) D, I = index2.search(emb, 10)
...@@ -83,8 +68,9 @@ def write_result(): ...@@ -83,8 +68,9 @@ def write_result():
ids = I.tolist()[0] ids = I.tolist()[0]
res = [] res = []
for (index, i) in enumerate(distances): for (index, i) in enumerate(distances):
if i <= 1.0: tmp_id = ids[index]
res.append(ids[index]) if i <= 1.0 and tmp_id != id:
res.append(str(tmp_id))
if res: if res:
data = "{}:{}".format(str(id), ",".join(res)) data = "{}:{}".format(str(id), ",".join(res))
print(data) print(data)
...@@ -121,17 +107,4 @@ def save_result(): ...@@ -121,17 +107,4 @@ def save_result():
if __name__ == "__main__": if __name__ == "__main__":
# bc = BertClient("172.16.44.82", check_length=False)
# sentence = """
# <p>做完私处整形手术,最好在一个月以后进行同房。因为过早同房,可能会对女性的私处造成损伤,甚至可能出现感染的情况。在恢复期间,女性可以适当的多吃水果蔬菜,多喝水,保持体内水分的充足。尽量不要吃刺激性过强的食物。在平时要注意私处的卫生,如果私处有瘙痒的情况,尽量不要用手直接的抓挠,坚持每天更换内裤,不要擅自用妇科清洗液,可以用温水轻轻擦拭私处。如果私处有不适感,需要及时去医院进行检查并治疗。</p>
# """
# sen1_em = bc.encode([sentence])
# sen2_em = bc.encode([sentence])
# print(type(sen1_em), sen1_em)
# print(sen2_em)
# print(cos_sim(sen1_em, sen2_em))
write_result() write_result()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment