Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
S
strategy_embedding
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
strategy_embedding
Commits
d7443fce
Commit
d7443fce
authored
Nov 27, 2020
by
赵威
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
write result
parent
04dccced
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
39 deletions
+12
-39
answer_similarity.py
doc_similarity/answer_similarity.py
+12
-39
No files found.
doc_similarity/answer_similarity.py
View file @
d7443fce
...
@@ -37,24 +37,21 @@ def write_result():
...
@@ -37,24 +37,21 @@ def write_result():
embedding_dict
=
{}
embedding_dict
=
{}
for
item
in
get_answer_info_from_es
([
"id"
,
"answer"
,
"content_level"
]):
for
item
in
get_answer_info_from_es
([
"id"
,
"answer"
,
"content_level"
]):
count
+=
1
count
+=
1
if
count
<
1000
:
try
:
try
:
id
=
int
(
item
[
"_id"
])
id
=
int
(
item
[
"_id"
])
soup
=
BeautifulSoup
(
item
[
"_source"
][
"answer"
],
"html.parser"
)
soup
=
BeautifulSoup
(
item
[
"_source"
][
"answer"
],
"html.parser"
)
content
=
soup
.
get_text
()
content
=
soup
.
get_text
()
# content_level = str(item["_source"]["content_level"])
# content_level = str(item["_source"]["content_level"])
# print(count, id, content)
print
(
count
,
id
,
content
)
embedding_dict
[
id
]
=
bc
.
encode
([
content
])
.
tolist
()[
0
]
embedding_dict
[
id
]
=
bc
.
encode
([
content
])
.
tolist
()[
0
]
except
Exception
as
e
:
except
Exception
as
e
:
pass
pass
answer_ids
=
np
.
array
(
list
(
embedding_dict
.
keys
()))
.
astype
(
"int"
)
answer_ids
=
np
.
array
(
list
(
embedding_dict
.
keys
()))
.
astype
(
"int"
)
answer_embeddings
=
np
.
array
(
list
(
embedding_dict
.
values
()))
.
astype
(
"float32"
)
answer_embeddings
=
np
.
array
(
list
(
embedding_dict
.
values
()))
.
astype
(
"float32"
)
print
(
answer_embeddings
.
shape
)
print
(
answer_embeddings
.
shape
)
index
=
faiss
.
IndexFlatL2
(
answer_embeddings
.
shape
[
1
])
index
=
faiss
.
IndexFlatL2
(
answer_embeddings
.
shape
[
1
])
print
(
"trained: "
+
str
(
index
.
is_trained
))
index2
=
faiss
.
IndexIDMap
(
index
)
index2
=
faiss
.
IndexIDMap
(
index
)
index2
.
add_with_ids
(
answer_embeddings
,
answer_ids
)
index2
.
add_with_ids
(
answer_embeddings
,
answer_ids
)
print
(
"trained: "
+
str
(
index2
.
is_trained
))
print
(
"trained: "
+
str
(
index2
.
is_trained
))
...
@@ -64,18 +61,6 @@ def write_result():
...
@@ -64,18 +61,6 @@ def write_result():
# faiss.write_index(index2, index_path)
# faiss.write_index(index2, index_path)
# print(index_path)
# print(index_path)
# id = tmp_tuple[0]
# emb = np.array([embedding_dict[id]]).astype("float32")
# print(emb)
# D, I = index2.search(emb, 10)
# distances = D.tolist()[0]
# ids = I.tolist()[0]
# res = []
# for (index, i) in enumerate(distances):
# if i <= 1.0:
# res.append(ids[index])
# print(res, "\n")
for
(
id
,
emb
)
in
embedding_dict
.
items
():
for
(
id
,
emb
)
in
embedding_dict
.
items
():
emb
=
np
.
array
([
emb
])
.
astype
(
"float32"
)
emb
=
np
.
array
([
emb
])
.
astype
(
"float32"
)
D
,
I
=
index2
.
search
(
emb
,
10
)
D
,
I
=
index2
.
search
(
emb
,
10
)
...
@@ -83,8 +68,9 @@ def write_result():
...
@@ -83,8 +68,9 @@ def write_result():
ids
=
I
.
tolist
()[
0
]
ids
=
I
.
tolist
()[
0
]
res
=
[]
res
=
[]
for
(
index
,
i
)
in
enumerate
(
distances
):
for
(
index
,
i
)
in
enumerate
(
distances
):
if
i
<=
1.0
:
tmp_id
=
ids
[
index
]
res
.
append
(
ids
[
index
])
if
i
<=
1.0
and
tmp_id
!=
id
:
res
.
append
(
str
(
tmp_id
))
if
res
:
if
res
:
data
=
"{}:{}"
.
format
(
str
(
id
),
","
.
join
(
res
))
data
=
"{}:{}"
.
format
(
str
(
id
),
","
.
join
(
res
))
print
(
data
)
print
(
data
)
...
@@ -121,17 +107,4 @@ def save_result():
...
@@ -121,17 +107,4 @@ def save_result():
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
# bc = BertClient("172.16.44.82", check_length=False)
# sentence = """
# <p>做完私处整形手术,最好在一个月以后进行同房。因为过早同房,可能会对女性的私处造成损伤,甚至可能出现感染的情况。在恢复期间,女性可以适当的多吃水果蔬菜,多喝水,保持体内水分的充足。尽量不要吃刺激性过强的食物。在平时要注意私处的卫生,如果私处有瘙痒的情况,尽量不要用手直接的抓挠,坚持每天更换内裤,不要擅自用妇科清洗液,可以用温水轻轻擦拭私处。如果私处有不适感,需要及时去医院进行检查并治疗。</p>
# """
# sen1_em = bc.encode([sentence])
# sen2_em = bc.encode([sentence])
# print(type(sen1_em), sen1_em)
# print(sen2_em)
# print(cos_sim(sen1_em, sen2_em))
write_result
()
write_result
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment