Commit 22ae3621 authored by 赵威's avatar 赵威

rewrite data with es_scan

parent 458df7cd
...@@ -2,7 +2,7 @@ git+ssh://git@git.wanmeizhensuo.com/backend/gm-rpcd.git@v0.2.5 ...@@ -2,7 +2,7 @@ git+ssh://git@git.wanmeizhensuo.com/backend/gm-rpcd.git@v0.2.5
faiss-cpu==1.6.3 faiss-cpu==1.6.3
pandas==1.1.1 pandas==1.1.1
elasticsearch==7.9.1 elasticsearch==7.0.0
opencv-python==4.4.0.42 opencv-python==4.4.0.42
pillow==7.2.0 pillow==7.2.0
dlib==19.21.0 dlib==19.21.0
......
...@@ -8,7 +8,7 @@ import faiss ...@@ -8,7 +8,7 @@ import faiss
import numpy as np import numpy as np
from utils.cache import redis_client3 from utils.cache import redis_client3
from utils.es import es_query, es_scan from utils.es import es_scan
from utils.images import face_to_vec, url_to_ndarray from utils.images import face_to_vec, url_to_ndarray
...@@ -56,23 +56,22 @@ def save_diary_image_info(save_file, face_to_vec_f): ...@@ -56,23 +56,22 @@ def save_diary_image_info(save_file, face_to_vec_f):
} }
} }
count = 0 count = 0
results = es_scan("diary", q)
with open(save_file, "w") as f: with open(save_file, "w") as f:
for start in [0, 100000, 200000, 300000]: for item in results:
res_dict = es_query("diary", q, offset=start, size=100000) diary_id = item["_id"]
# before_cover_url = item["_source"]["before_ocver_url"] + "-w"
for item in res_dict["hits"]["hits"]: after_cover_url = item["_source"]["after_ocver_url"] + "-w"
diary_id = item["_source"]["id"] print("count: " + str(count) + " " + str(diary_id))
# before_cover_url = item["_source"]["before_cover_url"] + "-w" img = url_to_ndarray(after_cover_url)
after_cover_url = item["_source"]["after_cover_url"] + "-w" if img.any():
img = url_to_ndarray(after_cover_url) count += 1
if img.any(): print("count: " + str(count) + " " + str(diary_id))
count += 1 faces = face_to_vec_f(img)
print("count: " + str(count) + " " + str(diary_id)) for face in faces:
faces = face_to_vec_f(img) line = str(diary_id) + "\t" + face["feature"] + "\n"
for face in faces: # print(line)
line = str(diary_id) + "\t" + face["feature"] + "\n" f.write(line)
# print(line)
f.write(line)
def save_faiss_index(load_file, save_path): def save_faiss_index(load_file, save_path):
...@@ -172,99 +171,53 @@ def main(): ...@@ -172,99 +171,53 @@ def main():
shape_predictor = dlib.shape_predictor(shape_model_path) shape_predictor = dlib.shape_predictor(shape_model_path)
face_to_vec_f = lambda img: face_to_vec(img, face_rec, face_detector, shape_predictor) face_to_vec_f = lambda img: face_to_vec(img, face_rec, face_detector, shape_predictor)
# save_diary_image_info(diary_after_cover_vec_file, face_to_vec_f) save_diary_image_info(diary_after_cover_vec_file, face_to_vec_f)
# save_faiss_index(diary_after_cover_vec_file, faiss_index_path) save_faiss_index(diary_after_cover_vec_file, faiss_index_path)
# faiss_index = faiss.read_index(faiss_index_path) faiss_index = faiss.read_index(faiss_index_path)
# imgs = [ imgs = [
# "https://pic.igengmei.com/2020/07/03/1437/1b9975bb0b81-w", "https://pic.igengmei.com/2020/07/01/1812/ca64827a83da-w", "https://pic.igengmei.com/2020/07/03/1437/1b9975bb0b81-w", "https://pic.igengmei.com/2020/07/01/1812/ca64827a83da-w",
# "https://pic.igengmei.com/2020/07/04/1711/24f4131a9b1e-w", "https://pic.igengmei.com/2020/07/04/1507/e17a995be219-w" "https://pic.igengmei.com/2020/07/04/1711/24f4131a9b1e-w", "https://pic.igengmei.com/2020/07/04/1507/e17a995be219-w"
# ] ]
# for img_url in imgs: for img_url in imgs:
# res = get_similar_diary_ids_by_url(img_url, faiss_index, face_to_vec_f, limit=0.18232107) res = get_similar_diary_ids_by_url(img_url, faiss_index, face_to_vec_f, limit=0.18232107)
# print(res) print(res)
# print("@@@@@@@@") print("@@@@@@@@")
# a = [ a = [
# -0.08361373096704483, 0.06760436296463013, 0.10752949863672256, -0.020746365189552307, -0.07035162299871445, -0.08361373096704483, 0.06760436296463013, 0.10752949863672256, -0.020746365189552307, -0.07035162299871445,
# -0.014547230675816536, -0.043201886117458344, -0.12196271121501923, 0.13929598033428192, -0.1360183209180832, -0.014547230675816536, -0.043201886117458344, -0.12196271121501923, 0.13929598033428192, -0.1360183209180832,
# 0.23247791826725006, -0.08867999166250229, -0.24177594482898712, -0.05600903555750847, -0.05371646583080292, 0.23247791826725006, -0.08867999166250229, -0.24177594482898712, -0.05600903555750847, -0.05371646583080292,
# 0.22015368938446045, -0.12883149087429047, -0.0822330191731453, -0.0413128100335598, 0.08704500645399094, 0.22015368938446045, -0.12883149087429047, -0.0822330191731453, -0.0413128100335598, 0.08704500645399094,
# 0.10081718862056732, -0.03764188289642334, 0.036720920354127884, 0.04766431450843811, -0.0685625970363617, 0.10081718862056732, -0.03764188289642334, 0.036720920354127884, 0.04766431450843811, -0.0685625970363617,
# -0.38336044549942017, -0.10978807508945465, -0.07328074425458908, -0.023904308676719666, -0.007438751868903637, -0.38336044549942017, -0.10978807508945465, -0.07328074425458908, -0.023904308676719666, -0.007438751868903637,
# -0.09545779973268509, 0.027364756911993027, -0.1537190079689026, -0.04008519649505615, -0.03581209108233452, -0.09545779973268509, 0.027364756911993027, -0.1537190079689026, -0.04008519649505615, -0.03581209108233452,
# 0.04322449117898941, -0.05686069279909134, -0.11610691249370575, 0.1640746295452118, -0.004643512889742851, 0.04322449117898941, -0.05686069279909134, -0.11610691249370575, 0.1640746295452118, -0.004643512889742851,
# -0.34821364283561707, 0.03711444139480591, -0.0026186704635620117, 0.1917344480752945, 0.14298999309539795, -0.34821364283561707, 0.03711444139480591, -0.0026186704635620117, 0.1917344480752945, 0.14298999309539795,
# 0.04084448516368866, 0.06119539216160774, -0.12611950933933258, 0.10941470414400101, -0.20786598324775696, 0.04084448516368866, 0.06119539216160774, -0.12611950933933258, 0.10941470414400101, -0.20786598324775696,
# 0.03435457497835159, 0.11412393301725388, 0.0602775476872921, 0.054409340023994446, -0.002967053558677435, 0.03435457497835159, 0.11412393301725388, 0.0602775476872921, 0.054409340023994446, -0.002967053558677435,
# -0.12524624168872833, 0.026284342631697655, 0.08236880600452423, -0.10654348134994507, 0.00403654295951128, -0.12524624168872833, 0.026284342631697655, 0.08236880600452423, -0.10654348134994507, 0.00403654295951128,
# 0.10716681182384491, -0.08270247280597687, 0.018992319703102112, -0.11595900356769562, 0.18344789743423462, 0.10716681182384491, -0.08270247280597687, 0.018992319703102112, -0.11595900356769562, 0.18344789743423462,
# 0.0895184576511383, -0.1307670772075653, -0.15750591456890106, 0.11103398352861404, -0.13521818816661835, 0.0895184576511383, -0.1307670772075653, -0.15750591456890106, 0.11103398352861404, -0.13521818816661835,
# -0.03199139982461929, 0.11129119992256165, -0.17407448589801788, -0.20658859610557556, -0.3114454746246338, -0.03199139982461929, 0.11129119992256165, -0.17407448589801788, -0.20658859610557556, -0.3114454746246338,
# 0.01914297416806221, 0.39955294132232666, 0.12365783005952835, -0.14545315504074097, -0.03254598751664162, 0.01914297416806221, 0.39955294132232666, 0.12365783005952835, -0.14545315504074097, -0.03254598751664162,
# -0.10342024266719818, 0.03375910595059395, 0.11272192746400833, 0.21788232028484344, 0.08588762581348419, -0.10342024266719818, 0.03375910595059395, 0.11272192746400833, 0.21788232028484344, 0.08588762581348419,
# 0.012640122324228287, -0.07646650820970535, -0.043292030692100525, 0.21306097507476807, -0.12407292425632477, 0.012640122324228287, -0.07646650820970535, -0.043292030692100525, 0.21306097507476807, -0.12407292425632477,
# -0.025112995877861977, 0.2634827196598053, 0.005047444254159927, 0.06562616676092148, -0.07397496700286865, -0.025112995877861977, 0.2634827196598053, 0.005047444254159927, 0.06562616676092148, -0.07397496700286865,
# 0.06206338107585907, -0.0634055882692337, 0.05882266163825989, -0.05909111723303795, 0.027562778443098068, 0.06206338107585907, -0.0634055882692337, 0.05882266163825989, -0.05909111723303795, 0.027562778443098068,
# 0.043835900723934174, 0.00407575536519289, -0.007656056433916092, 0.1048622876405716, -0.17822585999965668, 0.043835900723934174, 0.00407575536519289, -0.007656056433916092, 0.1048622876405716, -0.17822585999965668,
# 0.1303984671831131, -0.021631652489304543, 0.0836174339056015, 0.11956407874822617, 0.007379574701189995, 0.1303984671831131, -0.021631652489304543, 0.0836174339056015, 0.11956407874822617, 0.007379574701189995,
# -0.07777556777000427, -0.08474794030189514, 0.09585978090763092, -0.21120299398899078, 0.1435444951057434, -0.07777556777000427, -0.08474794030189514, 0.09585978090763092, -0.21120299398899078, 0.1435444951057434,
# 0.19884724915027618, 0.07154559344053268, 0.06259742379188538, 0.10118959099054337, 0.10188969224691391, 0.19884724915027618, 0.07154559344053268, 0.06259742379188538, 0.10118959099054337, 0.10188969224691391,
# -0.015351934358477592, -0.04335442930459976, -0.26258283853530884, -0.021509556099772453, 0.12185295671224594, -0.015351934358477592, -0.04335442930459976, -0.26258283853530884, -0.021509556099772453, 0.12185295671224594,
# -0.011788002215325832, 0.01337978895753622, -0.008025042712688446 -0.011788002215325832, 0.01337978895753622, -0.008025042712688446
# ] ]
# res = get_similar_diary_ids_by_face_features(a, faiss_index, face_to_vec_f) res = get_similar_diary_ids_by_face_features(a, faiss_index, face_to_vec_f)
# print(res) print(res)
# # save_diary_similarity(diary_after_cover_vec_file, faiss_index, face_to_vec_f) # # save_diary_similarity(diary_after_cover_vec_file, faiss_index, face_to_vec_f)
q = {
"query": {
"bool": {
"filter": [{
"term": {
"is_online": True
}
}, {
"term": {
"has_cover": True
}
}, {
"term": {
"is_sink": False
}
}, {
"term": {
"has_before_cover": True
}
}, {
"term": {
"has_after_cover": True
}
}, {
"terms": {
"content_level": [6, 5, 4, 3.5, 3]
}
}, {
"term": {
"content_simi_bol_show": 0
}
}, {
"exists": {
"field": "before_cover_url"
}
}]
}
},
"_source": {
"include": ["id", "before_cover_url", "after_cover_url"]
}
}
results = es_scan("diary", q)
for item in results:
print(item['_id'], item['_source']['name'])
if __name__ == "__main__": if __name__ == "__main__":
begin_time = time.time() begin_time = time.time()
......
...@@ -43,4 +43,4 @@ def es_scan(doc, body, es=None, rw="read"): ...@@ -43,4 +43,4 @@ def es_scan(doc, body, es=None, rw="read"):
if es is None: if es is None:
es = get_es() es = get_es()
index = es_index_adapt(index_prefix="gm-dbmw", doc_type=doc, rw=rw) index = es_index_adapt(index_prefix="gm-dbmw", doc_type=doc, rw=rw)
return helpers.scan(es, index, doc_type=doc) return helpers.scan(es, index=index, query=body)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment