import multiprocessing import os import sys import time import traceback sys.path.append(os.path.realpath(".")) from gensim.models import Word2Vec, word2vec from gm_rpcd.all import bind from utils.es import es_scan from utils.files import DATA_PATH, MODEL_PATH from utils.message import (send_msg_to_dingtalk, send_performance_msg_to_dingtalk) from word_vector.tractate import TRACTATE_CLICK_IDS, TRACTATE_CLICK_IDS_MODEL model_output_name = "w2v_model" model_path = os.path.join(MODEL_PATH, model_output_name) try: WORD2VEC_MODEL = word2vec.Word2Vec.load(model_path) except Exception as e: print(e) class W2vSentences: def __init__(self, f_name): self.f_name = f_name def __iter__(self): with open(self.f_name, mode="r", encoding="utf-8", errors="ignore") as f: for line in f.readlines(): yield line.split() def w2v_train(f_name, model_output_name): input_file = os.path.join(DATA_PATH, f_name) print("input: " + input_file) sentences = W2vSentences(input_file) w2v_model = word2vec.Word2Vec(sentences, min_count=2, workers=2, size=100, window=10) model_path = os.path.join(MODEL_PATH, model_output_name) print("output: " + model_path) w2v_model.save(model_path) @bind("strategy_embedding/word_vector/word_similarity") def word_similarity(word): try: return WORD2VEC_MODEL.wv.most_similar(word) except Exception as e: send_msg_to_dingtalk(str(traceback.format_exc())) return [] def get_user_portrait_projects(score_limit=5): """ return: { '6231F098-9E72-448E-B8D2-19FCB9687005': ['鼻综合', '玻尿酸填充面部'], '862538030266882': ['吸脂瘦脸', '吸脂瘦全身'] } """ es_res = es_scan("device", {}, rw=None) count = 0 res = {} for i in es_res: count += 1 print(count) source = i["_source"] device_id = source.get("device_id", "") projects = [i["name"] for i in source.get("projects", []) if i["score"] >= score_limit] if projects: res[device_id] = projects return res def projects_item2vec(score_limit=5): user_dict = get_user_portrait_projects(score_limit=score_limit) # TODO if not redis.get user_dict: projects = list(user_dict.values()) model = Word2Vec(projects, hs=0, min_count=3, workers=multiprocessing.cpu_count(), iter=10) print(model) print(len(projects)) for word in ["鼻综合", "吸脂瘦脸"]: print(model.wv.most_similar(word, topn=5)) return model # item2vec @bind("strategy_embedding/word_vector/tractate_item2vec") def clicked_tractate_ids_item2vec_model(id, n=5): try: time_begin = time.time() res = TRACTATE_CLICK_IDS_MODEL.wv.most_similar(id, topn=n) time_end = time.time() - time_begin if time_end > 0.04: send_performance_msg_to_dingtalk("clicked_tractate_ids_item2vec_model id={} n={} cost {:.3f}ms".format( id, n, time_end * 1000)) return res except KeyError as e: # send_msg_to_dingtalk("tractate_item2vec: " + str(e)) return [] except Exception as e: send_msg_to_dingtalk(str(traceback.format_exc())) return [] @bind("strategy_embedding/word_vector/tractate_item2vec_by_ids") def clicked_tractate_ids_item2vec_model_by_ids(ids, n=5): """ ids: ["417522", "179986", "75804", "84682", "400022"] return: [('77419', 0.8857042193412781), ('75761', 0.7772561311721802), ('84630', 0.9709808826446533), ('84671', 0.9621062278747559)] """ try: time_begin = time.time() res = [] for id in ids: if id in TRACTATE_CLICK_IDS: res.extend(TRACTATE_CLICK_IDS_MODEL.wv.most_similar(id, topn=n)) time_end = time.time() - time_begin if time_end > 0.04: send_performance_msg_to_dingtalk("clicked_tractate_ids_item2vec_model_by_ids ids={} n={} cost {:.3f}ms".format( ids, n, time_end * 1000)) return res except Exception as e: send_msg_to_dingtalk(str(traceback.format_exc())) return [] # if __name__ == "__main__": # w2v_train("dispose_problem.txt", model_output_name) # for i in ["双眼皮", "隆鼻"]: # print(word_similarity(i)) # save_clicked_tractate_ids_item2vec() # for id in ["84375", "148764", "368399"]: # print(clicked_tractate_ids_item2vec_model(id, n=5))