Commit 3376c972 authored by 赵威's avatar 赵威

get vector

parent 16ec9503
...@@ -3,38 +3,41 @@ import sys ...@@ -3,38 +3,41 @@ import sys
sys.path.append(os.path.realpath(".")) sys.path.append(os.path.realpath("."))
import multiprocessing
import pandas as pd import pandas as pd
from gensim.models import Word2Vec, word2vec from gensim.models import Word2Vec, word2vec
from utils.defs import nth_element from utils.defs import nth_element
from utils.files import get_df from utils.files import get_df
DEVICE_COLUMNS = [
"device_id",
]
TRACTATE_COLUMNS = [ def device_tractate_fe():
"card_id", device_tags_df = get_df("personas_device_feature.csv")
] device_tags_df = device_tags_df[["cl_id", "business_tags"]]
device_tags_df["business_tags"] = device_tags_df["business_tags"].str.split(",").\
apply(lambda d: d if isinstance(d, list) else [])
print(device_tags_df.head(3))
tractate_tags_df = get_df("personas_tractate_tags.csv", columns=["tractate_id", "business_tags"])
tractate_tags_df["business_tags"] = tractate_tags_df["business_tags"].str.split(",").\
apply(lambda d: d if isinstance(d, list) else [])
print(tractate_tags_df.head(3))
def device_tractae_fe(): return device_tags_df, tractate_tags_df
pass
if __name__ == "__main__": def tractate_business_tags_word2vec(tractate_df):
device_fe_df = get_df("personas_device_feature.csv", data = tractate_tags_df["business_tags"].to_list()
columns=[ print(len(data))
"cl_id", model = Word2Vec(data, hs=0, min_count=3, workers=multiprocessing.cpu_count(), iter=10)
"first_demands", print(model)
"first_solutions", return model
"first_positions",
"second_demands",
"second_solutions",
"second_positions",
"projects",
"business_tags",
])
print(device_fe_df.head(3))
tractate_tags_df = get_df("personas_tractate_tags.csv", columns=["tractate_id", "business_tags"])
print(tractate_tags_df.head(3)) if __name__ == "__main__":
device_tags_df, tractate_tags_df = device_tractate_fe()
model = tractate_business_tags_word2vec(tractate_tags_df)
for i in ["自体脂肪面部年轻化", "自体脂肪填充面部", "自体脂肪全面部填充", "自体脂肪面部填充", "鼻综合", "鼻部综合"]:
print(model.wv.most_similar(i))
print(model.wv.get_vector(i))
# tractate_tags_df["business_tags"].to_list()
...@@ -34,7 +34,11 @@ def save_dict_to_csv(d, file): ...@@ -34,7 +34,11 @@ def save_dict_to_csv(d, file):
def get_df(file, sep="|", columns=[]): def get_df(file, sep="|", columns=[]):
full_path = os.path.join(DATA_PATH, file) full_path = os.path.join(DATA_PATH, file)
# full_path = os.path.join("/Users/offic/work/GM/strategy_embedding/_data", file) # TODO
print(full_path) print(full_path)
df = pd.read_csv(full_path, sep="|", names=columns) if columns:
df = pd.read_csv(full_path, sep=sep, names=columns)
else:
df = pd.read_csv(full_path, sep=sep)
print(df.shape) print(df.shape)
return df return df
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment