Commit 7f87caf4 authored by 赵威's avatar 赵威

rename path

parent ada16bb5
......@@ -8,7 +8,7 @@
</config>
<config name="initializer_list">
<element value="face_similarity.diary_cover_similarity"/>
<element value="word_vector.word_to_vec"/>
<element value="word_vector.api"/>
<element value="personas_vector.match_api"/>
</config>
<config name="statuses" value="strategy_embedding.system:statuses"/>
......
......@@ -9,8 +9,10 @@ sys.path.append(os.path.realpath("."))
from gensim.models import Word2Vec, word2vec
from gm_rpcd.all import bind
from utils.es import es_scan
from utils.message import send_msg_to_dingtalk
from utils.file import DATA_PATH, MODEL_PATH
from utils.message import send_msg_to_dingtalk
from word_vector.tractate import tractate_click_ids_model_path
model_output_name = "w2v_model"
model_path = os.path.join(MODEL_PATH, model_output_name)
......@@ -19,8 +21,6 @@ try:
except Exception as e:
print(e)
tracate_click_ids_model_name = "tractate_click_ids_item2vec_model"
tractate_click_ids_model_path = os.path.join(MODEL_PATH, tracate_click_ids_model_name)
try:
TRACTATE_CLICK_IDS_MODEL = word2vec.Word2Vec.load(tractate_click_ids_model_path)
except Exception as e:
......@@ -89,23 +89,7 @@ def projects_item2vec(score_limit=5):
return model
def save_clicked_tractate_ids_item2vec():
click_ids = []
with open(os.path.join(DATA_PATH, "click_tractate_ids.csv"), "r") as f:
data = f.readlines()
for i in data:
tmp = i.split("|")
# app_session_id = tmp[0]
ids = tmp[1].rstrip("\n").split(",")
click_ids.append(ids)
model = Word2Vec(click_ids, hs=0, min_count=3, workers=multiprocessing.cpu_count(), iter=10)
print(model)
print(len(click_ids))
model.save(tractate_click_ids_model_path)
return model
# item2vec
@bind("strategy_embedding/word_vector/tractate_item2vec")
def clicked_tractate_ids_item2vec_model(id, n=5):
try:
......@@ -115,17 +99,14 @@ def clicked_tractate_ids_item2vec_model(id, n=5):
return []
if __name__ == "__main__":
begin_time = time.time()
# w2v_train("dispose_problem.txt", model_output_name)
# if __name__ == "__main__":
for i in ["双眼皮", "隆鼻"]:
print(word_similarity(i))
# w2v_train("dispose_problem.txt", model_output_name)
# save_clicked_tractate_ids_item2vec()
# for i in ["双眼皮", "隆鼻"]:
# print(word_similarity(i))
for id in ["84375", "148764", "368399"]:
print(clicked_tractate_ids_item2vec_model(id, n=5))
# save_clicked_tractate_ids_item2vec()
print("total cost: {:.2f}mins".format((time.time() - begin_time) / 60))
# for id in ["84375", "148764", "368399"]:
# print(clicked_tractate_ids_item2vec_model(id, n=5))
import multiprocessing
import os
import sys
from collections import defaultdict
sys.path.append(os.path.realpath("."))
import time
from gensim.models import Word2Vec, word2vec
from utils.date import get_ndays_before_no_minus, get_ndays_before_with_format
from utils.files import DATA_PATH
from utils.files import DATA_PATH, MODEL_PATH
from utils.spark import get_spark
from word_vector.api import clicked_tractate_ids_item2vec_model
tractate_click_ids_model_path = os.path.join(MODEL_PATH, "tractate_click_ids_item2vec_model")
def get_tracate_click_data(spark, start, end):
reg = r"""^\\d+$"""
......@@ -123,7 +131,26 @@ def get_device_click_tractate_ids_dict(click_df):
return res
def save_clicked_tractate_ids_item2vec():
click_ids = []
with open(os.path.join(DATA_PATH, "click_tractate_ids.csv"), "r") as f:
data = f.readlines()
for i in data:
tmp = i.split("|")
# app_session_id = tmp[0]
ids = tmp[1].rstrip("\n").split(",")
click_ids.append(ids)
model = Word2Vec(click_ids, hs=0, min_count=3, workers=multiprocessing.cpu_count(), iter=10)
print(model)
print(len(click_ids))
model.save(tractate_click_ids_model_path)
return model
if __name__ == "__main__":
begin_time = time.time()
spark = get_spark("tractate_click_ids")
click_df = get_tracate_click_data(spark, get_ndays_before_no_minus(180), get_ndays_before_no_minus(1))
click_df.show(5, False)
......@@ -135,4 +162,11 @@ if __name__ == "__main__":
if v:
f.write("{}|{}\n".format(k, ",".join([str(x) for x in v])))
save_clicked_tractate_ids_item2vec()
for id in ["84375", "148764", "368399"]:
print(clicked_tractate_ids_item2vec_model(id, n=5))
print("total cost: {:.2f}mins".format((time.time() - begin_time) / 60))
# spark-submit --master yarn --deploy-mode client --queue root.strategy --driver-memory 16g --executor-memory 1g --executor-cores 1 --num-executors 70 --conf spark.default.parallelism=100 --conf spark.storage.memoryFraction=0.5 --conf spark.shuffle.memoryFraction=0.3 --conf spark.locality.wait=0 --jars /srv/apps/tispark-core-2.1-SNAPSHOT-jar-with-dependencies.jar,/srv/apps/spark-connector_2.11-1.9.0-rc2.jar,/srv/apps/mysql-connector-java-5.1.38.jar /srv/apps/strategy_embedding/word_vector/tractate.py
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment