Commit 7f87caf4 authored by 赵威's avatar 赵威

rename path

parent ada16bb5
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
</config> </config>
<config name="initializer_list"> <config name="initializer_list">
<element value="face_similarity.diary_cover_similarity"/> <element value="face_similarity.diary_cover_similarity"/>
<element value="word_vector.word_to_vec"/> <element value="word_vector.api"/>
<element value="personas_vector.match_api"/> <element value="personas_vector.match_api"/>
</config> </config>
<config name="statuses" value="strategy_embedding.system:statuses"/> <config name="statuses" value="strategy_embedding.system:statuses"/>
......
...@@ -9,8 +9,10 @@ sys.path.append(os.path.realpath(".")) ...@@ -9,8 +9,10 @@ sys.path.append(os.path.realpath("."))
from gensim.models import Word2Vec, word2vec from gensim.models import Word2Vec, word2vec
from gm_rpcd.all import bind from gm_rpcd.all import bind
from utils.es import es_scan from utils.es import es_scan
from utils.message import send_msg_to_dingtalk
from utils.file import DATA_PATH, MODEL_PATH from utils.file import DATA_PATH, MODEL_PATH
from utils.message import send_msg_to_dingtalk
from word_vector.tractate import tractate_click_ids_model_path
model_output_name = "w2v_model" model_output_name = "w2v_model"
model_path = os.path.join(MODEL_PATH, model_output_name) model_path = os.path.join(MODEL_PATH, model_output_name)
...@@ -19,8 +21,6 @@ try: ...@@ -19,8 +21,6 @@ try:
except Exception as e: except Exception as e:
print(e) print(e)
tracate_click_ids_model_name = "tractate_click_ids_item2vec_model"
tractate_click_ids_model_path = os.path.join(MODEL_PATH, tracate_click_ids_model_name)
try: try:
TRACTATE_CLICK_IDS_MODEL = word2vec.Word2Vec.load(tractate_click_ids_model_path) TRACTATE_CLICK_IDS_MODEL = word2vec.Word2Vec.load(tractate_click_ids_model_path)
except Exception as e: except Exception as e:
...@@ -89,23 +89,7 @@ def projects_item2vec(score_limit=5): ...@@ -89,23 +89,7 @@ def projects_item2vec(score_limit=5):
return model return model
def save_clicked_tractate_ids_item2vec(): # item2vec
click_ids = []
with open(os.path.join(DATA_PATH, "click_tractate_ids.csv"), "r") as f:
data = f.readlines()
for i in data:
tmp = i.split("|")
# app_session_id = tmp[0]
ids = tmp[1].rstrip("\n").split(",")
click_ids.append(ids)
model = Word2Vec(click_ids, hs=0, min_count=3, workers=multiprocessing.cpu_count(), iter=10)
print(model)
print(len(click_ids))
model.save(tractate_click_ids_model_path)
return model
@bind("strategy_embedding/word_vector/tractate_item2vec") @bind("strategy_embedding/word_vector/tractate_item2vec")
def clicked_tractate_ids_item2vec_model(id, n=5): def clicked_tractate_ids_item2vec_model(id, n=5):
try: try:
...@@ -115,17 +99,14 @@ def clicked_tractate_ids_item2vec_model(id, n=5): ...@@ -115,17 +99,14 @@ def clicked_tractate_ids_item2vec_model(id, n=5):
return [] return []
if __name__ == "__main__": # if __name__ == "__main__":
begin_time = time.time()
# w2v_train("dispose_problem.txt", model_output_name)
for i in ["双眼皮", "隆鼻"]: # w2v_train("dispose_problem.txt", model_output_name)
print(word_similarity(i))
# save_clicked_tractate_ids_item2vec() # for i in ["双眼皮", "隆鼻"]:
# print(word_similarity(i))
for id in ["84375", "148764", "368399"]: # save_clicked_tractate_ids_item2vec()
print(clicked_tractate_ids_item2vec_model(id, n=5))
print("total cost: {:.2f}mins".format((time.time() - begin_time) / 60)) # for id in ["84375", "148764", "368399"]:
# print(clicked_tractate_ids_item2vec_model(id, n=5))
import multiprocessing
import os import os
import sys import sys
from collections import defaultdict from collections import defaultdict
sys.path.append(os.path.realpath(".")) sys.path.append(os.path.realpath("."))
import time
from gensim.models import Word2Vec, word2vec
from utils.date import get_ndays_before_no_minus, get_ndays_before_with_format from utils.date import get_ndays_before_no_minus, get_ndays_before_with_format
from utils.files import DATA_PATH from utils.files import DATA_PATH, MODEL_PATH
from utils.spark import get_spark from utils.spark import get_spark
from word_vector.api import clicked_tractate_ids_item2vec_model
tractate_click_ids_model_path = os.path.join(MODEL_PATH, "tractate_click_ids_item2vec_model")
def get_tracate_click_data(spark, start, end): def get_tracate_click_data(spark, start, end):
reg = r"""^\\d+$""" reg = r"""^\\d+$"""
...@@ -123,7 +131,26 @@ def get_device_click_tractate_ids_dict(click_df): ...@@ -123,7 +131,26 @@ def get_device_click_tractate_ids_dict(click_df):
return res return res
def save_clicked_tractate_ids_item2vec():
click_ids = []
with open(os.path.join(DATA_PATH, "click_tractate_ids.csv"), "r") as f:
data = f.readlines()
for i in data:
tmp = i.split("|")
# app_session_id = tmp[0]
ids = tmp[1].rstrip("\n").split(",")
click_ids.append(ids)
model = Word2Vec(click_ids, hs=0, min_count=3, workers=multiprocessing.cpu_count(), iter=10)
print(model)
print(len(click_ids))
model.save(tractate_click_ids_model_path)
return model
if __name__ == "__main__": if __name__ == "__main__":
begin_time = time.time()
spark = get_spark("tractate_click_ids") spark = get_spark("tractate_click_ids")
click_df = get_tracate_click_data(spark, get_ndays_before_no_minus(180), get_ndays_before_no_minus(1)) click_df = get_tracate_click_data(spark, get_ndays_before_no_minus(180), get_ndays_before_no_minus(1))
click_df.show(5, False) click_df.show(5, False)
...@@ -135,4 +162,11 @@ if __name__ == "__main__": ...@@ -135,4 +162,11 @@ if __name__ == "__main__":
if v: if v:
f.write("{}|{}\n".format(k, ",".join([str(x) for x in v]))) f.write("{}|{}\n".format(k, ",".join([str(x) for x in v])))
save_clicked_tractate_ids_item2vec()
for id in ["84375", "148764", "368399"]:
print(clicked_tractate_ids_item2vec_model(id, n=5))
print("total cost: {:.2f}mins".format((time.time() - begin_time) / 60))
# spark-submit --master yarn --deploy-mode client --queue root.strategy --driver-memory 16g --executor-memory 1g --executor-cores 1 --num-executors 70 --conf spark.default.parallelism=100 --conf spark.storage.memoryFraction=0.5 --conf spark.shuffle.memoryFraction=0.3 --conf spark.locality.wait=0 --jars /srv/apps/tispark-core-2.1-SNAPSHOT-jar-with-dependencies.jar,/srv/apps/spark-connector_2.11-1.9.0-rc2.jar,/srv/apps/mysql-connector-java-5.1.38.jar /srv/apps/strategy_embedding/word_vector/tractate.py # spark-submit --master yarn --deploy-mode client --queue root.strategy --driver-memory 16g --executor-memory 1g --executor-cores 1 --num-executors 70 --conf spark.default.parallelism=100 --conf spark.storage.memoryFraction=0.5 --conf spark.shuffle.memoryFraction=0.3 --conf spark.locality.wait=0 --jars /srv/apps/tispark-core-2.1-SNAPSHOT-jar-with-dependencies.jar,/srv/apps/spark-connector_2.11-1.9.0-rc2.jar,/srv/apps/mysql-connector-java-5.1.38.jar /srv/apps/strategy_embedding/word_vector/tractate.py
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment