Commit b238da87 authored by 赵威's avatar 赵威

get data

parent ddb3ecf5
...@@ -21,5 +21,6 @@ django-extensions==1.9.9 ...@@ -21,5 +21,6 @@ django-extensions==1.9.9
mysqlclient==2.0.1 mysqlclient==2.0.1
gunicorn==19.7.1 gunicorn==19.7.1
gevent==1.2.1 gevent==1.2.1
pymysql==0.10.1
gensim==3.8.3 gensim==3.8.3
import traceback
import pymysql
def get_data_from_jerry_test(sql):
try:
db = pymysql.connect(host="172.16.40.170",
port=4000,
user="st_user",
passwd="aqpuBLYzEV7tML5RPsN1pntUzFy",
db="jerry_test",
charset="utf8",
cursorclass=pymysql.cursors.DictCursor)
cursor = db.cursor()
cursor.execute(sql)
results = cursor.fetchall()
db.close()
return results
except Exception as e:
print(traceback.format_exc())
return traceback.format_exc()
def get_device_click_tractate_ids():
sql = "select device, tractate_ids from device_click_tractate"
data = get_data_from_jerry_test(sql)
res = {}
if data:
for i in data:
res[i["device"]] = i["tractate_ids"].rstrip("\n").split(",")
return res
import os import os
import time import time
from gensim.models import word2vec from gensim.models import Word2Vec, word2vec
from gm_rpcd.all import bind from gm_rpcd.all import bind
from utils.es import es_scan
from utils.db import get_device_click_tractate_ids
base_dir = os.getcwd() base_dir = os.getcwd()
print("base_dir: " + base_dir) print("base_dir: " + base_dir)
...@@ -13,6 +15,8 @@ model_output_name = "w2v_model" ...@@ -13,6 +15,8 @@ model_output_name = "w2v_model"
model_path = os.path.join(model_dir, model_output_name) model_path = os.path.join(model_dir, model_output_name)
WORD2VEC_MODEL = word2vec.Word2Vec.load(model_path) WORD2VEC_MODEL = word2vec.Word2Vec.load(model_path)
import multiprocessing
class W2vSentences: class W2vSentences:
def __init__(self, f_name): def __init__(self, f_name):
...@@ -39,6 +43,45 @@ def word_similarity(word): ...@@ -39,6 +43,45 @@ def word_similarity(word):
return WORD2VEC_MODEL.wv.most_similar(word) return WORD2VEC_MODEL.wv.most_similar(word)
def get_user_portrait_projects(score_limit=5):
"""
return: {
'6231F098-9E72-448E-B8D2-19FCB9687005': ['鼻综合', '玻尿酸填充面部'],
'862538030266882': ['吸脂瘦脸', '吸脂瘦全身']
}
"""
es_res = es_scan("device", {}, rw=None)
count = 0
res = {}
for i in es_res:
count += 1
print(count)
source = i["_source"]
device_id = source.get("device_id", "")
projects = [i["name"] for i in source.get("projects", []) if i["score"] >= score_limit]
if projects:
res[device_id] = projects
return res
def projects_item2vec(score_limit=5):
user_dict = get_user_portrait_projects(score_limit=score_limit)
# TODO if not redis.get user_dict:
projects = list(user_dict.values())
model = Word2Vec(projects, hs=0, min_count=3, workers=multiprocessing.cpu_count(), iter=10)
print(model)
print(len(projects))
for word in ["鼻综合", "吸脂瘦脸"]:
print(model.wv.most_similar(word, topn=5))
return model
def clicked_tractate_ids_item2vec():
user_dict = get_device_click_tractate_ids()
click_ids = list(user_dict.values())
print(click_ids)
if __name__ == "__main__": if __name__ == "__main__":
begin_time = time.time() begin_time = time.time()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment