get data

b238da87 · 赵威 · ddb3ecf5 · b238da87 · b238da87 · b238da87
Commit b238da87 authored Oct 21, 2020 by 赵威
Hide whitespace changes
Inline Side-by-side

Showing with 77 additions and 1 deletion

requirements.txt requirements.txt +1 -0

db.py utils/db.py +32 -0

word_to_vec.py word_vector/word_to_vec.py +44 -1

No files found.
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,5 +21,6 @@ django-extensions==1.9.9
 mysqlclient==2.0.1
 gunicorn==19.7.1
 gevent==1.2.1
+pymysql==0.10.1
 gensim==3.8.3
--- a/utils/db.py
+++ b/utils/db.py
+import traceback
+import pymysql
+def get_data_from_jerry_test(sql):
+    try:
+        db = pymysql.connect(host="172.16.40.170",
+                             port=4000,
+                             user="st_user",
+                             passwd="aqpuBLYzEV7tML5RPsN1pntUzFy",
+                             db="jerry_test",
+                             charset="utf8",
+                             cursorclass=pymysql.cursors.DictCursor)
+        cursor = db.cursor()
+        cursor.execute(sql)
+        results = cursor.fetchall()
+        db.close()
+        return results
+    except Exception as e:
+        print(traceback.format_exc())
+        return traceback.format_exc()
+def get_device_click_tractate_ids():
+    sql = "select device, tractate_ids from device_click_tractate"
+    data = get_data_from_jerry_test(sql)
+    res = {}
+    if data:
+        for i in data:
+            res[i["device"]] = i["tractate_ids"].rstrip("\n").split(",")
+    return res
--- a/word_vector/word_to_vec.py
+++ b/word_vector/word_to_vec.py
 import os
 import time
-from gensim.models import word2vec
+from gensim.models import Word2Vec, word2vec
 from gm_rpcd.all import bind
+from utils.es import es_scan
+from utils.db import get_device_click_tractate_ids
 base_dir = os.getcwd()
 print("base_dir: " + base_dir)
@@ -13,6 +15,8 @@ model_output_name = "w2v_model"
 model_path = os.path.join(model_dir, model_output_name)
 WORD2VEC_MODEL = word2vec.Word2Vec.load(model_path)
+import multiprocessing
 class W2vSentences:
    def __init__(self, f_name):
@@ -39,6 +43,45 @@ def word_similarity(word):
    return WORD2VEC_MODEL.wv.most_similar(word)
+def get_user_portrait_projects(score_limit=5):
+    """
+    return: {
+      '6231F098-9E72-448E-B8D2-19FCB9687005': ['鼻综合', '玻尿酸填充面部'],
+      '862538030266882': ['吸脂瘦脸', '吸脂瘦全身']
+    }
+    """
+    es_res = es_scan("device", {}, rw=None)
+    count = 0
+    res = {}
+    for i in es_res:
+        count += 1
+        print(count)
+        source = i["_source"]
+        device_id = source.get("device_id", "")
+        projects = [i["name"] for i in source.get("projects", []) if i["score"] >= score_limit]
+        if projects:
+            res[device_id] = projects
+    return res
+def projects_item2vec(score_limit=5):
+    user_dict = get_user_portrait_projects(score_limit=score_limit)
+    # TODO if not redis.get user_dict:
+    projects = list(user_dict.values())
+    model = Word2Vec(projects, hs=0, min_count=3, workers=multiprocessing.cpu_count(), iter=10)
+    print(model)
+    print(len(projects))
+    for word in ["鼻综合", "吸脂瘦脸"]:
+        print(model.wv.most_similar(word, topn=5))
+    return model
+def clicked_tractate_ids_item2vec():
+    user_dict = get_device_click_tractate_ids()
+    click_ids = list(user_dict.values())
+    print(click_ids)
 if __name__ == "__main__":
    begin_time = time.time()