Commit c7ed5c29 authored by 赵威's avatar 赵威

get tags from db

parent 5ab7d3ea
......@@ -2,6 +2,7 @@
# source /srv/envs/SyntaxNet/bin/activate
import json
import re
import sys
......@@ -10,6 +11,7 @@ import numpy as np
from gensim import models as word2vec
from utils.cache import redis_client5
from utils.db import get_all_business_tags
from utils.es import es_scan
reload(sys)
......@@ -62,11 +64,42 @@ def get_tractate_vector(sentence_lst, all_keywords_set, model):
pass
if vecs:
n = np.average(vecs, axis=0)
res[count] = n
res[count] = n.tolist()
count += 1
return res
def save_tractate_vector_to_redis(all_keywords_set, model):
es_result = get_new_user_tractate_info()
count = 0
for i in es_result:
count += 1
source = i["_source"]
sentences = source["keynote_sentence"]
id = source["id"]
vec = get_tractate_vector(sentences, all_keywords_set, model)
redis_key = "rims:tractate:sentense:vector:" + str(id)
if vec:
print(count, len(vec))
redis_client5.set(redis_key, json.dumps(vec))
# TODO remove
redis_client5.expire(redis_key, 60 * 60 * 24 * 3)
def save_tag_vector_to_redis(all_tags_lst, model):
res = {}
for name in all_tags_lst:
try:
vec = model.wv.get_vector(name).tolist()
res[name] = vec
except Exception as e:
pass
redis_key = "rims:tractate:tags:vector"
if res:
redis_client5.hmset(redis_key, json.dumps(res))
redis_client5.expire(redis_key, 60 * 60 * 24 * 3)
if __name__ == "__main__":
model_path = "/data/log/word2vec/app/ipynb_garbage_files/test_w2v_model_4"
keyword_txt = "/data/log/word2vec/all_key_word.txt"
......@@ -84,74 +117,9 @@ if __name__ == "__main__":
for word in all_keywords_set:
jieba.add_word(word, freq=1000, tag="user_defined")
tractate_vector_dict = {}
es_result = get_new_user_tractate_info()
# count = 0
for i in es_result:
# count += 1
source = i["_source"]
sentences = source["keynote_sentence"]
id = source["id"]
res = get_tractate_vector(sentences, all_keywords_set, model)
print(res)
# sentences = [
# "是不是肋软骨钙化就只能做异体骨修复鼻子?",
# "",
# "",
# "随着肋骨隆鼻的流行,想做肋骨隆鼻的人越来越多,但是又听说肋骨钙化就不能做这项手术了,因此想要了解清楚这究竟是什么情况?",
# "是不是不能做鼻综合手术了?",
# "",
# "",
# "肋骨包括硬骨和软骨,肋软骨位于肋骨的前端,为透明软骨,具有一定的弹性,参与胸廓的构成。",
# "鼻综合手术时一般选择的是第6、7肋的软骨。",
# "未钙化的肋软骨呈玉白色,有一定的柔软性,易取易雕刻。",
# "随着年龄增长,人的肋软骨会逐渐变成硬骨,这就是“肋软骨钙化”。",
# "",
# "",
# "而未完全钙化的软骨,钙化部分呈现淡黄色,易碎易折;",
# "重度钙化的骨性部分还因为缺乏骨膜等软组织的血液供养,抗感染性变差,加之细节的雕刻和缝合困难,常常导致手术效果不理想。",
# "",
# "",
# "一般肋软骨的钙化从25岁开始,年龄越大,钙化的几率越高。",
# "此外,肋软骨的钙化跟个人饮食、环境的改变可能也有关系。",
# "而且随着现在的环境的变化、人们饮食结构的变化,钙化的情也越来越多了,所以如果想做肋骨隆鼻还是要近早手术。",
# ]
# word_list = []
# for s in sentences:
# tmp_lst = []
# for i in jieba.lcut(s):
# s = i.encode("utf-8")
# if s in all_keywords_set:
# tmp_lst.append(s)
# word_list.append(tmp_lst)
# print("11111111111111")
# print(word_list)
# count = 0
# res = {}
# for lst in word_list:
# vecs = []
# for name in lst:
# try:
# vecs.append(model.wv.get_vector(name))
# except Exception as e:
# # print(e)
# pass
# if vecs:
# n = np.average(vecs, axis=0)
# res[count] = n
# count += 1
# print("222222222222")
# print(res)
# res2 = {}
# for name in ["软骨垫鼻基底", "肋软骨隆鼻", "软骨隆鼻", "鼻综合", "肋软骨", "隆鼻", "肋骨隆鼻"]:
# try:
# vec = model.wv.get_vector(name)
# res2[name] = vec
# except Exception as e:
# print(e)
# print("33333333333")
# print(res2)
# save_tractate_vector_to_redis(all_keywords_set, model)
all_tags_lst = get_all_business_tags()
print("all tags: " + str(len(all_tags_lst)))
print(all_tags_lst[:5])
# save_tag_vector_to_redis(all_tags_lst, model)
......@@ -22,6 +22,25 @@ def get_data_from_jerry_test(sql):
return traceback.format_exc()
def get_data_from_zhengxing(sql):
try:
db = pymysql.connect(host="172.16.30.141",
port=3306,
user="zx_str",
passwd="ZXueX58pStrage",
db="zhengxing",
charset="utf8",
cursorclass=pymysql.cursors.DictCursor)
cursor = db.cursor()
cursor.execute(sql)
results = cursor.fetchall()
db.close()
return results
except Exception as e:
print(traceback.format_exc())
return traceback.format_exc()
def get_device_click_tractate_ids():
sql = "select device, tractate_ids from device_click_tractate"
data = get_data_from_jerry_test(sql)
......@@ -30,3 +49,9 @@ def get_device_click_tractate_ids():
for i in data:
res[i["device"]] = i["tractate_ids"].rstrip("\n").split(",")
return res
def get_all_business_tags():
sql = "select name from api_tag where tag_type+0<'4'+0 and is_online = 1"
data = get_data_from_zhengxing(sql)
print(data)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment