Commit 6d81b5f7 authored by 郭羽's avatar 郭羽

美购精排模型

parent 338ffc4e
......@@ -12,11 +12,11 @@ one_hot_columns = ["service_type","doctor_type","doctor_famous","hospital_city_t
# history_columns = ["userRatedHistory"]
# 数据加载
data_path_train = "/Users/zhigangzheng/Desktop/work/guoyu/service_sort/train/part-00000-a61205d1-ad4e-4fa7-895d-ad8db41189e6-c000.csv"
data_path_test = "/Users/zhigangzheng/Desktop/work/guoyu/service_sort/train/part-00000-a61205d1-ad4e-4fa7-895d-ad8db41189e6-c000.csv"
# data_path_train = "/Users/zhigangzheng/Desktop/work/guoyu/service_sort/train/part-00000-a61205d1-ad4e-4fa7-895d-ad8db41189e6-c000.csv"
# data_path_test = "/Users/zhigangzheng/Desktop/work/guoyu/service_sort/train/part-00000-a61205d1-ad4e-4fa7-895d-ad8db41189e6-c000.csv"
# data_path_train = "/data/files/service_feature_train.csv"
# data_path_test = "/data/files/service_feature_test.csv"
data_path_train = "/data/files/service_feature_train.csv"
data_path_test = "/data/files/service_feature_test.csv"
version = "v1"
model_file = "service_mlp_"+version
......@@ -27,16 +27,17 @@ def getDataVocabFromRedis(version):
dataVocabStr = conn.get(key)
if dataVocabStr:
dataVocab = json.loads(dataVocabStr,encoding='utf-8')
print("-----data_vocab-----")
for k, v in dataVocab.items():
print(k, len(v))
else:
dataVocab = None
print("-----data_vocab-----")
for k, v in data_vocab.items():
print(k, len(v))
return dataVocab
# 数据类型转换
def csvTypeConvert(df,data_vocab):
df = df.fillna("-1")
# 离散na值填充
for k, v in data_vocab.items():
df[k] = df[k].fillna("-1")
......@@ -47,6 +48,7 @@ def csvTypeConvert(df,data_vocab):
df[k] = df[k].astype("float")
df["label"] = df["label"].astype("int")
print(df.dtypes)
return df
def loadData(data_path):
......@@ -85,9 +87,7 @@ def getTrainColumns(train_columns,data_vocab):
def train(columns,train_dataset):
model = tf.keras.Sequential([
tf.keras.layers.DenseFeatures(columns),
tf.keras.layers.DenseFeatures(columns),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(128, activation='relu'),
......@@ -118,29 +118,44 @@ def evaluate(model,test_dataset):
print("验证耗时s:", int(round(time.time())) - timestmp1)
def predict(model_path,df):
print("加载模型中...")
model_new = tf.keras.models.load_model("service_fm_v3")
# model_new.summary()
print("模型加载完成...")
# model = tf.keras.models.model_from_json(model.to_json)
n = 1000
dd = dict(df.sample(n=n))
for i in range(10):
timestmp1 = int(round(time.time() * 1000))
model_new.predict(dd, batch_size=10000)
print("测试样本数:{},测试耗时ms:{}".format(n, int(round(time.time() * 1000)) - timestmp1))
if __name__ == '__main__':
# redis中加载数据字典
print("redis 中加载模型字典...")
data_vocab = getDataVocabFromRedis(version)
assert not data_vocab
assert data_vocab
print("读取数据...")
timestmp1 = int(round(time.time() * 1000))
timestmp1 = int(round(time.time()))
df_train = loadData(data_path_train)
df_test = loadData(data_path_test)
timestmp2 = int(round(time.time() * 1000))
print("读取数据耗时ms:{}".format(timestmp2 - timestmp1))
timestmp2 = int(round(time.time()))
print("读取数据耗时s:{}".format(timestmp2 - timestmp1))
df_train = df_train[list(data_vocab.keys()) + ITEM_NUMBER_COLUMNS + ["label"]]
df_test = df_test[list(data_vocab.keys()) + ITEM_NUMBER_COLUMNS + ["label"]]
# df_train = df_train[list(data_vocab.keys()) + ITEM_NUMBER_COLUMNS + ["label"]]
# df_test = df_test[list(data_vocab.keys()) + ITEM_NUMBER_COLUMNS + ["label"]]
trainSize = df_train["label"].count()
testSize = df_test["label"].count()
print("trainSize:{},testSize{}".format(trainSize,testSize))
# 数据类型转换
df_train = csvTypeConvert(df_train)
df_test = csvTypeConvert(df_test)
df_train = csvTypeConvert(df_train,data_vocab)
df_test = csvTypeConvert(df_test,data_vocab)
columns = df_train.columns.tolist()
# 获取训练数据
......@@ -149,8 +164,12 @@ if __name__ == '__main__':
# 获取训练列
columns = getTrainColumns(columns,data_vocab)
timestmp3 = int(round(time.time()))
model = train(columns,train_data)
timestmp4 = int(round(time.time()))
print("读取数据耗时h:{}".format((timestmp4 - timestmp3)/60/60))
# evaluate(model,test_data)
predict(model_file,test_data)
pass
path=/srv/apps/serviceRec
day_count=$1
content_type="service"
pythonFile=${path}/shell/service_feature_csv_export.py
pythonFile=${path}/shell/featureEng.py
#log_file=~/${content_type}_feature_csv_export.log
/opt/hadoop/bin/hdfs dfs -rmr /${content_type}_feature_train
/opt/hadoop/bin/hdfs dfs -rmr /${content_type}_feature_test
......
......@@ -60,6 +60,8 @@ def addItemFeatures(samples,itemDF):
# 离散特征处理
for c, v in ITEM_MULTI_COLUMN_EXTRA_MAP.items():
print("null count:", c, samples.filter(col(c).isNull()).count())
samples = samples.withColumn(c, F.when(F.col(c).isNull(), "-1").otherwise(F.col(c)))
for i in range(1, v + 1):
new_c = c + "__" + str(i)
samples = samples.withColumn(new_c, F.split(F.col(c), ",")[i - 1])
......@@ -575,14 +577,21 @@ if __name__ == '__main__':
ratingSamplesWithLabel = addSampleLabel(ratingDF)
print("处理item特征...")
timestmp1 = int(round(time.time()))
samplesWithItemFeatures = addItemFeatures(ratingSamplesWithLabel, itemDF)
timestmp2 = int(round(time.time()))
print("处理item特征 耗时s:{}".format(timestmp2 - timestmp1))
print("处理user特征...")
samplesWithUserFeatures = addUserFeatures(samplesWithItemFeatures)
timestmp3 = int(round(time.time()))
print("处理user特征 耗时s:{}".format(timestmp3 - timestmp2))
# 离散数据字典生成
print("数据字典生成...")
dataVocab = getDataVocab(samplesWithUserFeatures)
timestmp4 = int(round(time.time()))
print("数据字典生成 耗时s:{}".format(timestmp4 - timestmp3))
# 字典转为json 存入redis
print("数据字典存入redis...")
dataVocabStr = json.dumps(dataVocab,ensure_ascii=False)
......
import redis
def getRedisConn():
# pool = redis.ConnectionPool(host="172.16.50.145",password="XfkMCCdWDIU%ls$h",port=6379,db=0)
# conn = redis.Redis(connection_pool=pool)
pool = redis.ConnectionPool(host="172.16.50.145",password="XfkMCCdWDIU%ls$h",port=6379,db=0)
conn = redis.Redis(connection_pool=pool)
# conn = redis.Redis(host="172.16.50.145", port=6379, password="XfkMCCdWDIU%ls$h",db=0)
conn = redis.Redis(host="172.18.51.10", port=6379,db=0) #test
# conn = redis.Redis(host="172.18.51.10", port=6379,db=0) #test
return conn
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment