Commit e4eb4cc9 authored by 郭羽's avatar 郭羽

美购精排模型

parent d60b9499
...@@ -70,8 +70,9 @@ def getDataSet(df,shuffleSize = 10000,batchSize=128): ...@@ -70,8 +70,9 @@ def getDataSet(df,shuffleSize = 10000,batchSize=128):
return dataSet return dataSet
def getTrainColumns(train_columns,data_vocab): def getTrainColumns(train_columns,data_vocab):
deep_columns = [] emb_columns = []
wide_columns = [] number_columns = []
oneHot_columns = []
dataColumns = [] dataColumns = []
inputs = {} inputs = {}
# 离散特征 # 离散特征
...@@ -80,20 +81,20 @@ def getTrainColumns(train_columns,data_vocab): ...@@ -80,20 +81,20 @@ def getTrainColumns(train_columns,data_vocab):
if feature.count("__")>0: if feature.count("__")>0:
cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature,vocabulary_list=data_vocab[feature]) cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature,vocabulary_list=data_vocab[feature])
col = tf.feature_column.embedding_column(cat_col, 5) col = tf.feature_column.embedding_column(cat_col, 5)
deep_columns.append(col) emb_columns.append(col)
dataColumns.append(feature) dataColumns.append(feature)
inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string') inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string')
elif feature in one_hot_columns or feature.count("Bucket") > 0: elif feature in one_hot_columns or feature.count("Bucket") > 0:
cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature, vocabulary_list=data_vocab[feature]) cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature, vocabulary_list=data_vocab[feature])
# col = tf.feature_column.indicator_column(cat_col) # col = tf.feature_column.indicator_column(cat_col)
col = tf.feature_column.embedding_column(cat_col, 3) col = tf.feature_column.embedding_column(cat_col, 3)
wide_columns.append(col) oneHot_columns.append(col)
dataColumns.append(feature) dataColumns.append(feature)
inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string') inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string')
else: else:
cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature,vocabulary_list=data_vocab[feature]) cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature,vocabulary_list=data_vocab[feature])
col = tf.feature_column.embedding_column(cat_col, 10) col = tf.feature_column.embedding_column(cat_col, 10)
deep_columns.append(col) emb_columns.append(col)
dataColumns.append(feature) dataColumns.append(feature)
inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string') inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string')
# if feature.startswith("userRatedHistory") or feature.count("__") > 0 or feature in embedding_columns: # if feature.startswith("userRatedHistory") or feature.count("__") > 0 or feature in embedding_columns:
...@@ -113,19 +114,17 @@ def getTrainColumns(train_columns,data_vocab): ...@@ -113,19 +114,17 @@ def getTrainColumns(train_columns,data_vocab):
elif feature in ITEM_NUMBER_COLUMNS: elif feature in ITEM_NUMBER_COLUMNS:
col = tf.feature_column.numeric_column(feature) col = tf.feature_column.numeric_column(feature)
wide_columns.append(col) number_columns.append(col)
dataColumns.append(feature) dataColumns.append(feature)
inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='float32') inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='float32')
return deep_columns,wide_columns,dataColumns,inputs return emb_columns,number_columns,oneHot_columns,dataColumns,inputs
def train(deep_columns,wide_columns,inputs,train_dataset): def train(emb_columns, number_columns, oneHot_columns, inputs, train_dataset):
wide = tf.keras.layers.DenseFeatures(deep_columns + wide_columns)(inputs) wide = tf.keras.layers.DenseFeatures(emb_columns + number_columns + oneHot_columns)(inputs)
deep = tf.keras.layers.DenseFeatures(deep_columns)(inputs) deep = tf.keras.layers.Dense(64, activation='relu')(wide)
deep = tf.keras.layers.Dense(64, activation='relu')(deep)
deep = tf.keras.layers.Dropout(0.2)(deep) deep = tf.keras.layers.Dropout(0.2)(deep)
concat_layer = tf.keras.layers.concatenate([wide, deep], axis=1) concat_layer = tf.keras.layers.concatenate([wide, deep], axis=1)
...@@ -134,15 +133,13 @@ def train(deep_columns,wide_columns,inputs,train_dataset): ...@@ -134,15 +133,13 @@ def train(deep_columns,wide_columns,inputs,train_dataset):
output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(concat_layer) output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(concat_layer)
# output_layer = FM(1)(deep) # output_layer = FM(1)(deep)
# deep = tf.keras.layers.DenseFeatures(columns)(inputs)
# deep = tf.keras.layers.Dense(64, activation='relu')(deep)
# deep = tf.keras.layers.Dropout(0.2)(deep)
# deep = tf.keras.layers.Dense(64, activation='relu')(deep)
# deep = tf.keras.layers.Dropout(0.2)(deep)
# output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(deep)
model = tf.keras.Model(inputs, output_layer) model = tf.keras.Model(inputs, output_layer)
# model = tf.keras.Sequential([
# tf.keras.layers.DenseFeatures(columns)(inputs),
# tf.keras.layers.Dense(128, activation='relu')(inputs),
# tf.keras.layers.Dense(128, activation='relu')(inputs),
# tf.keras.layers.Dense(1, activation='sigmoid'),
# ])
# compile the model, set loss function, optimizer and evaluation metrics # compile the model, set loss function, optimizer and evaluation metrics
model.compile( model.compile(
...@@ -151,13 +148,13 @@ def train(deep_columns,wide_columns,inputs,train_dataset): ...@@ -151,13 +148,13 @@ def train(deep_columns,wide_columns,inputs,train_dataset):
metrics=['accuracy', tf.keras.metrics.AUC(curve='ROC'), tf.keras.metrics.AUC(curve='PR')]) metrics=['accuracy', tf.keras.metrics.AUC(curve='ROC'), tf.keras.metrics.AUC(curve='PR')])
# train the model # train the model
model.fit(train_dataset, epochs=10) print("train start...")
model.fit(train_dataset, epochs=3)
print("train end...")
print("train save...") print("train save...")
model.save(model_file, include_optimizer=False, save_format='tf') model.save(model_file, include_optimizer=False, save_format='tf')
return model
def evaluate(model,test_dataset): def evaluate(model,test_dataset):
# evaluate the model # evaluate the model
...@@ -199,7 +196,7 @@ if __name__ == '__main__': ...@@ -199,7 +196,7 @@ if __name__ == '__main__':
# 获取训练列 # 获取训练列
columns = df_train.columns.tolist() columns = df_train.columns.tolist()
deep_columns,wide_columns, datasColumns,inputs = getTrainColumns(columns, data_vocab) emb_columns,number_columns,oneHot_columns, datasColumns,inputs = getTrainColumns(columns, data_vocab)
df_train = df_train[datasColumns + ["label"]] df_train = df_train[datasColumns + ["label"]]
df_test = df_test[datasColumns + ["label"]] df_test = df_test[datasColumns + ["label"]]
...@@ -219,7 +216,7 @@ if __name__ == '__main__': ...@@ -219,7 +216,7 @@ if __name__ == '__main__':
print("train start...") print("train start...")
timestmp3 = int(round(time.time())) timestmp3 = int(round(time.time()))
model = train(deep_columns,wide_columns,inputs,train_data) model = train(emb_columns,number_columns,oneHot_columns,inputs,train_data)
timestmp4 = int(round(time.time())) timestmp4 = int(round(time.time()))
print("train end...耗时h:{}".format((timestmp4 - timestmp3)/60/60)) print("train end...耗时h:{}".format((timestmp4 - timestmp3)/60/60))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment