Commit 7fc7ec68 authored by 郭羽's avatar 郭羽

美购精排模型

parent 5493b3f5
...@@ -72,40 +72,45 @@ def getDataSet(df,shuffleSize = 10000,batchSize=128): ...@@ -72,40 +72,45 @@ def getDataSet(df,shuffleSize = 10000,batchSize=128):
def getTrainColumns(train_columns,data_vocab): def getTrainColumns(train_columns,data_vocab):
columns = [] columns = []
dataColumns = [] dataColumns = []
inputs = {}
# 离散特征 # 离散特征
for feature in train_columns: for feature in train_columns:
if data_vocab.get(feature): # if data_vocab.get(feature):
cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature,vocabulary_list=data_vocab[feature]) # cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature,vocabulary_list=data_vocab[feature])
col = tf.feature_column.embedding_column(cat_col, 10)
columns.append(col)
dataColumns.append(feature)
# if feature.startswith("userRatedHistory") or feature.count("__") > 0 or feature in embedding_columns:
# cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature, vocabulary_list=data_vocab[feature])
# col = tf.feature_column.embedding_column(cat_col, 10) # col = tf.feature_column.embedding_column(cat_col, 10)
# columns.append(col) # columns.append(col)
# dataColumns.append(feature) # dataColumns.append(feature)
# # inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string')
# elif feature in one_hot_columns or feature.count("Bucket") > 0: if feature.startswith("userRatedHistory") or feature.count("__") > 0 or feature in embedding_columns:
# cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature, vocabulary_list=data_vocab[feature]) cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature, vocabulary_list=data_vocab[feature])
# col = tf.feature_column.indicator_column(cat_col) col = tf.feature_column.embedding_column(cat_col, 10)
# columns.append(col) columns.append(col)
# dataColumns.append(feature) dataColumns.append(feature)
inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string')
elif feature in one_hot_columns or feature.count("Bucket") > 0:
cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature, vocabulary_list=data_vocab[feature])
col = tf.feature_column.indicator_column(cat_col)
columns.append(col)
dataColumns.append(feature)
inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string')
elif feature in ITEM_NUMBER_COLUMNS: elif feature in ITEM_NUMBER_COLUMNS:
col = tf.feature_column.numeric_column(feature) col = tf.feature_column.numeric_column(feature)
columns.append(col) columns.append(col)
dataColumns.append(feature) dataColumns.append(feature)
inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='float32')
return columns,dataColumns return columns,dataColumns,inputs
def train(columns,train_dataset): def train(columns,inputs,train_dataset):
model = tf.keras.Sequential([ deep = tf.keras.layers.DenseFeatures(columns)(inputs)
tf.keras.layers.DenseFeatures(columns), deep = tf.keras.layers.Dense(64, activation='relu')(deep)
tf.keras.layers.Dense(128, activation='relu'), deep = tf.keras.layers.Dense(64, activation='relu')(deep)
tf.keras.layers.Dense(128, activation='relu'), output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(deep)
tf.keras.layers.Dense(1, activation='sigmoid'), model = tf.keras.Model(inputs, output_layer)
])
# compile the model, set loss function, optimizer and evaluation metrics # compile the model, set loss function, optimizer and evaluation metrics
model.compile( model.compile(
...@@ -121,6 +126,7 @@ def train(columns,train_dataset): ...@@ -121,6 +126,7 @@ def train(columns,train_dataset):
model.save(model_file, include_optimizer=False, save_format='tf') model.save(model_file, include_optimizer=False, save_format='tf')
def evaluate(model,test_dataset): def evaluate(model,test_dataset):
# evaluate the model # evaluate the model
timestmp1 = int(round(time.time())) timestmp1 = int(round(time.time()))
...@@ -161,7 +167,7 @@ if __name__ == '__main__': ...@@ -161,7 +167,7 @@ if __name__ == '__main__':
# 获取训练列 # 获取训练列
columns = df_train.columns.tolist() columns = df_train.columns.tolist()
trainColumns, datasColumns = getTrainColumns(columns, data_vocab) trainColumns, datasColumns,inputs = getTrainColumns(columns, data_vocab)
df_train = df_train[datasColumns + ["label"]] df_train = df_train[datasColumns + ["label"]]
df_test = df_test[datasColumns + ["label"]] df_test = df_test[datasColumns + ["label"]]
...@@ -180,7 +186,7 @@ if __name__ == '__main__': ...@@ -180,7 +186,7 @@ if __name__ == '__main__':
timestmp3 = int(round(time.time())) timestmp3 = int(round(time.time()))
model = train(trainColumns,train_data) model = train(trainColumns,inputs,train_data)
timestmp4 = int(round(time.time())) timestmp4 = int(round(time.time()))
print("读取数据耗时h:{}".format((timestmp4 - timestmp3)/60/60)) print("读取数据耗时h:{}".format((timestmp4 - timestmp3)/60/60))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment