Commit 5e1859bf authored by 郭羽's avatar 郭羽

service model 优化

parent 9dcf2b68
This diff is collapsed.
...@@ -10,16 +10,6 @@ from datetime import date, timedelta ...@@ -10,16 +10,6 @@ from datetime import date, timedelta
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__)))) sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))
import utils.configUtils as configUtils import utils.configUtils as configUtils
ITEM_NUMBER_COLUMNS = ["item_"+c for c in ["smart_rank2"]]
embedding_columns = ["itemid","userid"] + ["item_"+c for c in ["doctor_id","hospital_id","merchant_id"]]
multi_columns = ["tags_v3","first_demands","second_demands","first_solutions","second_solutions","first_positions","second_positions"]
one_hot_columns = ["user_os"] + ["item_"+c for c in ["service_type","doctor_type","doctor_famous","hospital_city_tag_id","hospital_type","hospital_is_high_quality"]]
# history_columns = ["userRatedHistory"]
# 数据加载
# data_path_train = "/Users/zhigangzheng/Desktop/work/guoyu/service_sort/train/part-00000-a61205d1-ad4e-4fa7-895d-ad8db41189e6-c000.csv"
# data_path_test = "/Users/zhigangzheng/Desktop/work/guoyu/service_sort/train/part-00000-a61205d1-ad4e-4fa7-895d-ad8db41189e6-c000.csv"
VERSION = configUtils.SERVICE_VERSION VERSION = configUtils.SERVICE_VERSION
trainDay = time.strftime("%Y%m%d%H", time.localtime()) trainDay = time.strftime("%Y%m%d%H", time.localtime())
data_path_train = "/data/files/service_feature_{}_train.csv".format(VERSION) data_path_train = "/data/files/service_feature_{}_train.csv".format(VERSION)
...@@ -84,43 +74,25 @@ def getDataSet(df,shuffleSize = 10000,batchSize=128): ...@@ -84,43 +74,25 @@ def getDataSet(df,shuffleSize = 10000,batchSize=128):
def getTrainColumns(train_columns,data_vocab): def getTrainColumns(train_columns,data_vocab):
emb_columns = [] emb_columns = []
number_columns = [] number_columns = []
oneHot_columns = []
dataColumns = []
inputs = {} inputs = {}
# 离散特征 # 离散特征
for feature in train_columns: for feature in train_columns:
if data_vocab.get(feature): if data_vocab.get(feature):
if feature.count("__")>0: cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature,vocabulary_list=data_vocab[feature])
cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature,vocabulary_list=data_vocab[feature]) col = tf.feature_column.embedding_column(cat_col, 10)
col = tf.feature_column.embedding_column(cat_col, 5) emb_columns.append(col)
emb_columns.append(col) inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string')
dataColumns.append(feature)
inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string') elif feature.endswith("_number"):
elif feature in one_hot_columns or feature.count("Bucket") > 0:
cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature, vocabulary_list=data_vocab[feature])
# col = tf.feature_column.indicator_column(cat_col)
col = tf.feature_column.embedding_column(cat_col, 3)
oneHot_columns.append(col)
dataColumns.append(feature)
inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string')
else:
cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature,vocabulary_list=data_vocab[feature])
col = tf.feature_column.embedding_column(cat_col, 10)
emb_columns.append(col)
dataColumns.append(feature)
inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string')
elif feature in ITEM_NUMBER_COLUMNS:
col = tf.feature_column.numeric_column(feature) col = tf.feature_column.numeric_column(feature)
number_columns.append(col) number_columns.append(col)
dataColumns.append(feature)
inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='float32') inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='float32')
return emb_columns,number_columns,oneHot_columns,dataColumns,inputs return emb_columns,number_columns,inputs
def train(emb_columns, number_columns, oneHot_columns, inputs, train_dataset): def train(emb_columns, number_columns, inputs, train_dataset):
wide = tf.keras.layers.DenseFeatures(emb_columns + number_columns + oneHot_columns)(inputs) wide = tf.keras.layers.DenseFeatures(emb_columns + number_columns)(inputs)
deep = tf.keras.layers.Dense(64, activation='relu')(wide) deep = tf.keras.layers.Dense(64, activation='relu')(wide)
deep = tf.keras.layers.Dropout(0.2)(deep) deep = tf.keras.layers.Dropout(0.2)(deep)
concat_layer = tf.keras.layers.concatenate([wide, deep], axis=1) concat_layer = tf.keras.layers.concatenate([wide, deep], axis=1)
...@@ -193,6 +165,7 @@ if __name__ == '__main__': ...@@ -193,6 +165,7 @@ if __name__ == '__main__':
timestmp1 = int(round(time.time())) timestmp1 = int(round(time.time()))
df_train = loadData(data_path_train) df_train = loadData(data_path_train)
print(df_train.dtypes) print(df_train.dtypes)
print("训练数据列:",df_train.columns)
df_test = df_train.loc[df_train['timestamp']>=splitTimestamp] df_test = df_train.loc[df_train['timestamp']>=splitTimestamp]
df_train = df_train.loc[df_train['timestamp'] < splitTimestamp] df_train = df_train.loc[df_train['timestamp'] < splitTimestamp]
...@@ -204,8 +177,9 @@ if __name__ == '__main__': ...@@ -204,8 +177,9 @@ if __name__ == '__main__':
columns = df_train.columns.tolist() columns = df_train.columns.tolist()
print("原始数据列:") print("原始数据列:")
print(columns) print(columns)
emb_columns,number_columns,oneHot_columns, datasColumns,inputs = getTrainColumns(columns, data_vocab) emb_columns,number_columns,inputs = getTrainColumns(columns, data_vocab)
print("训练列:") print("训练列:")
datasColumns = list(inputs.keys())
print(datasColumns) print(datasColumns)
df_train = df_train[datasColumns + ["label"]] df_train = df_train[datasColumns + ["label"]]
...@@ -226,7 +200,7 @@ if __name__ == '__main__': ...@@ -226,7 +200,7 @@ if __name__ == '__main__':
print("train start...") print("train start...")
timestmp3 = int(round(time.time())) timestmp3 = int(round(time.time()))
model = train(emb_columns,number_columns,oneHot_columns,inputs,train_data) model = train(emb_columns,number_columns,inputs,train_data)
timestmp4 = int(round(time.time())) timestmp4 = int(round(time.time()))
print("train end...耗时h:{}".format((timestmp4 - timestmp3)/60/60)) print("train end...耗时h:{}".format((timestmp4 - timestmp3)/60/60))
......
...@@ -10,6 +10,16 @@ from datetime import date, timedelta ...@@ -10,6 +10,16 @@ from datetime import date, timedelta
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__)))) sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))
import utils.configUtils as configUtils import utils.configUtils as configUtils
ITEM_NUMBER_COLUMNS = ["item_"+c for c in ["smart_rank2"]]
embedding_columns = ["itemid","userid"] + ["item_"+c for c in ["doctor_id","hospital_id","merchant_id"]]
multi_columns = ["tags_v3","first_demands","second_demands","first_solutions","second_solutions","first_positions","second_positions"]
one_hot_columns = ["user_os"] + ["item_"+c for c in ["service_type","doctor_type","doctor_famous","hospital_city_tag_id","hospital_type","hospital_is_high_quality"]]
# history_columns = ["userRatedHistory"]
# 数据加载
# data_path_train = "/Users/zhigangzheng/Desktop/work/guoyu/service_sort/train/part-00000-a61205d1-ad4e-4fa7-895d-ad8db41189e6-c000.csv"
# data_path_test = "/Users/zhigangzheng/Desktop/work/guoyu/service_sort/train/part-00000-a61205d1-ad4e-4fa7-895d-ad8db41189e6-c000.csv"
VERSION = configUtils.SERVICE_VERSION VERSION = configUtils.SERVICE_VERSION
trainDay = time.strftime("%Y%m%d%H", time.localtime()) trainDay = time.strftime("%Y%m%d%H", time.localtime())
data_path_train = "/data/files/service_feature_{}_train.csv".format(VERSION) data_path_train = "/data/files/service_feature_{}_train.csv".format(VERSION)
...@@ -74,25 +84,43 @@ def getDataSet(df,shuffleSize = 10000,batchSize=128): ...@@ -74,25 +84,43 @@ def getDataSet(df,shuffleSize = 10000,batchSize=128):
def getTrainColumns(train_columns,data_vocab): def getTrainColumns(train_columns,data_vocab):
emb_columns = [] emb_columns = []
number_columns = [] number_columns = []
oneHot_columns = []
dataColumns = []
inputs = {} inputs = {}
# 离散特征 # 离散特征
for feature in train_columns: for feature in train_columns:
if data_vocab.get(feature): if data_vocab.get(feature):
cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature,vocabulary_list=data_vocab[feature]) if feature.count("__")>0:
col = tf.feature_column.embedding_column(cat_col, 10) cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature,vocabulary_list=data_vocab[feature])
emb_columns.append(col) col = tf.feature_column.embedding_column(cat_col, 5)
inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string') emb_columns.append(col)
dataColumns.append(feature)
elif feature.endswith("_number"): inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string')
elif feature in one_hot_columns or feature.count("Bucket") > 0:
cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature, vocabulary_list=data_vocab[feature])
# col = tf.feature_column.indicator_column(cat_col)
col = tf.feature_column.embedding_column(cat_col, 3)
oneHot_columns.append(col)
dataColumns.append(feature)
inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string')
else:
cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature,vocabulary_list=data_vocab[feature])
col = tf.feature_column.embedding_column(cat_col, 10)
emb_columns.append(col)
dataColumns.append(feature)
inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='string')
elif feature in ITEM_NUMBER_COLUMNS:
col = tf.feature_column.numeric_column(feature) col = tf.feature_column.numeric_column(feature)
number_columns.append(col) number_columns.append(col)
dataColumns.append(feature)
inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='float32') inputs[feature] = tf.keras.layers.Input(name=feature, shape=(), dtype='float32')
return emb_columns,number_columns,inputs return emb_columns,number_columns,oneHot_columns,dataColumns,inputs
def train(emb_columns, number_columns, inputs, train_dataset): def train(emb_columns, number_columns, oneHot_columns, inputs, train_dataset):
wide = tf.keras.layers.DenseFeatures(emb_columns + number_columns)(inputs) wide = tf.keras.layers.DenseFeatures(emb_columns + number_columns + oneHot_columns)(inputs)
deep = tf.keras.layers.Dense(64, activation='relu')(wide) deep = tf.keras.layers.Dense(64, activation='relu')(wide)
deep = tf.keras.layers.Dropout(0.2)(deep) deep = tf.keras.layers.Dropout(0.2)(deep)
concat_layer = tf.keras.layers.concatenate([wide, deep], axis=1) concat_layer = tf.keras.layers.concatenate([wide, deep], axis=1)
...@@ -165,7 +193,6 @@ if __name__ == '__main__': ...@@ -165,7 +193,6 @@ if __name__ == '__main__':
timestmp1 = int(round(time.time())) timestmp1 = int(round(time.time()))
df_train = loadData(data_path_train) df_train = loadData(data_path_train)
print(df_train.dtypes) print(df_train.dtypes)
print("训练数据列:",df_train.columns)
df_test = df_train.loc[df_train['timestamp']>=splitTimestamp] df_test = df_train.loc[df_train['timestamp']>=splitTimestamp]
df_train = df_train.loc[df_train['timestamp'] < splitTimestamp] df_train = df_train.loc[df_train['timestamp'] < splitTimestamp]
...@@ -177,9 +204,8 @@ if __name__ == '__main__': ...@@ -177,9 +204,8 @@ if __name__ == '__main__':
columns = df_train.columns.tolist() columns = df_train.columns.tolist()
print("原始数据列:") print("原始数据列:")
print(columns) print(columns)
emb_columns,number_columns,inputs = getTrainColumns(columns, data_vocab) emb_columns,number_columns,oneHot_columns, datasColumns,inputs = getTrainColumns(columns, data_vocab)
print("训练列:") print("训练列:")
datasColumns = list(inputs.keys())
print(datasColumns) print(datasColumns)
df_train = df_train[datasColumns + ["label"]] df_train = df_train[datasColumns + ["label"]]
...@@ -200,7 +226,7 @@ if __name__ == '__main__': ...@@ -200,7 +226,7 @@ if __name__ == '__main__':
print("train start...") print("train start...")
timestmp3 = int(round(time.time())) timestmp3 = int(round(time.time()))
model = train(emb_columns,number_columns,inputs,train_data) model = train(emb_columns,number_columns,oneHot_columns,inputs,train_data)
timestmp4 = int(round(time.time())) timestmp4 = int(round(time.time()))
print("train end...耗时h:{}".format((timestmp4 - timestmp3)/60/60)) print("train end...耗时h:{}".format((timestmp4 - timestmp3)/60/60))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment