Commit 26cdbadd authored by 郭羽's avatar 郭羽

特征工程优化

parent 044f71fc
...@@ -177,6 +177,10 @@ def addDays(n, format="%Y%m%d"): ...@@ -177,6 +177,10 @@ def addDays(n, format="%Y%m%d"):
if __name__ == '__main__': if __name__ == '__main__':
trainDays = int(sys.argv[1])
isTest = False
if (len(sys.argv) >= 3 and "test" == sys.argv[2]):
isTest = True
curTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) curTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print("train_service执行时间:{}".format(curTime)) print("train_service执行时间:{}".format(curTime))
splitTimestamp = int(time.mktime(time.strptime(addDays(-1), "%Y%m%d"))) splitTimestamp = int(time.mktime(time.strptime(addDays(-1), "%Y%m%d")))
...@@ -189,6 +193,11 @@ if __name__ == '__main__': ...@@ -189,6 +193,11 @@ if __name__ == '__main__':
print("读取数据...") print("读取数据...")
timestmp1 = int(round(time.time())) timestmp1 = int(round(time.time()))
df_train = loadData(data_path_train) df_train = loadData(data_path_train)
print(df_train.dtypes)
df_test = df_train.loc[df_train['timestamp']>=splitTimestamp]
if isTest:
df_train = df_train.loc[df_train['timestamp'] < splitTimestamp]
# df_test = loadData(data_path_test) # df_test = loadData(data_path_test)
timestmp2 = int(round(time.time())) timestmp2 = int(round(time.time()))
print("读取数据耗时s:{}".format(timestmp2 - timestmp1)) print("读取数据耗时s:{}".format(timestmp2 - timestmp1))
...@@ -202,20 +211,20 @@ if __name__ == '__main__': ...@@ -202,20 +211,20 @@ if __name__ == '__main__':
print(datasColumns) print(datasColumns)
df_train = df_train[datasColumns + ["label"]] df_train = df_train[datasColumns + ["label"]]
# df_test = df_test[datasColumns + ["label"]] df_test = df_test[datasColumns + ["label"]]
trainSize = df_train["label"].count() trainSize = df_train["label"].count()
print("trainSize:{}".format(trainSize)) print("trainSize:{}".format(trainSize))
# testSize = df_test["label"].count() testSize = df_test["label"].count()
# print("trainSize:{},testSize{}".format(trainSize,testSize)) print("trainSize:{},testSize{}".format(trainSize,testSize))
# 数据类型转换 # 数据类型转换
df_train = csvTypeConvert(datasColumns,df_train,data_vocab) df_train = csvTypeConvert(datasColumns,df_train,data_vocab)
# df_test = csvTypeConvert(datasColumns,df_test,data_vocab) df_test = csvTypeConvert(datasColumns,df_test,data_vocab)
# 获取训练数据 # 获取训练数据
train_data = getDataSet(df_train,shuffleSize=trainSize,) train_data = getDataSet(df_train,shuffleSize=trainSize,)
# test_data = getDataSet(df_test,shuffleSize=testSize) test_data = getDataSet(df_test,shuffleSize=testSize)
print("train start...") print("train start...")
...@@ -224,7 +233,7 @@ if __name__ == '__main__': ...@@ -224,7 +233,7 @@ if __name__ == '__main__':
timestmp4 = int(round(time.time())) timestmp4 = int(round(time.time()))
print("train end...耗时h:{}".format((timestmp4 - timestmp3)/60/60)) print("train end...耗时h:{}".format((timestmp4 - timestmp3)/60/60))
# evaluate(model,test_data) evaluate(model,test_data)
# predict(model_file,test_data) # predict(model_file,test_data)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment