Commit c51d4fe8 authored by 张彦钊's avatar 张彦钊

refractor the process data file

parent 61438b13
...@@ -2,9 +2,8 @@ from eda.ml_tools.rocCurve import get_roc_curve ...@@ -2,9 +2,8 @@ from eda.ml_tools.rocCurve import get_roc_curve
import pandas as pd import pandas as pd
from config import * from config import *
if __name__ == "__main__": if __name__ == "__main__":
test = pd.read_csv(DIRECTORY_PATH + "test.csv",header = None) test = pd.read_csv(DIRECTORY_PATH + "test.csv", header=None)
test_label = test[0].apply(lambda x: x[0]).values test_label = test[0].apply(lambda x: x[0]).values
predict = pd.read_csv(DIRECTORY_PATH + "output.txt",header = None)[0].values predict = pd.read_csv(DIRECTORY_PATH + "output.txt", header=None)[0].values
get_roc_curve(test_label,predict) get_roc_curve(test_label, predict, "1")
DIRECTORY_PATH = '/home/zhangyanzhao/' DIRECTORY_PATH = '/home/zhangyanzhao/'
VALIDATION_DATE = '2018-08-05'
TEST_DATE = '2018-08-06'
DATA_START_DATE = '2018-07-06'
DATA_END_DATE = '2018-08-06'
# processData.py # processData.py
......
import xlearn as xl import xlearn as xl
from config import * from config import *
print("Start training") print("Start training")
ffm_model = xl.create_ffm() ffm_model = xl.create_ffm()
ffm_model.setTrain(DIRECTORY_PATH + "data.csv") ffm_model.setTrain(DIRECTORY_PATH + "train.csv")
ffm_model.setValidate(DIRECTORY_PATH + "validation.csv") ffm_model.setValidate(DIRECTORY_PATH + "validation.csv")
param = {'task': 'binary', 'lr': 0.03, 'lambda': 0.002, 'metric': 'auc'}
param = {'task':'binary', 'lr':0.03,
'lambda':0.002, 'metric':'auc'}
ffm_model.fit(param, DIRECTORY_PATH + "model.out") ffm_model.fit(param, DIRECTORY_PATH + "model.out")
...@@ -19,8 +15,3 @@ ffm_model.setTest(DIRECTORY_PATH + "test.csv") ...@@ -19,8 +15,3 @@ ffm_model.setTest(DIRECTORY_PATH + "test.csv")
ffm_model.setSigmoid() ffm_model.setSigmoid()
ffm_model.predict(DIRECTORY_PATH + "model.out", ffm_model.predict(DIRECTORY_PATH + "model.out",
DIRECTORY_PATH + "output.txt") DIRECTORY_PATH + "output.txt")
import pandas as pd import pandas as pd
from sklearn import metrics from sklearn import metrics
# import matplotlib.pyplot as plt
from sklearn.metrics import auc from sklearn.metrics import auc
# import argparse # import argparse
def get_roc_curve(y,pred,pos_label):
# parser = argparse.ArgumentParser()
# parser.add_argument('test_label',help='The filename of the test_label')
# parser.add_argument('test_pred',help='The filename of the test_pred')
# # parser.add_argument('output_photo',help='The filename of the output_photo')
# args = parser.parse_args()
def get_roc_curve(label,pred):
""" """
计算二分类问题的roc和auc 计算二分类问题的roc和auc
""" """
fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label)
test_label = pd.read_table(label) AUC = metrics.auc(fpr, tpr)
pred_label = pd.read_table(pred)
y = test_label.values
p = pred_label.values
fpr, tpr, thresholds = metrics.roc_curve(y, p)
# plt.plot(fpr,tpr,marker = 'o')
# plt.xlabel('False positive rate')
# plt.ylabel('True positive rate')
# plt.title("roc_curev")
AUC = auc(fpr, tpr)
AUC = "auc={}".format(AUC)
# plt.text(0.5,0.8,AUC,color='blue',ha='center')
# # plt.savefig(output)
print(AUC) print(AUC)
#
# if __name__ == "__main__":
# get_roc_curve(args.test_label,args.test_pred)
...@@ -5,7 +5,7 @@ import pandas as pd ...@@ -5,7 +5,7 @@ import pandas as pd
from config import * from config import *
exposure, click, click_device_id = fetch_data( exposure, click, click_device_id = fetch_data(
start_date='2018-08-03', end_date='2018-08-06') start_date=DATA_START_DATE, end_date=DATA_END_DATE)
# 求曝光表和点击表的差集合 # 求曝光表和点击表的差集合
print("曝光表处理前的样本个数") print("曝光表处理前的样本个数")
...@@ -14,7 +14,7 @@ print(exposure.shape) ...@@ -14,7 +14,7 @@ print(exposure.shape)
exposure = exposure.append(click) exposure = exposure.append(click)
exposure = exposure.append(click) exposure = exposure.append(click)
subset = click.columns.tolist() subset = click.columns.tolist()
exposure = exposure.drop_duplicates(subset=subset,keep=False) exposure = exposure.drop_duplicates(subset=subset, keep=False)
print("差集后曝光表个数") print("差集后曝光表个数")
print(exposure.shape) print(exposure.shape)
...@@ -33,23 +33,22 @@ print(exposure.shape[0]) ...@@ -33,23 +33,22 @@ print(exposure.shape[0])
# 合并点击表和曝光表 # 合并点击表和曝光表
data = click.append(exposure) data = click.append(exposure)
data = data.sort_values(by="stat_date",ascending=False) data = data.sort_values(by="stat_date", ascending=False)
print("前两行数据") print("前两行数据")
print(data.head(2)) print(data.head(2))
print("后两行数据") print("后两行数据")
print(data.tail(2)) print(data.tail(2))
test_number = data[data["stat_date"]=='2018-08-06'].shape[0] test_number = data[data["stat_date"] == TEST_DATE].shape[0]
validation_number = data[data["stat_date"]=='2018-08-05'].shape[0] validation_number = data[data["stat_date"] == VALIDATION_DATE].shape[0]
data = data.drop("stat_date",axis=1) data = data.drop("stat_date", axis=1)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征 # 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data.loc[data["hour"]==0,["hour"]] = 24 data.loc[data["hour"] == 0, ["hour"]] = 24
data.loc[data["minute"]==0,["minute"]] = 60 data.loc[data["minute"] == 0, ["minute"]] = 60
data["hour"] = data["hour"].astype("category") data["hour"] = data["hour"].astype("category")
data["minute"] = data["minute"].astype("category") data["minute"] = data["minute"].astype("category")
print(data.head(2)) print(data.head(2))
print("Start ffm transform") print("Start ffm transform")
start = time.time() start = time.time()
ffm_train = FFMFormatPandas() ffm_train = FFMFormatPandas()
...@@ -57,9 +56,10 @@ data = ffm_train.fit_transform(data, y='y') ...@@ -57,9 +56,10 @@ data = ffm_train.fit_transform(data, y='y')
print("done transform ffm") print("done transform ffm")
end = time.time() end = time.time()
print("ffm转化数据耗时:") print("ffm转化数据耗时:")
print(end-start) print(end - start)
data.to_csv(DIRECTORY_PATH + "data.csv",index=False)
data = pd.read_csv(DIRECTORY_PATH + "data.csv",header=None) data.to_csv(DIRECTORY_PATH + "data{0}-{1}.csv".format(DATA_START_DATE, DATA_END_DATE), index=False)
data = pd.read_csv(DIRECTORY_PATH + "data{0}-{1}.csv".format(DATA_START_DATE, DATA_END_DATE), header=None)
print("数据集大小") print("数据集大小")
print(data.shape) print(data.shape)
print(data.head(2)) print(data.head(2))
...@@ -67,13 +67,14 @@ print(data.head(2)) ...@@ -67,13 +67,14 @@ print(data.head(2))
test = data.loc[:test_number] test = data.loc[:test_number]
print("测试集大小") print("测试集大小")
print(test.shape[0]) print(test.shape[0])
test.to_csv(DIRECTORY_PATH + "test.csv",index = False,header = None) test.to_csv(DIRECTORY_PATH + "test{0}.csv".format(TEST_DATE), index=False, header=None)
validation = data.loc[(test_number+1):(test_number+validation_number)] validation = data.loc[(test_number + 1):(test_number + validation_number)]
print("验证集大小") print("验证集大小")
print(validation.shape[0]) print(validation.shape[0])
validation.to_csv(DIRECTORY_PATH + "validation.csv",index = False,header = None) validation.to_csv(DIRECTORY_PATH + "validation{0}.csv".format(VALIDATION_DATE), index=False, header=None)
train = data.loc[(test_number+validation_number+1):] train = data.loc[(test_number + validation_number + 1):]
print("训练集大小") print("训练集大小")
print(train.shape[0]) print(train.shape[0])
train.to_csv(DIRECTORY_PATH + "train.csv",index = False,header = None) # TODO validation date is not the end of train date
\ No newline at end of file train.to_csv(DIRECTORY_PATH + "train{0}-{1}.csv".format(DATA_START_DATE, VALIDATION_DATE), index=False, header=None)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment