Commit c51d4fe8 authored by 张彦钊's avatar 张彦钊

refractor the process data file

parent 61438b13
......@@ -2,9 +2,8 @@ from eda.ml_tools.rocCurve import get_roc_curve
import pandas as pd
from config import *
if __name__ == "__main__":
test = pd.read_csv(DIRECTORY_PATH + "test.csv",header = None)
test = pd.read_csv(DIRECTORY_PATH + "test.csv", header=None)
test_label = test[0].apply(lambda x: x[0]).values
predict = pd.read_csv(DIRECTORY_PATH + "output.txt",header = None)[0].values
get_roc_curve(test_label,predict)
predict = pd.read_csv(DIRECTORY_PATH + "output.txt", header=None)[0].values
get_roc_curve(test_label, predict, "1")
DIRECTORY_PATH = '/home/zhangyanzhao/'
VALIDATION_DATE = '2018-08-05'
TEST_DATE = '2018-08-06'
DATA_START_DATE = '2018-07-06'
DATA_END_DATE = '2018-08-06'
# processData.py
......
import xlearn as xl
from config import *
print("Start training")
ffm_model = xl.create_ffm()
ffm_model.setTrain(DIRECTORY_PATH + "data.csv")
ffm_model.setTrain(DIRECTORY_PATH + "train.csv")
ffm_model.setValidate(DIRECTORY_PATH + "validation.csv")
param = {'task':'binary', 'lr':0.03,
'lambda':0.002, 'metric':'auc'}
param = {'task': 'binary', 'lr': 0.03, 'lambda': 0.002, 'metric': 'auc'}
ffm_model.fit(param, DIRECTORY_PATH + "model.out")
......@@ -19,8 +15,3 @@ ffm_model.setTest(DIRECTORY_PATH + "test.csv")
ffm_model.setSigmoid()
ffm_model.predict(DIRECTORY_PATH + "model.out",
DIRECTORY_PATH + "output.txt")
import pandas as pd
from sklearn import metrics
# import matplotlib.pyplot as plt
from sklearn.metrics import auc
# import argparse
# parser = argparse.ArgumentParser()
# parser.add_argument('test_label',help='The filename of the test_label')
# parser.add_argument('test_pred',help='The filename of the test_pred')
# # parser.add_argument('output_photo',help='The filename of the output_photo')
# args = parser.parse_args()
def get_roc_curve(label,pred):
def get_roc_curve(y,pred,pos_label):
"""
计算二分类问题的roc和auc
"""
test_label = pd.read_table(label)
pred_label = pd.read_table(pred)
y = test_label.values
p = pred_label.values
fpr, tpr, thresholds = metrics.roc_curve(y, p)
# plt.plot(fpr,tpr,marker = 'o')
# plt.xlabel('False positive rate')
# plt.ylabel('True positive rate')
# plt.title("roc_curev")
AUC = auc(fpr, tpr)
AUC = "auc={}".format(AUC)
# plt.text(0.5,0.8,AUC,color='blue',ha='center')
# # plt.savefig(output)
fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label)
AUC = metrics.auc(fpr, tpr)
print(AUC)
#
# if __name__ == "__main__":
# get_roc_curve(args.test_label,args.test_pred)
......@@ -5,7 +5,7 @@ import pandas as pd
from config import *
exposure, click, click_device_id = fetch_data(
start_date='2018-08-03', end_date='2018-08-06')
start_date=DATA_START_DATE, end_date=DATA_END_DATE)
# 求曝光表和点击表的差集合
print("曝光表处理前的样本个数")
......@@ -14,7 +14,7 @@ print(exposure.shape)
exposure = exposure.append(click)
exposure = exposure.append(click)
subset = click.columns.tolist()
exposure = exposure.drop_duplicates(subset=subset,keep=False)
exposure = exposure.drop_duplicates(subset=subset, keep=False)
print("差集后曝光表个数")
print(exposure.shape)
......@@ -33,23 +33,22 @@ print(exposure.shape[0])
# 合并点击表和曝光表
data = click.append(exposure)
data = data.sort_values(by="stat_date",ascending=False)
data = data.sort_values(by="stat_date", ascending=False)
print("前两行数据")
print(data.head(2))
print("后两行数据")
print(data.tail(2))
test_number = data[data["stat_date"]=='2018-08-06'].shape[0]
validation_number = data[data["stat_date"]=='2018-08-05'].shape[0]
data = data.drop("stat_date",axis=1)
test_number = data[data["stat_date"] == TEST_DATE].shape[0]
validation_number = data[data["stat_date"] == VALIDATION_DATE].shape[0]
data = data.drop("stat_date", axis=1)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data.loc[data["hour"]==0,["hour"]] = 24
data.loc[data["minute"]==0,["minute"]] = 60
data.loc[data["hour"] == 0, ["hour"]] = 24
data.loc[data["minute"] == 0, ["minute"]] = 60
data["hour"] = data["hour"].astype("category")
data["minute"] = data["minute"].astype("category")
print(data.head(2))
print("Start ffm transform")
start = time.time()
ffm_train = FFMFormatPandas()
......@@ -57,9 +56,10 @@ data = ffm_train.fit_transform(data, y='y')
print("done transform ffm")
end = time.time()
print("ffm转化数据耗时:")
print(end-start)
data.to_csv(DIRECTORY_PATH + "data.csv",index=False)
data = pd.read_csv(DIRECTORY_PATH + "data.csv",header=None)
print(end - start)
data.to_csv(DIRECTORY_PATH + "data{0}-{1}.csv".format(DATA_START_DATE, DATA_END_DATE), index=False)
data = pd.read_csv(DIRECTORY_PATH + "data{0}-{1}.csv".format(DATA_START_DATE, DATA_END_DATE), header=None)
print("数据集大小")
print(data.shape)
print(data.head(2))
......@@ -67,13 +67,14 @@ print(data.head(2))
test = data.loc[:test_number]
print("测试集大小")
print(test.shape[0])
test.to_csv(DIRECTORY_PATH + "test.csv",index = False,header = None)
test.to_csv(DIRECTORY_PATH + "test{0}.csv".format(TEST_DATE), index=False, header=None)
validation = data.loc[(test_number+1):(test_number+validation_number)]
validation = data.loc[(test_number + 1):(test_number + validation_number)]
print("验证集大小")
print(validation.shape[0])
validation.to_csv(DIRECTORY_PATH + "validation.csv",index = False,header = None)
train = data.loc[(test_number+validation_number+1):]
validation.to_csv(DIRECTORY_PATH + "validation{0}.csv".format(VALIDATION_DATE), index=False, header=None)
train = data.loc[(test_number + validation_number + 1):]
print("训练集大小")
print(train.shape[0])
train.to_csv(DIRECTORY_PATH + "train.csv",index = False,header = None)
\ No newline at end of file
# TODO validation date is not the end of train date
train.to_csv(DIRECTORY_PATH + "train{0}-{1}.csv".format(DATA_START_DATE, VALIDATION_DATE), index=False, header=None)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment