Commit e83cb697 authored by 高雅喆's avatar 高雅喆

Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline

add dea/ml_tools/roc_curve.py
parents ef5008d2 0aa8923e
......@@ -4,89 +4,89 @@ import xlearn as xl
import time
from prepareData import fetch_data
exposure, click, click_device_id = fetch_data()
# exposure, click, click_device_id = fetch_data()
#
# # 求曝光表和点击表的差集合
# print("曝光表处理前的样本个数")
# print(exposure.shape)
# exposure = exposure.append(click)
# exposure = exposure.append(click)
# subset = click.columns.tolist()
# exposure = exposure.drop_duplicates(subset=subset,keep=False)
# print("差集后曝光表个数")
# print(exposure.shape)
# exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
# print("去除未点击用户后曝光表个数")
# print(exposure.shape)
# # 打标签
# click["y"] = 1
# exposure["y"] = 0
#
# print("正样本个数")
# print(click.shape[0])
# print("负样本个数")
# print(exposure.shape[0])
#
# # 合并点击表和曝光表
# data = click.append(exposure)
# data = data.sort_values(by="stat_date",ascending=False)
# print("前两行数据")
# print(data.head(2))
# print("后两行数据")
# print(data.tail(2))
# test_number = data[data["stat_date"]=='2018-08-06'].shape[0]
# validation_number = data[data["stat_date"]=='2018-08-05'].shape[0]
# data = data.drop("stat_date",axis=1)
#
# # 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
# data.loc[data["hour"]==0,["hour"]] = 24
# data.loc[data["minute"]==0,["minute"]] = 60
# data["hour"] = data["hour"].astype("category")
# data["minute"] = data["minute"].astype("category")
# print(data.head(2))
#
#
# print("start ffm transform")
# start = time.time()
# ffm_train = FFMFormatPandas()
# data = ffm_train.fit_transform(data, y='y')
# print("done transform ffm")
# end = time.time()
# print("ffm转化数据耗时:")
# print(end-start)
# data.to_csv("/home/zhangyanzhao/data.csv",index=False)
# data = pd.read_csv("/home/zhangyanzhao/data.csv",header=None)
# print("数据集大小")
# print(data.shape)
# print(data.head(2))
# 求曝光表和点击表的差集合
print("曝光表处理前的样本个数")
print(exposure.shape)
exposure = exposure.append(click)
exposure = exposure.append(click)
subset = click.columns.tolist()
exposure = exposure.drop_duplicates(subset=subset,keep=False)
print("差集后曝光表个数")
print(exposure.shape)
exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
print("去除未点击用户后曝光表个数")
print(exposure.shape)
# 打标签
click["y"] = 1
exposure["y"] = 0
print("正样本个数")
print(click.shape[0])
print("负样本个数")
print(exposure.shape[0])
# 合并点击表和曝光表
data = click.append(exposure)
data = data.sort_values(by="stat_date",ascending=False)
print("前两行数据")
print(data.head(2))
print("后两行数据")
print(data.tail(2))
test_number = data[data["stat_date"]=='2018-08-06'].shape[0]
validation_number = data[data["stat_date"]=='2018-08-05'].shape[0]
data = data.drop("stat_date",axis=1)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data.loc[data["hour"]==0,["hour"]] = 24
data.loc[data["minute"]==0,["minute"]] = 60
data["hour"] = data["hour"].astype("category")
data["minute"] = data["minute"].astype("category")
print(data.head(2))
print("start ffm transform")
start = time.time()
ffm_train = FFMFormatPandas()
data = ffm_train.fit_transform(data, y='y')
print("done transform ffm")
end = time.time()
print("ffm转化数据耗时:")
print(end-start)
data.to_csv("/home/zhangyanzhao/data.csv",index=False)
data = pd.read_csv("/home/zhangyanzhao/data.csv",header=None)
print("数据集大小")
print(data.shape)
print(data.head(2))
test = data.loc[:test_number]
print("测试集大小")
print(test.shape[0])
test.to_csv("/home/zhangyanzhao/test.csv",index = False,header = None)
validation = data.loc[(test_number+1):(test_number+validation_number)]
print("验证集大小")
print(validation.shape[0])
validation.to_csv("/home/zhangyanzhao/validation.csv",index = False,header = None)
train = data.loc[(test_number+validation_number+1):]
print("训练集大小")
print(train.shape[0])
train.to_csv("/home/zhangyanzhao/train.csv",index = False,header = None)
# test = data.loc[:test_number]
# print("测试集大小")
# print(test.shape[0])
# test.to_csv("/home/zhangyanzhao/test.csv",index = False,header = None)
# validation = data.loc[(test_number+1):(test_number+validation_number)]
# print("验证集大小")
# print(validation.shape[0])
# validation.to_csv("/home/zhangyanzhao/validation.csv",index = False,header = None)
# train = data.loc[(test_number+validation_number+1):]
# print("训练集大小")
# print(train.shape[0])
# train.to_csv("/home/zhangyanzhao/train.csv",index = False,header = None)
print("start training")
ffm_model = xl.create_ffm()
ffm_model.setTrain("/home/zhangyanzhao/train.csv")
ffm_model.setValidate("/home/zhangyanzhao/validation.csv")
ffm_model.setTrain("/home/zhangyanzhao/data.csv")
ffm_model.setValidate("/home/zhangyanzhao/data.csv")
param = {'task':'binary', 'lr':0.03,
'lambda':0.002, 'metric':'auc'}
ffm_model.fit(param,'/home/zhangyanzhao/model.out')
ffm_model.setTest("/home/zhangyanzhao/test.csv")
ffm_model.setTest("/home/zhangyanzhao/data.csv")
ffm_model.setSigmoid()
ffm_model.predict("/home/zhangyanzhao/model.out", "/home/zhangyanzhao/output.txt")
ffm_model.predict("/home/zhangyanzhao/model.out",
"/home/zhangyanzhao/output.txt")
print("end")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment