Commit 4f096b7d authored by 张彦钊's avatar 张彦钊

按照日期划分测试集、验证集

parent 2b944151
import datetime
import pymysql
import pandas as pd
from sklearn.utils import shuffle
import numpy as np
import xlearn as xl
......@@ -23,24 +22,41 @@ click_device_id = con_sql(sql)[0].values.tolist()
print("成功获取点击表里的device_id")
# 获取点击表里的数据
sql = "select cid,device_id,time from data_feed_click where stat_date >= '2018-07-25'"
sql = "select cid,device_id,time,stat_date from data_feed_click where stat_date >= '2018-08-03'"
click = con_sql(sql)
click = click.rename(columns={0:"cid",1:"device_id",2:"time"})
click = click.rename(columns={0:"cid",1:"device_id",2:"time",3:"stat_date"})
print("成功获取点击表里的数据")
# 从time特征中抽取hour
click["hour"] = click["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
click["minute"] = click["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).minute)
click = click.drop("time",axis=1)
print("点击表数据预览")
print(click.head(2))
# 获取曝光表里的数据
sql = "select cid,device_id,time from data_feed_exposure where stat_date >= '2018-07-25'"
sql = "select cid,device_id,time,stat_date from data_feed_exposure where stat_date >= '2018-08-03'"
exposure = con_sql(sql)
exposure = exposure.rename(columns={0:"cid",1:"device_id",2:"time"})
exposure = exposure.rename(columns={0:"cid",1:"device_id",2:"time",3:"stat_date"})
print("成功获取曝光表里的数据")
# 从time特征中抽取hour
exposure["hour"] = exposure["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
exposure["minute"] = exposure["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).minute)
exposure = exposure.drop("time",axis=1)
print("曝光表数据预览")
print(exposure.head(2))
# 求曝光表和点击表的差集合
print("曝光表处理前的样本个数")
print(exposure.shape)
exposure = exposure.append(click)
exposure = exposure.append(click)
subset = click.columns.tolist()
exposure = exposure.drop_duplicates(subset=subset,keep=False)
print("成功完成曝光表和点击表的差集合")
print("差集后曝光表个数")
print(exposure.shape)
exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
print("去除未点击用户后曝光表个数")
print(exposure.shape)
# 打标签
click["y"] = 1
exposure["y"] = 0
......@@ -52,21 +68,22 @@ print(exposure.shape[0])
# 合并点击表和曝光表
data = click.append(exposure)
print("done 合并点击表和曝光表")
data = data.sort_values(by="stat_date",ascending=False)
print("前两行数据")
print(data.head(2))
# 从time特征中抽取hour、weekday
data["hour"] = data["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
data["weekday"] = data["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).weekday())
print("后两行数据")
print(data.tail(2))
test_number = data[data["stat_date"]=='2018-08-06'].shape[0]
validation_number = data[data["stat_date"]=='2018-08-05'].shape[0]
data = data.drop("stat_date",axis=1)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data.loc[data["hour"]==0,["hour"]] = 24
data.loc[data["weekday"]==0,["weekday"]] = 7
data.loc[data["minute"]==0,["minute"]] = 60
data["hour"] = data["hour"].astype("category")
data["weekday"] = data["weekday"].astype("category")
data = data.drop("time",axis=1)
print("成功从time特征中抽取hour、weekday")
data["minute"] = data["minute"].astype("category")
print(data.head(2))
data = shuffle(data)
print("start ffm transform")
......@@ -138,42 +155,33 @@ print("数据集大小")
print(data.shape)
print(data.head(2))
'''
n = np.rint(data.shape[0]/8)
m = np.rint(data.shape[0]*(3/8))
# 1/8的数据集用来做测试集
test = data.loc[:n]
test = data.loc[:test_number]
print("测试集大小")
print(test.shape)
print(test.shape[0])
test.to_csv("/home/zhangyanzhao/test.csv",index = False,header = None)
# 1/4的数据集用来做验证集
validation = data.loc[n+1:m]
validation.to_csv("/home/zhangyanzhao/validation.csv",index = False,header = None)
validation = data.loc[(test_number+1):(test_number+validation_number)]
print("验证集大小")
print(validation.shape)
# 剩余的数据集用来做训练集
train = data.loc[m+1:]
print(validation.shape[0])
validation.to_csv("/home/zhangyanzhao/validation.csv",index = False,header = None)
train = data.loc[(test_number+validation_number+1):]
print("训练集大小")
print(train.shape)
print(train.shape[0])
train.to_csv("/home/zhangyanzhao/train.csv",index = False,header = None)
'''
print("start training")
ffm_model = xl.create_ffm()
ffm_model.setTrain("/home/zhangyanzhao/data.csv")
# ffm_model.setValidate("/home/zhangyanzhao/validation.csv")
ffm_model.setTrain("/home/zhangyanzhao/train.csv")
ffm_model.setValidate("/home/zhangyanzhao/validation.csv")
param = {'task':'binary', 'lr':0.03,
'lambda':0.002, 'metric':'auc'}
param = {'task':'binary', 'lr':0.05,
'lambda':0.002, 'metric':'auc', 'fold':3}
ffm_model.fit(param,'/home/zhangyanzhao/model.out')
ffm_model.cv(param)
'''
ffm_model.setTest("/home/zhangyanzhao/test.csv")
ffm_model.setSigmoid()
ffm_model.predict("/home/zhangyanzhao/model.out", "/home/zhangyanzhao/output.txt")
'''
print("end")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment