按照日期划分测试集、验证集

4f096b7d · 张彦钊 · 2b944151 · 4f096b7d
Commit 4f096b7d authored Aug 07, 2018 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 46 additions and 38 deletions

diary-training.py diary-training.py +46 -38

No files found.
--- a/diary-training.py
+++ b/diary-training.py
 import datetime
 import pymysql
 import pandas as pd
-from sklearn.utils import shuffle
 import numpy as np
 import xlearn as xl
@@ -23,24 +22,41 @@ click_device_id = con_sql(sql)[0].values.tolist()
 print("成功获取点击表里的device_id")
 # 获取点击表里的数据
-sql = "select cid,device_id,time from data_feed_click where stat_date >= '2018-07-25'"
+sql = "select cid,device_id,time,stat_date from data_feed_click where stat_date >= '2018-08-03'"
 click = con_sql(sql)
-click = click.rename(columns={0:"cid",1:"device_id",2:"time"})
+click = click.rename(columns={0:"cid",1:"device_id",2:"time",3:"stat_date"})
 print("成功获取点击表里的数据")
+# 从time特征中抽取hour
+click["hour"] = click["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
+click["minute"] = click["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).minute)
+click = click.drop("time",axis=1)
+print("点击表数据预览")
+print(click.head(2))
 # 获取曝光表里的数据
-sql = "select cid,device_id,time from data_feed_exposure where stat_date >= '2018-07-25'"
+sql = "select cid,device_id,time,stat_date from data_feed_exposure where stat_date >= '2018-08-03'"
 exposure = con_sql(sql)
-exposure = exposure.rename(columns={0:"cid",1:"device_id",2:"time"})
+exposure = exposure.rename(columns={0:"cid",1:"device_id",2:"time",3:"stat_date"})
 print("成功获取曝光表里的数据")
+# 从time特征中抽取hour
+exposure["hour"] = exposure["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
+exposure["minute"] = exposure["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).minute)
+exposure = exposure.drop("time",axis=1)
+print("曝光表数据预览")
+print(exposure.head(2))
 # 求曝光表和点击表的差集合
+print("曝光表处理前的样本个数")
+print(exposure.shape)
 exposure = exposure.append(click)
 exposure = exposure.append(click)
 subset = click.columns.tolist()
 exposure = exposure.drop_duplicates(subset=subset,keep=False)
-print("成功完成曝光表和点击表的差集合")
+print("差集后曝光表个数")
+print(exposure.shape)
 exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
+print("去除未点击用户后曝光表个数")
+print(exposure.shape)
 # 打标签
 click["y"] = 1
 exposure["y"] = 0
@@ -52,21 +68,22 @@ print(exposure.shape[0])
 # 合并点击表和曝光表
 data = click.append(exposure)
-print("done 合并点击表和曝光表")
+data = data.sort_values(by="stat_date",ascending=False)
+print("前两行数据")
 print(data.head(2))
-# 从time特征中抽取hour、weekday
+print("后两行数据")
-data["hour"] = data["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
+print(data.tail(2))
-data["weekday"] = data["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).weekday())
+test_number = data[data["stat_date"]=='2018-08-06'].shape[0]
+validation_number = data[data["stat_date"]=='2018-08-05'].shape[0]
+data = data.drop("stat_date",axis=1)
 # 数值是0的特征会被ffm格式删除，经过下面的处理后，没有数值是0的特征
 data.loc[data["hour"]==0,["hour"]] = 24
-data.loc[data["weekday"]==0,["weekday"]] = 7
+data.loc[data["minute"]==0,["minute"]] = 60
 data["hour"] = data["hour"].astype("category")
-data["weekday"] = data["weekday"].astype("category")
+data["minute"] = data["minute"].astype("category")
-data = data.drop("time",axis=1)
-print("成功从time特征中抽取hour、weekday")
 print(data.head(2))
-data = shuffle(data)
 print("start ffm transform")
@@ -138,42 +155,33 @@ print("数据集大小")
 print(data.shape)
 print(data.head(2))
-'''
+test = data.loc[:test_number]
-n = np.rint(data.shape[0]/8)
-m = np.rint(data.shape[0]*(3/8))
-# 1/8的数据集用来做测试集
-test = data.loc[:n]
 print("测试集大小")
-print(test.shape)
+print(test.shape[0])
 test.to_csv("/home/zhangyanzhao/test.csv",index = False,header = None)
-# 1/4的数据集用来做验证集
+validation = data.loc[(test_number+1):(test_number+validation_number)]
-validation = data.loc[n+1:m]
-validation.to_csv("/home/zhangyanzhao/validation.csv",index = False,header = None)
 print("验证集大小")
-print(validation.shape)
+print(validation.shape[0])
-# 剩余的数据集用来做训练集
+validation.to_csv("/home/zhangyanzhao/validation.csv",index = False,header = None)
-train = data.loc[m+1:]
+train = data.loc[(test_number+validation_number+1):]
 print("训练集大小")
-print(train.shape)
+print(train.shape[0])
 train.to_csv("/home/zhangyanzhao/train.csv",index = False,header = None)
-'''
 print("start training")
 ffm_model = xl.create_ffm()
-ffm_model.setTrain("/home/zhangyanzhao/data.csv")
+ffm_model.setTrain("/home/zhangyanzhao/train.csv")
-# ffm_model.setValidate("/home/zhangyanzhao/validation.csv")
+ffm_model.setValidate("/home/zhangyanzhao/validation.csv")
+param = {'task':'binary', 'lr':0.03,
+         'lambda':0.002, 'metric':'auc'}
-param = {'task':'binary', 'lr':0.05,
+ffm_model.fit(param,'/home/zhangyanzhao/model.out')
-         'lambda':0.002, 'metric':'auc', 'fold':3}
-ffm_model.cv(param)
-'''
 ffm_model.setTest("/home/zhangyanzhao/test.csv")
 ffm_model.setSigmoid()
 ffm_model.predict("/home/zhangyanzhao/model.out", "/home/zhangyanzhao/output.txt")
-'''
 print("end")