refactor

1af713e4 · 张彦钊 · 0aa8923e · 1af713e4 · 1af713e4 · 1af713e4
Commit 1af713e4 authored Aug 07, 2018 by 张彦钊
7 changed files
--- a/config.py
+++ b/config.py
+
+DIRECTORY_PATH = '/home/zhangyanzhao/'
+
+
+# processData.py
+# diaryTraining.py
--- a/test/diaryTestSet.py
+++ b/test/diaryTestSet.py
--- a/diaryTraining.py
+++ b/diaryTraining.py
-import pandas as pd
-from utils import FFMFormatPandas
-import xlearn as xl
-import time
-from prepareData import fetch_data

-# exposure, click, click_device_id = fetch_data()
-#
-# # 求曝光表和点击表的差集合
-# print("曝光表处理前的样本个数")
-# print(exposure.shape)
-# exposure = exposure.append(click)
-# exposure = exposure.append(click)
-# subset = click.columns.tolist()
-# exposure = exposure.drop_duplicates(subset=subset,keep=False)
-# print("差集后曝光表个数")
-# print(exposure.shape)
-# exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
-# print("去除未点击用户后曝光表个数")
-# print(exposure.shape)
-# # 打标签
-# click["y"] = 1
-# exposure["y"] = 0
-#
-# print("正样本个数")
-# print(click.shape[0])
-# print("负样本个数")
-# print(exposure.shape[0])
-#
-# # 合并点击表和曝光表
-# data = click.append(exposure)
-# data = data.sort_values(by="stat_date",ascending=False)
-# print("前两行数据")
-# print(data.head(2))
-# print("后两行数据")
-# print(data.tail(2))
-# test_number = data[data["stat_date"]=='2018-08-06'].shape[0]
-# validation_number = data[data["stat_date"]=='2018-08-05'].shape[0]
-# data = data.drop("stat_date",axis=1)
-#
-# # 数值是0的特征会被ffm格式删除，经过下面的处理后，没有数值是0的特征
-# data.loc[data["hour"]==0,["hour"]] = 24
-# data.loc[data["minute"]==0,["minute"]] = 60
-# data["hour"] = data["hour"].astype("category")
-# data["minute"] = data["minute"].astype("category")
-# print(data.head(2))
-#
-#
-# print("start ffm transform")
-# start = time.time()
-# ffm_train = FFMFormatPandas()
-# data = ffm_train.fit_transform(data, y='y')
-# print("done transform ffm")
-# end = time.time()
-# print("ffm转化数据耗时：")
-# print(end-start)
-# data.to_csv("/home/zhangyanzhao/data.csv",index=False)
-# data = pd.read_csv("/home/zhangyanzhao/data.csv",header=None)
-# print("数据集大小")
-# print(data.shape)
-# print(data.head(2))
+import xlearn as xl
+from config import *

-# test = data.loc[:test_number]
-# print("测试集大小")
-# print(test.shape[0])
-# test.to_csv("/home/zhangyanzhao/test.csv",index = False,header = None)
-# validation = data.loc[(test_number+1):(test_number+validation_number)]
-# print("验证集大小")
-# print(validation.shape[0])
-# validation.to_csv("/home/zhangyanzhao/validation.csv",index = False,header = None)
-# train = data.loc[(test_number+validation_number+1):]
-# print("训练集大小")
-# print(train.shape[0])
-# train.to_csv("/home/zhangyanzhao/train.csv",index = False,header = None)

 print("start training")
 ffm_model = xl.create_ffm()
-ffm_model.setTrain("/home/zhangyanzhao/data.csv")
-ffm_model.setValidate("/home/zhangyanzhao/data.csv")
+ffm_model.setTrain(DIRECTORY_PATH + "data.csv")
+ffm_model.setValidate(DIRECTORY_PATH + "data.csv")


 param = {'task':'binary', 'lr':0.03,
         'lambda':0.002, 'metric':'auc'}

-ffm_model.fit(param,'/home/zhangyanzhao/model.out')
+ffm_model.fit(param, DIRECTORY_PATH + "model.out")

-ffm_model.setTest("/home/zhangyanzhao/data.csv")
+ffm_model.setTest(DIRECTORY_PATH + "data.csv")
 ffm_model.setSigmoid()
-ffm_model.predict("/home/zhangyanzhao/model.out",
-                  "/home/zhangyanzhao/output.txt")
+ffm_model.predict(DIRECTORY_PATH + "model.out",
+                  DIRECTORY_PATH + "output.txt")

 print("end")
--- a/test/prepareTestData.py
+++ b/test/prepareTestData.py
--- a/test/testCases.py
+++ b/test/testCases.py
--- a/prepareData.py
+++ b/prepareData.py
@@ -2,7 +2,7 @@ from utils import con_sql
 import datetime


-def fetch_data(start_date='2018-08-03'):
+def fetch_data(start_date, end_date):

    # 获取点击表里的device_id
    sql = "select distinct device_id from data_feed_click"
@@ -10,7 +10,8 @@ def fetch_data(start_date='2018-08-03'):
    print("成功获取点击表里的device_id")

    # 获取点击表里的数据
-    sql = "select cid,device_id,time,stat_date from data_feed_click where stat_date >= {0}".format(start_date)
+    sql = "select cid,device_id,time,stat_date from data_feed_click " \
+          "where stat_date >= {0} and stat_date <= {1}".format(start_date, end_date)
    click = con_sql(sql)
    click = click.rename(columns={0:"cid",1:"device_id",2:"time",3:"stat_date"})
    print("成功获取点击表里的数据")
@@ -22,7 +23,8 @@ def fetch_data(start_date='2018-08-03'):
    print(click.head(2))

    # 获取曝光表里的数据
-    sql = "select cid,device_id,time,stat_date from data_feed_exposure where stat_date >= {0}".format(start_date)
+    sql = "select cid,device_id,time,stat_date from data_feed_exposure " \
+          "where stat_date >= {0} and stat_date <= {1}".format(start_date, end_date)
    exposure = con_sql(sql)
    exposure = exposure.rename(columns={0:"cid",1:"device_id",2:"time",3:"stat_date"})
    print("成功获取曝光表里的数据")

--- a/processData.py
+++ b/processData.py
+import time
+from prepareData import fetch_data
+from utils import FFMFormatPandas
+import pandas as pd
+from config import *
+
+exposure, click, click_device_id = fetch_data(
+    start_date='2018-08-03', end_date='2018-08-06')
+
+# 求曝光表和点击表的差集合
+print("曝光表处理前的样本个数")
+print(exposure.shape)
+exposure = exposure.append(click)
+exposure = exposure.append(click)
+subset = click.columns.tolist()
+exposure = exposure.drop_duplicates(subset=subset,keep=False)
+print("差集后曝光表个数")
+print(exposure.shape)
+exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
+print("去除未点击用户后曝光表个数")
+print(exposure.shape)
+# 打标签
+click["y"] = 1
+exposure["y"] = 0
+
+print("正样本个数")
+print(click.shape[0])
+print("负样本个数")
+print(exposure.shape[0])
+
+# 合并点击表和曝光表
+data = click.append(exposure)
+data = data.sort_values(by="stat_date",ascending=False)
+print("前两行数据")
+print(data.head(2))
+print("后两行数据")
+print(data.tail(2))
+test_number = data[data["stat_date"]=='2018-08-06'].shape[0]
+validation_number = data[data["stat_date"]=='2018-08-05'].shape[0]
+data = data.drop("stat_date",axis=1)
+
+# 数值是0的特征会被ffm格式删除，经过下面的处理后，没有数值是0的特征
+data.loc[data["hour"]==0,["hour"]] = 24
+data.loc[data["minute"]==0,["minute"]] = 60
+data["hour"] = data["hour"].astype("category")
+data["minute"] = data["minute"].astype("category")
+print(data.head(2))
+
+
+print("Start ffm transform")
+start = time.time()
+ffm_train = FFMFormatPandas()
+data = ffm_train.fit_transform(data, y='y')
+print("done transform ffm")
+end = time.time()
+print("ffm转化数据耗时：")
+print(end-start)
+data.to_csv(DIRECTORY_PATH + "data.csv",index=False)
+data = pd.read_csv(DIRECTORY_PATH + "data.csv",header=None)
+print("数据集大小")
+print(data.shape)
+print(data.head(2))
+
+test = data.loc[:test_number]
+print("测试集大小")
+print(test.shape[0])
+test.to_csv(DIRECTORY_PATH + "test.csv",index = False,header = None)
+validation = data.loc[(test_number+1):(test_number+validation_number)]
+print("验证集大小")
+print(validation.shape[0])
+validation.to_csv(DIRECTORY_PATH + "validation.csv",index = False,header = None)
+train = data.loc[(test_number+validation_number+1):]
+print("训练集大小")
+print(train.shape[0])
+train.to_csv(DIRECTORY_PATH + "train.csv",index = False,header = None)
\ No newline at end of file