Commit 1af713e4 authored by 张彦钊's avatar 张彦钊

refactor

parent 0aa8923e
DIRECTORY_PATH = '/home/zhangyanzhao/'
# processData.py
# diaryTraining.py
import pandas as pd
from utils import FFMFormatPandas
import xlearn as xl
import time
from prepareData import fetch_data
# exposure, click, click_device_id = fetch_data()
#
# # 求曝光表和点击表的差集合
# print("曝光表处理前的样本个数")
# print(exposure.shape)
# exposure = exposure.append(click)
# exposure = exposure.append(click)
# subset = click.columns.tolist()
# exposure = exposure.drop_duplicates(subset=subset,keep=False)
# print("差集后曝光表个数")
# print(exposure.shape)
# exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
# print("去除未点击用户后曝光表个数")
# print(exposure.shape)
# # 打标签
# click["y"] = 1
# exposure["y"] = 0
#
# print("正样本个数")
# print(click.shape[0])
# print("负样本个数")
# print(exposure.shape[0])
#
# # 合并点击表和曝光表
# data = click.append(exposure)
# data = data.sort_values(by="stat_date",ascending=False)
# print("前两行数据")
# print(data.head(2))
# print("后两行数据")
# print(data.tail(2))
# test_number = data[data["stat_date"]=='2018-08-06'].shape[0]
# validation_number = data[data["stat_date"]=='2018-08-05'].shape[0]
# data = data.drop("stat_date",axis=1)
#
# # 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
# data.loc[data["hour"]==0,["hour"]] = 24
# data.loc[data["minute"]==0,["minute"]] = 60
# data["hour"] = data["hour"].astype("category")
# data["minute"] = data["minute"].astype("category")
# print(data.head(2))
#
#
# print("start ffm transform")
# start = time.time()
# ffm_train = FFMFormatPandas()
# data = ffm_train.fit_transform(data, y='y')
# print("done transform ffm")
# end = time.time()
# print("ffm转化数据耗时:")
# print(end-start)
# data.to_csv("/home/zhangyanzhao/data.csv",index=False)
# data = pd.read_csv("/home/zhangyanzhao/data.csv",header=None)
# print("数据集大小")
# print(data.shape)
# print(data.head(2))
import xlearn as xl
from config import *
# test = data.loc[:test_number]
# print("测试集大小")
# print(test.shape[0])
# test.to_csv("/home/zhangyanzhao/test.csv",index = False,header = None)
# validation = data.loc[(test_number+1):(test_number+validation_number)]
# print("验证集大小")
# print(validation.shape[0])
# validation.to_csv("/home/zhangyanzhao/validation.csv",index = False,header = None)
# train = data.loc[(test_number+validation_number+1):]
# print("训练集大小")
# print(train.shape[0])
# train.to_csv("/home/zhangyanzhao/train.csv",index = False,header = None)
print("start training")
ffm_model = xl.create_ffm()
ffm_model.setTrain("/home/zhangyanzhao/data.csv")
ffm_model.setValidate("/home/zhangyanzhao/data.csv")
ffm_model.setTrain(DIRECTORY_PATH + "data.csv")
ffm_model.setValidate(DIRECTORY_PATH + "data.csv")
param = {'task':'binary', 'lr':0.03,
'lambda':0.002, 'metric':'auc'}
ffm_model.fit(param,'/home/zhangyanzhao/model.out')
ffm_model.fit(param, DIRECTORY_PATH + "model.out")
ffm_model.setTest("/home/zhangyanzhao/data.csv")
ffm_model.setTest(DIRECTORY_PATH + "data.csv")
ffm_model.setSigmoid()
ffm_model.predict("/home/zhangyanzhao/model.out",
"/home/zhangyanzhao/output.txt")
ffm_model.predict(DIRECTORY_PATH + "model.out",
DIRECTORY_PATH + "output.txt")
print("end")
......@@ -2,7 +2,7 @@ from utils import con_sql
import datetime
def fetch_data(start_date='2018-08-03'):
def fetch_data(start_date, end_date):
# 获取点击表里的device_id
sql = "select distinct device_id from data_feed_click"
......@@ -10,7 +10,8 @@ def fetch_data(start_date='2018-08-03'):
print("成功获取点击表里的device_id")
# 获取点击表里的数据
sql = "select cid,device_id,time,stat_date from data_feed_click where stat_date >= {0}".format(start_date)
sql = "select cid,device_id,time,stat_date from data_feed_click " \
"where stat_date >= {0} and stat_date <= {1}".format(start_date, end_date)
click = con_sql(sql)
click = click.rename(columns={0:"cid",1:"device_id",2:"time",3:"stat_date"})
print("成功获取点击表里的数据")
......@@ -22,7 +23,8 @@ def fetch_data(start_date='2018-08-03'):
print(click.head(2))
# 获取曝光表里的数据
sql = "select cid,device_id,time,stat_date from data_feed_exposure where stat_date >= {0}".format(start_date)
sql = "select cid,device_id,time,stat_date from data_feed_exposure " \
"where stat_date >= {0} and stat_date <= {1}".format(start_date, end_date)
exposure = con_sql(sql)
exposure = exposure.rename(columns={0:"cid",1:"device_id",2:"time",3:"stat_date"})
print("成功获取曝光表里的数据")
......
import time
from prepareData import fetch_data
from utils import FFMFormatPandas
import pandas as pd
from config import *
exposure, click, click_device_id = fetch_data(
start_date='2018-08-03', end_date='2018-08-06')
# 求曝光表和点击表的差集合
print("曝光表处理前的样本个数")
print(exposure.shape)
exposure = exposure.append(click)
exposure = exposure.append(click)
subset = click.columns.tolist()
exposure = exposure.drop_duplicates(subset=subset,keep=False)
print("差集后曝光表个数")
print(exposure.shape)
exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
print("去除未点击用户后曝光表个数")
print(exposure.shape)
# 打标签
click["y"] = 1
exposure["y"] = 0
print("正样本个数")
print(click.shape[0])
print("负样本个数")
print(exposure.shape[0])
# 合并点击表和曝光表
data = click.append(exposure)
data = data.sort_values(by="stat_date",ascending=False)
print("前两行数据")
print(data.head(2))
print("后两行数据")
print(data.tail(2))
test_number = data[data["stat_date"]=='2018-08-06'].shape[0]
validation_number = data[data["stat_date"]=='2018-08-05'].shape[0]
data = data.drop("stat_date",axis=1)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data.loc[data["hour"]==0,["hour"]] = 24
data.loc[data["minute"]==0,["minute"]] = 60
data["hour"] = data["hour"].astype("category")
data["minute"] = data["minute"].astype("category")
print(data.head(2))
print("Start ffm transform")
start = time.time()
ffm_train = FFMFormatPandas()
data = ffm_train.fit_transform(data, y='y')
print("done transform ffm")
end = time.time()
print("ffm转化数据耗时:")
print(end-start)
data.to_csv(DIRECTORY_PATH + "data.csv",index=False)
data = pd.read_csv(DIRECTORY_PATH + "data.csv",header=None)
print("数据集大小")
print(data.shape)
print(data.head(2))
test = data.loc[:test_number]
print("测试集大小")
print(test.shape[0])
test.to_csv(DIRECTORY_PATH + "test.csv",index = False,header = None)
validation = data.loc[(test_number+1):(test_number+validation_number)]
print("验证集大小")
print(validation.shape[0])
validation.to_csv(DIRECTORY_PATH + "validation.csv",index = False,header = None)
train = data.loc[(test_number+validation_number+1):]
print("训练集大小")
print(train.shape[0])
train.to_csv(DIRECTORY_PATH + "train.csv",index = False,header = None)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment