Commit 7bdb5d50 authored by 张彦钊's avatar 张彦钊

change start_date

parent ea80570f
......@@ -2,10 +2,10 @@
DIRECTORY_PATH = '/data2/models/'
# 测试日期一定要大于验证日期,因为切割数据集的代码是这样设置的
VALIDATION_DATE = '2018-08-05'
TEST_DATE = '2018-08-06'
DATA_START_DATE = '2018-07-05'
DATA_END_DATE = '2018-08-06'
# VALIDATION_DATE = '2018-08-05'
# TEST_DATE = '2018-08-06'
# DATA_START_DATE = '2018-07-05'
# DATA_END_DATE = '2018-08-06'
MODEL_VERSION = ''
......
import xlearn as xl
from config import *
......@@ -7,21 +5,16 @@ from config import *
def train():
print("Start training")
ffm_model = xl.create_ffm()
ffm_model.setTrain(DIRECTORY_PATH + "train{0}-{1}.csv".format(DATA_START_DATE, VALIDATION_DATE))
ffm_model.setValidate(DIRECTORY_PATH + "validation{0}.csv".format(VALIDATION_DATE))
ffm_model.setTrain(DIRECTORY_PATH + "train_ffm_data.csv")
ffm_model.setValidate(DIRECTORY_PATH + "validation_ffm_data.csv")
# log保存路径,如果不加这个参数,日志默认保存在/temp路径下,不符合规范
param = {'task': 'binary', 'lr': lr, 'lambda': l2_lambda, 'metric': 'auc',"log":"/data2/models/result"}
ffm_model.fit(param, DIRECTORY_PATH + "model_lr{}_lambda{}.out".format(lr, l2_lambda))
print("predicting")
ffm_model.setTest(DIRECTORY_PATH + "test{0}.csv".format(TEST_DATE))
ffm_model.setTest(DIRECTORY_PATH + "test_ffm_data.csv")
ffm_model.setSigmoid()
ffm_model.predict(DIRECTORY_PATH + "model_{0}-{1}_lr{2}_lambda{3}.out".format(DATA_START_DATE,
DATA_END_DATE, lr, l2_lambda),
DIRECTORY_PATH + "testset{0}_output_model_{1}-{2}_lr{3}_lambda{4}.txt".format(TEST_DATE,
DATA_START_DATE,
DATA_END_DATE, lr,
l2_lambda))
ffm_model.predict(DIRECTORY_PATH + "model.out",DIRECTORY_PATH + "test_set_predict_output.txt")
......@@ -32,15 +32,13 @@ def feature_en(user_profile):
# 把ffm.pkl load进来,将上面的表转化为ffm格式
def transform_ffm_format(df, device_id):
file_path = DIRECTORY_PATH + "ffm_{0}_{1}.pkl".format(DATA_START_DATE, DATA_END_DATE)
with open(file_path, "rb") as f:
with open(DIRECTORY_PATH+"ffm.pkl","rb") as f:
ffm_format_pandas = pickle.load(f)
data = ffm_format_pandas.transform(df)
now = datetime.now().strftime("%Y-%m-%d-%H-%M")
print("ffm格式转化结束")
predict_file_name = DIRECTORY_PATH + "result/{0}_{1}DiaryTop3000.csv".format(device_id, now)
data.to_csv(predict_file_name, index=False,header=None)
print("ffm写到服务器")
print("成功将ffm预测文件写到服务器")
return predict_file_name
......
from utils import con_sql
import datetime
......
import time
from prepareData import fetch_data
from utils import *
......@@ -8,9 +6,9 @@ from config import *
import pickle
def feature_en():
exposure, click, click_device_id = fetch_data(
start_date=DATA_START_DATE, end_date=DATA_END_DATE)
def feature_en(data_start_date, data_end_date, validation_date, test_date):
exposure, click, click_device_id = fetch_data(data_start_date, data_end_date)
# 求曝光表和点击表的差集合
print("曝光表处理前的样本个数")
......@@ -43,8 +41,8 @@ def feature_en():
print(data.head(2))
print("后两行数据")
print(data.tail(2))
test_number = data[data["stat_date"] == TEST_DATE].shape[0]
validation_number = data[data["stat_date"] == VALIDATION_DATE].shape[0]
test_number = data[data["stat_date"] == test_date].shape[0]
validation_number = data[data["stat_date"] == validation_date].shape[0]
data = data.drop("stat_date", axis=1)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
......@@ -78,16 +76,16 @@ def ffm_transform(data, test_number, validation_number):
start = time.time()
ffm_train = multiFFMFormatPandas()
data = ffm_train.fit_transform(data, y='y',n=50000,processes=5)
with open(DIRECTORY_PATH+"ffm_{0}_{1}.pkl".format(DATA_START_DATE,DATA_END_DATE), "wb") as f:
with open(DIRECTORY_PATH+"ffm.pkl", "wb") as f:
pickle.dump(ffm_train, f)
print("done transform ffm")
end = time.time()
print("ffm转化数据耗时():")
print(end - start)
print("ffm转化数据耗时():")
print((end - start)/60)
data.to_csv(DIRECTORY_PATH + "data{0}-{1}.csv".format(DATA_START_DATE, DATA_END_DATE), index=False)
data = pd.read_csv(DIRECTORY_PATH + "data{0}-{1}.csv".format(DATA_START_DATE, DATA_END_DATE), header=None)
data.to_csv(DIRECTORY_PATH + "total_ffm_data.csv", index=False)
data = pd.read_csv(DIRECTORY_PATH + "total_ffm_data.csv", header=None)
print("数据集大小")
print(data.shape)
print(data.head(2))
......@@ -95,17 +93,17 @@ def ffm_transform(data, test_number, validation_number):
test = data.loc[:test_number]
print("测试集大小")
print(test.shape[0])
test.to_csv(DIRECTORY_PATH + "test{0}.csv".format(TEST_DATE), index=False, header=None)
test.to_csv(DIRECTORY_PATH + "test_ffm_data.csv", index=False, header=None)
# 注意:测试集的日期一定要大于验证集,否则数据切割可能会出现错误
validation = data.loc[(test_number + 1):(test_number + validation_number)]
print("验证集大小")
print(validation.shape[0])
validation.to_csv(DIRECTORY_PATH + "validation{0}.csv".format(VALIDATION_DATE), index=False, header=None)
validation.to_csv(DIRECTORY_PATH + "validation_ffm_data.csv", index=False, header=None)
train = data.loc[(test_number + validation_number + 1):]
print("训练集大小")
print(train.shape[0])
# TODO validation date is not the end of train date
train.to_csv(DIRECTORY_PATH + "train{0}-{1}.csv".format(DATA_START_DATE, VALIDATION_DATE), index=False, header=None)
train.to_csv(DIRECTORY_PATH + "train_ffm_data.csv", index=False, header=None)
......
from processData import *
from diaryTraining import *
from diaryCandidateSet import get_eachCityDiaryTop3000
from datetime import datetime
from datetime import timedelta
from utils import get_date
# 把数据获取、特征转换、模型训练的模型串联在一起
if __name__ == "__main__":
data, test_number, validation_number = feature_en()
# while True:
# now = datetime.now()
# if (now.hour == 23) and (now.minute == 30):
start = time.time()
data_start_date, data_end_date, validation_date, test_date = get_date()
data, test_number, validation_number = feature_en(data_start_date, data_end_date,
validation_date,test_date)
ffm_transform(data, test_number, validation_number)
train()
end = time.time()
print("训练模型耗时{}分".format((end-start)/60))
print('---------------prepare candidates--------------')
get_eachCityDiaryTop3000()
print("end")
# encoding = "utf-8"
from datetime import datetime
from datetime import timedelta
import pymysql
import numpy as np
import redis
......@@ -9,6 +10,21 @@ from sklearn.metrics import auc
from multiprocessing import Pool
def get_date():
now = datetime.now()
year = now.year
month = now.month
day = now.day
date = datetime(year,month,day)
data_start_date = (date - timedelta(days=38)).strftime("%Y-%m-%d")
data_end_date = (date - timedelta(days=2)).strftime("%Y-%m-%d")
validation_date = (date - timedelta(days=3)).strftime("%Y-%m-%d")
test_date = data_end_date
print("data_start_date,data_end_date,validation_date,test_date:")
print(data_start_date,data_end_date,validation_date,test_date)
return data_start_date,data_end_date,validation_date,test_date
def get_roc_curve(y, pred, pos_label):
"""
计算二分类问题的roc和auc
......@@ -99,7 +115,7 @@ class multiFFMFormatPandas:
# t = df.dtypes.to_dict()
# return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
def transform(self, df,n=10000,processes=5):
def transform(self, df,n=10000,processes=1):
# n是每个线程运行最大的数据条数,processes是线程数
t = df.dtypes.to_dict()
data_list = self.data_split_line(df,n)
......@@ -166,65 +182,65 @@ class multiFFMFormatPandas:
return False
# ffm 格式转换函数、类
class FFMFormatPandas:
def __init__(self):
self.field_index_ = None
self.feature_index_ = None
self.y = None
def fit(self, df, y=None):
self.y = y
df_ffm = df[df.columns.difference([self.y])]
if self.field_index_ is None:
self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
if self.feature_index_ is not None:
last_idx = max(list(self.feature_index_.values()))
if self.feature_index_ is None:
self.feature_index_ = dict()
last_idx = 0
for col in df.columns:
vals = df[col].unique()
for val in vals:
if pd.isnull(val):
continue
name = '{}_{}'.format(col, val)
if name not in self.feature_index_:
self.feature_index_[name] = last_idx
last_idx += 1
self.feature_index_[col] = last_idx
last_idx += 1
return self
def fit_transform(self, df, y=None):
self.fit(df, y)
return self.transform(df)
def transform_row_(self, row, t):
ffm = []
if self.y is not None:
ffm.append(str(row.loc[row.index == self.y][0]))
if self.y is None:
ffm.append(str(0))
for col, val in row.loc[row.index != self.y].to_dict().items():
col_type = t[col]
name = '{}_{}'.format(col, val)
if col_type.kind == 'O':
ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
elif col_type.kind == 'i':
ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
return ' '.join(ffm)
def transform(self, df):
t = df.dtypes.to_dict()
return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
# 下面这个方法不是这个类原有的方法,是新增的。目的是用来判断这个用户是不是在训练数据集中存在
def is_feature_index_exist(self, name):
if name in self.feature_index_:
return True
else:
return False
# class FFMFormatPandas:
# def __init__(self):
# self.field_index_ = None
# self.feature_index_ = None
# self.y = None
#
# def fit(self, df, y=None):
# self.y = y
# df_ffm = df[df.columns.difference([self.y])]
# if self.field_index_ is None:
# self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
#
# if self.feature_index_ is not None:
# last_idx = max(list(self.feature_index_.values()))
#
# if self.feature_index_ is None:
# self.feature_index_ = dict()
# last_idx = 0
#
# for col in df.columns:
# vals = df[col].unique()
# for val in vals:
# if pd.isnull(val):
# continue
# name = '{}_{}'.format(col, val)
# if name not in self.feature_index_:
# self.feature_index_[name] = last_idx
# last_idx += 1
# self.feature_index_[col] = last_idx
# last_idx += 1
# return self
#
# def fit_transform(self, df, y=None):
# self.fit(df, y)
# return self.transform(df)
#
# def transform_row_(self, row, t):
# ffm = []
# if self.y is not None:
# ffm.append(str(row.loc[row.index == self.y][0]))
# if self.y is None:
# ffm.append(str(0))
#
# for col, val in row.loc[row.index != self.y].to_dict().items():
# col_type = t[col]
# name = '{}_{}'.format(col, val)
# if col_type.kind == 'O':
# ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
# elif col_type.kind == 'i':
# ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
# return ' '.join(ffm)
#
# def transform(self, df):
# t = df.dtypes.to_dict()
# return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
#
# # 下面这个方法不是这个类原有的方法,是新增的。目的是用来判断这个用户是不是在训练数据集中存在
# def is_feature_index_exist(self, name):
# if name in self.feature_index_:
# return True
# else:
# return False
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment