Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
7bdb5d50
Commit
7bdb5d50
authored
Aug 15, 2018
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
change start_date
parent
ea80570f
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
116 additions
and
104 deletions
+116
-104
config.py
config.py
+4
-4
diaryTraining.py
diaryTraining.py
+4
-11
predictDiary.py
predictDiary.py
+2
-4
prepareData.py
prepareData.py
+0
-2
processData.py
processData.py
+13
-15
train.py
train.py
+13
-4
utils.py
utils.py
+80
-64
No files found.
config.py
View file @
7bdb5d50
...
...
@@ -2,10 +2,10 @@
DIRECTORY_PATH
=
'/data2/models/'
# 测试日期一定要大于验证日期,因为切割数据集的代码是这样设置的
VALIDATION_DATE
=
'2018-08-05'
TEST_DATE
=
'2018-08-06'
DATA_START_DATE
=
'2018-07-05'
DATA_END_DATE
=
'2018-08-06'
#
VALIDATION_DATE = '2018-08-05'
#
TEST_DATE = '2018-08-06'
#
DATA_START_DATE = '2018-07-05'
#
DATA_END_DATE = '2018-08-06'
MODEL_VERSION
=
''
...
...
diaryTraining.py
View file @
7bdb5d50
import
xlearn
as
xl
from
config
import
*
...
...
@@ -7,21 +5,16 @@ from config import *
def
train
():
print
(
"Start training"
)
ffm_model
=
xl
.
create_ffm
()
ffm_model
.
setTrain
(
DIRECTORY_PATH
+
"train
{0}-{1}.csv"
.
format
(
DATA_START_DATE
,
VALIDATION_DATE
)
)
ffm_model
.
setValidate
(
DIRECTORY_PATH
+
"validation
{0}.csv"
.
format
(
VALIDATION_DATE
)
)
ffm_model
.
setTrain
(
DIRECTORY_PATH
+
"train
_ffm_data.csv"
)
ffm_model
.
setValidate
(
DIRECTORY_PATH
+
"validation
_ffm_data.csv"
)
# log保存路径,如果不加这个参数,日志默认保存在/temp路径下,不符合规范
param
=
{
'task'
:
'binary'
,
'lr'
:
lr
,
'lambda'
:
l2_lambda
,
'metric'
:
'auc'
,
"log"
:
"/data2/models/result"
}
ffm_model
.
fit
(
param
,
DIRECTORY_PATH
+
"model_lr{}_lambda{}.out"
.
format
(
lr
,
l2_lambda
))
print
(
"predicting"
)
ffm_model
.
setTest
(
DIRECTORY_PATH
+
"test
{0}.csv"
.
format
(
TEST_DATE
)
)
ffm_model
.
setTest
(
DIRECTORY_PATH
+
"test
_ffm_data.csv"
)
ffm_model
.
setSigmoid
()
ffm_model
.
predict
(
DIRECTORY_PATH
+
"model_{0}-{1}_lr{2}_lambda{3}.out"
.
format
(
DATA_START_DATE
,
DATA_END_DATE
,
lr
,
l2_lambda
),
DIRECTORY_PATH
+
"testset{0}_output_model_{1}-{2}_lr{3}_lambda{4}.txt"
.
format
(
TEST_DATE
,
DATA_START_DATE
,
DATA_END_DATE
,
lr
,
l2_lambda
))
ffm_model
.
predict
(
DIRECTORY_PATH
+
"model.out"
,
DIRECTORY_PATH
+
"test_set_predict_output.txt"
)
predictDiary.py
View file @
7bdb5d50
...
...
@@ -32,15 +32,13 @@ def feature_en(user_profile):
# 把ffm.pkl load进来,将上面的表转化为ffm格式
def
transform_ffm_format
(
df
,
device_id
):
file_path
=
DIRECTORY_PATH
+
"ffm_{0}_{1}.pkl"
.
format
(
DATA_START_DATE
,
DATA_END_DATE
)
with
open
(
file_path
,
"rb"
)
as
f
:
with
open
(
DIRECTORY_PATH
+
"ffm.pkl"
,
"rb"
)
as
f
:
ffm_format_pandas
=
pickle
.
load
(
f
)
data
=
ffm_format_pandas
.
transform
(
df
)
now
=
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d-
%
H-
%
M"
)
print
(
"ffm格式转化结束"
)
predict_file_name
=
DIRECTORY_PATH
+
"result/{0}_{1}DiaryTop3000.csv"
.
format
(
device_id
,
now
)
data
.
to_csv
(
predict_file_name
,
index
=
False
,
header
=
None
)
print
(
"
ffm
写到服务器"
)
print
(
"
成功将ffm预测文件
写到服务器"
)
return
predict_file_name
...
...
prepareData.py
View file @
7bdb5d50
from
utils
import
con_sql
import
datetime
...
...
processData.py
View file @
7bdb5d50
import
time
from
prepareData
import
fetch_data
from
utils
import
*
...
...
@@ -8,9 +6,9 @@ from config import *
import
pickle
def
feature_en
():
exposure
,
click
,
click_device_id
=
fetch_data
(
start_date
=
DATA_START_DATE
,
end_date
=
DATA_END_DATE
)
def
feature_en
(
data_start_date
,
data_end_date
,
validation_date
,
test_date
):
exposure
,
click
,
click_device_id
=
fetch_data
(
data_start_date
,
data_end_date
)
# 求曝光表和点击表的差集合
print
(
"曝光表处理前的样本个数"
)
...
...
@@ -43,8 +41,8 @@ def feature_en():
print
(
data
.
head
(
2
))
print
(
"后两行数据"
)
print
(
data
.
tail
(
2
))
test_number
=
data
[
data
[
"stat_date"
]
==
TEST_DATE
]
.
shape
[
0
]
validation_number
=
data
[
data
[
"stat_date"
]
==
VALIDATION_DATE
]
.
shape
[
0
]
test_number
=
data
[
data
[
"stat_date"
]
==
test_date
]
.
shape
[
0
]
validation_number
=
data
[
data
[
"stat_date"
]
==
validation_date
]
.
shape
[
0
]
data
=
data
.
drop
(
"stat_date"
,
axis
=
1
)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
...
...
@@ -78,16 +76,16 @@ def ffm_transform(data, test_number, validation_number):
start
=
time
.
time
()
ffm_train
=
multiFFMFormatPandas
()
data
=
ffm_train
.
fit_transform
(
data
,
y
=
'y'
,
n
=
50000
,
processes
=
5
)
with
open
(
DIRECTORY_PATH
+
"ffm
_{0}_{1}.pkl"
.
format
(
DATA_START_DATE
,
DATA_END_DATE
)
,
"wb"
)
as
f
:
with
open
(
DIRECTORY_PATH
+
"ffm
.pkl"
,
"wb"
)
as
f
:
pickle
.
dump
(
ffm_train
,
f
)
print
(
"done transform ffm"
)
end
=
time
.
time
()
print
(
"ffm转化数据耗时(
秒
):"
)
print
(
end
-
start
)
print
(
"ffm转化数据耗时(
分
):"
)
print
(
(
end
-
start
)
/
60
)
data
.
to_csv
(
DIRECTORY_PATH
+
"
data{0}-{1}.csv"
.
format
(
DATA_START_DATE
,
DATA_END_DATE
)
,
index
=
False
)
data
=
pd
.
read_csv
(
DIRECTORY_PATH
+
"
data{0}-{1}.csv"
.
format
(
DATA_START_DATE
,
DATA_END_DATE
)
,
header
=
None
)
data
.
to_csv
(
DIRECTORY_PATH
+
"
total_ffm_data.csv"
,
index
=
False
)
data
=
pd
.
read_csv
(
DIRECTORY_PATH
+
"
total_ffm_data.csv"
,
header
=
None
)
print
(
"数据集大小"
)
print
(
data
.
shape
)
print
(
data
.
head
(
2
))
...
...
@@ -95,17 +93,17 @@ def ffm_transform(data, test_number, validation_number):
test
=
data
.
loc
[:
test_number
]
print
(
"测试集大小"
)
print
(
test
.
shape
[
0
])
test
.
to_csv
(
DIRECTORY_PATH
+
"test
{0}.csv"
.
format
(
TEST_DATE
)
,
index
=
False
,
header
=
None
)
test
.
to_csv
(
DIRECTORY_PATH
+
"test
_ffm_data.csv"
,
index
=
False
,
header
=
None
)
# 注意:测试集的日期一定要大于验证集,否则数据切割可能会出现错误
validation
=
data
.
loc
[(
test_number
+
1
):(
test_number
+
validation_number
)]
print
(
"验证集大小"
)
print
(
validation
.
shape
[
0
])
validation
.
to_csv
(
DIRECTORY_PATH
+
"validation
{0}.csv"
.
format
(
VALIDATION_DATE
)
,
index
=
False
,
header
=
None
)
validation
.
to_csv
(
DIRECTORY_PATH
+
"validation
_ffm_data.csv"
,
index
=
False
,
header
=
None
)
train
=
data
.
loc
[(
test_number
+
validation_number
+
1
):]
print
(
"训练集大小"
)
print
(
train
.
shape
[
0
])
# TODO validation date is not the end of train date
train
.
to_csv
(
DIRECTORY_PATH
+
"train
{0}-{1}.csv"
.
format
(
DATA_START_DATE
,
VALIDATION_DATE
)
,
index
=
False
,
header
=
None
)
train
.
to_csv
(
DIRECTORY_PATH
+
"train
_ffm_data.csv"
,
index
=
False
,
header
=
None
)
...
...
train.py
View file @
7bdb5d50
from
processData
import
*
from
diaryTraining
import
*
from
diaryCandidateSet
import
get_eachCityDiaryTop3000
from
datetime
import
datetim
e
from
datetime
import
timedelta
from
utils
import
get_dat
e
# 把数据获取、特征转换、模型训练的模型串联在一起
if
__name__
==
"__main__"
:
data
,
test_number
,
validation_number
=
feature_en
()
# while True:
# now = datetime.now()
# if (now.hour == 23) and (now.minute == 30):
start
=
time
.
time
()
data_start_date
,
data_end_date
,
validation_date
,
test_date
=
get_date
()
data
,
test_number
,
validation_number
=
feature_en
(
data_start_date
,
data_end_date
,
validation_date
,
test_date
)
ffm_transform
(
data
,
test_number
,
validation_number
)
train
()
end
=
time
.
time
()
print
(
"训练模型耗时{}分"
.
format
((
end
-
start
)
/
60
))
print
(
'---------------prepare candidates--------------'
)
get_eachCityDiaryTop3000
()
print
(
"end"
)
utils.py
View file @
7bdb5d50
# encoding = "utf-8"
from
datetime
import
datetime
from
datetime
import
timedelta
import
pymysql
import
numpy
as
np
import
redis
...
...
@@ -9,6 +10,21 @@ from sklearn.metrics import auc
from
multiprocessing
import
Pool
def
get_date
():
now
=
datetime
.
now
()
year
=
now
.
year
month
=
now
.
month
day
=
now
.
day
date
=
datetime
(
year
,
month
,
day
)
data_start_date
=
(
date
-
timedelta
(
days
=
38
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
data_end_date
=
(
date
-
timedelta
(
days
=
2
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
validation_date
=
(
date
-
timedelta
(
days
=
3
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
test_date
=
data_end_date
print
(
"data_start_date,data_end_date,validation_date,test_date:"
)
print
(
data_start_date
,
data_end_date
,
validation_date
,
test_date
)
return
data_start_date
,
data_end_date
,
validation_date
,
test_date
def
get_roc_curve
(
y
,
pred
,
pos_label
):
"""
计算二分类问题的roc和auc
...
...
@@ -99,7 +115,7 @@ class multiFFMFormatPandas:
# t = df.dtypes.to_dict()
# return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
def
transform
(
self
,
df
,
n
=
10000
,
processes
=
5
):
def
transform
(
self
,
df
,
n
=
10000
,
processes
=
1
):
# n是每个线程运行最大的数据条数,processes是线程数
t
=
df
.
dtypes
.
to_dict
()
data_list
=
self
.
data_split_line
(
df
,
n
)
...
...
@@ -166,65 +182,65 @@ class multiFFMFormatPandas:
return
False
# ffm 格式转换函数、类
class
FFMFormatPandas
:
def
__init__
(
self
):
self
.
field_index_
=
None
self
.
feature_index_
=
None
self
.
y
=
None
def
fit
(
self
,
df
,
y
=
None
):
self
.
y
=
y
df_ffm
=
df
[
df
.
columns
.
difference
([
self
.
y
])]
if
self
.
field_index_
is
None
:
self
.
field_index_
=
{
col
:
i
for
i
,
col
in
enumerate
(
df_ffm
)}
if
self
.
feature_index_
is
not
None
:
last_idx
=
max
(
list
(
self
.
feature_index_
.
values
()))
if
self
.
feature_index_
is
None
:
self
.
feature_index_
=
dict
()
last_idx
=
0
for
col
in
df
.
columns
:
vals
=
df
[
col
]
.
unique
()
for
val
in
vals
:
if
pd
.
isnull
(
val
):
continue
name
=
'{}_{}'
.
format
(
col
,
val
)
if
name
not
in
self
.
feature_index_
:
self
.
feature_index_
[
name
]
=
last_idx
last_idx
+=
1
self
.
feature_index_
[
col
]
=
last_idx
last_idx
+=
1
return
self
def
fit_transform
(
self
,
df
,
y
=
None
):
self
.
fit
(
df
,
y
)
return
self
.
transform
(
df
)
def
transform_row_
(
self
,
row
,
t
):
ffm
=
[]
if
self
.
y
is
not
None
:
ffm
.
append
(
str
(
row
.
loc
[
row
.
index
==
self
.
y
][
0
]))
if
self
.
y
is
None
:
ffm
.
append
(
str
(
0
))
for
col
,
val
in
row
.
loc
[
row
.
index
!=
self
.
y
]
.
to_dict
()
.
items
():
col_type
=
t
[
col
]
name
=
'{}_{}'
.
format
(
col
,
val
)
if
col_type
.
kind
==
'O'
:
ffm
.
append
(
'{}:{}:1'
.
format
(
self
.
field_index_
[
col
],
self
.
feature_index_
[
name
]))
elif
col_type
.
kind
==
'i'
:
ffm
.
append
(
'{}:{}:{}'
.
format
(
self
.
field_index_
[
col
],
self
.
feature_index_
[
col
],
val
))
return
' '
.
join
(
ffm
)
def
transform
(
self
,
df
):
t
=
df
.
dtypes
.
to_dict
()
return
pd
.
Series
({
idx
:
self
.
transform_row_
(
row
,
t
)
for
idx
,
row
in
df
.
iterrows
()})
# 下面这个方法不是这个类原有的方法,是新增的。目的是用来判断这个用户是不是在训练数据集中存在
def
is_feature_index_exist
(
self
,
name
):
if
name
in
self
.
feature_index_
:
return
True
else
:
return
False
#
class FFMFormatPandas:
#
def __init__(self):
#
self.field_index_ = None
#
self.feature_index_ = None
#
self.y = None
#
#
def fit(self, df, y=None):
#
self.y = y
#
df_ffm = df[df.columns.difference([self.y])]
#
if self.field_index_ is None:
#
self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
#
#
if self.feature_index_ is not None:
#
last_idx = max(list(self.feature_index_.values()))
#
#
if self.feature_index_ is None:
#
self.feature_index_ = dict()
#
last_idx = 0
#
#
for col in df.columns:
#
vals = df[col].unique()
#
for val in vals:
#
if pd.isnull(val):
#
continue
#
name = '{}_{}'.format(col, val)
#
if name not in self.feature_index_:
#
self.feature_index_[name] = last_idx
#
last_idx += 1
#
self.feature_index_[col] = last_idx
#
last_idx += 1
#
return self
#
#
def fit_transform(self, df, y=None):
#
self.fit(df, y)
#
return self.transform(df)
#
#
def transform_row_(self, row, t):
#
ffm = []
#
if self.y is not None:
#
ffm.append(str(row.loc[row.index == self.y][0]))
#
if self.y is None:
#
ffm.append(str(0))
#
#
for col, val in row.loc[row.index != self.y].to_dict().items():
#
col_type = t[col]
#
name = '{}_{}'.format(col, val)
#
if col_type.kind == 'O':
#
ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
#
elif col_type.kind == 'i':
#
ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
#
return ' '.join(ffm)
#
#
def transform(self, df):
#
t = df.dtypes.to_dict()
#
return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
#
#
# 下面这个方法不是这个类原有的方法,是新增的。目的是用来判断这个用户是不是在训练数据集中存在
#
def is_feature_index_exist(self, name):
#
if name in self.feature_index_:
#
return True
#
else:
#
return False
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment