Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
e83cb697
Commit
e83cb697
authored
Aug 07, 2018
by
高雅喆
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline
add dea/ml_tools/roc_curve.py
parents
ef5008d2
0aa8923e
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
73 additions
and
73 deletions
+73
-73
diaryTraining.py
diaryTraining.py
+73
-73
No files found.
diaryTraining.py
View file @
e83cb697
...
@@ -4,89 +4,89 @@ import xlearn as xl
...
@@ -4,89 +4,89 @@ import xlearn as xl
import
time
import
time
from
prepareData
import
fetch_data
from
prepareData
import
fetch_data
exposure
,
click
,
click_device_id
=
fetch_data
()
# exposure, click, click_device_id = fetch_data()
#
# # 求曝光表和点击表的差集合
# print("曝光表处理前的样本个数")
# print(exposure.shape)
# exposure = exposure.append(click)
# exposure = exposure.append(click)
# subset = click.columns.tolist()
# exposure = exposure.drop_duplicates(subset=subset,keep=False)
# print("差集后曝光表个数")
# print(exposure.shape)
# exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
# print("去除未点击用户后曝光表个数")
# print(exposure.shape)
# # 打标签
# click["y"] = 1
# exposure["y"] = 0
#
# print("正样本个数")
# print(click.shape[0])
# print("负样本个数")
# print(exposure.shape[0])
#
# # 合并点击表和曝光表
# data = click.append(exposure)
# data = data.sort_values(by="stat_date",ascending=False)
# print("前两行数据")
# print(data.head(2))
# print("后两行数据")
# print(data.tail(2))
# test_number = data[data["stat_date"]=='2018-08-06'].shape[0]
# validation_number = data[data["stat_date"]=='2018-08-05'].shape[0]
# data = data.drop("stat_date",axis=1)
#
# # 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
# data.loc[data["hour"]==0,["hour"]] = 24
# data.loc[data["minute"]==0,["minute"]] = 60
# data["hour"] = data["hour"].astype("category")
# data["minute"] = data["minute"].astype("category")
# print(data.head(2))
#
#
# print("start ffm transform")
# start = time.time()
# ffm_train = FFMFormatPandas()
# data = ffm_train.fit_transform(data, y='y')
# print("done transform ffm")
# end = time.time()
# print("ffm转化数据耗时:")
# print(end-start)
# data.to_csv("/home/zhangyanzhao/data.csv",index=False)
# data = pd.read_csv("/home/zhangyanzhao/data.csv",header=None)
# print("数据集大小")
# print(data.shape)
# print(data.head(2))
# 求曝光表和点击表的差集合
# test = data.loc[:test_number]
print
(
"曝光表处理前的样本个数"
)
# print("测试集大小")
print
(
exposure
.
shape
)
# print(test.shape[0])
exposure
=
exposure
.
append
(
click
)
# test.to_csv("/home/zhangyanzhao/test.csv",index = False,header = None)
exposure
=
exposure
.
append
(
click
)
# validation = data.loc[(test_number+1):(test_number+validation_number)]
subset
=
click
.
columns
.
tolist
()
# print("验证集大小")
exposure
=
exposure
.
drop_duplicates
(
subset
=
subset
,
keep
=
False
)
# print(validation.shape[0])
print
(
"差集后曝光表个数"
)
# validation.to_csv("/home/zhangyanzhao/validation.csv",index = False,header = None)
print
(
exposure
.
shape
)
# train = data.loc[(test_number+validation_number+1):]
exposure
=
exposure
.
loc
[
exposure
[
"device_id"
]
.
isin
(
click_device_id
)]
# print("训练集大小")
print
(
"去除未点击用户后曝光表个数"
)
# print(train.shape[0])
print
(
exposure
.
shape
)
# train.to_csv("/home/zhangyanzhao/train.csv",index = False,header = None)
# 打标签
click
[
"y"
]
=
1
exposure
[
"y"
]
=
0
print
(
"正样本个数"
)
print
(
click
.
shape
[
0
])
print
(
"负样本个数"
)
print
(
exposure
.
shape
[
0
])
# 合并点击表和曝光表
data
=
click
.
append
(
exposure
)
data
=
data
.
sort_values
(
by
=
"stat_date"
,
ascending
=
False
)
print
(
"前两行数据"
)
print
(
data
.
head
(
2
))
print
(
"后两行数据"
)
print
(
data
.
tail
(
2
))
test_number
=
data
[
data
[
"stat_date"
]
==
'2018-08-06'
]
.
shape
[
0
]
validation_number
=
data
[
data
[
"stat_date"
]
==
'2018-08-05'
]
.
shape
[
0
]
data
=
data
.
drop
(
"stat_date"
,
axis
=
1
)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data
.
loc
[
data
[
"hour"
]
==
0
,[
"hour"
]]
=
24
data
.
loc
[
data
[
"minute"
]
==
0
,[
"minute"
]]
=
60
data
[
"hour"
]
=
data
[
"hour"
]
.
astype
(
"category"
)
data
[
"minute"
]
=
data
[
"minute"
]
.
astype
(
"category"
)
print
(
data
.
head
(
2
))
print
(
"start ffm transform"
)
start
=
time
.
time
()
ffm_train
=
FFMFormatPandas
()
data
=
ffm_train
.
fit_transform
(
data
,
y
=
'y'
)
print
(
"done transform ffm"
)
end
=
time
.
time
()
print
(
"ffm转化数据耗时:"
)
print
(
end
-
start
)
data
.
to_csv
(
"/home/zhangyanzhao/data.csv"
,
index
=
False
)
data
=
pd
.
read_csv
(
"/home/zhangyanzhao/data.csv"
,
header
=
None
)
print
(
"数据集大小"
)
print
(
data
.
shape
)
print
(
data
.
head
(
2
))
test
=
data
.
loc
[:
test_number
]
print
(
"测试集大小"
)
print
(
test
.
shape
[
0
])
test
.
to_csv
(
"/home/zhangyanzhao/test.csv"
,
index
=
False
,
header
=
None
)
validation
=
data
.
loc
[(
test_number
+
1
):(
test_number
+
validation_number
)]
print
(
"验证集大小"
)
print
(
validation
.
shape
[
0
])
validation
.
to_csv
(
"/home/zhangyanzhao/validation.csv"
,
index
=
False
,
header
=
None
)
train
=
data
.
loc
[(
test_number
+
validation_number
+
1
):]
print
(
"训练集大小"
)
print
(
train
.
shape
[
0
])
train
.
to_csv
(
"/home/zhangyanzhao/train.csv"
,
index
=
False
,
header
=
None
)
print
(
"start training"
)
print
(
"start training"
)
ffm_model
=
xl
.
create_ffm
()
ffm_model
=
xl
.
create_ffm
()
ffm_model
.
setTrain
(
"/home/zhangyanzhao/train.csv"
)
ffm_model
.
setTrain
(
"/home/zhangyanzhao/data.csv"
)
ffm_model
.
setValidate
(
"/home/zhangyanzhao/validation.csv"
)
ffm_model
.
setValidate
(
"/home/zhangyanzhao/data.csv"
)
param
=
{
'task'
:
'binary'
,
'lr'
:
0.03
,
param
=
{
'task'
:
'binary'
,
'lr'
:
0.03
,
'lambda'
:
0.002
,
'metric'
:
'auc'
}
'lambda'
:
0.002
,
'metric'
:
'auc'
}
ffm_model
.
fit
(
param
,
'/home/zhangyanzhao/model.out'
)
ffm_model
.
fit
(
param
,
'/home/zhangyanzhao/model.out'
)
ffm_model
.
setTest
(
"/home/zhangyanzhao/data.csv"
)
ffm_model
.
setTest
(
"/home/zhangyanzhao/test.csv"
)
ffm_model
.
setSigmoid
()
ffm_model
.
setSigmoid
()
ffm_model
.
predict
(
"/home/zhangyanzhao/model.out"
,
ffm_model
.
predict
(
"/home/zhangyanzhao/model.out"
,
"/home/zhangyanzhao/output.txt"
)
"/home/zhangyanzhao/output.txt"
)
print
(
"end"
)
print
(
"end"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment