Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
4f096b7d
Commit
4f096b7d
authored
6 years ago
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
按照日期划分测试集、验证集
parent
2b944151
master
gyz
mr/beta/bug22
offic
rtt
test
updatedb
wzw
zhao
zhao22
No related merge requests found
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
46 additions
and
38 deletions
+46
-38
diary-training.py
diary-training.py
+46
-38
No files found.
diary-training.py
View file @
4f096b7d
import
datetime
import
pymysql
import
pandas
as
pd
from
sklearn.utils
import
shuffle
import
numpy
as
np
import
xlearn
as
xl
...
...
@@ -23,24 +22,41 @@ click_device_id = con_sql(sql)[0].values.tolist()
print
(
"成功获取点击表里的device_id"
)
# 获取点击表里的数据
sql
=
"select cid,device_id,time
from data_feed_click where stat_date >= '2018-07-25
'"
sql
=
"select cid,device_id,time
,stat_date from data_feed_click where stat_date >= '2018-08-03
'"
click
=
con_sql
(
sql
)
click
=
click
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time"
})
click
=
click
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time"
,
3
:
"stat_date"
})
print
(
"成功获取点击表里的数据"
)
# 从time特征中抽取hour
click
[
"hour"
]
=
click
[
"time"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
hour
)
click
[
"minute"
]
=
click
[
"time"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
minute
)
click
=
click
.
drop
(
"time"
,
axis
=
1
)
print
(
"点击表数据预览"
)
print
(
click
.
head
(
2
))
# 获取曝光表里的数据
sql
=
"select cid,device_id,time
from data_feed_exposure where stat_date >= '2018-07-25
'"
sql
=
"select cid,device_id,time
,stat_date from data_feed_exposure where stat_date >= '2018-08-03
'"
exposure
=
con_sql
(
sql
)
exposure
=
exposure
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time"
})
exposure
=
exposure
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time"
,
3
:
"stat_date"
})
print
(
"成功获取曝光表里的数据"
)
# 从time特征中抽取hour
exposure
[
"hour"
]
=
exposure
[
"time"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
hour
)
exposure
[
"minute"
]
=
exposure
[
"time"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
minute
)
exposure
=
exposure
.
drop
(
"time"
,
axis
=
1
)
print
(
"曝光表数据预览"
)
print
(
exposure
.
head
(
2
))
# 求曝光表和点击表的差集合
print
(
"曝光表处理前的样本个数"
)
print
(
exposure
.
shape
)
exposure
=
exposure
.
append
(
click
)
exposure
=
exposure
.
append
(
click
)
subset
=
click
.
columns
.
tolist
()
exposure
=
exposure
.
drop_duplicates
(
subset
=
subset
,
keep
=
False
)
print
(
"成功完成曝光表和点击表的差集合"
)
print
(
"差集后曝光表个数"
)
print
(
exposure
.
shape
)
exposure
=
exposure
.
loc
[
exposure
[
"device_id"
]
.
isin
(
click_device_id
)]
print
(
"去除未点击用户后曝光表个数"
)
print
(
exposure
.
shape
)
# 打标签
click
[
"y"
]
=
1
exposure
[
"y"
]
=
0
...
...
@@ -52,21 +68,22 @@ print(exposure.shape[0])
# 合并点击表和曝光表
data
=
click
.
append
(
exposure
)
print
(
"done 合并点击表和曝光表"
)
data
=
data
.
sort_values
(
by
=
"stat_date"
,
ascending
=
False
)
print
(
"前两行数据"
)
print
(
data
.
head
(
2
))
# 从time特征中抽取hour、weekday
data
[
"hour"
]
=
data
[
"time"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
hour
)
data
[
"weekday"
]
=
data
[
"time"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
weekday
())
print
(
"后两行数据"
)
print
(
data
.
tail
(
2
))
test_number
=
data
[
data
[
"stat_date"
]
==
'2018-08-06'
]
.
shape
[
0
]
validation_number
=
data
[
data
[
"stat_date"
]
==
'2018-08-05'
]
.
shape
[
0
]
data
=
data
.
drop
(
"stat_date"
,
axis
=
1
)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data
.
loc
[
data
[
"hour"
]
==
0
,[
"hour"
]]
=
24
data
.
loc
[
data
[
"
weekday"
]
==
0
,[
"weekday"
]]
=
7
data
.
loc
[
data
[
"
minute"
]
==
0
,[
"minute"
]]
=
60
data
[
"hour"
]
=
data
[
"hour"
]
.
astype
(
"category"
)
data
[
"weekday"
]
=
data
[
"weekday"
]
.
astype
(
"category"
)
data
=
data
.
drop
(
"time"
,
axis
=
1
)
print
(
"成功从time特征中抽取hour、weekday"
)
data
[
"minute"
]
=
data
[
"minute"
]
.
astype
(
"category"
)
print
(
data
.
head
(
2
))
data
=
shuffle
(
data
)
print
(
"start ffm transform"
)
...
...
@@ -138,42 +155,33 @@ print("数据集大小")
print
(
data
.
shape
)
print
(
data
.
head
(
2
))
'''
n = np.rint(data.shape[0]/8)
m = np.rint(data.shape[0]*(3/8))
# 1/8的数据集用来做测试集
test = data.loc[:n]
test
=
data
.
loc
[:
test_number
]
print
(
"测试集大小"
)
print(test.shape)
print
(
test
.
shape
[
0
]
)
test
.
to_csv
(
"/home/zhangyanzhao/test.csv"
,
index
=
False
,
header
=
None
)
# 1/4的数据集用来做验证集
validation = data.loc[n+1:m]
validation.to_csv("/home/zhangyanzhao/validation.csv",index = False,header = None)
validation
=
data
.
loc
[(
test_number
+
1
):(
test_number
+
validation_number
)]
print
(
"验证集大小"
)
print(validation.shape)
# 剩余的数据集用来做训练集
train = data.loc[
m+1
:]
print
(
validation
.
shape
[
0
]
)
validation
.
to_csv
(
"/home/zhangyanzhao/validation.csv"
,
index
=
False
,
header
=
None
)
train
=
data
.
loc
[
(
test_number
+
validation_number
+
1
)
:]
print
(
"训练集大小"
)
print(train.shape)
print
(
train
.
shape
[
0
]
)
train
.
to_csv
(
"/home/zhangyanzhao/train.csv"
,
index
=
False
,
header
=
None
)
'''
print
(
"start training"
)
ffm_model
=
xl
.
create_ffm
()
ffm_model
.
setTrain
(
"/home/zhangyanzhao/data.csv"
)
# ffm_model.setValidate("/home/zhangyanzhao/validation.csv")
ffm_model
.
setTrain
(
"/home/zhangyanzhao/train.csv"
)
ffm_model
.
setValidate
(
"/home/zhangyanzhao/validation.csv"
)
param
=
{
'task'
:
'binary'
,
'lr'
:
0.03
,
'lambda'
:
0.002
,
'metric'
:
'auc'
}
param
=
{
'task'
:
'binary'
,
'lr'
:
0.05
,
'lambda'
:
0.002
,
'metric'
:
'auc'
,
'fold'
:
3
}
ffm_model
.
fit
(
param
,
'/home/zhangyanzhao/model.out'
)
ffm_model
.
cv
(
param
)
'''
ffm_model
.
setTest
(
"/home/zhangyanzhao/test.csv"
)
ffm_model
.
setSigmoid
()
ffm_model
.
predict
(
"/home/zhangyanzhao/model.out"
,
"/home/zhangyanzhao/output.txt"
)
'''
print
(
"end"
)
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment