Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
4f096b7d
Commit
4f096b7d
authored
Aug 07, 2018
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
按照日期划分测试集、验证集
parent
2b944151
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
46 additions
and
38 deletions
+46
-38
diary-training.py
diary-training.py
+46
-38
No files found.
diary-training.py
View file @
4f096b7d
import
datetime
import
datetime
import
pymysql
import
pymysql
import
pandas
as
pd
import
pandas
as
pd
from
sklearn.utils
import
shuffle
import
numpy
as
np
import
numpy
as
np
import
xlearn
as
xl
import
xlearn
as
xl
...
@@ -23,24 +22,41 @@ click_device_id = con_sql(sql)[0].values.tolist()
...
@@ -23,24 +22,41 @@ click_device_id = con_sql(sql)[0].values.tolist()
print
(
"成功获取点击表里的device_id"
)
print
(
"成功获取点击表里的device_id"
)
# 获取点击表里的数据
# 获取点击表里的数据
sql
=
"select cid,device_id,time
from data_feed_click where stat_date >= '2018-07-25
'"
sql
=
"select cid,device_id,time
,stat_date from data_feed_click where stat_date >= '2018-08-03
'"
click
=
con_sql
(
sql
)
click
=
con_sql
(
sql
)
click
=
click
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time"
})
click
=
click
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time"
,
3
:
"stat_date"
})
print
(
"成功获取点击表里的数据"
)
print
(
"成功获取点击表里的数据"
)
# 从time特征中抽取hour
click
[
"hour"
]
=
click
[
"time"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
hour
)
click
[
"minute"
]
=
click
[
"time"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
minute
)
click
=
click
.
drop
(
"time"
,
axis
=
1
)
print
(
"点击表数据预览"
)
print
(
click
.
head
(
2
))
# 获取曝光表里的数据
# 获取曝光表里的数据
sql
=
"select cid,device_id,time
from data_feed_exposure where stat_date >= '2018-07-25
'"
sql
=
"select cid,device_id,time
,stat_date from data_feed_exposure where stat_date >= '2018-08-03
'"
exposure
=
con_sql
(
sql
)
exposure
=
con_sql
(
sql
)
exposure
=
exposure
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time"
})
exposure
=
exposure
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time"
,
3
:
"stat_date"
})
print
(
"成功获取曝光表里的数据"
)
print
(
"成功获取曝光表里的数据"
)
# 从time特征中抽取hour
exposure
[
"hour"
]
=
exposure
[
"time"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
hour
)
exposure
[
"minute"
]
=
exposure
[
"time"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
minute
)
exposure
=
exposure
.
drop
(
"time"
,
axis
=
1
)
print
(
"曝光表数据预览"
)
print
(
exposure
.
head
(
2
))
# 求曝光表和点击表的差集合
# 求曝光表和点击表的差集合
print
(
"曝光表处理前的样本个数"
)
print
(
exposure
.
shape
)
exposure
=
exposure
.
append
(
click
)
exposure
=
exposure
.
append
(
click
)
exposure
=
exposure
.
append
(
click
)
exposure
=
exposure
.
append
(
click
)
subset
=
click
.
columns
.
tolist
()
subset
=
click
.
columns
.
tolist
()
exposure
=
exposure
.
drop_duplicates
(
subset
=
subset
,
keep
=
False
)
exposure
=
exposure
.
drop_duplicates
(
subset
=
subset
,
keep
=
False
)
print
(
"成功完成曝光表和点击表的差集合"
)
print
(
"差集后曝光表个数"
)
print
(
exposure
.
shape
)
exposure
=
exposure
.
loc
[
exposure
[
"device_id"
]
.
isin
(
click_device_id
)]
exposure
=
exposure
.
loc
[
exposure
[
"device_id"
]
.
isin
(
click_device_id
)]
print
(
"去除未点击用户后曝光表个数"
)
print
(
exposure
.
shape
)
# 打标签
# 打标签
click
[
"y"
]
=
1
click
[
"y"
]
=
1
exposure
[
"y"
]
=
0
exposure
[
"y"
]
=
0
...
@@ -52,21 +68,22 @@ print(exposure.shape[0])
...
@@ -52,21 +68,22 @@ print(exposure.shape[0])
# 合并点击表和曝光表
# 合并点击表和曝光表
data
=
click
.
append
(
exposure
)
data
=
click
.
append
(
exposure
)
print
(
"done 合并点击表和曝光表"
)
data
=
data
.
sort_values
(
by
=
"stat_date"
,
ascending
=
False
)
print
(
"前两行数据"
)
print
(
data
.
head
(
2
))
print
(
data
.
head
(
2
))
# 从time特征中抽取hour、weekday
print
(
"后两行数据"
)
data
[
"hour"
]
=
data
[
"time"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
hour
)
print
(
data
.
tail
(
2
))
data
[
"weekday"
]
=
data
[
"time"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
weekday
())
test_number
=
data
[
data
[
"stat_date"
]
==
'2018-08-06'
]
.
shape
[
0
]
validation_number
=
data
[
data
[
"stat_date"
]
==
'2018-08-05'
]
.
shape
[
0
]
data
=
data
.
drop
(
"stat_date"
,
axis
=
1
)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data
.
loc
[
data
[
"hour"
]
==
0
,[
"hour"
]]
=
24
data
.
loc
[
data
[
"hour"
]
==
0
,[
"hour"
]]
=
24
data
.
loc
[
data
[
"
weekday"
]
==
0
,[
"weekday"
]]
=
7
data
.
loc
[
data
[
"
minute"
]
==
0
,[
"minute"
]]
=
60
data
[
"hour"
]
=
data
[
"hour"
]
.
astype
(
"category"
)
data
[
"hour"
]
=
data
[
"hour"
]
.
astype
(
"category"
)
data
[
"weekday"
]
=
data
[
"weekday"
]
.
astype
(
"category"
)
data
[
"minute"
]
=
data
[
"minute"
]
.
astype
(
"category"
)
data
=
data
.
drop
(
"time"
,
axis
=
1
)
print
(
"成功从time特征中抽取hour、weekday"
)
print
(
data
.
head
(
2
))
print
(
data
.
head
(
2
))
data
=
shuffle
(
data
)
print
(
"start ffm transform"
)
print
(
"start ffm transform"
)
...
@@ -138,42 +155,33 @@ print("数据集大小")
...
@@ -138,42 +155,33 @@ print("数据集大小")
print
(
data
.
shape
)
print
(
data
.
shape
)
print
(
data
.
head
(
2
))
print
(
data
.
head
(
2
))
'''
test
=
data
.
loc
[:
test_number
]
n = np.rint(data.shape[0]/8)
m = np.rint(data.shape[0]*(3/8))
# 1/8的数据集用来做测试集
test = data.loc[:n]
print
(
"测试集大小"
)
print
(
"测试集大小"
)
print(test.shape)
print
(
test
.
shape
[
0
]
)
test
.
to_csv
(
"/home/zhangyanzhao/test.csv"
,
index
=
False
,
header
=
None
)
test
.
to_csv
(
"/home/zhangyanzhao/test.csv"
,
index
=
False
,
header
=
None
)
# 1/4的数据集用来做验证集
validation
=
data
.
loc
[(
test_number
+
1
):(
test_number
+
validation_number
)]
validation = data.loc[n+1:m]
validation.to_csv("/home/zhangyanzhao/validation.csv",index = False,header = None)
print
(
"验证集大小"
)
print
(
"验证集大小"
)
print(validation.shape)
print
(
validation
.
shape
[
0
]
)
# 剩余的数据集用来做训练集
validation
.
to_csv
(
"/home/zhangyanzhao/validation.csv"
,
index
=
False
,
header
=
None
)
train = data.loc[
m+1
:]
train
=
data
.
loc
[
(
test_number
+
validation_number
+
1
)
:]
print
(
"训练集大小"
)
print
(
"训练集大小"
)
print(train.shape)
print
(
train
.
shape
[
0
]
)
train
.
to_csv
(
"/home/zhangyanzhao/train.csv"
,
index
=
False
,
header
=
None
)
train
.
to_csv
(
"/home/zhangyanzhao/train.csv"
,
index
=
False
,
header
=
None
)
'''
print
(
"start training"
)
print
(
"start training"
)
ffm_model
=
xl
.
create_ffm
()
ffm_model
=
xl
.
create_ffm
()
ffm_model
.
setTrain
(
"/home/zhangyanzhao/data.csv"
)
ffm_model
.
setTrain
(
"/home/zhangyanzhao/train.csv"
)
# ffm_model.setValidate("/home/zhangyanzhao/validation.csv")
ffm_model
.
setValidate
(
"/home/zhangyanzhao/validation.csv"
)
param
=
{
'task'
:
'binary'
,
'lr'
:
0.03
,
'lambda'
:
0.002
,
'metric'
:
'auc'
}
param
=
{
'task'
:
'binary'
,
'lr'
:
0.05
,
ffm_model
.
fit
(
param
,
'/home/zhangyanzhao/model.out'
)
'lambda'
:
0.002
,
'metric'
:
'auc'
,
'fold'
:
3
}
ffm_model
.
cv
(
param
)
'''
ffm_model
.
setTest
(
"/home/zhangyanzhao/test.csv"
)
ffm_model
.
setTest
(
"/home/zhangyanzhao/test.csv"
)
ffm_model
.
setSigmoid
()
ffm_model
.
setSigmoid
()
ffm_model
.
predict
(
"/home/zhangyanzhao/model.out"
,
"/home/zhangyanzhao/output.txt"
)
ffm_model
.
predict
(
"/home/zhangyanzhao/model.out"
,
"/home/zhangyanzhao/output.txt"
)
'''
print
(
"end"
)
print
(
"end"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment