Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
55f77c05
Commit
55f77c05
authored
Aug 06, 2018
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
first time commit
parent
879adfa0
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
163 additions
and
0 deletions
+163
-0
diary-training.py
diary-training.py
+163
-0
No files found.
diary-training.py
0 → 100644
View file @
55f77c05
import
datetime
import
pymysql
import
pandas
as
pd
from
sklearn.utils
import
shuffle
import
numpy
as
np
import
xlearn
as
xl
# 从数据库的表里获取数据,并转化成df格式
def
con_sql
(
sql
):
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
.
dropna
()
db
.
close
()
return
df
# 获取点击表里的device_id
sql
=
"select distinct device_id from data_feed_click where cid_type = 'diary'"
click_device_id
=
con_sql
(
sql
)[
0
]
.
values
.
tolist
()
print
(
"成功获取点击表里的device_id"
)
# 获取点击表里的数据
sql
=
"select cid,device_id,time from data_feed_click where cid_type = 'diary'"
click
=
con_sql
(
sql
)
click
=
click
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time"
})
print
(
"成功获取点击表里的数据"
)
# 获取曝光表里的数据
sql
=
"select cid,device_id,time from data_feed_exposure where cid_type = 'diary'"
exposure
=
con_sql
(
sql
)
exposure
=
exposure
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time"
})
print
(
"成功获取曝光表里的数据"
)
# 求曝光表和点击表的差集合
exposure
.
append
(
click
)
exposure
.
append
(
click
)
subset
=
click
.
columns
.
tolist
()
exposure
=
exposure
.
drop_duplicates
(
subset
=
subset
,
keep
=
False
)
print
(
"成功完成曝光表和点击表的差集合"
)
exposure
=
exposure
.
loc
[
exposure
[
"device_id"
]
.
isin
(
click_device_id
)]
# 打标签
click
[
"y"
]
=
1
exposure
[
"y"
]
=
0
print
(
"成功获取正负样本"
)
# 合并点击表和曝光表
data
=
click
.
append
(
exposure
)
print
(
"done 合并点击表和曝光表"
)
print
(
data
.
head
(
2
))
# 从time特征中抽取hour、weekday
data
[
"hour"
]
=
data
[
"time"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
hour
)
data
[
"weekday"
]
=
data
[
"time"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
weekday
())
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data
.
loc
[
data
[
"hour"
]
==
0
]
=
24
data
.
loc
[
data
[
"weekday"
]
==
0
]
=
7
data
[
"hour"
]
=
data
[
"hour"
]
.
astype
(
"category"
)
data
[
"weekday"
]
=
data
[
"weekday"
]
.
astype
(
"category"
)
data
=
data
.
drop
(
"time"
,
axis
=
1
)
print
(
"成功从time特征中抽取hour、weekday"
)
print
(
data
.
head
(
2
))
data
=
shuffle
(
data
)
print
(
"start ffm transform"
)
# ffm 格式转换函数、类
class
FFMFormatPandas
:
def
__init__
(
self
):
self
.
field_index_
=
None
self
.
feature_index_
=
None
self
.
y
=
None
def
fit
(
self
,
df
,
y
=
None
):
self
.
y
=
y
df_ffm
=
df
[
df
.
columns
.
difference
([
self
.
y
])]
if
self
.
field_index_
is
None
:
self
.
field_index_
=
{
col
:
i
for
i
,
col
in
enumerate
(
df_ffm
)}
if
self
.
feature_index_
is
not
None
:
last_idx
=
max
(
list
(
self
.
feature_index_
.
values
()))
if
self
.
feature_index_
is
None
:
self
.
feature_index_
=
dict
()
last_idx
=
0
for
col
in
df
.
columns
:
vals
=
df
[
col
]
.
unique
()
for
val
in
vals
:
if
pd
.
isnull
(
val
):
continue
name
=
'{}_{}'
.
format
(
col
,
val
)
if
name
not
in
self
.
feature_index_
:
self
.
feature_index_
[
name
]
=
last_idx
last_idx
+=
1
self
.
feature_index_
[
col
]
=
last_idx
last_idx
+=
1
return
self
def
fit_transform
(
self
,
df
,
y
=
None
):
self
.
fit
(
df
,
y
)
return
self
.
transform
(
df
)
def
transform_row_
(
self
,
row
,
t
):
ffm
=
[]
if
self
.
y
!=
None
:
ffm
.
append
(
str
(
row
.
loc
[
row
.
index
==
self
.
y
][
0
]))
if
self
.
y
is
None
:
ffm
.
append
(
str
(
0
))
for
col
,
val
in
row
.
loc
[
row
.
index
!=
self
.
y
]
.
to_dict
()
.
items
():
col_type
=
t
[
col
]
name
=
'{}_{}'
.
format
(
col
,
val
)
if
col_type
.
kind
==
'O'
:
ffm
.
append
(
'{}:{}:1'
.
format
(
self
.
field_index_
[
col
],
self
.
feature_index_
[
name
]))
elif
col_type
.
kind
==
'i'
:
ffm
.
append
(
'{}:{}:{}'
.
format
(
self
.
field_index_
[
col
],
self
.
feature_index_
[
col
],
val
))
return
' '
.
join
(
ffm
)
def
transform
(
self
,
df
):
t
=
df
.
dtypes
.
to_dict
()
return
pd
.
Series
({
idx
:
self
.
transform_row_
(
row
,
t
)
for
idx
,
row
in
df
.
iterrows
()})
ffm_train
=
FFMFormatPandas
()
data
=
ffm_train
.
fit_transform
(
data
,
y
=
'y'
)
print
(
"done transform ffm"
)
n
=
np
.
rint
(
data
.
shape
[
0
]
/
8
)
m
=
np
.
rint
(
data
.
shape
[
0
]
*
(
3
/
8
))
# 1/8的数据集用来做测试集
data
.
loc
[:
n
]
.
to_csv
(
"/home/zhangyanzhao/test.csv"
,
index
=
False
,
header
=
None
)
# 1/4的数据集用来做验证集
data
.
loc
[
n
+
1
:
m
]
.
to_csv
(
"/home/zhangyanzhao/validation.csv"
,
index
=
False
,
header
=
None
)
# 剩余的数据集用来做验证集
data
.
loc
[
m
+
1
:]
.
to_csv
(
"/home/zhangyanzhao/train.csv"
,
index
=
False
,
header
=
None
)
# 销毁data,目的是为了节省内存
data
=
data
.
drop
(
data
.
index
.
tolist
())
print
(
"start training"
)
ffm_model
=
xl
.
create_ffm
()
ffm_model
.
setTrain
(
"/home/zhangyanzhao/train.csv"
)
ffm_model
.
setValidate
(
"/home/zhangyanzhao/validation.csv"
)
param
=
{
'task'
:
'binary'
,
'lr'
:
0.2
,
'lambda'
:
0.002
,
'metric'
:
'auc'
}
ffm_model
.
fit
(
param
,
'/home/zhangyanzhao/model.out'
)
ffm_model
.
setTest
(
"/home/zhangyanzhao/test.csv"
)
ffm_model
.
setSigmoid
()
ffm_model
.
predict
(
"/home/zhangyanzhao/model.out"
,
"/home/zhangyanzhao/output.txt"
)
print
(
"end"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment