Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
1af713e4
Commit
1af713e4
authored
Aug 07, 2018
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactor
parent
0aa8923e
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
95 additions
and
81 deletions
+95
-81
config.py
config.py
+6
-0
diaryCandidateSet.py
diaryCandidateSet.py
+0
-0
diaryTraining.py
diaryTraining.py
+8
-78
prepareTestData.py
local/prepareTestData.py
+0
-0
testCases.py
local/testCases.py
+0
-0
prepareData.py
prepareData.py
+5
-3
processData.py
processData.py
+76
-0
No files found.
config.py
0 → 100644
View file @
1af713e4
DIRECTORY_PATH
=
'/home/zhangyanzhao/'
# processData.py
# diaryTraining.py
test/diaryTest
Set.py
→
diaryCandidate
Set.py
View file @
1af713e4
File moved
diaryTraining.py
View file @
1af713e4
import
pandas
as
pd
from
utils
import
FFMFormatPandas
import
xlearn
as
xl
import
time
from
prepareData
import
fetch_data
# exposure, click, click_device_id = fetch_data()
#
# # 求曝光表和点击表的差集合
# print("曝光表处理前的样本个数")
# print(exposure.shape)
# exposure = exposure.append(click)
# exposure = exposure.append(click)
# subset = click.columns.tolist()
# exposure = exposure.drop_duplicates(subset=subset,keep=False)
# print("差集后曝光表个数")
# print(exposure.shape)
# exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
# print("去除未点击用户后曝光表个数")
# print(exposure.shape)
# # 打标签
# click["y"] = 1
# exposure["y"] = 0
#
# print("正样本个数")
# print(click.shape[0])
# print("负样本个数")
# print(exposure.shape[0])
#
# # 合并点击表和曝光表
# data = click.append(exposure)
# data = data.sort_values(by="stat_date",ascending=False)
# print("前两行数据")
# print(data.head(2))
# print("后两行数据")
# print(data.tail(2))
# test_number = data[data["stat_date"]=='2018-08-06'].shape[0]
# validation_number = data[data["stat_date"]=='2018-08-05'].shape[0]
# data = data.drop("stat_date",axis=1)
#
# # 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
# data.loc[data["hour"]==0,["hour"]] = 24
# data.loc[data["minute"]==0,["minute"]] = 60
# data["hour"] = data["hour"].astype("category")
# data["minute"] = data["minute"].astype("category")
# print(data.head(2))
#
#
# print("start ffm transform")
# start = time.time()
# ffm_train = FFMFormatPandas()
# data = ffm_train.fit_transform(data, y='y')
# print("done transform ffm")
# end = time.time()
# print("ffm转化数据耗时:")
# print(end-start)
# data.to_csv("/home/zhangyanzhao/data.csv",index=False)
# data = pd.read_csv("/home/zhangyanzhao/data.csv",header=None)
# print("数据集大小")
# print(data.shape)
# print(data.head(2))
import
xlearn
as
xl
from
config
import
*
# test = data.loc[:test_number]
# print("测试集大小")
# print(test.shape[0])
# test.to_csv("/home/zhangyanzhao/test.csv",index = False,header = None)
# validation = data.loc[(test_number+1):(test_number+validation_number)]
# print("验证集大小")
# print(validation.shape[0])
# validation.to_csv("/home/zhangyanzhao/validation.csv",index = False,header = None)
# train = data.loc[(test_number+validation_number+1):]
# print("训练集大小")
# print(train.shape[0])
# train.to_csv("/home/zhangyanzhao/train.csv",index = False,header = None)
print
(
"start training"
)
ffm_model
=
xl
.
create_ffm
()
ffm_model
.
setTrain
(
"/home/zhangyanzhao/
data.csv"
)
ffm_model
.
setValidate
(
"/home/zhangyanzhao/
data.csv"
)
ffm_model
.
setTrain
(
DIRECTORY_PATH
+
"
data.csv"
)
ffm_model
.
setValidate
(
DIRECTORY_PATH
+
"
data.csv"
)
param
=
{
'task'
:
'binary'
,
'lr'
:
0.03
,
'lambda'
:
0.002
,
'metric'
:
'auc'
}
ffm_model
.
fit
(
param
,
'/home/zhangyanzhao/model.out'
)
ffm_model
.
fit
(
param
,
DIRECTORY_PATH
+
"model.out"
)
ffm_model
.
setTest
(
"/home/zhangyanzhao/
data.csv"
)
ffm_model
.
setTest
(
DIRECTORY_PATH
+
"
data.csv"
)
ffm_model
.
setSigmoid
()
ffm_model
.
predict
(
"/home/zhangyanzhao/
model.out"
,
"/home/zhangyanzhao/
output.txt"
)
ffm_model
.
predict
(
DIRECTORY_PATH
+
"
model.out"
,
DIRECTORY_PATH
+
"
output.txt"
)
print
(
"end"
)
test
/prepareTestData.py
→
local
/prepareTestData.py
View file @
1af713e4
File moved
test
/testCases.py
→
local
/testCases.py
View file @
1af713e4
File moved
prepareData.py
View file @
1af713e4
...
...
@@ -2,7 +2,7 @@ from utils import con_sql
import
datetime
def
fetch_data
(
start_date
=
'2018-08-03'
):
def
fetch_data
(
start_date
,
end_date
):
# 获取点击表里的device_id
sql
=
"select distinct device_id from data_feed_click"
...
...
@@ -10,7 +10,8 @@ def fetch_data(start_date='2018-08-03'):
print
(
"成功获取点击表里的device_id"
)
# 获取点击表里的数据
sql
=
"select cid,device_id,time,stat_date from data_feed_click where stat_date >= {0}"
.
format
(
start_date
)
sql
=
"select cid,device_id,time,stat_date from data_feed_click "
\
"where stat_date >= {0} and stat_date <= {1}"
.
format
(
start_date
,
end_date
)
click
=
con_sql
(
sql
)
click
=
click
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time"
,
3
:
"stat_date"
})
print
(
"成功获取点击表里的数据"
)
...
...
@@ -22,7 +23,8 @@ def fetch_data(start_date='2018-08-03'):
print
(
click
.
head
(
2
))
# 获取曝光表里的数据
sql
=
"select cid,device_id,time,stat_date from data_feed_exposure where stat_date >= {0}"
.
format
(
start_date
)
sql
=
"select cid,device_id,time,stat_date from data_feed_exposure "
\
"where stat_date >= {0} and stat_date <= {1}"
.
format
(
start_date
,
end_date
)
exposure
=
con_sql
(
sql
)
exposure
=
exposure
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time"
,
3
:
"stat_date"
})
print
(
"成功获取曝光表里的数据"
)
...
...
processData.py
0 → 100644
View file @
1af713e4
import
time
from
prepareData
import
fetch_data
from
utils
import
FFMFormatPandas
import
pandas
as
pd
from
config
import
*
exposure
,
click
,
click_device_id
=
fetch_data
(
start_date
=
'2018-08-03'
,
end_date
=
'2018-08-06'
)
# 求曝光表和点击表的差集合
print
(
"曝光表处理前的样本个数"
)
print
(
exposure
.
shape
)
exposure
=
exposure
.
append
(
click
)
exposure
=
exposure
.
append
(
click
)
subset
=
click
.
columns
.
tolist
()
exposure
=
exposure
.
drop_duplicates
(
subset
=
subset
,
keep
=
False
)
print
(
"差集后曝光表个数"
)
print
(
exposure
.
shape
)
exposure
=
exposure
.
loc
[
exposure
[
"device_id"
]
.
isin
(
click_device_id
)]
print
(
"去除未点击用户后曝光表个数"
)
print
(
exposure
.
shape
)
# 打标签
click
[
"y"
]
=
1
exposure
[
"y"
]
=
0
print
(
"正样本个数"
)
print
(
click
.
shape
[
0
])
print
(
"负样本个数"
)
print
(
exposure
.
shape
[
0
])
# 合并点击表和曝光表
data
=
click
.
append
(
exposure
)
data
=
data
.
sort_values
(
by
=
"stat_date"
,
ascending
=
False
)
print
(
"前两行数据"
)
print
(
data
.
head
(
2
))
print
(
"后两行数据"
)
print
(
data
.
tail
(
2
))
test_number
=
data
[
data
[
"stat_date"
]
==
'2018-08-06'
]
.
shape
[
0
]
validation_number
=
data
[
data
[
"stat_date"
]
==
'2018-08-05'
]
.
shape
[
0
]
data
=
data
.
drop
(
"stat_date"
,
axis
=
1
)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data
.
loc
[
data
[
"hour"
]
==
0
,[
"hour"
]]
=
24
data
.
loc
[
data
[
"minute"
]
==
0
,[
"minute"
]]
=
60
data
[
"hour"
]
=
data
[
"hour"
]
.
astype
(
"category"
)
data
[
"minute"
]
=
data
[
"minute"
]
.
astype
(
"category"
)
print
(
data
.
head
(
2
))
print
(
"Start ffm transform"
)
start
=
time
.
time
()
ffm_train
=
FFMFormatPandas
()
data
=
ffm_train
.
fit_transform
(
data
,
y
=
'y'
)
print
(
"done transform ffm"
)
end
=
time
.
time
()
print
(
"ffm转化数据耗时:"
)
print
(
end
-
start
)
data
.
to_csv
(
DIRECTORY_PATH
+
"data.csv"
,
index
=
False
)
data
=
pd
.
read_csv
(
DIRECTORY_PATH
+
"data.csv"
,
header
=
None
)
print
(
"数据集大小"
)
print
(
data
.
shape
)
print
(
data
.
head
(
2
))
test
=
data
.
loc
[:
test_number
]
print
(
"测试集大小"
)
print
(
test
.
shape
[
0
])
test
.
to_csv
(
DIRECTORY_PATH
+
"test.csv"
,
index
=
False
,
header
=
None
)
validation
=
data
.
loc
[(
test_number
+
1
):(
test_number
+
validation_number
)]
print
(
"验证集大小"
)
print
(
validation
.
shape
[
0
])
validation
.
to_csv
(
DIRECTORY_PATH
+
"validation.csv"
,
index
=
False
,
header
=
None
)
train
=
data
.
loc
[(
test_number
+
validation_number
+
1
):]
print
(
"训练集大小"
)
print
(
train
.
shape
[
0
])
train
.
to_csv
(
DIRECTORY_PATH
+
"train.csv"
,
index
=
False
,
header
=
None
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment