Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
65dcce4a
Commit
65dcce4a
authored
Aug 17, 2018
by
高雅喆
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline
add the count in result format
parents
8cf15903
e521d2ea
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
5 additions
and
17 deletions
+5
-17
prepareData.py
prepareData.py
+0
-5
processData.py
processData.py
+4
-11
utils.py
utils.py
+1
-1
No files found.
prepareData.py
View file @
65dcce4a
...
...
@@ -15,14 +15,11 @@ def fetch_data(start_date, end_date):
"where stat_date >= '{0}' and stat_date <= '{1}'"
.
format
(
start_date
,
end_date
)
click
=
con_sql
(
sql
)
click
=
click
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time_date"
,
3
:
"stat_date"
})
print
(
click
.
head
(
5
))
print
(
"成功获取点击表里的数据"
)
# 从time特征中抽取hour
click
[
"hour"
]
=
click
[
"time_date"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
hour
)
click
[
"minute"
]
=
click
[
"time_date"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
minute
)
click
=
click
.
drop
(
"time_date"
,
axis
=
1
)
print
(
"点击表数据预览"
)
print
(
click
.
head
(
2
))
# 获取曝光表里的数据
sql
=
"select cid,device_id,time,stat_date from data_feed_exposure "
\
...
...
@@ -37,7 +34,5 @@ def fetch_data(start_date, end_date):
exposure
[
"hour"
]
=
exposure
[
"time_date"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
hour
)
exposure
[
"minute"
]
=
exposure
[
"time_date"
]
.
apply
(
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
minute
)
exposure
=
exposure
.
drop
(
"time_date"
,
axis
=
1
)
print
(
"曝光表数据预览"
)
print
(
exposure
.
head
(
2
))
return
exposure
,
click
,
click_device_id
processData.py
View file @
65dcce4a
...
...
@@ -36,11 +36,8 @@ def feature_en(data_start_date, data_end_date, validation_date, test_date):
# 合并点击表和曝光表
data
=
click
.
append
(
exposure
)
print
(
"点击表和曝光表合并成功"
)
data
=
data
.
sort_values
(
by
=
"stat_date"
,
ascending
=
False
)
print
(
"前两行数据"
)
print
(
data
.
head
(
2
))
print
(
"后两行数据"
)
print
(
data
.
tail
(
2
))
test_number
=
data
[
data
[
"stat_date"
]
==
test_date
]
.
shape
[
0
]
validation_number
=
data
[
data
[
"stat_date"
]
==
validation_date
]
.
shape
[
0
]
data
=
data
.
drop
(
"stat_date"
,
axis
=
1
)
...
...
@@ -50,23 +47,20 @@ def feature_en(data_start_date, data_end_date, validation_date, test_date):
data
.
loc
[
data
[
"minute"
]
==
0
,
[
"minute"
]]
=
60
data
[
"hour"
]
=
data
[
"hour"
]
.
astype
(
"category"
)
data
[
"minute"
]
=
data
[
"minute"
]
.
astype
(
"category"
)
print
(
data
.
head
(
2
))
# 持久化候选cid
data_set_cid
=
data
[
"cid"
]
.
unique
()
cid_df
=
pd
.
DataFrame
()
cid_df
[
'cid'
]
=
data_set_cid
print
(
"data_set_cid :"
)
print
(
cid_df
.
head
(
2
))
cid_df
.
to_csv
(
DIRECTORY_PATH
+
"data_set_cid.csv"
,
index
=
False
)
print
(
"成功保存data_set_cid"
)
# 将device_id 保存,目的是为了判断预测的device_id是否在这个集合里,如果不在,不需要预测
data_set_device_id
=
data
[
"device_id"
]
.
unique
()
device_id_df
=
pd
.
DataFrame
()
device_id_df
[
'device_id'
]
=
data_set_device_id
print
(
"data_set_device_id :"
)
print
(
device_id_df
.
head
(
2
))
device_id_df
.
to_csv
(
DIRECTORY_PATH
+
"data_set_device_id.csv"
,
index
=
False
)
print
(
"成功保存data_set_device_id"
)
return
data
,
test_number
,
validation_number
...
...
@@ -75,7 +69,7 @@ def ffm_transform(data, test_number, validation_number):
print
(
"Start ffm transform"
)
start
=
time
.
time
()
ffm_train
=
multiFFMFormatPandas
()
data
=
ffm_train
.
fit_transform
(
data
,
y
=
'y'
,
n
=
100000
,
processes
=
6
)
data
=
ffm_train
.
fit_transform
(
data
,
y
=
'y'
,
n
=
50000
,
processes
=
4
)
with
open
(
DIRECTORY_PATH
+
"ffm.pkl"
,
"wb"
)
as
f
:
pickle
.
dump
(
ffm_train
,
f
)
...
...
@@ -88,7 +82,6 @@ def ffm_transform(data, test_number, validation_number):
data
=
pd
.
read_csv
(
DIRECTORY_PATH
+
"total_ffm_data.csv"
,
header
=
None
)
print
(
"数据集大小"
)
print
(
data
.
shape
)
print
(
data
.
head
(
2
))
test
=
data
.
loc
[:
test_number
]
print
(
"测试集大小"
)
...
...
utils.py
View file @
65dcce4a
...
...
@@ -89,7 +89,7 @@ class multiFFMFormatPandas:
return
self
def
fit_transform
(
self
,
df
,
y
=
None
,
n
=
200000
,
processes
=
8
):
def
fit_transform
(
self
,
df
,
y
=
None
,
n
=
50000
,
processes
=
4
):
# n是每个线程运行最大的数据条数,processes是线程数
self
.
fit
(
df
,
y
)
n
=
n
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment