Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
83a7534f
Commit
83a7534f
authored
6 years ago
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add predict, user profile, candidates set, filter cids
parent
724824ce
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
207 additions
and
96 deletions
+207
-96
.DS_Store
.DS_Store
+0
-0
.gitignore
.gitignore
+2
-0
config.py
config.py
+1
-1
diaryCandidateSet.py
diaryCandidateSet.py
+23
-16
diaryTraining.py
diaryTraining.py
+5
-0
testCases.py
local/testCases.py
+5
-5
processData.py
processData.py
+95
-73
userDiaryPredict.py
userDiaryPredict.py
+61
-1
userProfile.py
userProfile.py
+9
-0
utils.py
utils.py
+6
-0
No files found.
.DS_Store
deleted
100644 → 0
View file @
724824ce
File deleted
This diff is collapsed.
Click to expand it.
.gitignore
View file @
83a7534f
data/
*.pyc
.DS_Store
This diff is collapsed.
Click to expand it.
config.py
View file @
83a7534f
...
...
@@ -5,6 +5,6 @@ TEST_DATE = '2018-08-06'
DATA_START_DATE
=
'2018-07-05'
DATA_END_DATE
=
'2018-08-06'
MODEL_VERSION
=
''
# processData.py
# diaryTraining.py
This diff is collapsed.
Click to expand it.
diaryCandidateSet.py
View file @
83a7534f
...
...
@@ -3,12 +3,25 @@ import pandas as pd
from
utils
import
*
from
config
import
*
# 候选集cid只能从训练数据集cid中选择
def
filter_cid
(
df
):
data_set_cid
=
pd
.
read_csv
(
DIRECTORY_PATH
+
"data_set_cid.csv"
)[
"cid"
]
.
values
.
tolist
()
print
(
"过滤前样本大小:"
)
print
(
df
.
shape
)
df
=
df
.
loc
[
df
[
"cid"
]
.
isin
(
data_set_cid
)]
print
(
"过滤后样本大小:"
)
print
(
df
.
shape
)
return
df
def
get_allCitiesDiaryTop2000
():
# 获取全国点击量TOP2000日记
sql
=
"select city_id,cid from data_feed_click where cid_type = 'diary' order by click_count_choice desc limit 2000"
allCitiesTop2000
=
con_sql
(
sql
)
allCitiesTop2000
=
allCitiesTop2000
.
rename
(
columns
=
{
0
:
"city_id"
,
1
:
"cid"
})
allCitiesTop2000
.
to_csv
(
DIRECTORY_PATH
+
"diaryTestSet/allCitiesDiaryTop2000.csv"
)
allCitiesTop2000
=
allCitiesTop2000
.
rename
(
columns
=
{
0
:
"city_id"
,
1
:
"cid"
})
allCitiesTop2000
=
filter_cid
(
allCitiesTop2000
)
allCitiesTop2000
.
to_csv
(
DIRECTORY_PATH
+
"diaryTestSet/allCitiesDiaryTop2000.csv"
)
print
(
"成功获取全国日记点击量TOP2000"
)
return
allCitiesTop2000
...
...
@@ -17,7 +30,7 @@ def get_cityList():
# 获取全国城市列表
sql
=
"select distinct city_id from data_feed_click"
cityList
=
con_sql
(
sql
)
cityList
.
to_csv
(
DIRECTORY_PATH
+
"diaryTestSet/cityList.csv"
)
cityList
.
to_csv
(
DIRECTORY_PATH
+
"diaryTestSet/cityList.csv"
)
cityList
=
cityList
[
0
]
.
values
.
tolist
()
cityList
.
remove
(
'worldwide'
)
print
(
"成功获取全国城市列表"
)
...
...
@@ -33,25 +46,19 @@ def get_eachCityDiaryTop2000():
"where cid_type = 'diary' and city_id = '{0}' "
\
"order by click_count_choice desc limit 2000"
.
format
(
i
)
data
=
con_sql
(
sql
)
data
=
data
.
rename
(
columns
=
{
0
:
"city_id"
,
1
:
"cid"
})
if
data
.
shape
[
0
]
<
2000
:
n
=
2000
-
data
.
shape
[
0
]
data
=
data
.
rename
(
columns
=
{
0
:
"city_id"
,
1
:
"cid"
})
data
=
filter_cid
(
data
)
if
data
.
shape
[
0
]
<
2000
:
n
=
2000
-
data
.
shape
[
0
]
# 全国点击量TOP2000日记中去除该城市的日记
temp
=
allCitiesTop2000
[
allCitiesTop2000
[
"city_id"
]
!=
i
]
.
loc
[:
n
-
1
]
temp
=
allCitiesTop2000
[
allCitiesTop2000
[
"city_id"
]
!=
i
]
.
loc
[:
n
-
1
]
data
=
data
.
append
(
temp
)
else
:
pass
file_name
=
DIRECTORY_PATH
+
"diaryTestSet/{0}DiaryTop2000.csv"
.
format
(
i
)
file_name
=
DIRECTORY_PATH
+
"diaryTestSet/{0}DiaryTop2000.csv"
.
format
(
i
)
data
.
to_csv
(
file_name
)
if
__name__
==
"__main__"
:
get_eachCityDiaryTop2000
()
This diff is collapsed.
Click to expand it.
diaryTraining.py
View file @
83a7534f
import
xlearn
as
xl
from
config
import
*
from
diaryCandidateSet
import
get_eachCityDiaryTop2000
print
(
"Start training"
)
ffm_model
=
xl
.
create_ffm
()
...
...
@@ -20,3 +21,7 @@ ffm_model.predict(DIRECTORY_PATH + "model_{0}-{1}_lr{2}_lambda{3}.out".format(DA
DATA_END_DATE
,
"0.03"
,
"0.002"
),
DIRECTORY_PATH
+
"testset{0}_output_model_{1}-{2}_lr{3}_lambda{4}.txt"
.
format
(
TEST_DATE
,
DATA_START_DATE
,
DATA_END_DATE
,
"0.03"
,
"0.002"
))
print
(
'---------------candidates--------------'
)
get_eachCityDiaryTop2000
()
This diff is collapsed.
Click to expand it.
local/testCases.py
View file @
83a7534f
...
...
@@ -5,17 +5,17 @@ import pickle
if
__name__
==
'__main__'
:
data
=
pd
.
read_csv
(
"../data/test-data/raw-exposure.csv"
)[[
"cid"
,
"device_id"
]]
data
[
"y"
]
=
1
test_data
=
data
.
tail
(
5
)
test_data
=
data
.
tail
(
1
)
ffm
=
FFMFormatPandas
()
data
=
ffm
.
fit_transform
(
data
,
y
=
'y'
)
data
.
to_csv
(
"ffm_data.csv"
,
index
=
False
)
data
.
to_csv
(
"
../data/
ffm_data.csv"
,
index
=
False
)
with
open
(
"ffm.object"
,
"wb"
)
as
f
:
with
open
(
"
../data/
ffm.object"
,
"wb"
)
as
f
:
pickle
.
dump
(
ffm
,
f
)
with
open
(
"ffm.object"
,
"rb"
)
as
f
:
with
open
(
"
../data/
ffm.object"
,
"rb"
)
as
f
:
ffm
=
pickle
.
load
(
f
)
result
=
ffm
.
transform
(
test_data
)
print
(
result
)
data_1
=
pd
.
read_csv
(
"ffm_data.csv"
,
header
=
None
)
.
tail
(
5
)
data_1
=
pd
.
read_csv
(
"
../data/
ffm_data.csv"
,
header
=
None
)
.
tail
(
5
)
print
(
data_1
)
This diff is collapsed.
Click to expand it.
processData.py
View file @
83a7534f
...
...
@@ -5,80 +5,102 @@ import pandas as pd
from
config
import
*
import
pickle
exposure
,
click
,
click_device_id
=
fetch_data
(
def
feature_en
():
exposure
,
click
,
click_device_id
=
fetch_data
(
start_date
=
DATA_START_DATE
,
end_date
=
DATA_END_DATE
)
# 求曝光表和点击表的差集合
print
(
"曝光表处理前的样本个数"
)
print
(
exposure
.
shape
)
exposure
=
exposure
.
append
(
click
)
exposure
=
exposure
.
append
(
click
)
subset
=
click
.
columns
.
tolist
()
exposure
=
exposure
.
drop_duplicates
(
subset
=
subset
,
keep
=
False
)
print
(
"差集后曝光表个数"
)
print
(
exposure
.
shape
)
exposure
=
exposure
.
loc
[
exposure
[
"device_id"
]
.
isin
(
click_device_id
)]
print
(
"去除未点击用户后曝光表个数"
)
print
(
exposure
.
shape
)
# 打标签
click
[
"y"
]
=
1
exposure
[
"y"
]
=
0
print
(
"正样本个数"
)
print
(
click
.
shape
[
0
])
print
(
"负样本个数"
)
print
(
exposure
.
shape
[
0
])
# 合并点击表和曝光表
data
=
click
.
append
(
exposure
)
data
=
data
.
sort_values
(
by
=
"stat_date"
,
ascending
=
False
)
print
(
"前两行数据"
)
print
(
data
.
head
(
2
))
print
(
"后两行数据"
)
print
(
data
.
tail
(
2
))
test_number
=
data
[
data
[
"stat_date"
]
==
TEST_DATE
]
.
shape
[
0
]
validation_number
=
data
[
data
[
"stat_date"
]
==
VALIDATION_DATE
]
.
shape
[
0
]
data
=
data
.
drop
(
"stat_date"
,
axis
=
1
)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data
.
loc
[
data
[
"hour"
]
==
0
,
[
"hour"
]]
=
24
data
.
loc
[
data
[
"minute"
]
==
0
,
[
"minute"
]]
=
60
data
[
"hour"
]
=
data
[
"hour"
]
.
astype
(
"category"
)
data
[
"minute"
]
=
data
[
"minute"
]
.
astype
(
"category"
)
print
(
data
.
head
(
2
))
print
(
"Start ffm transform"
)
start
=
time
.
time
()
ffm_train
=
FFMFormatPandas
()
data
=
ffm_train
.
fit_transform
(
data
,
y
=
'y'
)
with
open
(
DIRECTORY_PATH
+
"ffm_{0}_{1}.pkl"
.
format
(
DATA_START_DATE
,
DATA_END_DATE
),
"wb"
)
as
f
:
# 求曝光表和点击表的差集合
print
(
"曝光表处理前的样本个数"
)
print
(
exposure
.
shape
)
exposure
=
exposure
.
append
(
click
)
exposure
=
exposure
.
append
(
click
)
subset
=
click
.
columns
.
tolist
()
exposure
=
exposure
.
drop_duplicates
(
subset
=
subset
,
keep
=
False
)
print
(
"差集后曝光表个数"
)
print
(
exposure
.
shape
)
exposure
=
exposure
.
loc
[
exposure
[
"device_id"
]
.
isin
(
click_device_id
)]
print
(
"去除未点击用户后曝光表个数"
)
print
(
exposure
.
shape
)
# 打标签
click
[
"y"
]
=
1
exposure
[
"y"
]
=
0
print
(
"正样本个数"
)
print
(
click
.
shape
[
0
])
print
(
"负样本个数"
)
print
(
exposure
.
shape
[
0
])
# 合并点击表和曝光表
data
=
click
.
append
(
exposure
)
data
=
data
.
sort_values
(
by
=
"stat_date"
,
ascending
=
False
)
print
(
"前两行数据"
)
print
(
data
.
head
(
2
))
print
(
"后两行数据"
)
print
(
data
.
tail
(
2
))
test_number
=
data
[
data
[
"stat_date"
]
==
TEST_DATE
]
.
shape
[
0
]
validation_number
=
data
[
data
[
"stat_date"
]
==
VALIDATION_DATE
]
.
shape
[
0
]
data
=
data
.
drop
(
"stat_date"
,
axis
=
1
)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data
.
loc
[
data
[
"hour"
]
==
0
,
[
"hour"
]]
=
24
data
.
loc
[
data
[
"minute"
]
==
0
,
[
"minute"
]]
=
60
data
[
"hour"
]
=
data
[
"hour"
]
.
astype
(
"category"
)
data
[
"minute"
]
=
data
[
"minute"
]
.
astype
(
"category"
)
print
(
data
.
head
(
2
))
# 持久化候选cid
data_set_cid
=
data
[[
"cid"
]]
.
unique
()
cid_df
=
pd
.
DataFrame
()
cid_df
[
'cid'
]
=
data_set_cid
print
(
"data_set_cid :"
)
print
(
cid_df
.
head
(
2
))
cid_df
.
to_csv
(
DIRECTORY_PATH
+
"data_set_cid.csv"
,
index
=
False
)
return
data
,
test_number
,
validation_number
def
ffm_transform
(
data
,
test_number
,
validation_number
):
print
(
"Start ffm transform"
)
start
=
time
.
time
()
ffm_train
=
FFMFormatPandas
()
data
=
ffm_train
.
fit_transform
(
data
,
y
=
'y'
)
with
open
(
DIRECTORY_PATH
+
"ffm_{0}_{1}.pkl"
.
format
(
DATA_START_DATE
,
DATA_END_DATE
),
"wb"
)
as
f
:
pickle
.
dump
(
ffm_train
,
f
)
print
(
"done transform ffm"
)
end
=
time
.
time
()
print
(
"ffm转化数据耗时:"
)
print
(
end
-
start
)
data
.
to_csv
(
DIRECTORY_PATH
+
"data{0}-{1}.csv"
.
format
(
DATA_START_DATE
,
DATA_END_DATE
),
index
=
False
)
data
=
pd
.
read_csv
(
DIRECTORY_PATH
+
"data{0}-{1}.csv"
.
format
(
DATA_START_DATE
,
DATA_END_DATE
),
header
=
None
)
print
(
"数据集大小"
)
print
(
data
.
shape
)
print
(
data
.
head
(
2
))
test
=
data
.
loc
[:
test_number
]
print
(
"测试集大小"
)
print
(
test
.
shape
[
0
])
test
.
to_csv
(
DIRECTORY_PATH
+
"test{0}.csv"
.
format
(
TEST_DATE
),
index
=
False
,
header
=
None
)
# 注意:测试集的日期一定要大于验证集,否则数据切割可能会出现错误
validation
=
data
.
loc
[(
test_number
+
1
):(
test_number
+
validation_number
)]
print
(
"验证集大小"
)
print
(
validation
.
shape
[
0
])
validation
.
to_csv
(
DIRECTORY_PATH
+
"validation{0}.csv"
.
format
(
VALIDATION_DATE
),
index
=
False
,
header
=
None
)
train
=
data
.
loc
[(
test_number
+
validation_number
+
1
):]
print
(
"训练集大小"
)
print
(
train
.
shape
[
0
])
# TODO validation date is not the end of train date
train
.
to_csv
(
DIRECTORY_PATH
+
"train{0}-{1}.csv"
.
format
(
DATA_START_DATE
,
VALIDATION_DATE
),
index
=
False
,
header
=
None
)
print
(
"done transform ffm"
)
end
=
time
.
time
()
print
(
"ffm转化数据耗时:"
)
print
(
end
-
start
)
data
.
to_csv
(
DIRECTORY_PATH
+
"data{0}-{1}.csv"
.
format
(
DATA_START_DATE
,
DATA_END_DATE
),
index
=
False
)
data
=
pd
.
read_csv
(
DIRECTORY_PATH
+
"data{0}-{1}.csv"
.
format
(
DATA_START_DATE
,
DATA_END_DATE
),
header
=
None
)
print
(
"数据集大小"
)
print
(
data
.
shape
)
print
(
data
.
head
(
2
))
test
=
data
.
loc
[:
test_number
]
print
(
"测试集大小"
)
print
(
test
.
shape
[
0
])
test
.
to_csv
(
DIRECTORY_PATH
+
"test{0}.csv"
.
format
(
TEST_DATE
),
index
=
False
,
header
=
None
)
# 注意:测试集的日期一定要大于验证集,否则数据切割可能会出现错误
validation
=
data
.
loc
[(
test_number
+
1
):(
test_number
+
validation_number
)]
print
(
"验证集大小"
)
print
(
validation
.
shape
[
0
])
validation
.
to_csv
(
DIRECTORY_PATH
+
"validation{0}.csv"
.
format
(
VALIDATION_DATE
),
index
=
False
,
header
=
None
)
train
=
data
.
loc
[(
test_number
+
validation_number
+
1
):]
print
(
"训练集大小"
)
print
(
train
.
shape
[
0
])
# TODO validation date is not the end of train date
train
.
to_csv
(
DIRECTORY_PATH
+
"train{0}-{1}.csv"
.
format
(
DATA_START_DATE
,
VALIDATION_DATE
),
index
=
False
,
header
=
None
)
if
__name__
==
"__main__"
:
data_fe
=
feature_en
()
# ffm_transform(data_fe)
This diff is collapsed.
Click to expand it.
userDiaryPredict.py
View file @
83a7534f
from
config
import
*
import
pandas
as
pd
import
pickle
import
xlearn
as
xl
import
datetime
from
userProfile
import
fetch_user_profile
# 接收device_id、city_id
# 将device_id、city_id拼接到对应的城市热门日记表
# 将device_id、city_id拼接到对应的城市热门日记表。注意:下面预测集特征顺序要与训练集保持一致
def
device_id_merge
(
user_profile
):
file_name
=
DIRECTORY_PATH
+
"diaryTestSet/{0}DiaryTop2000.csv"
.
format
(
user_profile
[
'city_id'
])
data
=
pd
.
read_csv
(
file_name
)
data
[
"device_id"
]
=
user_profile
[
'device_id'
]
now
=
datetime
.
datetime
.
now
()
data
[
"hour"
]
=
now
.
hour
data
[
"minute"
]
=
now
.
minute
data
.
loc
[
data
[
"hour"
]
==
0
,
[
"hour"
]]
=
24
data
.
loc
[
data
[
"minute"
]
==
0
,
[
"minute"
]]
=
60
data
[
"hour"
]
=
data
[
"hour"
]
.
astype
(
"category"
)
data
[
"minute"
]
=
data
[
"minute"
]
.
astype
(
"category"
)
data
[
"y"
]
=
0
data
=
data
.
drop
(
"city_id"
,
axis
=
1
)
print
(
data
.
head
(
2
))
return
data
# 把ffm.pkl load进来,将上面的表转化为ffm格式
def
transform_ffm_format
(
ffm_format_pandas
,
df
,
device_id
):
data
=
ffm_format_pandas
.
transform
(
df
)
now
=
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d-
%
H-
%
M"
)
predict_file_name
=
DIRECTORY_PATH
+
"diaryPredictSet/{0}_{1}DiaryTop2000.csv"
.
format
(
device_id
,
now
)
data
.
to_csv
(
predict_file_name
)
user_instance_file_path
=
''
return
user_instance_file_path
# 将模型加载,预测,把预测日记的概率值按照降序排序,存到一个表里
def
predict
(
user_profile
):
ffm_model
=
xl
.
create_ffm
()
user_instance_file_path
=
device_id_merge
(
device_id
)
ffm_model
.
setTest
(
user_instance_file_path
)
ffm_model
.
predict
(
DIRECTORY_PATH
+
MODEL_VERSION
,
"./{0}_output.txt"
.
format
(
device_id
))
def
router
(
device_id
):
user_profile
,
is_exist
=
fetch_user_profile
(
device_id
)
file_path
=
DIRECTORY_PATH
+
"ffm_{0}_{1}.pkl"
.
format
(
DATA_START_DATE
,
DATA_END_DATE
)
with
open
(
file_path
,
"rb"
)
as
f
:
ffm_format_pandas
=
pickle
.
load
(
f
)
if
is_exist
:
predict
()
else
:
pass
# do something
if
__name__
==
"__main__"
:
router
(
device_id
=
'358035085192742'
)
# 预测一些真实的device_id
This diff is collapsed.
Click to expand it.
userProfile.py
0 → 100644
View file @
83a7534f
from
utils
import
con_sql
def
fetch_user_profile
(
device_id
):
# TODO sql语句中的device_id可能对应多个city_id
sql
=
"select device_id,city_id from data_feed_click limit 1"
user_profile
=
con_sql
(
sql
)
is_exist
=
user_profile
.
empty
return
user_profile
,
is_exist
This diff is collapsed.
Click to expand it.
utils.py
View file @
83a7534f
...
...
@@ -71,3 +71,9 @@ class FFMFormatPandas:
def
transform
(
self
,
df
):
t
=
df
.
dtypes
.
to_dict
()
return
pd
.
Series
({
idx
:
self
.
transform_row_
(
row
,
t
)
for
idx
,
row
in
df
.
iterrows
()})
def
is_feature_index_exist
(
self
,
name
):
if
name
in
self
.
feature_index_
:
return
True
else
:
return
False
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment