Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
3ac71436
Commit
3ac71436
authored
Aug 08, 2018
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
解决过滤前样本是0的bug
parent
c9ffad87
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
35 additions
and
13 deletions
+35
-13
diaryCandidateSet.py
diaryCandidateSet.py
+4
-3
processData.py
processData.py
+1
-1
userDiaryPredict.py
userDiaryPredict.py
+29
-7
userProfile.py
userProfile.py
+1
-2
No files found.
diaryCandidateSet.py
View file @
3ac71436
...
...
@@ -9,9 +9,10 @@ def filter_cid(df):
data_set_cid
=
pd
.
read_csv
(
DIRECTORY_PATH
+
"data_set_cid.csv"
)[
"cid"
]
.
values
.
tolist
()
print
(
"过滤前样本大小:"
)
print
(
df
.
shape
)
df
=
df
.
loc
[
df
[
"cid"
]
.
isin
(
data_set_cid
)]
print
(
"过滤后样本大小:"
)
print
(
df
.
shape
)
if
not
df
.
empty
:
df
=
df
.
loc
[
df
[
"cid"
]
.
isin
(
data_set_cid
)]
print
(
"过滤后样本大小:"
)
print
(
df
.
shape
)
return
df
...
...
processData.py
View file @
3ac71436
...
...
@@ -101,6 +101,6 @@ def ffm_transform(data, test_number, validation_number):
if
__name__
==
"__main__"
:
data_fe
=
feature_en
()
#
ffm_transform(data_fe)
ffm_transform
(
data_fe
)
userDiaryPredict.py
View file @
3ac71436
...
...
@@ -10,18 +10,21 @@ from userProfile import fetch_user_profile
# 将device_id、city_id拼接到对应的城市热门日记表。注意:下面预测集特征顺序要与训练集保持一致
def
device_id_merge
(
user_profile
):
def
feature_en
(
user_profile
):
file_name
=
DIRECTORY_PATH
+
"diaryTestSet/{0}DiaryTop2000.csv"
.
format
(
user_profile
[
'city_id'
])
data
=
pd
.
read_csv
(
file_name
)
data
[
"device_id"
]
=
user_profile
[
'device_id'
]
now
=
datetime
.
datetime
.
now
()
data
[
"hour"
]
=
now
.
hour
data
[
"minute"
]
=
now
.
minute
data
.
loc
[
data
[
"hour"
]
==
0
,
[
"hour"
]]
=
24
data
.
loc
[
data
[
"minute"
]
==
0
,
[
"minute"
]]
=
60
data
[
"hour"
]
=
data
[
"hour"
]
.
astype
(
"category"
)
data
[
"minute"
]
=
data
[
"minute"
]
.
astype
(
"category"
)
data
[
"y"
]
=
0
data
=
data
.
drop
(
"city_id"
,
axis
=
1
)
print
(
data
.
head
(
2
))
...
...
@@ -37,7 +40,7 @@ def transform_ffm_format(df, device_id):
data
=
ffm_format_pandas
.
transform
(
df
)
now
=
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d-
%
H-
%
M"
)
predict_file_name
=
DIRECTORY_PATH
+
"
diaryPredictSe
t/{0}_{1}DiaryTop2000.csv"
.
format
(
device_id
,
now
)
predict_file_name
=
DIRECTORY_PATH
+
"
resul
t/{0}_{1}DiaryTop2000.csv"
.
format
(
device_id
,
now
)
data
.
to_csv
(
predict_file_name
)
user_instance_file_path
=
''
return
user_instance_file_path
...
...
@@ -45,15 +48,33 @@ def transform_ffm_format(df, device_id):
# 将模型加载,预测,把预测日记的概率值按照降序排序,存到一个表里
def
predict
(
user_profile
):
ffm_model
=
xl
.
create_ffm
()
user_instance
=
device_id_merge
(
user_profile
)
user_instance
=
feature_en
(
user_profile
)
user_instance_file_path
=
transform_ffm_format
(
user_instance
)
ffm_model
=
xl
.
create_ffm
()
ffm_model
.
setTest
(
user_instance_file_path
)
ffm_model
.
predict
(
DIRECTORY_PATH
+
"model_{0}-{1}_lr{2}_lambda{3}.out"
.
format
(
DATA_START_DATE
,
DATA_END_DATE
,
lr
,
l2_lambda
),
DIRECTORY_PATH
+
"/{0}_output.txt"
.
format
(
user_profile
[
'device_id'
]))
DIRECTORY_PATH
+
"result/{0}_output.txt"
.
format
(
user_profile
[
'device_id'
]))
upload_predict
(
user_profile
,
user_instance
)
def
upload_predict
(
user_profile
,
instance
):
probabilities
=
pd
.
read_csv
(
DIRECTORY_PATH
+
"result/{0}_output.txt"
.
format
(
user_profile
[
'device_id'
]),
header
=
None
)
probabilities
=
probabilities
.
rename
(
columns
=
{
0
:
"prob"
})
probabilities
[
"cid"
]
=
instance
[
'cid'
]
probabilities
=
probabilities
.
sort_values
(
by
=
"0"
,
ascending
=
False
)
wrapper_result
(
probabilities
,
user_profile
[
'device_id'
])
def
wrapper_result
(
prob
,
device_id
):
prob
=
prob
.
head
(
500
)
prob
[
"url"
]
=
prob
[
"cid"
]
.
apply
(
lambda
x
:
"http://m.igengmei.com/diary_book/"
+
prob
[
x
.
index
(
'|'
)
+
1
:]
+
'/'
)
prob
.
to_csv
(
DIRECTORY_PATH
+
"result/{}_feed"
.
format
(
device_id
))
def
router
(
device_id
):
...
...
@@ -61,8 +82,9 @@ def router(device_id):
if
is_exist
:
predict
(
user_profile
)
else
:
print
(
'Sorry, we don
\'
t have you'
)
print
(
'Sorry, we don
\'
t have you
.
'
)
if
__name__
==
"__main__"
:
...
...
userProfile.py
View file @
3ac71436
...
...
@@ -5,5 +5,4 @@ def fetch_user_profile(device_id):
# TODO sql语句中的device_id可能对应多个city_id
sql
=
"select device_id,city_id from data_feed_click limit 1"
user_profile
=
con_sql
(
sql
)
is_exist
=
user_profile
.
empty
return
user_profile
,
is_exist
return
user_profile
,
user_profile
.
empty
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment