Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
83b62060
Commit
83b62060
authored
Aug 07, 2018
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
test pickle
parent
400cb6a0
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
72 additions
and
43 deletions
+72
-43
.gitignore
.gitignore
+3
-0
utils.cpython-36.pyc
__pycache__/utils.cpython-36.pyc
+0
-0
diaryCandidateSet.py
diaryCandidateSet.py
+35
-29
diaryTraining.py
diaryTraining.py
+12
-7
testCases.py
local/testCases.py
+20
-5
processData.py
processData.py
+1
-1
utils.py
utils.py
+1
-1
No files found.
.gitignore
0 → 100644
View file @
83b62060
data/
*.pyc
__pycache__/utils.cpython-36.pyc
deleted
100644 → 0
View file @
400cb6a0
File deleted
diaryCandidateSet.py
View file @
83b62060
import
pymysql
import
pymysql
import
pandas
as
pd
import
pandas
as
pd
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
from
utils
import
*
from
config
import
*
# 从数据库获取数据,并将数据转化成DataFrame
def
get_data
(
sql
):
def
get_allCitiesDiaryTop2000
():
cursor
=
db
.
cursor
()
# 获取全国点击量TOP2000日记
cursor
.
execute
(
sql
)
sql
=
"select city_id,cid where cid_type = 'diary' order by click_count_choice desc limit 2000"
data
=
cursor
.
fetchall
()
allCitiesTop2000
=
con_sql
(
sql
)
data
=
pd
.
DataFrame
(
list
(
data
))
.
dropna
()
allCitiesTop2000
=
allCitiesTop2000
.
rename
(
columns
=
{
0
:
"city_id"
,
1
:
"cid"
})
return
data
allCitiesTop2000
.
to_csv
(
DIRECTORY_PATH
+
"diaryTestSet/allCitiesDiaryTop2000.csv"
)
print
(
"成功获取全国日记点击量TOP2000"
)
# 获取全国点击量TOP2000日记
return
allCitiesTop2000
sql
=
"select city_id,cid where cid_type = 'diary' order by click_count_choice desc limit 2000"
allCitiesTop2000
=
get_data
(
sql
)
allCitiesTop2000
=
allCitiesTop2000
.
rename
(
columns
=
{
0
:
"city_id"
,
1
:
"cid"
})
def
get_cityList
():
allCitiesTop2000
.
to_csv
(
"
\
home
\
zhangyanzhao
\
diaryTestSet
\a
llCitiesTop2000.csv"
)
# 获取全国城市列表
print
(
"成功获取全国日记点击量TOP2000"
)
sql
=
"select distinct city_id from data_feed_click"
cityList
=
con_sql
(
sql
)
# 获取全国城市列表
cityList
.
to_csv
(
DIRECTORY_PATH
+
"diaryTestSet/cityList.csv"
)
sql
=
"select distinct city_id from data_feed_click"
cityList
=
cityList
[
0
]
.
values
.
tolist
()
cityList
=
get_data
(
sql
)
print
(
"成功获取全国城市列表"
)
cityList
.
to_csv
(
"
\
home
\
zhangyanzhao
\
diaryTestSet
\
cityList.csv"
)
return
cityList
cityList
=
cityList
[
0
]
.
values
.
tolist
()
print
(
"成功获取城市列表"
)
def
get_eachCityDiaryTop2000
():
# 获取每个城市点击量TOP2000日记,如果数量小于2000,用全国点击量TOP2000日记补充
# 获取每个城市点击量TOP2000日记,如果数量小于2000,用全国点击量TOP2000日记补充
for
i
in
cityList
:
cityList
=
get_cityList
()
allCitiesTop2000
=
get_allCitiesDiaryTop2000
()
for
i
in
cityList
:
sql
=
"select city_id,cid from data_feed_click "
\
sql
=
"select city_id,cid from data_feed_click "
\
"where cid_type = 'diary' and city_id = {0} "
\
"where cid_type = 'diary' and city_id = {0} "
\
"order by click_count_choice desc limit 2000"
.
format
(
i
)
"order by click_count_choice desc limit 2000"
.
format
(
i
)
data
=
get_data
(
sql
)
data
=
con_sql
(
sql
)
data
=
data
.
rename
(
columns
=
{
0
:
"city_id"
,
1
:
"cid"
})
data
=
data
.
rename
(
columns
=
{
0
:
"city_id"
,
1
:
"cid"
})
if
data
.
shape
[
0
]
<
2000
:
if
data
.
shape
[
0
]
<
2000
:
...
@@ -40,10 +42,14 @@ for i in cityList:
...
@@ -40,10 +42,14 @@ for i in cityList:
else
:
else
:
pass
pass
file_name
=
"
\
home
\
zhangyanzhao
\
diaryTestSet
\
{0}DiaryTop2000.csv"
.
format
(
i
)
file_name
=
DIRECTORY_PATH
+
"diaryTestSet/
{0}DiaryTop2000.csv"
.
format
(
i
)
data
.
to_csv
(
file_name
)
data
.
to_csv
(
file_name
)
print
(
"end"
)
if
__name__
==
"__main__"
:
get_eachCityDiaryTop2000
()
...
...
diaryTraining.py
View file @
83b62060
...
@@ -3,15 +3,20 @@ from config import *
...
@@ -3,15 +3,20 @@ from config import *
print
(
"Start training"
)
print
(
"Start training"
)
ffm_model
=
xl
.
create_ffm
()
ffm_model
=
xl
.
create_ffm
()
ffm_model
.
setTrain
(
DIRECTORY_PATH
+
"train.csv"
)
ffm_model
.
setTrain
(
DIRECTORY_PATH
+
"train{0}-{1}.csv"
.
format
(
DATA_START_DATE
,
VALIDATION_DATE
))
ffm_model
.
setValidate
(
DIRECTORY_PATH
+
"validation.csv"
)
ffm_model
.
setValidate
(
DIRECTORY_PATH
+
"validation{0}.csv"
.
format
(
VALIDATION_DATE
))
lr
=
0.03
l2_lambda
=
0.002
param
=
{
'task'
:
'binary'
,
'lr'
:
lr
,
'lambda'
:
l2_lambda
,
'metric'
:
'auc'
}
param
=
{
'task'
:
'binary'
,
'lr'
:
0.03
,
'lambda'
:
0.002
,
'metric'
:
'auc'
}
ffm_model
.
fit
(
param
,
DIRECTORY_PATH
+
"model.out"
)
ffm_model
.
fit
(
param
,
DIRECTORY_PATH
+
"model_{0}-{1}_lr{2}_lambda{3}.out"
.
format
(
DATA_START_DATE
,
DATA_END_DATE
,
lr
,
l2_lambda
))
print
(
"predicting"
)
print
(
"predicting"
)
ffm_model
.
setTest
(
DIRECTORY_PATH
+
"test
.csv"
)
ffm_model
.
setTest
(
DIRECTORY_PATH
+
"test
{0}.csv"
.
format
(
TEST_DATE
)
)
ffm_model
.
setSigmoid
()
ffm_model
.
setSigmoid
()
ffm_model
.
predict
(
DIRECTORY_PATH
+
"model.out"
,
ffm_model
.
predict
(
DIRECTORY_PATH
+
"model_{0}-{1}_lr{2}_lambda{3}.out"
.
format
(
DATA_START_DATE
,
DIRECTORY_PATH
+
"output.txt"
)
DATA_END_DATE
,
"0.03"
,
"0.002"
),
DIRECTORY_PATH
+
"testset{0}_output_model_{1}-{2}_lr{3}_lambda{4}.txt"
.
format
(
TEST_DATE
,
DATA_START_DATE
,
DATA_END_DATE
,
"0.03"
,
"0.002"
))
local/testCases.py
View file @
83b62060
from
utils
import
*
from
utils
import
*
import
datetime
import
datetime
import
pickle
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
data
=
pd
.
read_csv
(
"data/raw-exposure.csv"
)[[
"cid"
,
"device_id"
,
"time"
]]
data
=
pd
.
read_csv
(
"../data/test-data/raw-exposure.csv"
)[[
"cid"
,
"device_id"
]]
data
[
"y"
]
=
1
test_data
=
data
.
tail
(
5
)
ffm
=
FFMFormatPandas
()
data
=
ffm
.
fit_transform
(
data
,
y
=
'y'
)
data
.
to_csv
(
"ffm_data.csv"
,
index
=
False
)
with
open
(
"ffm.object"
,
"wb"
)
as
f
:
pickle
.
dump
(
ffm
,
f
)
with
open
(
"ffm.object"
,
"rb"
)
as
f
:
ffm
=
pickle
.
load
(
f
)
result
=
ffm
.
transform
(
test_data
)
print
(
result
)
data_1
=
pd
.
read_csv
(
"ffm_data.csv"
,
header
=
None
)
.
tail
(
5
)
print
(
data_1
)
data
[
"hour"
]
=
data
[
"time"
]
.
apply
(
lambda
x
:
lambda
x
:
datetime
.
datetime
.
fromtimestamp
(
x
)
.
hour
)
#data.to_csv("data/data.csv")
print
(
data
.
head
())
...
...
processData.py
View file @
83b62060
...
@@ -68,7 +68,7 @@ test = data.loc[:test_number]
...
@@ -68,7 +68,7 @@ test = data.loc[:test_number]
print
(
"测试集大小"
)
print
(
"测试集大小"
)
print
(
test
.
shape
[
0
])
print
(
test
.
shape
[
0
])
test
.
to_csv
(
DIRECTORY_PATH
+
"test{0}.csv"
.
format
(
TEST_DATE
),
index
=
False
,
header
=
None
)
test
.
to_csv
(
DIRECTORY_PATH
+
"test{0}.csv"
.
format
(
TEST_DATE
),
index
=
False
,
header
=
None
)
# 注意:测试集的日期一定要大于验证集,否则数据切割可能会出现错误
validation
=
data
.
loc
[(
test_number
+
1
):(
test_number
+
validation_number
)]
validation
=
data
.
loc
[(
test_number
+
1
):(
test_number
+
validation_number
)]
print
(
"验证集大小"
)
print
(
"验证集大小"
)
print
(
validation
.
shape
[
0
])
print
(
validation
.
shape
[
0
])
...
...
utils.py
View file @
83b62060
...
@@ -54,7 +54,7 @@ class FFMFormatPandas:
...
@@ -54,7 +54,7 @@ class FFMFormatPandas:
def
transform_row_
(
self
,
row
,
t
):
def
transform_row_
(
self
,
row
,
t
):
ffm
=
[]
ffm
=
[]
if
self
.
y
!=
None
:
if
self
.
y
is
not
None
:
ffm
.
append
(
str
(
row
.
loc
[
row
.
index
==
self
.
y
][
0
]))
ffm
.
append
(
str
(
row
.
loc
[
row
.
index
==
self
.
y
][
0
]))
if
self
.
y
is
None
:
if
self
.
y
is
None
:
ffm
.
append
(
str
(
0
))
ffm
.
append
(
str
(
0
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment