Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
df7a805c
Commit
df7a805c
authored
Aug 16, 2018
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
change ffm argument for test
parent
1caba335
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
22 additions
and
133 deletions
+22
-133
diaryCandidateSet.py
diaryCandidateSet.py
+2
-2
testCases.py
local/testCases.py
+8
-32
utils.py
local/utils.py
+0
-90
processData.py
processData.py
+1
-1
train.py
train.py
+9
-6
utils.py
utils.py
+2
-2
No files found.
diaryCandidateSet.py
View file @
df7a805c
...
...
@@ -76,7 +76,7 @@ def pool_method(city,sql,allCitiesTop3000):
# 多线程方法获取全国城市热门日记
def
multi_get_eachCityDiaryTop3000
(
processes
):
def
multi_get_eachCityDiaryTop3000
(
processes
=
8
):
city_list
=
get_cityList
()
allCitiesTop3000
=
get_allCitiesDiaryTop3000
()
pool
=
Pool
(
processes
)
...
...
@@ -93,7 +93,7 @@ def multi_get_eachCityDiaryTop3000(processes):
if
__name__
==
"__main__"
:
start
=
time
.
time
()
multi_get_eachCityDiaryTop3000
(
6
)
multi_get_eachCityDiaryTop3000
()
end
=
time
.
time
()
print
(
"获取各城市热门日记耗时{}分"
.
format
((
end
-
start
)
/
60
))
local/testCases.py
View file @
df7a805c
from
utils
import
*
import
datetime
import
pickle
import
time
def
split_cityList
(
cityList
,
n
):
l
=
len
(
cityList
)
if
l
<=
n
:
return
cityList
else
:
step
=
int
(
np
.
rint
(
l
/
n
))
data_list
=
[]
x
=
0
while
x
+
step
<
l
:
data_list
.
append
(
cityList
[
x
:
x
+
step
])
x
+=
step
data_list
.
append
(
cityList
[
x
:])
return
data_list
if
__name__
==
'__main__'
:
l
=
list
(
range
(
22
))
a
=
split_cityList
(
l
,
5
)
print
(
a
)
df
=
pd
.
read_csv
(
"/Users/mac/PycharmProjects/nvwa/ffm-baseline/data/test-data/大数据.csv"
)
for
i
in
range
(
500
,
10000
,
500
):
# data = pd.read_csv("../data/test-data/raw-exposure.csv")[["cid", "device_id"]]
# data["y"] = 1
# test_data = data.tail(1)
#
# ffm = FFMFormatPandas()
# data = ffm.fit_transform(data, y='y')
# data.to_csv("../data/ffm_data.csv", index=False)
#
# with open("../data/ffm.object", "wb") as f:
# pickle.dump(ffm, f)
# with open("../data/ffm.object", "rb") as f:
# ffm = pickle.load(f)
# result = ffm.transform(test_data)
# print(result)
# data_1 = pd.read_csv("../data/ffm_data.csv", header=None).tail(5)
# print(data_1)
start
=
time
.
time
()
ffm
=
multiFFMFormatPandas
()
data
=
ffm
.
fit_transform
(
df
,
y
=
"y"
,
n
=
i
,
processes
=
3
)
end
=
time
.
time
()
print
(
"分割单位{}耗时{}"
.
format
(
i
,
end
-
start
))
local/utils.py
deleted
100644 → 0
View file @
1caba335
# encoding = "utf-8"
import
pymysql
import
pandas
as
pd
import
numpy
as
np
import
redis
# 从Tidb数据库的表里获取数据,并转化成df格式
def
con_sql
(
sql
):
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
.
dropna
()
db
.
close
()
return
df
# 把数据写到redis里
# TODO 生产环境的redis地址没有提供,下面的地址是测试环境的,需要改成生产环境地址
def
add_data_to_redis
(
key
,
val
):
r
=
redis
.
StrictRedis
(
host
=
'10.30.50.58'
,
port
=
6379
,
db
=
12
)
r
.
set
(
key
,
val
)
# 设置key的过期时间,36小时后过期
r
.
expire
(
key
,
36
*
60
*
60
)
# ffm 格式转换函数、类
class
FFMFormatPandas
:
def
__init__
(
self
):
self
.
field_index_
=
None
self
.
feature_index_
=
None
self
.
y
=
None
def
fit
(
self
,
df
,
y
=
None
):
self
.
y
=
y
df_ffm
=
df
[
df
.
columns
.
difference
([
self
.
y
])]
if
self
.
field_index_
is
None
:
self
.
field_index_
=
{
col
:
i
for
i
,
col
in
enumerate
(
df_ffm
)}
if
self
.
feature_index_
is
not
None
:
last_idx
=
max
(
list
(
self
.
feature_index_
.
values
()))
if
self
.
feature_index_
is
None
:
self
.
feature_index_
=
dict
()
last_idx
=
0
for
col
in
df
.
columns
:
vals
=
df
[
col
]
.
unique
()
for
val
in
vals
:
if
pd
.
isnull
(
val
):
continue
name
=
'{}_{}'
.
format
(
col
,
val
)
if
name
not
in
self
.
feature_index_
:
self
.
feature_index_
[
name
]
=
last_idx
last_idx
+=
1
self
.
feature_index_
[
col
]
=
last_idx
last_idx
+=
1
return
self
def
fit_transform
(
self
,
df
,
y
=
None
):
self
.
fit
(
df
,
y
)
return
self
.
transform
(
df
)
def
transform_row_
(
self
,
row
,
t
):
ffm
=
[]
if
self
.
y
is
not
None
:
ffm
.
append
(
str
(
row
.
loc
[
row
.
index
==
self
.
y
][
0
]))
if
self
.
y
is
None
:
ffm
.
append
(
str
(
0
))
for
col
,
val
in
row
.
loc
[
row
.
index
!=
self
.
y
]
.
to_dict
()
.
items
():
col_type
=
t
[
col
]
name
=
'{}_{}'
.
format
(
col
,
val
)
if
col_type
.
kind
==
'O'
:
ffm
.
append
(
'{}:{}:1'
.
format
(
self
.
field_index_
[
col
],
self
.
feature_index_
[
name
]))
elif
col_type
.
kind
==
'i'
:
ffm
.
append
(
'{}:{}:{}'
.
format
(
self
.
field_index_
[
col
],
self
.
feature_index_
[
col
],
val
))
return
' '
.
join
(
ffm
)
def
transform
(
self
,
df
):
t
=
df
.
dtypes
.
to_dict
()
return
pd
.
Series
({
idx
:
self
.
transform_row_
(
row
,
t
)
for
idx
,
row
in
df
.
iterrows
()})
# 下面这个方法不是这个类原有的方法,是新增的。目的是用来判断这个用户是不是在训练数据集中存在
def
is_feature_index_exist
(
self
,
name
):
if
name
in
self
.
feature_index_
:
return
True
else
:
return
False
processData.py
View file @
df7a805c
...
...
@@ -75,7 +75,7 @@ def ffm_transform(data, test_number, validation_number):
print
(
"Start ffm transform"
)
start
=
time
.
time
()
ffm_train
=
multiFFMFormatPandas
()
data
=
ffm_train
.
fit_transform
(
data
,
y
=
'y'
,
n
=
200000
,
processes
=
6
)
data
=
ffm_train
.
fit_transform
(
data
,
y
=
'y'
,
n
=
200000
,
processes
=
8
)
with
open
(
DIRECTORY_PATH
+
"ffm.pkl"
,
"wb"
)
as
f
:
pickle
.
dump
(
ffm_train
,
f
)
...
...
train.py
View file @
df7a805c
from
processData
import
*
from
diaryTraining
import
*
from
diaryCandidateSet
import
get_eachCityDiaryTop3000
from
diaryCandidateSet
import
multi_
get_eachCityDiaryTop3000
from
utils
import
get_date
...
...
@@ -10,17 +10,20 @@ if __name__ == "__main__":
# while True:
# now = datetime.now()
# if (now.hour == 23) and (now.minute == 30):
start
=
time
.
time
()
start
_train
=
time
.
time
()
data_start_date
,
data_end_date
,
validation_date
,
test_date
=
get_date
()
data
,
test_number
,
validation_number
=
feature_en
(
data_start_date
,
data_end_date
,
validation_date
,
test_date
)
ffm_transform
(
data
,
test_number
,
validation_number
)
train
()
end_train
=
time
.
time
()
print
(
"训练模型耗时{}分"
.
format
((
end_train
-
start_train
)
/
60
))
print
(
'---------------prepare candidates--------------'
)
start
=
time
.
time
()
multi_get_eachCityDiaryTop3000
()
end
=
time
.
time
()
print
(
"训练模型耗时{}分"
.
format
((
end
-
start
)
/
60
))
# print('---------------prepare candidates--------------')
# get_eachCityDiaryTop3000()
# print("end")
print
(
"获取各城市热门日记耗时{}分"
.
format
((
end
-
start
)
/
60
))
print
(
"end"
)
...
...
utils.py
View file @
df7a805c
...
...
@@ -89,7 +89,7 @@ class multiFFMFormatPandas:
return
self
def
fit_transform
(
self
,
df
,
y
=
None
,
n
=
1000000
,
processes
=
6
):
def
fit_transform
(
self
,
df
,
y
=
None
,
n
=
200000
,
processes
=
8
):
# n是每个线程运行最大的数据条数,processes是线程数
self
.
fit
(
df
,
y
)
n
=
n
...
...
@@ -112,7 +112,7 @@ class multiFFMFormatPandas:
ffm
.
append
(
'{}:{}:{}'
.
format
(
self
.
field_index_
[
col
],
self
.
feature_index_
[
col
],
val
))
return
' '
.
join
(
ffm
)
def
transform
(
self
,
df
,
n
=
1
00
00
,
processes
=
2
):
def
transform
(
self
,
df
,
n
=
1
5
00
,
processes
=
2
):
# n是每个线程运行最大的数据条数,processes是线程数
t
=
df
.
dtypes
.
to_dict
()
data_list
=
self
.
data_split_line
(
df
,
n
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment