Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
df7a805c
Commit
df7a805c
authored
6 years ago
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
change ffm argument for test
parent
1caba335
master
gyz
mr/beta/bug22
offic
rtt
test
updatedb
wzw
zhao
zhao22
No related merge requests found
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
22 additions
and
133 deletions
+22
-133
diaryCandidateSet.py
diaryCandidateSet.py
+2
-2
testCases.py
local/testCases.py
+8
-32
utils.py
local/utils.py
+0
-90
processData.py
processData.py
+1
-1
train.py
train.py
+9
-6
utils.py
utils.py
+2
-2
No files found.
diaryCandidateSet.py
View file @
df7a805c
...
...
@@ -76,7 +76,7 @@ def pool_method(city,sql,allCitiesTop3000):
# 多线程方法获取全国城市热门日记
def
multi_get_eachCityDiaryTop3000
(
processes
):
def
multi_get_eachCityDiaryTop3000
(
processes
=
8
):
city_list
=
get_cityList
()
allCitiesTop3000
=
get_allCitiesDiaryTop3000
()
pool
=
Pool
(
processes
)
...
...
@@ -93,7 +93,7 @@ def multi_get_eachCityDiaryTop3000(processes):
if
__name__
==
"__main__"
:
start
=
time
.
time
()
multi_get_eachCityDiaryTop3000
(
6
)
multi_get_eachCityDiaryTop3000
()
end
=
time
.
time
()
print
(
"获取各城市热门日记耗时{}分"
.
format
((
end
-
start
)
/
60
))
This diff is collapsed.
Click to expand it.
local/testCases.py
View file @
df7a805c
from
utils
import
*
import
datetime
import
pickle
import
time
def
split_cityList
(
cityList
,
n
):
l
=
len
(
cityList
)
if
l
<=
n
:
return
cityList
else
:
step
=
int
(
np
.
rint
(
l
/
n
))
data_list
=
[]
x
=
0
while
x
+
step
<
l
:
data_list
.
append
(
cityList
[
x
:
x
+
step
])
x
+=
step
data_list
.
append
(
cityList
[
x
:])
return
data_list
if
__name__
==
'__main__'
:
l
=
list
(
range
(
22
))
a
=
split_cityList
(
l
,
5
)
print
(
a
)
df
=
pd
.
read_csv
(
"/Users/mac/PycharmProjects/nvwa/ffm-baseline/data/test-data/大数据.csv"
)
for
i
in
range
(
500
,
10000
,
500
):
# data = pd.read_csv("../data/test-data/raw-exposure.csv")[["cid", "device_id"]]
# data["y"] = 1
# test_data = data.tail(1)
#
# ffm = FFMFormatPandas()
# data = ffm.fit_transform(data, y='y')
# data.to_csv("../data/ffm_data.csv", index=False)
#
# with open("../data/ffm.object", "wb") as f:
# pickle.dump(ffm, f)
# with open("../data/ffm.object", "rb") as f:
# ffm = pickle.load(f)
# result = ffm.transform(test_data)
# print(result)
# data_1 = pd.read_csv("../data/ffm_data.csv", header=None).tail(5)
# print(data_1)
start
=
time
.
time
()
ffm
=
multiFFMFormatPandas
()
data
=
ffm
.
fit_transform
(
df
,
y
=
"y"
,
n
=
i
,
processes
=
3
)
end
=
time
.
time
()
print
(
"分割单位{}耗时{}"
.
format
(
i
,
end
-
start
))
This diff is collapsed.
Click to expand it.
local/utils.py
deleted
100644 → 0
View file @
1caba335
# encoding = "utf-8"
import
pymysql
import
pandas
as
pd
import
numpy
as
np
import
redis
# 从Tidb数据库的表里获取数据,并转化成df格式
def
con_sql
(
sql
):
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
.
dropna
()
db
.
close
()
return
df
# 把数据写到redis里
# TODO 生产环境的redis地址没有提供,下面的地址是测试环境的,需要改成生产环境地址
def
add_data_to_redis
(
key
,
val
):
r
=
redis
.
StrictRedis
(
host
=
'10.30.50.58'
,
port
=
6379
,
db
=
12
)
r
.
set
(
key
,
val
)
# 设置key的过期时间,36小时后过期
r
.
expire
(
key
,
36
*
60
*
60
)
# ffm 格式转换函数、类
class
FFMFormatPandas
:
def
__init__
(
self
):
self
.
field_index_
=
None
self
.
feature_index_
=
None
self
.
y
=
None
def
fit
(
self
,
df
,
y
=
None
):
self
.
y
=
y
df_ffm
=
df
[
df
.
columns
.
difference
([
self
.
y
])]
if
self
.
field_index_
is
None
:
self
.
field_index_
=
{
col
:
i
for
i
,
col
in
enumerate
(
df_ffm
)}
if
self
.
feature_index_
is
not
None
:
last_idx
=
max
(
list
(
self
.
feature_index_
.
values
()))
if
self
.
feature_index_
is
None
:
self
.
feature_index_
=
dict
()
last_idx
=
0
for
col
in
df
.
columns
:
vals
=
df
[
col
]
.
unique
()
for
val
in
vals
:
if
pd
.
isnull
(
val
):
continue
name
=
'{}_{}'
.
format
(
col
,
val
)
if
name
not
in
self
.
feature_index_
:
self
.
feature_index_
[
name
]
=
last_idx
last_idx
+=
1
self
.
feature_index_
[
col
]
=
last_idx
last_idx
+=
1
return
self
def
fit_transform
(
self
,
df
,
y
=
None
):
self
.
fit
(
df
,
y
)
return
self
.
transform
(
df
)
def
transform_row_
(
self
,
row
,
t
):
ffm
=
[]
if
self
.
y
is
not
None
:
ffm
.
append
(
str
(
row
.
loc
[
row
.
index
==
self
.
y
][
0
]))
if
self
.
y
is
None
:
ffm
.
append
(
str
(
0
))
for
col
,
val
in
row
.
loc
[
row
.
index
!=
self
.
y
]
.
to_dict
()
.
items
():
col_type
=
t
[
col
]
name
=
'{}_{}'
.
format
(
col
,
val
)
if
col_type
.
kind
==
'O'
:
ffm
.
append
(
'{}:{}:1'
.
format
(
self
.
field_index_
[
col
],
self
.
feature_index_
[
name
]))
elif
col_type
.
kind
==
'i'
:
ffm
.
append
(
'{}:{}:{}'
.
format
(
self
.
field_index_
[
col
],
self
.
feature_index_
[
col
],
val
))
return
' '
.
join
(
ffm
)
def
transform
(
self
,
df
):
t
=
df
.
dtypes
.
to_dict
()
return
pd
.
Series
({
idx
:
self
.
transform_row_
(
row
,
t
)
for
idx
,
row
in
df
.
iterrows
()})
# 下面这个方法不是这个类原有的方法,是新增的。目的是用来判断这个用户是不是在训练数据集中存在
def
is_feature_index_exist
(
self
,
name
):
if
name
in
self
.
feature_index_
:
return
True
else
:
return
False
This diff is collapsed.
Click to expand it.
processData.py
View file @
df7a805c
...
...
@@ -75,7 +75,7 @@ def ffm_transform(data, test_number, validation_number):
print
(
"Start ffm transform"
)
start
=
time
.
time
()
ffm_train
=
multiFFMFormatPandas
()
data
=
ffm_train
.
fit_transform
(
data
,
y
=
'y'
,
n
=
200000
,
processes
=
6
)
data
=
ffm_train
.
fit_transform
(
data
,
y
=
'y'
,
n
=
200000
,
processes
=
8
)
with
open
(
DIRECTORY_PATH
+
"ffm.pkl"
,
"wb"
)
as
f
:
pickle
.
dump
(
ffm_train
,
f
)
...
...
This diff is collapsed.
Click to expand it.
train.py
View file @
df7a805c
from
processData
import
*
from
diaryTraining
import
*
from
diaryCandidateSet
import
get_eachCityDiaryTop3000
from
diaryCandidateSet
import
multi_
get_eachCityDiaryTop3000
from
utils
import
get_date
...
...
@@ -10,17 +10,20 @@ if __name__ == "__main__":
# while True:
# now = datetime.now()
# if (now.hour == 23) and (now.minute == 30):
start
=
time
.
time
()
start
_train
=
time
.
time
()
data_start_date
,
data_end_date
,
validation_date
,
test_date
=
get_date
()
data
,
test_number
,
validation_number
=
feature_en
(
data_start_date
,
data_end_date
,
validation_date
,
test_date
)
ffm_transform
(
data
,
test_number
,
validation_number
)
train
()
end_train
=
time
.
time
()
print
(
"训练模型耗时{}分"
.
format
((
end_train
-
start_train
)
/
60
))
print
(
'---------------prepare candidates--------------'
)
start
=
time
.
time
()
multi_get_eachCityDiaryTop3000
()
end
=
time
.
time
()
print
(
"训练模型耗时{}分"
.
format
((
end
-
start
)
/
60
))
# print('---------------prepare candidates--------------')
# get_eachCityDiaryTop3000()
# print("end")
print
(
"获取各城市热门日记耗时{}分"
.
format
((
end
-
start
)
/
60
))
print
(
"end"
)
...
...
This diff is collapsed.
Click to expand it.
utils.py
View file @
df7a805c
...
...
@@ -89,7 +89,7 @@ class multiFFMFormatPandas:
return
self
def
fit_transform
(
self
,
df
,
y
=
None
,
n
=
1000000
,
processes
=
6
):
def
fit_transform
(
self
,
df
,
y
=
None
,
n
=
200000
,
processes
=
8
):
# n是每个线程运行最大的数据条数,processes是线程数
self
.
fit
(
df
,
y
)
n
=
n
...
...
@@ -112,7 +112,7 @@ class multiFFMFormatPandas:
ffm
.
append
(
'{}:{}:{}'
.
format
(
self
.
field_index_
[
col
],
self
.
feature_index_
[
col
],
val
))
return
' '
.
join
(
ffm
)
def
transform
(
self
,
df
,
n
=
1
00
00
,
processes
=
2
):
def
transform
(
self
,
df
,
n
=
1
5
00
,
processes
=
2
):
# n是每个线程运行最大的数据条数,processes是线程数
t
=
df
.
dtypes
.
to_dict
()
data_list
=
self
.
data_split_line
(
df
,
n
)
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment