Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
32b39b12
Commit
32b39b12
authored
Sep 01, 2018
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改了配置文件里的文件路径
parent
f0746c05
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
6 additions
and
123 deletions
+6
-123
config.py
config.py
+2
-19
dataProcess.py
dataProcess.py
+0
-0
diaryUpdateOnlineOffline.py
diaryUpdateOnlineOffline.py
+0
-3
train.py
train.py
+3
-100
utils.py
utils.py
+1
-1
No files found.
config.py
View file @
32b39b12
DIRECTORY_PATH
=
'/data2/models/'
DIRECTORY_PATH
=
'/data/models/'
# 测试日期一定要大于验证日期,因为切割数据集的代码是这样设置的
# VALIDATION_DATE = '2018-08-05'
# TEST_DATE = '2018-08-06'
...
...
@@ -27,21 +28,3 @@ QUEUE_ONLINE_HOST = 'rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com'
LOCAL_HOST
=
'rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com'
# #线下预测文件
# "/Users/mac/utils/result/{0}.csv".format(queue_name)
# # 线下模型、预测产出文件
# "/Users/mac/utils/model.out",
# "/Users/mac/utils/result/{0}_output.txt".format(queue_name)
#
# # 线下日记队列
# host='rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com', port=3306, user='work',
# passwd='workwork', db='doris_test'
# select native_queue from device_diary_queue where device_id = '{}' and city_id = '{}';".for
# update device_diary_queue set {}='{}' where device_id = '{}' and city_id = '{}'".format\
# (queue_name,id_str,device_id, city_id)
#
# # 线下日记打分表
# host='rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com', port=3306, user='work',
# passwd='workwork', db='zhengxing_test'
# "select score,diary_id from biz_feed_diary_score where diary_id in {};".format(diary_list)
processData
.py
→
dataProcess
.py
View file @
32b39b12
File moved
diaryUpdateOnlineOffline.py
View file @
32b39b12
...
...
@@ -34,9 +34,6 @@ def get_video_id(cache_video_id):
finally
:
db
.
close
()
print
(
"videio_id 预览"
)
print
(
df
.
head
(
1
))
if
df
.
empty
:
return
cache_video_id
else
:
...
...
train.py
View file @
32b39b12
# from processData
import *
from
dataProcess
import
*
from
diaryTraining
import
*
import
time
from
prepareData
import
fetch_data
from
utils
import
*
import
pandas
as
pd
from
config
import
*
import
pickle
def
feature_en
(
data_start_date
,
data_end_date
,
validation_date
,
test_date
):
exposure
,
click
,
click_device_id
=
fetch_data
(
data_start_date
,
data_end_date
)
# 求曝光表和点击表的差集合
print
(
"曝光表处理前的样本个数"
)
print
(
exposure
.
shape
)
exposure
=
exposure
.
append
(
click
)
exposure
=
exposure
.
append
(
click
)
subset
=
click
.
columns
.
tolist
()
exposure
=
exposure
.
drop_duplicates
(
subset
=
subset
,
keep
=
False
)
print
(
"差集后曝光表个数"
)
print
(
exposure
.
shape
)
exposure
=
exposure
.
loc
[
exposure
[
"device_id"
]
.
isin
(
click_device_id
)]
print
(
"去除未点击用户后曝光表个数"
)
print
(
exposure
.
shape
)
# 打标签
click
[
"y"
]
=
1
exposure
[
"y"
]
=
0
print
(
"正样本个数"
)
print
(
click
.
shape
[
0
])
print
(
"负样本个数"
)
print
(
exposure
.
shape
[
0
])
# 合并点击表和曝光表
data
=
click
.
append
(
exposure
)
print
(
"点击表和曝光表合并成功"
)
data
=
data
.
sort_values
(
by
=
"stat_date"
,
ascending
=
False
)
test_number
=
data
[
data
[
"stat_date"
]
==
test_date
]
.
shape
[
0
]
validation_number
=
data
[
data
[
"stat_date"
]
==
validation_date
]
.
shape
[
0
]
data
=
data
.
drop
(
"stat_date"
,
axis
=
1
)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data
.
loc
[
data
[
"hour"
]
==
0
,
[
"hour"
]]
=
24
data
.
loc
[
data
[
"minute"
]
==
0
,
[
"minute"
]]
=
60
data
[
"hour"
]
=
data
[
"hour"
]
.
astype
(
"category"
)
data
[
"minute"
]
=
data
[
"minute"
]
.
astype
(
"category"
)
# 持久化候选cid,选预测候选集时用这个过滤
data_set_cid
=
data
[
"cid"
]
.
unique
()
cid_df
=
pd
.
DataFrame
()
cid_df
[
'cid'
]
=
data_set_cid
cid_df
.
to_csv
(
DIRECTORY_PATH
+
"train/data_set_cid.csv"
,
index
=
False
)
print
(
"成功保存data_set_cid"
)
# 将device_id 保存,目的是为了判断预测的device_id是否在这个集合里,如果不在,不需要预测
data_set_device_id
=
data
[
"device_id"
]
.
unique
()
device_id_df
=
pd
.
DataFrame
()
device_id_df
[
'device_id'
]
=
data_set_device_id
device_id_df
.
to_csv
(
DIRECTORY_PATH
+
"train/data_set_device_id.csv"
,
index
=
False
)
print
(
"成功保存data_set_device_id"
)
return
data
,
test_number
,
validation_number
def
ffm_transform
(
data
,
test_number
,
validation_number
):
print
(
"Start ffm transform"
)
start
=
time
.
time
()
ffm_train
=
multiFFMFormatPandas
()
# 服务器内存空闲的时候,可以下面的4改成6。4比较稳定,如果服务器内存被其他程序占用较多的时候,用6可能因为分配不到内存,脚本挂掉。
data
=
ffm_train
.
fit_transform
(
data
,
y
=
'y'
,
n
=
50000
,
processes
=
4
)
with
open
(
DIRECTORY_PATH
+
"train/ffm.pkl"
,
"wb"
)
as
f
:
pickle
.
dump
(
ffm_train
,
f
)
print
(
"done transform ffm"
)
end
=
time
.
time
()
print
(
"ffm转化数据耗时(分):"
)
print
((
end
-
start
)
/
60
)
data
.
to_csv
(
DIRECTORY_PATH
+
"total_ffm_data.csv"
,
index
=
False
)
data
=
pd
.
read_csv
(
DIRECTORY_PATH
+
"total_ffm_data.csv"
,
header
=
None
)
print
(
"数据集大小"
)
print
(
data
.
shape
)
test
=
data
.
loc
[:
test_number
]
print
(
"测试集大小"
)
print
(
test
.
shape
[
0
])
test
.
to_csv
(
DIRECTORY_PATH
+
"test_ffm_data.csv"
,
index
=
False
,
header
=
None
)
# 注意:测试集的日期一定要大于验证集,否则数据切割可能会出现错误
validation
=
data
.
loc
[(
test_number
+
1
):(
test_number
+
validation_number
)]
print
(
"验证集大小"
)
print
(
validation
.
shape
[
0
])
validation
.
to_csv
(
DIRECTORY_PATH
+
"validation_ffm_data.csv"
,
index
=
False
,
header
=
None
)
train
=
data
.
loc
[(
test_number
+
validation_number
+
1
):]
print
(
"训练集大小"
)
print
(
train
.
shape
[
0
])
# TODO validation date is not the end of train date
train
.
to_csv
(
DIRECTORY_PATH
+
"train_ffm_data.csv"
,
index
=
False
,
header
=
None
)
# 把数据获取、特征转换、模型训练的模型串联在一起
...
...
@@ -114,9 +17,9 @@ if __name__ == "__main__":
train
()
end_train
=
time
.
time
()
print
(
"训练模型耗时{}分"
.
format
((
end_train
-
start_train
)
/
60
))
move_file
()
#
move_file()
#TODO 如果用自己写的keepProcess文件守护进程,下面在这个函数里删掉重新启动进程那行代码,因为可能会多启动一次进程
restart_process
()
#
restart_process()
...
...
utils.py
View file @
32b39b12
...
...
@@ -18,7 +18,7 @@ def get_date():
month
=
now
.
month
day
=
now
.
day
date
=
datetime
(
year
,
month
,
day
)
data_start_date
=
(
date
-
timedelta
(
days
=
31
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
data_start_date
=
(
date
-
timedelta
(
days
=
5
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
data_end_date
=
(
date
-
timedelta
(
days
=
1
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
validation_date
=
(
date
-
timedelta
(
days
=
2
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
# 验证集和测试集的日期必须相差一天,否则切割数据集时会报错
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment