Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
G
gm_strategy_cvr
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
gm_strategy_cvr
Commits
c713565e
Commit
c713565e
authored
Jul 30, 2020
by
赵威
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
try predict tractate
parent
d9f5cd31
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
205 additions
and
43 deletions
+205
-43
tractate_fe.py
src/models/esmm/fe/tractate_fe.py
+93
-1
tractate_model.py
src/models/esmm/tractate_model.py
+50
-0
train_tractate.py
src/train_tractate.py
+62
-42
No files found.
src/models/esmm/fe/tractate_fe.py
View file @
c713565e
import
timeit
import
pandas
as
pd
from
utils.cache
import
redis_db_client
...
...
@@ -135,4 +137,94 @@ def join_features(device_df, tractate_df, cc_df):
def
device_tractate_fe
(
device_id
,
tractate_ids
,
device_dict
,
tractate_dict
):
pass
time_1
=
timeit
.
default_timer
()
device_info
=
device_dict
.
get
(
device_id
,
{})
.
copy
()
if
not
device_info
:
device_info
=
{
"device_id"
:
device_id
,
"active_type"
:
"1"
,
"active_days"
:
"0"
,
"channel_first"
:
"App Store"
,
"city_first"
:
"beijing"
,
"model_first"
:
"iPhone10"
,
"past_consume_ability_history"
:
"极弱"
,
"potential_consume_ability_history"
:
"极弱"
,
"price_sensitive_history"
:
"不敏感无消费"
,
"device_click_num_1d"
:
0
,
"device_click_num_3d"
:
0
,
"device_click_num_7d"
:
0
,
"device_click_num_15d"
:
0
,
"device_click_num_30d"
:
0
,
"device_click_num_180d"
:
0
,
"click_tractate_id1"
:
"-1"
,
"click_tractate_id2"
:
"-1"
,
"click_tractate_id3"
:
"-1"
,
"click_tractate_id4"
:
"-1"
,
"click_tractate_id5"
:
"-1"
}
device_fd
=
device_info
.
get
(
"first_demands"
,
[])
device_sd
=
device_info
.
get
(
"second_demands"
,
[])
device_fs
=
device_info
.
get
(
"first_solutions"
,
[])
device_ss
=
device_info
.
get
(
"second_solutions"
,
[])
device_fp
=
device_info
.
get
(
"first_positions"
,
[])
device_sp
=
device_info
.
get
(
"second_positions"
,
[])
device_p
=
device_info
.
get
(
"projects"
,
[])
device_info
[
"device_fd"
]
=
nth_element
(
device_fd
,
0
)
device_info
[
"device_sd"
]
=
nth_element
(
device_sd
,
0
)
device_info
[
"device_fs"
]
=
nth_element
(
device_fs
,
0
)
device_info
[
"device_ss"
]
=
nth_element
(
device_ss
,
0
)
device_info
[
"device_fp"
]
=
nth_element
(
device_fp
,
0
)
device_info
[
"device_sp"
]
=
nth_element
(
device_sp
,
0
)
device_info
[
"device_p"
]
=
nth_element
(
device_p
,
0
)
tractate_lst
=
[]
tractate_ids_res
=
[]
for
id
in
tractate_ids
:
tractate_info
=
tractate_dict
.
get
(
id
,
{})
.
copy
()
if
tractate_info
:
tractate_ids_res
.
append
(
tractate_info
.
get
(
"card_id"
,
"-1"
))
tractate_fd
=
tractate_info
.
get
(
"first_demands"
,
[])
tractate_sd
=
tractate_info
.
get
(
"second_demands"
,
[])
tractate_fs
=
tractate_info
.
get
(
"first_solutions"
,
[])
tractate_ss
=
tractate_info
.
get
(
"second_solutions"
,
[])
tractate_fp
=
tractate_info
.
get
(
"first_positions"
,
[])
tractate_sp
=
tractate_info
.
get
(
"second_positions"
,
[])
tractate_p
=
tractate_info
.
get
(
"projects"
,
[])
common_fd
=
common_elements
(
device_fd
,
tractate_fd
)
common_sd
=
common_elements
(
device_sd
,
tractate_sd
)
common_fs
=
common_elements
(
device_fs
,
tractate_fs
)
common_ss
=
common_elements
(
device_ss
,
tractate_ss
)
common_fp
=
common_elements
(
device_fp
,
tractate_fp
)
common_sp
=
common_elements
(
device_sp
,
tractate_sp
)
common_p
=
common_elements
(
device_p
,
tractate_p
)
tractate_info
[
"content_fd"
]
=
nth_element
(
tractate_fd
,
0
)
tractate_info
[
"content_sd"
]
=
nth_element
(
tractate_sd
,
0
)
tractate_info
[
"content_fs"
]
=
nth_element
(
tractate_fs
,
0
)
tractate_info
[
"content_ss"
]
=
nth_element
(
tractate_ss
,
0
)
tractate_info
[
"content_fp"
]
=
nth_element
(
tractate_fp
,
0
)
tractate_info
[
"content_sp"
]
=
nth_element
(
tractate_sp
,
0
)
tractate_info
[
"content_p"
]
=
nth_element
(
tractate_p
,
0
)
tractate_info
[
"fd1"
]
=
nth_element
(
common_fd
,
0
)
tractate_info
[
"fd2"
]
=
nth_element
(
common_fd
,
1
)
tractate_info
[
"fd3"
]
=
nth_element
(
common_fd
,
2
)
tractate_info
[
"sd1"
]
=
nth_element
(
common_sd
,
0
)
tractate_info
[
"sd2"
]
=
nth_element
(
common_sd
,
1
)
tractate_info
[
"sd3"
]
=
nth_element
(
common_sd
,
2
)
tractate_info
[
"fs1"
]
=
nth_element
(
common_fs
,
0
)
tractate_info
[
"fs2"
]
=
nth_element
(
common_fs
,
1
)
tractate_info
[
"fs3"
]
=
nth_element
(
common_fs
,
2
)
tractate_info
[
"ss1"
]
=
nth_element
(
common_ss
,
0
)
tractate_info
[
"ss2"
]
=
nth_element
(
common_ss
,
1
)
tractate_info
[
"ss3"
]
=
nth_element
(
common_ss
,
2
)
tractate_info
[
"fp1"
]
=
nth_element
(
common_fp
,
0
)
tractate_info
[
"fp2"
]
=
nth_element
(
common_fp
,
1
)
tractate_info
[
"fp3"
]
=
nth_element
(
common_fp
,
2
)
tractate_info
[
"sp1"
]
=
nth_element
(
common_sp
,
0
)
tractate_info
[
"sp2"
]
=
nth_element
(
common_sp
,
1
)
tractate_info
[
"sp3"
]
=
nth_element
(
common_sp
,
2
)
tractate_info
[
"p1"
]
=
nth_element
(
common_p
,
0
)
tractate_info
[
"p2"
]
=
nth_element
(
common_p
,
1
)
tractate_info
[
"p3"
]
=
nth_element
(
common_p
,
2
)
tractate_lst
.
append
(
tractate_info
)
total_1
=
(
timeit
.
default_timer
()
-
time_1
)
print
(
"join device tractate cost {:.5f}s"
.
format
(
total_1
))
return
device_info
,
tractate_lst
,
tractate_ids_res
src/models/esmm/tractate_model.py
0 → 100644
View file @
c713565e
import
timeit
import
tensorflow
as
tf
from
.fe.tractate_fe
import
(
CATEGORICAL_COLUMNS
,
FLOAT_COLUMNS
,
INT_COLUMNS
,
device_tractate_fe
)
from
.model
import
_bytes_feature
,
_float_feature
,
_int64_feature
def
model_predict_tractate
(
device_id
,
tractate_ids
,
device_dict
,
tractate_dict
,
predict_fn
):
try
:
time_1
=
timeit
.
default_timer
()
device_info
,
tractate_lst
,
tractate_ids_res
=
device_tractate_fe
(
device_id
,
tractate_ids
,
device_dict
,
tractate_dict
)
print
(
"predict check: "
+
str
(
len
(
tractate_lst
))
+
" "
+
str
(
len
(
tractate_ids_res
)))
int_columns
=
INT_COLUMNS
float_columns
=
FLOAT_COLUMNS
str_columns
=
CATEGORICAL_COLUMNS
examples
=
[]
for
tractate_info
in
tractate_lst
:
tmp
=
{}
tmp
.
update
(
device_info
)
tmp
.
update
(
tractate_info
)
features
=
{}
for
col
in
int_columns
:
features
[
col
]
=
_int64_feature
(
int
(
tmp
[
col
]))
for
col
in
float_columns
:
features
[
col
]
=
_float_feature
(
float
(
tmp
[
col
]))
for
col
in
str_columns
:
features
[
col
]
=
_bytes_feature
(
str
(
tmp
[
col
])
.
encode
(
encoding
=
"utf-8"
))
example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
examples
.
append
(
example
.
SerializeToString
())
total_1
=
(
timeit
.
default_timer
()
-
time_1
)
print
(
"make example cost {:.5f}s"
.
format
(
total_1
))
time_1
=
timeit
.
default_timer
()
predictions
=
predict_fn
({
"examples"
:
examples
})
print
(
predictions
)
# res_tuple = sorted(zip(tractate_ids_res, predictions["output"].tolist()), key=lambda x: x[1], reverse=True)
# res = []
# for (id, _) in res_tuple:
# res.append(int(id))
# # print(res)
total_1
=
(
timeit
.
default_timer
()
-
time_1
)
print
(
"prediction cost {:.5f}s"
.
format
(
total_1
))
# return res
except
Exception
as
e
:
print
(
e
)
# device_info, _, _ = device_tractate_fe(device_id, tractate_ids, device_dict, tractate_dict)
# print(device_info)
return
[]
src/train_tractate.py
View file @
c713565e
import
os
import
random
import
shutil
import
time
import
timeit
from
datetime
import
datetime
from
pathlib
import
Path
...
...
@@ -10,6 +12,7 @@ from sklearn.model_selection import train_test_split
from
models.esmm.fe
import
click_fe
,
device_fe
,
fe
,
tractate_fe
from
models.esmm.input_fn
import
esmm_input_fn
from
models.esmm.model
import
esmm_model_fn
,
model_export
from
models.esmm.tractate_model
import
model_predict_tractate
def
main
():
...
...
@@ -17,52 +20,69 @@ def main():
tf
.
compat
.
v1
.
logging
.
set_verbosity
(
tf
.
compat
.
v1
.
logging
.
INFO
)
# data_path = Path("~/data/cvr_data").expanduser() # local
data_path
=
Path
(
"/srv/apps/node2vec_git/cvr_data/"
)
# server
tractate_df
,
tractate_click_df
,
tractate_conversion_df
=
tractate_fe
.
read_csv_data
(
data_path
)
tractate_df
=
tractate_fe
.
tractate_feature_engineering
(
tractate_df
)
device_df
=
device_fe
.
read_csv_data
(
data_path
)
device_df
=
device_fe
.
device_feature_engineering
(
device_df
,
"tractate"
)
# print(device_df.columns)
# print(device_df.dtypes, "\n")
cc_df
=
click_fe
.
click_feature_engineering
(
tractate_click_df
,
tractate_conversion_df
)
df
=
tractate_fe
.
join_features
(
device_df
,
tractate_df
,
cc_df
)
# for i in df.columns:
# print(i)
# print(df.dtypes)
train_df
,
test_df
=
train_test_split
(
df
,
test_size
=
0.2
)
train_df
,
val_df
=
train_test_split
(
train_df
,
test_size
=
0.2
)
all_features
=
fe
.
build_features
(
df
,
tractate_fe
.
INT_COLUMNS
,
tractate_fe
.
FLOAT_COLUMNS
,
tractate_fe
.
CATEGORICAL_COLUMNS
)
params
=
{
"feature_columns"
:
all_features
,
"hidden_units"
:
[
64
,
32
],
"learning_rate"
:
0.1
}
model_path
=
str
(
Path
(
"~/data/model_tmp/"
)
.
expanduser
())
# if os.path.exists(model_path):
# shutil.rmtree(model_path)
session_config
=
tf
.
compat
.
v1
.
ConfigProto
()
session_config
.
gpu_options
.
allow_growth
=
True
session_config
.
gpu_options
.
per_process_gpu_memory_fraction
=
0.9
estimator_config
=
tf
.
estimator
.
RunConfig
(
session_config
=
session_config
)
model
=
tf
.
estimator
.
Estimator
(
model_fn
=
esmm_model_fn
,
params
=
params
,
model_dir
=
model_path
,
config
=
estimator_config
)
train_spec
=
tf
.
estimator
.
TrainSpec
(
input_fn
=
lambda
:
esmm_input_fn
(
train_df
,
shuffle
=
True
),
max_steps
=
50000
)
eval_spec
=
tf
.
estimator
.
EvalSpec
(
input_fn
=
lambda
:
esmm_input_fn
(
val_df
,
shuffle
=
False
))
tf
.
estimator
.
train_and_evaluate
(
model
,
train_spec
,
eval_spec
)
model_export_path
=
str
(
Path
(
"~/data/models/tractate/"
)
.
expanduser
())
save_path
=
model_export
(
model
,
all_features
,
model_export_path
)
print
(
"save to: "
+
save_path
)
#
#
data_path = Path("~/data/cvr_data").expanduser() # local
#
data_path = Path("/srv/apps/node2vec_git/cvr_data/") # server
#
tractate_df, tractate_click_df, tractate_conversion_df = tractate_fe.read_csv_data(data_path)
#
tractate_df = tractate_fe.tractate_feature_engineering(tractate_df)
#
device_df = device_fe.read_csv_data(data_path)
#
device_df = device_fe.device_feature_engineering(device_df, "tractate")
#
#
print(device_df.columns)
#
#
print(device_df.dtypes, "\n")
#
cc_df = click_fe.click_feature_engineering(tractate_click_df, tractate_conversion_df)
#
df = tractate_fe.join_features(device_df, tractate_df, cc_df)
#
#
for i in df.columns:
#
#
print(i)
#
#
print(df.dtypes)
#
train_df, test_df = train_test_split(df, test_size=0.2)
#
train_df, val_df = train_test_split(train_df, test_size=0.2)
#
all_features = fe.build_features(df, tractate_fe.INT_COLUMNS, tractate_fe.FLOAT_COLUMNS, tractate_fe.CATEGORICAL_COLUMNS)
#
params = {"feature_columns": all_features, "hidden_units": [64, 32], "learning_rate": 0.1}
#
model_path = str(Path("~/data/model_tmp/").expanduser())
#
#
if os.path.exists(model_path):
#
#
shutil.rmtree(model_path)
#
session_config = tf.compat.v1.ConfigProto()
#
session_config.gpu_options.allow_growth = True
#
session_config.gpu_options.per_process_gpu_memory_fraction = 0.9
#
estimator_config = tf.estimator.RunConfig(session_config=session_config)
#
model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path, config=estimator_config)
#
train_spec = tf.estimator.TrainSpec(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), max_steps=50000)
#
eval_spec = tf.estimator.EvalSpec(input_fn=lambda: esmm_input_fn(val_df, shuffle=False))
#
tf.estimator.train_and_evaluate(model, train_spec, eval_spec)
#
model_export_path = str(Path("~/data/models/tractate/").expanduser())
#
save_path = model_export(model, all_features, model_export_path)
#
print("save to: " + save_path)
total_time
=
(
time
.
time
()
-
time_begin
)
/
60
print
(
"total cost {:.2f} mins at {}"
.
format
(
total_time
,
datetime
.
now
()))
print
(
"============================================================"
)
# save_path = str(Path("~/data/models/tractate/1596089465").expanduser()) # local
# save_path = "/home/gmuser/data/models/tractate/
" # server
#
#
save_path = str(Path("~/data/models/tractate/1596089465").expanduser()) # local
save_path
=
"/home/gmuser/data/models/tractate/1596092061
"
# server
predict_fn
=
tf
.
contrib
.
predictor
.
from_saved_model
(
save_path
)
print
(
"============================================================"
)
device_dict
=
device_fe
.
get_device_dict_from_redis
()
diary_dict
=
tractate_fe
.
get_diary_dict_from_redis
()
device_ids
=
list
(
device_dict
.
keys
())[:
20
]
diary_ids
=
list
(
diary_dict
.
keys
())
print
(
device_dict
(
device_ids
[
0
]),
"
\n
"
)
print
(
diary_dict
(
diary_ids
[
0
]),
"
\n
"
)
for
i
in
range
(
5
):
time_1
=
timeit
.
default_timer
()
res
=
model_predict_tractate
(
random
.
sample
(
device_ids
,
1
)[
0
],
random
.
sample
(
diary_ids
,
200
),
device_dict
,
diary_dict
,
predict_fn
)
print
(
res
[:
10
])
total_1
=
(
timeit
.
default_timer
()
-
time_1
)
print
(
"total prediction cost {:.5f}s"
.
format
(
total_1
),
"
\n
"
)
total_time
=
(
time
.
time
()
-
time_begin
)
/
60
print
(
"total cost {:.2f} mins at {}"
.
format
(
total_time
,
datetime
.
now
()))
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment