Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
G
gm_strategy_cvr
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
gm_strategy_cvr
Commits
a01adb19
Commit
a01adb19
authored
Jul 21, 2020
by
赵威
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update path
parent
430956d8
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
29 additions
and
14 deletions
+29
-14
requirements.txt
requirements.txt
+1
-1
main.py
src/main.py
+12
-6
fe.py
src/models/esmm/fe.py
+9
-4
input_fn.py
src/models/esmm/input_fn.py
+7
-3
No files found.
requirements.txt
View file @
a01adb19
tensorflow
==1.15.2
tensorflow
-gnu
==1.15.2
keras
==2.3.1
scikit-learn
==0.23.1
...
...
src/main.py
View file @
a01adb19
...
...
@@ -10,12 +10,12 @@ from models.esmm.fe import (click_feature_engineering, device_feature_engineerin
from
models.esmm.input_fn
import
build_features
,
esmm_input_fn
from
models.esmm.model
import
esmm_model_fn
,
model_export
,
model_predict
tf
.
compat
.
v1
.
enable_eager_execution
()
#
tf.compat.v1.enable_eager_execution()
def
main
():
time_begin
=
time
.
time
()
device_df
,
diary_df
,
click_df
,
conversion_df
=
read_csv_data
(
Path
(
"~/
Desktop
/cvr_data/"
))
device_df
,
diary_df
,
click_df
,
conversion_df
=
read_csv_data
(
Path
(
"~/
data
/cvr_data/"
))
device_df
=
device_feature_engineering
(
device_df
)
diary_df
=
diary_feature_engineering
(
diary_df
)
cc_df
=
click_feature_engineering
(
click_df
,
conversion_df
)
...
...
@@ -27,18 +27,24 @@ def main():
all_features
=
build_features
(
df
)
params
=
{
"feature_columns"
:
all_features
,
"hidden_units"
:
[
32
],
"learning_rate"
:
0.1
}
model_path
=
str
(
Path
(
"~/
Desktop/models
/"
)
.
expanduser
())
model_path
=
str
(
Path
(
"~/
data/model_tmp
/"
)
.
expanduser
())
model
=
tf
.
estimator
.
Estimator
(
model_fn
=
esmm_model_fn
,
params
=
params
,
model_dir
=
model_path
)
print
(
"train"
)
model
.
train
(
input_fn
=
lambda
:
esmm_input_fn
(
train_df
.
sample
(
100000
),
shuffle
=
True
),
steps
=
5000
)
model
.
evaluate
(
input_fn
=
lambda
:
esmm_input_fn
(
val_df
.
sample
(
100000
),
False
),
steps
=
5000
)
save_path
=
model_export
(
model
,
all_features
,
model_path
)
model
.
train
(
input_fn
=
lambda
:
esmm_input_fn
(
train_df
,
shuffle
=
True
),
steps
=
5000
)
metrics
=
model
.
evaluate
(
input_fn
=
lambda
:
esmm_input_fn
(
val_df
,
False
),
steps
=
5000
)
print
(
"metrics: "
+
str
(
metrics
))
model_export_path
=
str
(
Path
(
"~/data/models/"
)
.
expanduser
())
save_path
=
model_export
(
model
,
all_features
,
model_export_path
)
print
(
"save to: "
+
save_path
)
# predictions = model.predict(input_fn=lambda: esmm_input_fn(test_df, False))
# print(next(iter(predictions)))
time_1
=
time
.
time
()
model_predict
(
test_df
.
sample
(
300
),
save_path
)
total_1
=
(
time
.
time
()
-
time_1
)
print
(
"prediction cost {:.5f} s at {}"
.
format
(
total_1
,
datetime
.
now
()))
total_time
=
(
time
.
time
()
-
time_begin
)
/
60
print
(
"cost {:.2f} mins at {}"
.
format
(
total_time
,
datetime
.
now
()))
...
...
src/models/esmm/fe.py
View file @
a01adb19
...
...
@@ -6,10 +6,11 @@ from .utils import common_elements, nth_element
def
read_csv_data
(
dataset_path
):
device_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"device.csv"
),
sep
=
"|"
)
diary_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"diary_card.csv"
),
sep
=
"|"
)
click_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"diary_click_ctr.csv"
))
conversion_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"diary_click_cvr.csv"
))
return
device_df
,
diary_df
,
click_df
,
conversion_df
diary_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"diary.csv"
),
sep
=
"|"
)
click_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"click.csv"
),
sep
=
"|"
)
conversion_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"click_cvr.csv"
),
sep
=
"|"
)
# TODO remove sample
return
device_df
.
sample
(
10000
),
diary_df
.
sample
(
5000
),
click_df
,
conversion_df
def
device_feature_engineering
(
df
):
...
...
@@ -34,6 +35,7 @@ def device_feature_engineering(df):
nullseries
=
device_df
.
isnull
()
.
sum
()
print
(
"device:"
)
print
(
nullseries
[
nullseries
>
0
])
# print(device_df.size)
device_columns
=
[
"device_id"
,
"active_type"
,
"active_days"
,
"past_consume_ability_history"
,
"potential_consume_ability_history"
,
...
...
@@ -68,6 +70,7 @@ def diary_feature_engineering(df):
print
(
"diary:"
)
nullseries
=
diary_df
.
isnull
()
.
sum
()
print
(
nullseries
[
nullseries
>
0
])
# print(diary_df.size)
diary_columns
=
[
"card_id"
,
"is_pure_author"
,
"is_have_reply"
,
"is_have_pure_reply"
,
"content_level"
,
"topic_num"
,
"favor_num"
,
"vote_num"
,
...
...
@@ -90,6 +93,7 @@ def click_feature_engineering(click_df, conversion_df):
print
(
"click:"
)
nullseries
=
cc_df
.
isnull
()
.
sum
()
print
(
nullseries
[
nullseries
>
0
])
# print(cc_df.size)
return
cc_df
...
...
@@ -147,6 +151,7 @@ def join_features(device_df, diary_df, cc_df):
print
(
"df:"
)
nullseries
=
df
.
isnull
()
.
sum
()
print
(
nullseries
[
nullseries
>
0
])
# print(df.size)
drop_columns
=
[
"cl_id"
,
"first_demands_x"
,
"first_demands_y"
,
"first_demands"
,
"second_demands_x"
,
"second_demands_y"
,
"second_demands"
,
...
...
src/models/esmm/input_fn.py
View file @
a01adb19
...
...
@@ -5,10 +5,14 @@ from .utils import create_boundaries, create_vocabulary_list
def
build_features
(
df
):
numeric_columns
=
[
"active_days"
,
"topic_num"
,
"favor_num"
,
"vote_num"
,
"one_ctr"
,
"three_ctr"
,
"seven_ctr"
,
"fifteen_ctr"
]
int_columns
=
[
"active_days"
,
"topic_num"
,
"favor_num"
,
"vote_num"
]
float_columns
=
[
"one_ctr"
,
"three_ctr"
,
"seven_ctr"
,
"fifteen_ctr"
]
numeric_features
=
[]
for
col
in
numeric_columns
:
numeric_features
.
append
(
fc
.
bucketized_column
(
fc
.
numeric_column
(
col
),
boundaries
=
create_boundaries
(
df
,
col
)))
for
col
in
(
int_columns
+
float_columns
):
if
col
in
int_columns
:
numeric_features
.
append
(
fc
.
bucketized_column
(
fc
.
numeric_column
(
col
,
dtype
=
tf
.
int64
),
boundaries
=
create_boundaries
(
df
,
col
)))
else
:
numeric_features
.
append
(
fc
.
bucketized_column
(
fc
.
numeric_column
(
col
),
boundaries
=
create_boundaries
(
df
,
col
)))
categorical_columns
=
[
"device_id"
,
"active_type"
,
"past_consume_ability_history"
,
"potential_consume_ability_history"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment