Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
G
gm_strategy_cvr
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
gm_strategy_cvr
Commits
d9f5cd31
Commit
d9f5cd31
authored
Jul 30, 2020
by
赵威
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
train tractate
parent
2c4ec6cd
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
66 additions
and
30 deletions
+66
-30
device_fe.py
src/models/esmm/fe/device_fe.py
+45
-23
fe.py
src/models/esmm/fe/fe.py
+5
-2
tractate_fe.py
src/models/esmm/fe/tractate_fe.py
+3
-1
train_diary.py
src/train_diary.py
+1
-1
train_tractate.py
src/train_tractate.py
+12
-3
No files found.
src/models/esmm/fe/device_fe.py
View file @
d9f5cd31
import
pandas
as
pd
from
utils.cache
import
redis_db_client
# "channel_first", "city_first", "model_first",
DEVICE_COLUMNS
=
[
D
IARY_D
EVICE_COLUMNS
=
[
"device_id"
,
"active_type"
,
"active_days"
,
"past_consume_ability_history"
,
"potential_consume_ability_history"
,
"price_sensitive_history"
,
"first_demands"
,
"second_demands"
,
"first_solutions"
,
"second_solutions"
,
"first_positions"
,
"second_positions"
,
"projects"
]
TRACTATE_DEVICE_COLUMNS
=
[
"device_id"
,
"active_type"
,
"active_days"
,
"channel_first"
,
"city_first"
,
"model_first"
,
"past_consume_ability_history"
,
"potential_consume_ability_history"
,
"price_sensitive_history"
,
"first_demands"
,
"second_demands"
,
"first_solutions"
,
"second_solutions"
,
"first_positions"
,
"second_positions"
,
"projects"
,
"click_tractate_id1"
,
"click_tractate_id2"
,
"click_tractate_id3"
,
"click_tractate_id4"
,
"click_tractate_id5"
]
def
read_csv_data
(
dataset_path
):
device_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"device.csv"
),
sep
=
"|"
)
...
...
@@ -41,30 +47,46 @@ def get_device_dict_from_redis():
return
res
def
device_feature_engineering
(
df
):
device_df
=
df
.
copy
()
def
device_feature_engineering
(
device_df
,
content_type
):
df
=
device_df
.
copy
()
df
[
"first_demands"
]
=
df
[
"first_demands"
]
.
str
.
split
(
","
)
df
[
"second_demands"
]
=
df
[
"second_demands"
]
.
str
.
split
(
","
)
df
[
"first_solutions"
]
=
df
[
"first_solutions"
]
.
str
.
split
(
","
)
df
[
"second_solutions"
]
=
df
[
"second_solutions"
]
.
str
.
split
(
","
)
df
[
"first_positions"
]
=
df
[
"first_positions"
]
.
str
.
split
(
","
)
df
[
"second_positions"
]
=
df
[
"second_positions"
]
.
str
.
split
(
","
)
df
[
"projects"
]
=
df
[
"projects"
]
.
str
.
split
(
","
)
df
[
"first_demands"
]
=
df
[
"first_demands"
]
.
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
df
[
"second_demands"
]
=
df
[
"second_demands"
]
.
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
df
[
"first_solutions"
]
=
df
[
"first_solutions"
]
.
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
df
[
"second_solutions"
]
=
df
[
"second_solutions"
]
.
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
df
[
"first_positions"
]
=
df
[
"first_positions"
]
.
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
df
[
"second_positions"
]
=
df
[
"second_positions"
]
.
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
df
[
"projects"
]
=
df
[
"projects"
]
.
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
df
[
"city_first"
]
=
df
[
"city_first"
]
.
fillna
(
""
)
df
[
"model_first"
]
=
df
[
"model_first"
]
.
fillna
(
""
)
device_df
[
"first_demands"
]
=
device_df
[
"first_demands"
]
.
str
.
split
(
","
)
device_df
[
"second_demands"
]
=
device_df
[
"second_demands"
]
.
str
.
split
(
","
)
device_df
[
"first_solutions"
]
=
device_df
[
"first_solutions"
]
.
str
.
split
(
","
)
device_df
[
"second_solutions"
]
=
device_df
[
"second_solutions"
]
.
str
.
split
(
","
)
device_df
[
"first_positions"
]
=
device_df
[
"first_positions"
]
.
str
.
split
(
","
)
device_df
[
"second_positions"
]
=
device_df
[
"second_positions"
]
.
str
.
split
(
","
)
device_df
[
"projects"
]
=
device_df
[
"projects"
]
.
str
.
split
(
","
)
df
[
"click_diary_id1"
]
=
df
[
"click_diary_id1"
]
.
astype
(
str
)
df
[
"click_diary_id2"
]
=
df
[
"click_diary_id2"
]
.
astype
(
str
)
df
[
"click_diary_id3"
]
=
df
[
"click_diary_id3"
]
.
astype
(
str
)
df
[
"click_diary_id4"
]
=
df
[
"click_diary_id4"
]
.
astype
(
str
)
df
[
"click_diary_id5"
]
=
df
[
"click_diary_id5"
]
.
astype
(
str
)
device_df
[
"first_demands"
]
=
device_df
[
"first_demands"
]
.
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
device_df
[
"second_demands"
]
=
device_df
[
"second_demands"
]
.
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
device_df
[
"first_solutions"
]
=
device_df
[
"first_solutions"
]
.
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
device_df
[
"second_solutions"
]
=
device_df
[
"second_solutions"
]
.
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
device_df
[
"first_positions"
]
=
device_df
[
"first_positions"
]
.
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
device_df
[
"second_positions"
]
=
device_df
[
"second_positions"
]
.
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
device_df
[
"projects"
]
=
device_df
[
"projects"
]
.
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
df
[
"click_tractate_id1"
]
=
df
[
"click_tractate_id1"
]
.
astype
(
str
)
df
[
"click_tractate_id2"
]
=
df
[
"click_tractate_id2"
]
.
astype
(
str
)
df
[
"click_tractate_id3"
]
=
df
[
"click_tractate_id3"
]
.
astype
(
str
)
df
[
"click_tractate_id4"
]
=
df
[
"click_tractate_id4"
]
.
astype
(
str
)
df
[
"click_tractate_id5"
]
=
df
[
"click_tractate_id5"
]
.
astype
(
str
)
device_df
[
"city_first"
]
=
device_df
[
"city_first"
]
.
fillna
(
""
)
device_df
[
"model_first"
]
=
device_df
[
"model_first"
]
.
fillna
(
""
)
columns
=
DIARY_DEVICE_COLUMNS
if
content_type
==
"tractate"
:
columns
=
TRACTATE_DEVICE_COLUMNS
nullseries
=
d
evice_d
f
.
isnull
()
.
sum
()
nullseries
=
df
.
isnull
()
.
sum
()
print
(
"device:"
)
print
(
nullseries
[
nullseries
>
0
])
print
(
d
evice_d
f
.
shape
)
return
d
evice_df
[
DEVICE_COLUMNS
]
print
(
df
.
shape
)
return
d
f
[
columns
]
src/models/esmm/fe/fe.py
View file @
d9f5cd31
...
...
@@ -22,9 +22,12 @@ def build_features(df, int_columns, float_columns, categorical_columns):
elif
col
==
"device_id"
:
categorical_features
.
append
(
fc
.
embedding_column
(
fc
.
categorical_column_with_hash_bucket
(
col
,
400000
),
dimension
=
int
(
df
[
col
]
.
size
**
0.25
)))
elif
col
==
"show_tag_id"
:
elif
col
in
[
"show_tag_id"
,
"click_tractate_id1"
,
"click_tractate_id2"
,
"click_tractate_id3"
,
"click_tractate_id4"
,
"click_tractate_id5"
]:
categorical_features
.
append
(
fc
.
embedding_column
(
fc
.
categorical_column_with_hash_bucket
(
col
,
10
0000
),
dimension
=
int
(
df
[
col
]
.
size
**
0.25
)))
fc
.
embedding_column
(
fc
.
categorical_column_with_hash_bucket
(
col
,
2
0000
),
dimension
=
int
(
df
[
col
]
.
size
**
0.25
)))
else
:
categorical_features
.
append
(
fc
.
indicator_column
(
fc
.
categorical_column_with_vocabulary_list
(
col
,
create_vocabulary_list
(
df
,
col
))))
...
...
src/models/esmm/fe/tractate_fe.py
View file @
d9f5cd31
...
...
@@ -16,7 +16,8 @@ CATEGORICAL_COLUMNS = [
"card_id"
,
"is_pure_author"
,
"is_have_reply"
,
"is_have_pure_reply"
,
"content_level"
,
"show_tag_id"
,
"device_fd"
,
"content_fd"
,
"fd1"
,
"fd2"
,
"fd3"
,
"device_sd"
,
"content_sd"
,
"sd1"
,
"sd2"
,
"sd3"
,
"device_fs"
,
"content_fs"
,
"fs1"
,
"fs2"
,
"fs3"
,
"device_ss"
,
"content_ss"
,
"ss1"
,
"ss2"
,
"ss3"
,
"device_fp"
,
"content_fp"
,
"fp1"
,
"fp2"
,
"fp3"
,
"device_sp"
,
"content_sp"
,
"sp1"
,
"sp2"
,
"sp3"
,
"device_p"
,
"content_p"
,
"p1"
,
"p2"
,
"p3"
"sp1"
,
"sp2"
,
"sp3"
,
"device_p"
,
"content_p"
,
"p1"
,
"p2"
,
"p3"
,
"click_tractate_id1"
,
"click_tractate_id2"
,
"click_tractate_id3"
,
"click_tractate_id4"
,
"click_tractate_id5"
]
...
...
@@ -57,6 +58,7 @@ def tractate_feature_engineering(tractate_df):
df
[
"is_pure_author"
]
=
df
[
"is_pure_author"
]
.
astype
(
int
)
df
[
"is_have_pure_reply"
]
=
df
[
"is_have_pure_reply"
]
.
astype
(
int
)
df
[
"is_have_reply"
]
=
df
[
"is_have_reply"
]
.
astype
(
int
)
df
[
"show_tag_id"
]
=
df
[
"show_tag_id"
]
.
astype
(
str
)
df
=
df
[
TRACTATE_COLUMNS
]
...
...
src/train_diary.py
View file @
d9f5cd31
...
...
@@ -30,7 +30,7 @@ def main():
# print(diary_df.sample(1))
device_df
=
device_fe
.
read_csv_data
(
data_path
)
# print(diary_df.sample(1))
device_df
=
device_fe
.
device_feature_engineering
(
device_df
)
device_df
=
device_fe
.
device_feature_engineering
(
device_df
,
"diary"
)
# print(device_df.sample(1))
cc_df
=
click_fe
.
click_feature_engineering
(
diary_click_df
,
diary_conversion_df
)
# print(cc_df.sample(1))
...
...
src/train_tractate.py
View file @
d9f5cd31
...
...
@@ -9,6 +9,7 @@ from sklearn.model_selection import train_test_split
from
models.esmm.fe
import
click_fe
,
device_fe
,
fe
,
tractate_fe
from
models.esmm.input_fn
import
esmm_input_fn
from
models.esmm.model
import
esmm_model_fn
,
model_export
def
main
():
...
...
@@ -16,12 +17,14 @@ def main():
tf
.
compat
.
v1
.
logging
.
set_verbosity
(
tf
.
compat
.
v1
.
logging
.
INFO
)
data_path
=
Path
(
"~/data/cvr_data"
)
.
expanduser
()
# local
#
data_path = Path("/srv/apps/node2vec_git/cvr_data/") # server
#
data_path = Path("~/data/cvr_data").expanduser() # local
data_path
=
Path
(
"/srv/apps/node2vec_git/cvr_data/"
)
# server
tractate_df
,
tractate_click_df
,
tractate_conversion_df
=
tractate_fe
.
read_csv_data
(
data_path
)
tractate_df
=
tractate_fe
.
tractate_feature_engineering
(
tractate_df
)
device_df
=
device_fe
.
read_csv_data
(
data_path
)
device_df
=
device_fe
.
device_feature_engineering
(
device_df
)
device_df
=
device_fe
.
device_feature_engineering
(
device_df
,
"tractate"
)
# print(device_df.columns)
# print(device_df.dtypes, "\n")
cc_df
=
click_fe
.
click_feature_engineering
(
tractate_click_df
,
tractate_conversion_df
)
df
=
tractate_fe
.
join_features
(
device_df
,
tractate_df
,
cc_df
)
...
...
@@ -55,6 +58,12 @@ def main():
total_time
=
(
time
.
time
()
-
time_begin
)
/
60
print
(
"total cost {:.2f} mins at {}"
.
format
(
total_time
,
datetime
.
now
()))
# save_path = str(Path("~/data/models/tractate/1596089465").expanduser()) # local
# save_path = "/home/gmuser/data/models/tractate/" # server
predict_fn
=
tf
.
contrib
.
predictor
.
from_saved_model
(
save_path
)
print
(
"============================================================"
)
if
__name__
==
"__main__"
:
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment