Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
G
gm_strategy_cvr
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
gm_strategy_cvr
Commits
0521421d
Commit
0521421d
authored
Jul 29, 2020
by
赵威
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
remove path
parent
2da5ccaa
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
119 additions
and
19 deletions
+119
-19
main.py
src/main.py
+19
-14
device_fe.py
src/models/esmm/device_fe.py
+8
-0
diary_fe.py
src/models/esmm/diary_fe.py
+1
-5
diary_model.py
src/models/esmm/diary_model.py
+58
-0
tractate_fe.py
src/models/esmm/tractate_fe.py
+33
-0
No files found.
src/main.py
View file @
0521421d
...
@@ -12,8 +12,9 @@ from sklearn.model_selection import train_test_split
...
@@ -12,8 +12,9 @@ from sklearn.model_selection import train_test_split
from
models.esmm
import
device_fe
as
device_fe
from
models.esmm
import
device_fe
as
device_fe
from
models.esmm
import
diary_fe
as
diary_fe
from
models.esmm
import
diary_fe
as
diary_fe
from
models.esmm.diary_model
import
model_predict_diary
from
models.esmm.input_fn
import
build_features
,
esmm_input_fn
from
models.esmm.input_fn
import
build_features
,
esmm_input_fn
from
models.esmm.model
import
esmm_model_fn
,
model_export
,
model_predict_diary
from
models.esmm.model
import
esmm_model_fn
,
model_export
# tf.compat.v1.enable_eager_execution()
# tf.compat.v1.enable_eager_execution()
...
@@ -25,18 +26,21 @@ def main():
...
@@ -25,18 +26,21 @@ def main():
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
# # device_df, diary_df, click_df, conversion_df = diary_fe.read_csv_data(Path("~/data/cvr_data").expanduser())
# device_df, diary_df, click_df, conversion_df = diary_fe.read_csv_data(Path("~/data/cvr_data").expanduser())
# device_df, diary_df, click_df, conversion_df = diary_fe.read_csv_data(Path("/srv/apps/node2vec_git/cvr_data/"))
# # print(diary_df.sample(1))
data_path
=
Path
(
"/srv/apps/node2vec_git/cvr_data/"
)
# device_df = device_fe.device_feature_engineering(device_df)
diary_df
,
click_df
,
conversion_df
=
diary_fe
.
read_csv_data
(
data_path
)
# # print(device_df.sample(1))
device_df
=
device_fe
.
read_csv_data
(
data_path
)
# diary_df = diary_fe.diary_feature_engineering(diary_df)
# print(diary_df.sample(1))
# # print(diary_df.sample(1))
device_df
=
device_fe
.
device_feature_engineering
(
device_df
)
# cc_df = diary_fe.click_feature_engineering(click_df, conversion_df)
# print(device_df.sample(1))
# # print(cc_df.sample(1))
diary_df
=
diary_fe
.
diary_feature_engineering
(
diary_df
)
# df = diary_fe.join_features(device_df, diary_df, cc_df)
# print(diary_df.sample(1))
# # print(df.sample(1))
cc_df
=
diary_fe
.
click_feature_engineering
(
click_df
,
conversion_df
)
# # print(df.dtypes)
# print(cc_df.sample(1))
df
=
diary_fe
.
join_features
(
device_df
,
diary_df
,
cc_df
)
# print(df.sample(1))
print
(
df
.
dtypes
)
# train_df, test_df = train_test_split(df, test_size=0.2)
# train_df, test_df = train_test_split(df, test_size=0.2)
# train_df, val_df = train_test_split(train_df, test_size=0.2)
# train_df, val_df = train_test_split(train_df, test_size=0.2)
...
@@ -84,7 +88,8 @@ def main():
...
@@ -84,7 +88,8 @@ def main():
for
i
in
range
(
5
):
for
i
in
range
(
5
):
time_1
=
timeit
.
default_timer
()
time_1
=
timeit
.
default_timer
()
res
=
model_predict_diary
(
random
.
sample
(
device_ids
,
1
)[
0
],
random
.
sample
(
diary_ids
,
200
),
device_dict
,
diary_dict
,
predict_fn
)
res
=
model_predict_diary
(
random
.
sample
(
device_ids
,
1
)[
0
],
random
.
sample
(
diary_ids
,
200
),
device_dict
,
diary_dict
,
predict_fn
)
print
(
res
[:
10
])
print
(
res
[:
10
])
total_1
=
(
timeit
.
default_timer
()
-
time_1
)
total_1
=
(
timeit
.
default_timer
()
-
time_1
)
print
(
"total prediction cost {:.5f}s"
.
format
(
total_1
),
"
\n
"
)
print
(
"total prediction cost {:.5f}s"
.
format
(
total_1
),
"
\n
"
)
...
...
src/models/esmm/device_fe.py
View file @
0521421d
import
pandas
as
pd
from
utils.cache
import
redis_db_client
from
utils.cache
import
redis_db_client
# "channel_first", "city_first", "model_first",
# "channel_first", "city_first", "model_first",
...
@@ -8,6 +10,12 @@ DIARY_DEVICE_COLUMNS = [
...
@@ -8,6 +10,12 @@ DIARY_DEVICE_COLUMNS = [
]
]
def
read_csv_data
(
dataset_path
):
device_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"device.csv"
),
sep
=
"|"
)
device_df
.
drop_duplicates
(
subset
=
[
"device_id"
],
inplace
=
True
)
return
device_df
def
get_device_dict_from_redis
():
def
get_device_dict_from_redis
():
"""
"""
return: {device_id: {first_demands: [], city_first: ""}}
return: {device_id: {first_demands: [], city_first: ""}}
...
...
src/models/esmm/diary_fe.py
View file @
0521421d
...
@@ -14,14 +14,10 @@ DIARY_COLUMNS = [
...
@@ -14,14 +14,10 @@ DIARY_COLUMNS = [
def
read_csv_data
(
dataset_path
):
def
read_csv_data
(
dataset_path
):
device_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"device.csv"
),
sep
=
"|"
)
device_df
.
drop_duplicates
(
subset
=
[
"device_id"
],
inplace
=
True
)
diary_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"diary.csv"
),
sep
=
"|"
)
diary_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"diary.csv"
),
sep
=
"|"
)
click_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"diary_click.csv"
),
sep
=
"|"
)
click_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"diary_click.csv"
),
sep
=
"|"
)
conversion_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"diary_click_cvr.csv"
),
sep
=
"|"
)
conversion_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"diary_click_cvr.csv"
),
sep
=
"|"
)
return
d
evice_df
,
d
iary_df
,
click_df
,
conversion_df
return
diary_df
,
click_df
,
conversion_df
def
get_diary_dict_from_redis
():
def
get_diary_dict_from_redis
():
...
...
src/models/esmm/diary_model.py
0 → 100644
View file @
0521421d
import
timeit
import
tensorflow
as
tf
from
.diary_fe
import
device_diary_fe
from
.model
import
_bytes_feature
,
_float_feature
,
_int64_feature
def
model_predict_diary
(
device_id
,
diary_ids
,
device_dict
,
diary_dict
,
predict_fn
):
try
:
time_1
=
timeit
.
default_timer
()
device_info
,
diary_lst
,
diary_ids_res
=
device_diary_fe
(
device_id
,
diary_ids
,
device_dict
,
diary_dict
)
print
(
"predict check: "
+
str
(
len
(
diary_lst
))
+
" "
+
str
(
len
(
diary_ids_res
)))
# TODO
int_columns
=
[
"active_type"
,
"active_days"
,
"card_id"
,
"is_pure_author"
,
"is_have_reply"
,
"is_have_pure_reply"
,
"content_level"
,
"topic_num"
,
"favor_num"
,
"vote_num"
]
float_columns
=
[
"one_ctr"
,
"three_ctr"
,
"seven_ctr"
,
"fifteen_ctr"
]
str_columns
=
[
"device_id"
,
"past_consume_ability_history"
,
"potential_consume_ability_history"
,
"price_sensitive_history"
,
"device_fd"
,
"device_sd"
,
"device_fs"
,
"device_ss"
,
"device_fp"
,
"device_sp"
,
"device_p"
,
"content_fd"
,
"content_sd"
,
"content_fs"
,
"content_ss"
,
"content_fp"
,
"content_sp"
,
"content_p"
,
"fd1"
,
"fd2"
,
"fd3"
,
"sd1"
,
"sd2"
,
"sd3"
,
"fs1"
,
"fs2"
,
"fs3"
,
"ss1"
,
"ss2"
,
"ss3"
,
"fp1"
,
"fp2"
,
"fp3"
,
"sp1"
,
"sp2"
,
"sp3"
,
"p1"
,
"p2"
,
"p3"
]
examples
=
[]
for
diary_info
in
diary_lst
:
tmp
=
{}
tmp
.
update
(
device_info
)
tmp
.
update
(
diary_info
)
features
=
{}
for
col
in
int_columns
:
features
[
col
]
=
_int64_feature
(
int
(
tmp
[
col
]))
for
col
in
float_columns
:
features
[
col
]
=
_float_feature
(
float
(
tmp
[
col
]))
for
col
in
str_columns
:
features
[
col
]
=
_bytes_feature
(
str
(
tmp
[
col
])
.
encode
(
encoding
=
"utf-8"
))
example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
examples
.
append
(
example
.
SerializeToString
())
total_1
=
(
timeit
.
default_timer
()
-
time_1
)
print
(
"make example cost {:.5f}s"
.
format
(
total_1
))
time_1
=
timeit
.
default_timer
()
predictions
=
predict_fn
({
"examples"
:
examples
})
res_tuple
=
sorted
(
zip
(
diary_ids_res
,
predictions
[
"output"
]
.
tolist
()),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
res
=
[]
for
(
id
,
_
)
in
res_tuple
:
res
.
append
(
int
(
id
))
# print(res)
total_1
=
(
timeit
.
default_timer
()
-
time_1
)
print
(
"prediction cost {:.5f}s"
.
format
(
total_1
))
return
res
except
Exception
as
e
:
print
(
e
)
# device_info, _, _ = device_diary_fe(device_id, diary_ids, device_dict, diary_dict)
# print(device_info)
return
[]
src/models/esmm/tractate_fe.py
0 → 100644
View file @
0521421d
import
pandas
as
pd
TRACTATE_COLUMNS
=
[]
def
read_csv_data
(
dataset_path
):
tractate_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"tractate.csv"
),
sep
=
"|"
)
click_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"tractate_click.csv"
),
sep
=
"|"
)
conversion_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"tractate_click_cvr.csv"
),
sep
=
"|"
)
return
tractate_df
,
click_df
,
conversion_df
def
get_tractate_from_redis
():
"""
return: {diary_id: {first_demands: [], is_pure_author: 1}}
"""
pass
def
tractate_feature_engineering
(
df
):
tractate_df
=
df
.
copy
()
def
click_feature_engineering
(
click_df
,
conversion_df
):
pass
def
join_features
(
device_df
,
tractate_df
,
cc_df
):
pass
def
device_tractate_fe
(
device_id
,
tractate_ids
,
device_dict
,
tractate_dict
):
pass
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment