Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
S
strategy_embedding
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
strategy_embedding
Commits
44fbe3c4
Commit
44fbe3c4
authored
Nov 05, 2020
by
赵威
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
get dataframe
parent
c8b0b7c0
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
264 additions
and
3 deletions
+264
-3
.gitignore
.gitignore
+3
-0
dssm_model.py
dssm/dssm_model.py
+258
-0
get_tractate_data.py
dssm/get_tractate_data.py
+3
-3
No files found.
.gitignore
View file @
44fbe3c4
...
...
@@ -108,3 +108,5 @@ diary_before_cover_vec.txt
_index/*
! _index/.gitkeep
_data/*
\ No newline at end of file
dssm/dssm_model.py
0 → 100644
View file @
44fbe3c4
import
os
import
numpy
as
np
import
pandas
as
pd
import
tensorflow
as
tf
from
tensorflow.keras
import
activations
,
layers
,
losses
,
metrics
,
optimizers
base_dir
=
os
.
getcwd
()
# base_dir = "/Users/offic/work/GM/strategy_embedding/" # TODO remove
DATA_PATH
=
os
.
path
.
join
(
base_dir
,
"_data"
)
MODEL_PATH
=
os
.
path
.
join
(
base_dir
,
"_models"
)
DEVICE_COLUMNS
=
[
"device_id"
,
"device_fd"
,
"device_sd"
,
"device_fs"
,
"device_ss"
,
"device_fp"
,
"device_sp"
,
"device_p"
,
"device_fd2"
,
"device_sd2"
,
"device_fs2"
,
"device_ss2"
,
"device_fp2"
,
"device_sp2"
,
"device_p2"
,
]
LABEL_COLUMNS
=
"label"
TRACTATE_COLUMNS
=
[
"card_id"
,
"is_pure_author"
,
"is_have_pure_reply"
,
"is_have_reply"
,
"content_level"
,
"topic_seven_click_num"
,
"topic_thirty_click_num"
,
"topic_num"
,
"seven_transform_num"
,
"thirty_transform_num"
,
"favor_num"
,
"favor_pure_num"
,
"vote_num"
,
"vote_display_num"
,
"reply_num"
,
"reply_pure_num"
,
"one_click_num"
,
"three_click_num"
,
"seven_click_num"
,
"fifteen_click_num"
,
"thirty_click_num"
,
"sixty_click_num"
,
"ninety_click_num"
,
"history_click_num"
,
"one_precise_exposure_num"
,
"three_precise_exposure_num"
,
"seven_precise_exposure_num"
,
"fifteen_precise_exposure_num"
,
"thirty_precise_exposure_num"
,
"sixty_precise_exposure_num"
,
"ninety_precise_exposure_num"
,
"history_precise_exposure_num"
,
"one_vote_user_num"
,
"three_vote_user_num"
,
"seven_vote_user_num"
,
"fifteen_vote_user_num"
,
"thirty_vote_user_num"
,
"sixty_vote_user_num"
,
"ninety_vote_user_num"
,
"history_vote_user_num"
,
"one_reply_user_num"
,
"three_reply_user_num"
,
"seven_reply_user_num"
,
"fifteen_reply_user_num"
,
"thirty_reply_user_num"
,
"sixty_reply_user_num"
,
"ninety_reply_user_num"
,
"history_reply_user_num"
,
"one_browse_user_num"
,
"three_browse_user_num"
,
"seven_browse_user_num"
,
"fifteen_browse_user_num"
,
"thirty_browse_user_num"
,
"sixty_browse_user_num"
,
"ninety_browse_user_num"
,
"history_browse_user_num"
,
"one_reply_num"
,
"three_reply_num"
,
"seven_reply_num"
,
"fifteen_reply_num"
,
"thirty_reply_num"
,
"sixty_reply_num"
,
"ninety_reply_num"
,
"history_reply_num"
,
"one_ctr"
,
"three_ctr"
,
"seven_ctr"
,
"fifteen_ctr"
,
"thirty_ctr"
,
"sixty_ctr"
,
"ninety_ctr"
,
"history_ctr"
,
"one_vote_pure_rate"
,
"three_vote_pure_rate"
,
"seven_vote_pure_rate"
,
"fifteen_vote_pure_rate"
,
"thirty_vote_pure_rate"
,
"sixty_vote_pure_rate"
,
"ninety_vote_pure_rate"
,
"history_vote_pure_rate"
,
"one_reply_pure_rate"
,
"three_reply_pure_rate"
,
"seven_reply_pure_rate"
,
"fifteen_reply_pure_rate"
,
"thirty_reply_pure_rate"
,
"sixty_reply_pure_rate"
,
"ninety_reply_pure_rate"
,
"history_reply_pure_rate"
,
"card_fd"
,
"card_sd"
,
"card_fs"
,
"card_ss"
,
"card_fp"
,
"card_sp"
,
"card_p"
,
"card_fd2"
,
"card_sd2"
,
"card_fs2"
,
"card_ss2"
,
"card_fp2"
,
"card_sp2"
,
"card_p2"
,
]
def
nth_element
(
lst
,
n
):
if
n
>=
len
(
lst
):
return
""
return
lst
[
n
]
def
get_df
(
file
):
full_path
=
os
.
path
.
join
(
DATA_PATH
,
file
)
df
=
pd
.
read_csv
(
full_path
,
sep
=
"|"
)
return
df
def
device_tractae_fe
():
click_df
=
get_df
(
"tractate_click.csv"
)
exposure_df
=
get_df
(
"tractate_exposure.csv"
)
device_fe_df
=
get_df
(
"device_feature.csv"
)
tractate_fe_df
=
get_df
(
"tractate_feature.csv"
)
# print(click_df.head(3))
# print(exposure_df.head(3))
# print(device_fe_df.head(3))
# print(tractate_fe_df.head(3))
#
base_df
=
pd
.
merge
(
click_df
,
exposure_df
,
how
=
"outer"
,
indicator
=
"Exist"
)
base_df
[
"label"
]
=
np
.
where
(
base_df
[
"Exist"
]
==
"right_only"
,
0.5
,
1.0
)
base_df
.
drop
(
"Exist"
,
inplace
=
True
,
axis
=
1
)
#
device_fe_df
.
fillna
(
""
,
inplace
=
True
)
device_fe_df
.
rename
(
columns
=
{
"cl_id"
:
"device_id"
},
inplace
=
True
)
device_fe_df
[
"first_demands"
]
=
device_fe_df
[
"first_demands"
]
.
str
.
split
(
","
)
.
\
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
device_fe_df
[
"second_demands"
]
=
device_fe_df
[
"second_demands"
]
.
str
.
split
(
","
)
.
\
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
device_fe_df
[
"first_solutions"
]
=
device_fe_df
[
"first_solutions"
]
.
str
.
split
(
","
)
.
\
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
device_fe_df
[
"second_solutions"
]
=
device_fe_df
[
"second_solutions"
]
.
str
.
split
(
","
)
.
\
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
device_fe_df
[
"first_positions"
]
=
device_fe_df
[
"first_positions"
]
.
str
.
split
(
","
)
.
\
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
device_fe_df
[
"second_positions"
]
=
device_fe_df
[
"second_positions"
]
.
str
.
split
(
","
)
.
\
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
device_fe_df
[
"projects"
]
=
device_fe_df
[
"projects"
]
.
str
.
split
(
","
)
.
\
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
device_fe_df
[
"device_fd"
]
=
device_fe_df
[
"first_demands"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
0
))
device_fe_df
[
"device_sd"
]
=
device_fe_df
[
"second_demands"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
0
))
device_fe_df
[
"device_fs"
]
=
device_fe_df
[
"first_solutions"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
0
))
device_fe_df
[
"device_ss"
]
=
device_fe_df
[
"second_solutions"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
0
))
device_fe_df
[
"device_fp"
]
=
device_fe_df
[
"first_positions"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
0
))
device_fe_df
[
"device_sp"
]
=
device_fe_df
[
"second_positions"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
0
))
device_fe_df
[
"device_p"
]
=
device_fe_df
[
"projects"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
0
))
device_fe_df
[
"device_fd2"
]
=
device_fe_df
[
"first_demands"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
1
))
device_fe_df
[
"device_sd2"
]
=
device_fe_df
[
"second_demands"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
1
))
device_fe_df
[
"device_fs2"
]
=
device_fe_df
[
"first_solutions"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
1
))
device_fe_df
[
"device_ss2"
]
=
device_fe_df
[
"second_solutions"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
1
))
device_fe_df
[
"device_fp2"
]
=
device_fe_df
[
"first_positions"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
1
))
device_fe_df
[
"device_sp2"
]
=
device_fe_df
[
"second_positions"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
1
))
device_fe_df
[
"device_p2"
]
=
device_fe_df
[
"projects"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
1
))
_drop_columns
=
[
"first_demands"
,
"second_demands"
,
"first_solutions"
,
"second_solutions"
,
"first_positions"
,
"second_positions"
,
"projects"
]
device_fe_df
.
drop
(
columns
=
_drop_columns
,
axis
=
1
,
inplace
=
True
)
#
_card_drop_columns
=
[
"card_first_demands"
,
"card_second_demands"
,
"card_first_solutions"
,
"card_second_solutions"
,
"card_first_positions"
,
"card_second_positions"
,
"card_projects"
]
tractate_fe_df
[
_card_drop_columns
]
.
fillna
(
""
,
inplace
=
True
)
tractate_fe_df
[
"card_first_demands"
]
=
tractate_fe_df
[
"card_first_demands"
]
.
str
.
split
(
","
)
.
\
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
tractate_fe_df
[
"card_second_demands"
]
=
tractate_fe_df
[
"card_second_demands"
]
.
str
.
split
(
","
)
.
\
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
tractate_fe_df
[
"card_first_solutions"
]
=
tractate_fe_df
[
"card_first_solutions"
]
.
str
.
split
(
","
)
.
\
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
tractate_fe_df
[
"card_second_solutions"
]
=
tractate_fe_df
[
"card_second_solutions"
]
.
str
.
split
(
","
)
.
\
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
tractate_fe_df
[
"card_first_positions"
]
=
tractate_fe_df
[
"card_first_positions"
]
.
str
.
split
(
","
)
.
\
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
tractate_fe_df
[
"card_second_positions"
]
=
tractate_fe_df
[
"card_second_positions"
]
.
str
.
split
(
","
)
.
\
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
tractate_fe_df
[
"card_projects"
]
=
tractate_fe_df
[
"card_projects"
]
.
str
.
split
(
","
)
.
\
apply
(
lambda
d
:
d
if
isinstance
(
d
,
list
)
else
[])
tractate_fe_df
[
"card_fd"
]
=
tractate_fe_df
[
"card_first_demands"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
0
))
tractate_fe_df
[
"card_sd"
]
=
tractate_fe_df
[
"card_second_demands"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
0
))
tractate_fe_df
[
"card_fs"
]
=
tractate_fe_df
[
"card_first_solutions"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
0
))
tractate_fe_df
[
"card_ss"
]
=
tractate_fe_df
[
"card_second_solutions"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
0
))
tractate_fe_df
[
"card_fp"
]
=
tractate_fe_df
[
"card_first_positions"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
0
))
tractate_fe_df
[
"card_sp"
]
=
tractate_fe_df
[
"card_second_positions"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
0
))
tractate_fe_df
[
"card_p"
]
=
tractate_fe_df
[
"card_projects"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
0
))
tractate_fe_df
[
"card_fd2"
]
=
tractate_fe_df
[
"card_first_demands"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
1
))
tractate_fe_df
[
"card_sd2"
]
=
tractate_fe_df
[
"card_second_demands"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
1
))
tractate_fe_df
[
"card_fs2"
]
=
tractate_fe_df
[
"card_first_solutions"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
1
))
tractate_fe_df
[
"card_ss2"
]
=
tractate_fe_df
[
"card_second_solutions"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
1
))
tractate_fe_df
[
"card_fp2"
]
=
tractate_fe_df
[
"card_first_positions"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
1
))
tractate_fe_df
[
"card_sp2"
]
=
tractate_fe_df
[
"card_second_positions"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
1
))
tractate_fe_df
[
"card_p2"
]
=
tractate_fe_df
[
"card_projects"
]
.
apply
(
lambda
x
:
nth_element
(
x
,
1
))
tractate_fe_df
.
drop
(
columns
=
_card_drop_columns
,
axis
=
1
,
inplace
=
True
)
#
df
=
pd
.
merge
(
pd
.
merge
(
base_df
,
device_fe_df
),
tractate_fe_df
)
nullseries
=
df
.
isnull
()
.
sum
()
nulls
=
nullseries
[
nullseries
>
0
]
if
nulls
.
any
():
print
(
nulls
)
raise
Exception
(
"dataframe nulls"
)
return
df
if
__name__
==
"__main__"
:
df
=
device_tractae_fe
()
print
(
df
.
head
(
3
))
dssm/get_tractate_data.py
View file @
44fbe3c4
...
...
@@ -406,13 +406,13 @@ if __name__ == "__main__":
start
,
end
=
get_ndays_before_no_minus
(
days
),
get_ndays_before_no_minus
(
1
)
click_df
=
get_click_data
(
spark
,
card_type
,
start
,
end
)
save_df_to_csv
(
click_df
,
"tracate_click.csv"
)
save_df_to_csv
(
click_df
,
"trac
t
ate_click.csv"
)
exposure_df
=
get_exposure_data
(
spark
,
card_type
,
start
,
end
)
save_df_to_csv
(
exposure_df
,
"tracate_exposure.csv"
)
save_df_to_csv
(
exposure_df
,
"trac
t
ate_exposure.csv"
)
tractate_feature_df
=
get_card_feature_df
(
spark
,
card_type
,
end
)
save_df_to_csv
(
tractate_feature_df
,
"tracate_feature.csv"
)
save_df_to_csv
(
tractate_feature_df
,
"trac
t
ate_feature.csv"
)
device_feature_df
=
get_device_tags
(
spark
)
save_df_to_csv
(
device_feature_df
,
"device_feature.csv"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment