Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
S
strategy_embedding
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
strategy_embedding
Commits
566d56d6
Commit
566d56d6
authored
Nov 05, 2020
by
赵威
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
get shape
parent
44fbe3c4
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
13 additions
and
5 deletions
+13
-5
dssm_model.py
dssm/dssm_model.py
+9
-5
get_tractate_data.py
dssm/get_tractate_data.py
+4
-0
No files found.
dssm/dssm_model.py
View file @
566d56d6
...
@@ -153,12 +153,14 @@ def device_tractae_fe():
...
@@ -153,12 +153,14 @@ def device_tractae_fe():
exposure_df
=
get_df
(
"tractate_exposure.csv"
)
exposure_df
=
get_df
(
"tractate_exposure.csv"
)
device_fe_df
=
get_df
(
"device_feature.csv"
)
device_fe_df
=
get_df
(
"device_feature.csv"
)
tractate_fe_df
=
get_df
(
"tractate_feature.csv"
)
tractate_fe_df
=
get_df
(
"tractate_feature.csv"
)
# print(click_df.head(3)
)
print
(
click_df
.
shape
)
# print(exposure_df.head(3)
)
print
(
exposure_df
.
shape
)
# print(device_fe_df.head(3)
)
print
(
device_fe_df
.
shape
)
# print(tractate_fe_df.head(3)
)
print
(
tractate_fe_df
.
shape
)
#
#
click_df
.
drop
(
"partition_date"
,
inplace
=
True
,
axis
=
1
)
exposure_df
.
drop
(
"partition_date"
,
inplace
=
True
,
axis
=
1
)
base_df
=
pd
.
merge
(
click_df
,
exposure_df
,
how
=
"outer"
,
indicator
=
"Exist"
)
base_df
=
pd
.
merge
(
click_df
,
exposure_df
,
how
=
"outer"
,
indicator
=
"Exist"
)
base_df
[
"label"
]
=
np
.
where
(
base_df
[
"Exist"
]
==
"right_only"
,
0.5
,
1.0
)
base_df
[
"label"
]
=
np
.
where
(
base_df
[
"Exist"
]
==
"right_only"
,
0.5
,
1.0
)
base_df
.
drop
(
"Exist"
,
inplace
=
True
,
axis
=
1
)
base_df
.
drop
(
"Exist"
,
inplace
=
True
,
axis
=
1
)
...
@@ -245,6 +247,8 @@ def device_tractae_fe():
...
@@ -245,6 +247,8 @@ def device_tractae_fe():
#
#
df
=
pd
.
merge
(
pd
.
merge
(
base_df
,
device_fe_df
),
tractate_fe_df
)
df
=
pd
.
merge
(
pd
.
merge
(
base_df
,
device_fe_df
),
tractate_fe_df
)
# a = pd.merge(base_df, tractate_fe_df, how="left", left_on="card_id", right_on="card_id")
nullseries
=
df
.
isnull
()
.
sum
()
nullseries
=
df
.
isnull
()
.
sum
()
nulls
=
nullseries
[
nullseries
>
0
]
nulls
=
nullseries
[
nullseries
>
0
]
if
nulls
.
any
():
if
nulls
.
any
():
...
@@ -255,4 +259,4 @@ def device_tractae_fe():
...
@@ -255,4 +259,4 @@ def device_tractae_fe():
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
df
=
device_tractae_fe
()
df
=
device_tractae_fe
()
print
(
df
.
head
(
3
))
print
(
df
.
head
(
3
)
,
df
.
shape
)
dssm/get_tractate_data.py
View file @
566d56d6
...
@@ -407,14 +407,18 @@ if __name__ == "__main__":
...
@@ -407,14 +407,18 @@ if __name__ == "__main__":
click_df
=
get_click_data
(
spark
,
card_type
,
start
,
end
)
click_df
=
get_click_data
(
spark
,
card_type
,
start
,
end
)
save_df_to_csv
(
click_df
,
"tractate_click.csv"
)
save_df_to_csv
(
click_df
,
"tractate_click.csv"
)
print
(
click_df
.
shape
)
exposure_df
=
get_exposure_data
(
spark
,
card_type
,
start
,
end
)
exposure_df
=
get_exposure_data
(
spark
,
card_type
,
start
,
end
)
save_df_to_csv
(
exposure_df
,
"tractate_exposure.csv"
)
save_df_to_csv
(
exposure_df
,
"tractate_exposure.csv"
)
print
(
exposure_df
.
shape
)
tractate_feature_df
=
get_card_feature_df
(
spark
,
card_type
,
end
)
tractate_feature_df
=
get_card_feature_df
(
spark
,
card_type
,
end
)
save_df_to_csv
(
tractate_feature_df
,
"tractate_feature.csv"
)
save_df_to_csv
(
tractate_feature_df
,
"tractate_feature.csv"
)
print
(
tractate_feature_df
.
shape
)
device_feature_df
=
get_device_tags
(
spark
)
device_feature_df
=
get_device_tags
(
spark
)
save_df_to_csv
(
device_feature_df
,
"device_feature.csv"
)
save_df_to_csv
(
device_feature_df
,
"device_feature.csv"
)
print
(
device_feature_df
.
shape
)
# spark-submit --master yarn --deploy-mode client --queue root.strategy --driver-memory 16g --executor-memory 1g --executor-cores 1 --num-executors 70 --conf spark.default.parallelism=100 --conf spark.storage.memoryFraction=0.5 --conf spark.shuffle.memoryFraction=0.3 --conf spark.locality.wait=0 --jars /srv/apps/tispark-core-2.1-SNAPSHOT-jar-with-dependencies.jar,/srv/apps/spark-connector_2.11-1.9.0-rc2.jar,/srv/apps/mysql-connector-java-5.1.38.jar /srv/apps/strategy_embedding/dssm/get_tractate_data.py
# spark-submit --master yarn --deploy-mode client --queue root.strategy --driver-memory 16g --executor-memory 1g --executor-cores 1 --num-executors 70 --conf spark.default.parallelism=100 --conf spark.storage.memoryFraction=0.5 --conf spark.shuffle.memoryFraction=0.3 --conf spark.locality.wait=0 --jars /srv/apps/tispark-core-2.1-SNAPSHOT-jar-with-dependencies.jar,/srv/apps/spark-connector_2.11-1.9.0-rc2.jar,/srv/apps/mysql-connector-java-5.1.38.jar /srv/apps/strategy_embedding/dssm/get_tractate_data.py
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment