Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
G
gm_strategy_cvr
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
gm_strategy_cvr
Commits
735c83c9
Commit
735c83c9
authored
Jul 23, 2020
by
赵威
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add size
parent
daaaffce
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
4 additions
and
81 deletions
+4
-81
fe.py
src/models/esmm/fe.py
+4
-81
No files found.
src/models/esmm/fe.py
View file @
735c83c9
...
@@ -12,23 +12,9 @@ def read_csv_data(dataset_path):
...
@@ -12,23 +12,9 @@ def read_csv_data(dataset_path):
diary_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"diary.csv"
),
sep
=
"|"
)
diary_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"diary.csv"
),
sep
=
"|"
)
click_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"diary_click.csv"
),
sep
=
"|"
)
click_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"diary_click.csv"
),
sep
=
"|"
)
conversion_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"diary_click_cvr.csv"
),
sep
=
"|"
)
conversion_df
=
pd
.
read_csv
(
dataset_path
.
joinpath
(
"diary_click_cvr.csv"
),
sep
=
"|"
)
# TODO remove sample
# return device_df.sample(10000), diary_df.sample(5000), click_df, conversion_df
return
device_df
,
diary_df
,
click_df
,
conversion_df
return
device_df
,
diary_df
,
click_df
,
conversion_df
# def _get_data_from_redis(key):
# column_key = key + ":column"
# d = redis_db_client.hgetall(key)
# tmp = d.values()
# lists = []
# for i in tmp:
# lists.append(str(i, "utf-8").split("|"))
# columns = str(redis_db_client.get(column_key), "utf-8").split("|")
# df = pd.DataFrame(lists, columns=columns)
# return df
def
get_device_dict_from_redis
():
def
get_device_dict_from_redis
():
db_key
=
"cvr:db:device"
db_key
=
"cvr:db:device"
column_key
=
db_key
+
":column"
column_key
=
db_key
+
":column"
...
@@ -100,7 +86,7 @@ def device_feature_engineering(df):
...
@@ -100,7 +86,7 @@ def device_feature_engineering(df):
nullseries
=
device_df
.
isnull
()
.
sum
()
nullseries
=
device_df
.
isnull
()
.
sum
()
print
(
"device:"
)
print
(
"device:"
)
print
(
nullseries
[
nullseries
>
0
])
print
(
nullseries
[
nullseries
>
0
])
#
print(device_df.size)
print
(
device_df
.
size
)
device_columns
=
[
device_columns
=
[
"device_id"
,
"active_type"
,
"active_days"
,
"past_consume_ability_history"
,
"potential_consume_ability_history"
,
"device_id"
,
"active_type"
,
"active_days"
,
"past_consume_ability_history"
,
"potential_consume_ability_history"
,
...
@@ -142,7 +128,7 @@ def diary_feature_engineering(df, from_redis=False):
...
@@ -142,7 +128,7 @@ def diary_feature_engineering(df, from_redis=False):
print
(
"diary:"
)
print
(
"diary:"
)
nullseries
=
diary_df
.
isnull
()
.
sum
()
nullseries
=
diary_df
.
isnull
()
.
sum
()
print
(
nullseries
[
nullseries
>
0
])
print
(
nullseries
[
nullseries
>
0
])
#
print(diary_df.size)
print
(
diary_df
.
size
)
diary_columns
=
[
diary_columns
=
[
"card_id"
,
"is_pure_author"
,
"is_have_reply"
,
"is_have_pure_reply"
,
"content_level"
,
"topic_num"
,
"favor_num"
,
"vote_num"
,
"card_id"
,
"is_pure_author"
,
"is_have_reply"
,
"is_have_pure_reply"
,
"content_level"
,
"topic_num"
,
"favor_num"
,
"vote_num"
,
...
@@ -165,7 +151,7 @@ def click_feature_engineering(click_df, conversion_df):
...
@@ -165,7 +151,7 @@ def click_feature_engineering(click_df, conversion_df):
print
(
"click:"
)
print
(
"click:"
)
nullseries
=
cc_df
.
isnull
()
.
sum
()
nullseries
=
cc_df
.
isnull
()
.
sum
()
print
(
nullseries
[
nullseries
>
0
])
print
(
nullseries
[
nullseries
>
0
])
#
print(cc_df.size)
print
(
cc_df
.
size
)
return
cc_df
return
cc_df
...
@@ -223,7 +209,7 @@ def join_features(device_df, diary_df, cc_df):
...
@@ -223,7 +209,7 @@ def join_features(device_df, diary_df, cc_df):
print
(
"df:"
)
print
(
"df:"
)
nullseries
=
df
.
isnull
()
.
sum
()
nullseries
=
df
.
isnull
()
.
sum
()
print
(
nullseries
[
nullseries
>
0
])
print
(
nullseries
[
nullseries
>
0
])
#
print(df.size)
print
(
df
.
size
)
drop_columns
=
[
drop_columns
=
[
"cl_id"
,
"first_demands_x"
,
"first_demands_y"
,
"first_demands"
,
"second_demands_x"
,
"second_demands_y"
,
"second_demands"
,
"cl_id"
,
"first_demands_x"
,
"first_demands_y"
,
"first_demands"
,
"second_demands_x"
,
"second_demands_y"
,
"second_demands"
,
...
@@ -238,69 +224,6 @@ def join_features(device_df, diary_df, cc_df):
...
@@ -238,69 +224,6 @@ def join_features(device_df, diary_df, cc_df):
return
df
return
df
# def join_device_diary(device_id, diary_ids, device_df, diary_df):
# a_df = device_df.loc[device_df["device_id"] == device_id]
# b_df = diary_df.loc[diary_df["card_id"].isin(diary_ids)]
# b_df["device_id"] = device_id
# df = pd.merge(a_df, b_df, how="left", on="device_id")
# df["first_demands"] = df[["first_demands_x", "first_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
# df["second_demands"] = df[["second_demands_x", "second_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
# df["first_solutions"] = df[["first_solutions_x", "first_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
# df["second_solutions"] = df[["second_solutions_x", "second_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
# df["first_positions"] = df[["first_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
# df["second_positions"] = df[["second_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
# df["projects"] = df[["projects_x", "projects_y"]].apply(lambda x: common_elements(*x), axis=1)
# df["device_fd"] = df["first_demands_x"].apply(lambda x: nth_element(x, 0))
# df["device_sd"] = df["second_demands_x"].apply(lambda x: nth_element(x, 0))
# df["device_fs"] = df["first_solutions_x"].apply(lambda x: nth_element(x, 0))
# df["device_ss"] = df["second_solutions_x"].apply(lambda x: nth_element(x, 0))
# df["device_fp"] = df["first_positions_x"].apply(lambda x: nth_element(x, 0))
# df["device_sp"] = df["second_positions_x"].apply(lambda x: nth_element(x, 0))
# df["device_p"] = df["projects_x"].apply(lambda x: nth_element(x, 0))
# df["content_fd"] = df["first_demands_y"].apply(lambda x: nth_element(x, 0))
# df["content_sd"] = df["second_demands_y"].apply(lambda x: nth_element(x, 0))
# df["content_fs"] = df["first_solutions_y"].apply(lambda x: nth_element(x, 0))
# df["content_ss"] = df["second_solutions_y"].apply(lambda x: nth_element(x, 0))
# df["content_fp"] = df["first_positions_y"].apply(lambda x: nth_element(x, 0))
# df["content_sp"] = df["second_positions_y"].apply(lambda x: nth_element(x, 0))
# df["content_p"] = df["projects_y"].apply(lambda x: nth_element(x, 0))
# df["fd1"] = df["first_demands"].apply(lambda x: nth_element(x, 0))
# df["fd2"] = df["first_demands"].apply(lambda x: nth_element(x, 1))
# df["fd3"] = df["first_demands"].apply(lambda x: nth_element(x, 2))
# df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 0))
# df["sd2"] = df["second_demands"].apply(lambda x: nth_element(x, 1))
# df["sd3"] = df["second_demands"].apply(lambda x: nth_element(x, 2))
# df["fs1"] = df["first_solutions"].apply(lambda x: nth_element(x, 0))
# df["fs2"] = df["first_solutions"].apply(lambda x: nth_element(x, 1))
# df["fs3"] = df["first_solutions"].apply(lambda x: nth_element(x, 2))
# df["ss1"] = df["second_solutions"].apply(lambda x: nth_element(x, 0))
# df["ss2"] = df["second_solutions"].apply(lambda x: nth_element(x, 1))
# df["ss3"] = df["second_solutions"].apply(lambda x: nth_element(x, 2))
# df["fp1"] = df["first_positions"].apply(lambda x: nth_element(x, 0))
# df["fp2"] = df["first_positions"].apply(lambda x: nth_element(x, 1))
# df["fp3"] = df["first_positions"].apply(lambda x: nth_element(x, 2))
# df["sp1"] = df["second_positions"].apply(lambda x: nth_element(x, 0))
# df["sp2"] = df["second_positions"].apply(lambda x: nth_element(x, 1))
# df["sp3"] = df["second_positions"].apply(lambda x: nth_element(x, 2))
# df["p1"] = df["projects"].apply(lambda x: nth_element(x, 0))
# df["p2"] = df["projects"].apply(lambda x: nth_element(x, 1))
# df["p3"] = df["projects"].apply(lambda x: nth_element(x, 2))
# drop_columns = [
# "first_demands_x", "first_demands_y", "first_demands", "second_demands_x", "second_demands_y", "second_demands",
# "first_solutions_x", "first_solutions_y", "first_solutions", "second_solutions_x", "second_solutions_y",
# "second_solutions", "first_positions_x", "first_positions_y", "first_positions", "second_positions_x",
# "second_positions_y", "second_positions", "projects_x", "projects_y", "projects"
# ]
# df.drop(drop_columns, inplace=True, axis=1)
# return df
def
device_diary_fe
(
device_id
,
diary_ids
,
device_dict
,
diary_dict
):
def
device_diary_fe
(
device_id
,
diary_ids
,
device_dict
,
diary_dict
):
time_1
=
timeit
.
default_timer
()
time_1
=
timeit
.
default_timer
()
device_info
=
device_dict
.
get
(
device_id
,
{})
.
copy
()
device_info
=
device_dict
.
get
(
device_id
,
{})
.
copy
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment