add size

735c83c9 · 赵威 · daaaffce · 735c83c9
Commit 735c83c9 authored Jul 23, 2020 by 赵威
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 81 deletions

fe.py src/models/esmm/fe.py +4 -81

No files found.
--- a/src/models/esmm/fe.py
+++ b/src/models/esmm/fe.py
@@ -12,23 +12,9 @@ def read_csv_data(dataset_path):
    diary_df = pd.read_csv(dataset_path.joinpath("diary.csv"), sep="|")
    click_df = pd.read_csv(dataset_path.joinpath("diary_click.csv"), sep="|")
    conversion_df = pd.read_csv(dataset_path.joinpath("diary_click_cvr.csv"), sep="|")
-    # TODO remove sample
-    # return device_df.sample(10000), diary_df.sample(5000), click_df, conversion_df
    return device_df, diary_df, click_df, conversion_df


-# def _get_data_from_redis(key):
-#     column_key = key + ":column"
-#     d = redis_db_client.hgetall(key)
-#     tmp = d.values()
-#     lists = []
-#     for i in tmp:
-#         lists.append(str(i, "utf-8").split("|"))
-#     columns = str(redis_db_client.get(column_key), "utf-8").split("|")
-#     df = pd.DataFrame(lists, columns=columns)
-#     return df
-
-
 def get_device_dict_from_redis():
    db_key = "cvr:db:device"
    column_key = db_key + ":column"
@@ -100,7 +86,7 @@ def device_feature_engineering(df):
    nullseries = device_df.isnull().sum()
    print("device:")
    print(nullseries[nullseries > 0])
-    # print(device_df.size)
+    print(device_df.size)

    device_columns = [
        "device_id", "active_type", "active_days", "past_consume_ability_history", "potential_consume_ability_history",
@@ -142,7 +128,7 @@ def diary_feature_engineering(df, from_redis=False):
    print("diary:")
    nullseries = diary_df.isnull().sum()
    print(nullseries[nullseries > 0])
-    # print(diary_df.size)
+    print(diary_df.size)

    diary_columns = [
        "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "topic_num", "favor_num", "vote_num",
@@ -165,7 +151,7 @@ def click_feature_engineering(click_df, conversion_df):
    print("click:")
    nullseries = cc_df.isnull().sum()
    print(nullseries[nullseries > 0])
-    # print(cc_df.size)
+    print(cc_df.size)

    return cc_df

@@ -223,7 +209,7 @@ def join_features(device_df, diary_df, cc_df):
    print("df:")
    nullseries = df.isnull().sum()
    print(nullseries[nullseries > 0])
-    # print(df.size)
+    print(df.size)

    drop_columns = [
        "cl_id", "first_demands_x", "first_demands_y", "first_demands", "second_demands_x", "second_demands_y", "second_demands",
@@ -238,69 +224,6 @@ def join_features(device_df, diary_df, cc_df):
    return df


-# def join_device_diary(device_id, diary_ids, device_df, diary_df):
-#     a_df = device_df.loc[device_df["device_id"] == device_id]
-#     b_df = diary_df.loc[diary_df["card_id"].isin(diary_ids)]
-#     b_df["device_id"] = device_id
-
-#     df = pd.merge(a_df, b_df, how="left", on="device_id")
-
-#     df["first_demands"] = df[["first_demands_x", "first_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
-#     df["second_demands"] = df[["second_demands_x", "second_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
-#     df["first_solutions"] = df[["first_solutions_x", "first_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
-#     df["second_solutions"] = df[["second_solutions_x", "second_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
-#     df["first_positions"] = df[["first_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
-#     df["second_positions"] = df[["second_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
-#     df["projects"] = df[["projects_x", "projects_y"]].apply(lambda x: common_elements(*x), axis=1)
-
-#     df["device_fd"] = df["first_demands_x"].apply(lambda x: nth_element(x, 0))
-#     df["device_sd"] = df["second_demands_x"].apply(lambda x: nth_element(x, 0))
-#     df["device_fs"] = df["first_solutions_x"].apply(lambda x: nth_element(x, 0))
-#     df["device_ss"] = df["second_solutions_x"].apply(lambda x: nth_element(x, 0))
-#     df["device_fp"] = df["first_positions_x"].apply(lambda x: nth_element(x, 0))
-#     df["device_sp"] = df["second_positions_x"].apply(lambda x: nth_element(x, 0))
-#     df["device_p"] = df["projects_x"].apply(lambda x: nth_element(x, 0))
-
-#     df["content_fd"] = df["first_demands_y"].apply(lambda x: nth_element(x, 0))
-#     df["content_sd"] = df["second_demands_y"].apply(lambda x: nth_element(x, 0))
-#     df["content_fs"] = df["first_solutions_y"].apply(lambda x: nth_element(x, 0))
-#     df["content_ss"] = df["second_solutions_y"].apply(lambda x: nth_element(x, 0))
-#     df["content_fp"] = df["first_positions_y"].apply(lambda x: nth_element(x, 0))
-#     df["content_sp"] = df["second_positions_y"].apply(lambda x: nth_element(x, 0))
-#     df["content_p"] = df["projects_y"].apply(lambda x: nth_element(x, 0))
-
-#     df["fd1"] = df["first_demands"].apply(lambda x: nth_element(x, 0))
-#     df["fd2"] = df["first_demands"].apply(lambda x: nth_element(x, 1))
-#     df["fd3"] = df["first_demands"].apply(lambda x: nth_element(x, 2))
-#     df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 0))
-#     df["sd2"] = df["second_demands"].apply(lambda x: nth_element(x, 1))
-#     df["sd3"] = df["second_demands"].apply(lambda x: nth_element(x, 2))
-#     df["fs1"] = df["first_solutions"].apply(lambda x: nth_element(x, 0))
-#     df["fs2"] = df["first_solutions"].apply(lambda x: nth_element(x, 1))
-#     df["fs3"] = df["first_solutions"].apply(lambda x: nth_element(x, 2))
-#     df["ss1"] = df["second_solutions"].apply(lambda x: nth_element(x, 0))
-#     df["ss2"] = df["second_solutions"].apply(lambda x: nth_element(x, 1))
-#     df["ss3"] = df["second_solutions"].apply(lambda x: nth_element(x, 2))
-#     df["fp1"] = df["first_positions"].apply(lambda x: nth_element(x, 0))
-#     df["fp2"] = df["first_positions"].apply(lambda x: nth_element(x, 1))
-#     df["fp3"] = df["first_positions"].apply(lambda x: nth_element(x, 2))
-#     df["sp1"] = df["second_positions"].apply(lambda x: nth_element(x, 0))
-#     df["sp2"] = df["second_positions"].apply(lambda x: nth_element(x, 1))
-#     df["sp3"] = df["second_positions"].apply(lambda x: nth_element(x, 2))
-#     df["p1"] = df["projects"].apply(lambda x: nth_element(x, 0))
-#     df["p2"] = df["projects"].apply(lambda x: nth_element(x, 1))
-#     df["p3"] = df["projects"].apply(lambda x: nth_element(x, 2))
-
-#     drop_columns = [
-#         "first_demands_x", "first_demands_y", "first_demands", "second_demands_x", "second_demands_y", "second_demands",
-#         "first_solutions_x", "first_solutions_y", "first_solutions", "second_solutions_x", "second_solutions_y",
-#         "second_solutions", "first_positions_x", "first_positions_y", "first_positions", "second_positions_x",
-#         "second_positions_y", "second_positions", "projects_x", "projects_y", "projects"
-#     ]
-#     df.drop(drop_columns, inplace=True, axis=1)
-#     return df
-
-
 def device_diary_fe(device_id, diary_ids, device_dict, diary_dict):
    time_1 = timeit.default_timer()
    device_info = device_dict.get(device_id, {}).copy()