update columns

be82b754 · 赵威 · a23369bd · be82b754 · be82b754 · be82b754
Commit be82b754 authored Jul 30, 2020 by 赵威
Hide whitespace changes
Inline Side-by-side

Showing with 48 additions and 52 deletions

diary_fe.py src/models/esmm/fe/diary_fe.py +22 -28

tractate_fe.py src/models/esmm/fe/tractate_fe.py +24 -24

train_tractate.py src/train_tractate.py +2 -0

No files found.
--- a/src/models/esmm/fe/diary_fe.py
+++ b/src/models/esmm/fe/diary_fe.py
@@ -12,6 +12,15 @@ DIARY_COLUMNS = [
    "one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr", "first_demands", "second_demands", "first_solutions", "second_solutions",
    "first_positions", "second_positions", "projects"
 ]
+_int_columns = ["active_days", "topic_num", "favor_num", "vote_num"]
+_float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
+_categorical_columns = [
+    "device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history", "price_sensitive_history",
+    "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "device_fd", "content_fd", "fd1", "fd2",
+    "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs", "fs1", "fs2", "fs3", "device_ss",
+    "content_ss", "ss1", "ss2", "ss3", "device_fp", "content_fp", "fp1", "fp2", "fp3", "device_sp", "content_sp", "sp1", "sp2",
+    "sp3", "device_p", "content_p", "p1", "p2", "p3"
+]


 def read_csv_data(dataset_path):
@@ -152,41 +161,26 @@ def join_features(device_df, diary_df, cc_df):


 def build_features(df):
-    # TODO
-    int_columns = ["active_days", "topic_num", "favor_num", "vote_num"]
-    float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
    numeric_features = []
-    for col in (int_columns + float_columns):
-        if col in int_columns:
+    for col in (_int_columns + _float_columns):
+        if col in _int_columns:
            numeric_features.append(
                fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
        else:
            numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))

-    # TODO
-    categorical_columns = [
-        "device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
-        "price_sensitive_history", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
-        "device_fd", "content_fd", "fd1", "fd2", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs",
-        "fs1", "fs2", "fs3", "device_ss", "content_ss", "ss1", "ss2", "ss3", "device_fp", "content_fp", "fp1", "fp2", "fp3",
-        "device_sp", "content_sp", "sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3"
-    ]
-
-    categorical_ignore_columns = []
-
    categorical_features = []
-    for col in categorical_columns:
-        if col not in categorical_ignore_columns:
-            if col == "card_id":
-                categorical_features.append(
-                    fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
-                                        dimension=int(df[col].size**0.25)))
-            elif col == "device_id":
-                categorical_features.append(
-                    fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
-            else:
-                categorical_features.append(
-                    fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
+    for col in _categorical_columns:
+        if col == "card_id":
+            categorical_features.append(
+                fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
+                                    dimension=int(df[col].size**0.25)))
+        elif col == "device_id":
+            categorical_features.append(
+                fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
+        else:
+            categorical_features.append(
+                fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))

    all_features = (numeric_features + categorical_features)
    return all_features

--- a/src/models/esmm/fe/tractate_fe.py
+++ b/src/models/esmm/fe/tractate_fe.py
@@ -11,6 +11,16 @@ TRACTATE_COLUMNS = [
    "first_demands", "second_demands", "first_solutions", "second_solutions", "first_positions", "second_positions", "projects"
 ]

+_int_columns = ["active_days", "reply_num", "reply_pure_num"]
+_float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr", "thirty_ctr", "sixty_ctr", "ninety_ctr", "history_ctr"]
+_categorical_columns = [
+    "device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history", "price_sensitive_history",
+    "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "show_tag_id", "device_fd", "content_fd",
+    "fd1", "fd2", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs", "fs1", "fs2", "fs3",
+    "device_ss", "content_ss", "ss1", "ss2", "ss3", "device_fp", "content_fp", "fp1", "fp2", "fp3", "device_sp", "content_sp",
+    "sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3"
+]
+

 def read_csv_data(dataset_path):
    tractate_df = pd.read_csv(dataset_path.joinpath("tractate.csv"), sep="|")
@@ -120,27 +130,20 @@ def join_features(device_df, tractate_df, cc_df):
        "second_solutions", "first_positions_x", "first_positions_y", "first_positions", "second_positions_x",
        "second_positions_y", "second_positions", "projects_x", "projects_y", "projects"
    ]
-    # for col in drop_columns:
-    #     if col in df.columns:
-    #         df.drop(col, inplace=True, axis=1)
    df.drop(drop_columns, inplace=True, axis=1)
    return df


 def build_features(df):
-    # TODO
-    int_columns = ["active_days", "topic_num", "favor_num", "vote_num"]
-    float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
    numeric_features = []
-    for col in (int_columns + float_columns):
-        if col in int_columns:
+    for col in (_int_columns + _float_columns):
+        if col in _int_columns:
            numeric_features.append(
                fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
        else:
            numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))

-    # TODO
-    categorical_columns = [
+    _categorical_columns = [
        "device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
        "price_sensitive_history", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
        "device_fd", "content_fd", "fd1", "fd2", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs",
@@ -148,21 +151,18 @@ def build_features(df):
        "device_sp", "content_sp", "sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3"
    ]

-    categorical_ignore_columns = []
-
    categorical_features = []
-    for col in categorical_columns:
-        if col not in categorical_ignore_columns:
-            if col == "card_id":
-                categorical_features.append(
-                    fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
-                                        dimension=int(df[col].size**0.25)))
-            elif col == "device_id":
-                categorical_features.append(
-                    fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
-            else:
-                categorical_features.append(
-                    fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
+    for col in _categorical_columns:
+        if col == "card_id":
+            categorical_features.append(
+                fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
+                                    dimension=int(df[col].size**0.25)))
+        elif col == "device_id":
+            categorical_features.append(
+                fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
+        else:
+            categorical_features.append(
+                fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))

    all_features = (numeric_features + categorical_features)
    return all_features

--- a/src/train_tractate.py
+++ b/src/train_tractate.py
@@ -25,6 +25,8 @@ def main():

    cc_df = click_fe.click_feature_engineering(tractate_click_df, tractate_conversion_df)
    df = tractate_fe.join_features(device_df, tractate_df, cc_df)
+    for i in df.columns:
+        print(i)
    # print(df.dtypes)

    train_df, test_df = train_test_split(df, test_size=0.2)