update diary fe

ef92df4d · 赵威 · be82b754 · ef92df4d · ef92df4d · ef92df4d
Commit ef92df4d authored Jul 30, 2020 by 赵威
Showing with 42 additions and 72 deletions

diary_fe.py src/models/esmm/fe/diary_fe.py +4 -32

fe.py src/models/esmm/fe/fe.py +33 -0

tractate_fe.py src/models/esmm/fe/tractate_fe.py +1 -37

train_diary.py src/train_diary.py +4 -3

No files found.
--- a/src/models/esmm/fe/diary_fe.py
+++ b/src/models/esmm/fe/diary_fe.py
 import timeit

 import pandas as pd
-import tensorflow as tf
-from tensorflow import feature_column as fc
 from utils.cache import redis_db_client

-from ..utils import (common_elements, create_boundaries, create_vocabulary_list, nth_element)
+from ..utils import common_elements, nth_element

 DIARY_COLUMNS = [
    "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "topic_num", "favor_num", "vote_num",
    "one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr", "first_demands", "second_demands", "first_solutions", "second_solutions",
    "first_positions", "second_positions", "projects"
 ]
-_int_columns = ["active_days", "topic_num", "favor_num", "vote_num"]
-_float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
-_categorical_columns = [
+INT_COLUMNS = ["active_days", "topic_num", "favor_num", "vote_num"]
+FLOAT_COLUMNS = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
+CATEGORICAL_COLUMNS = [
    "device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history", "price_sensitive_history",
    "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "device_fd", "content_fd", "fd1", "fd2",
    "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs", "fs1", "fs2", "fs3", "device_ss",
@@ -160,32 +158,6 @@ def join_features(device_df, diary_df, cc_df):
    return df


-def build_features(df):
-    numeric_features = []
-    for col in (_int_columns + _float_columns):
-        if col in _int_columns:
-            numeric_features.append(
-                fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
-        else:
-            numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
-
-    categorical_features = []
-    for col in _categorical_columns:
-        if col == "card_id":
-            categorical_features.append(
-                fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
-                                    dimension=int(df[col].size**0.25)))
-        elif col == "device_id":
-            categorical_features.append(
-                fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
-        else:
-            categorical_features.append(
-                fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
-
-    all_features = (numeric_features + categorical_features)
-    return all_features
-
-
 def device_diary_fe(device_id, diary_ids, device_dict, diary_dict):
    time_1 = timeit.default_timer()
    device_info = device_dict.get(device_id, {}).copy()

--- a/src/models/esmm/fe/fe.py
+++ b/src/models/esmm/fe/fe.py
+import tensorflow as tf
+from tensorflow import feature_column as fc
+
+from ..utils import create_boundaries, create_vocabulary_list
+
+
+def build_features(df, int_columns, float_columns, categorical_columns):
+    numeric_features = []
+    for col in (int_columns + float_columns):
+        if col in int_columns:
+            numeric_features.append(
+                fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
+        else:
+            numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
+
+    categorical_features = []
+    for col in categorical_columns:
+        if col == "card_id":
+            categorical_features.append(
+                fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
+                                    dimension=int(df[col].size**0.25)))
+        elif col == "device_id":
+            categorical_features.append(
+                fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 400000), dimension=int(df[col].size**0.25)))
+        elif col == "show_tag_id":
+            categorical_features.append(
+                fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 100000), dimension=int(df[col].size**0.25)))
+        else:
+            categorical_features.append(
+                fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
+
+    all_features = (numeric_features + categorical_features)
+    return all_features
--- a/src/models/esmm/fe/tractate_fe.py
+++ b/src/models/esmm/fe/tractate_fe.py
 import pandas as pd
-import tensorflow as tf
-from tensorflow import feature_column as fc
 from utils.cache import redis_db_client

-from ..utils import (common_elements, create_boundaries, create_vocabulary_list, nth_element)
+from ..utils import common_elements, nth_element

 TRACTATE_COLUMNS = [
    "card_id", "is_pure_author", "is_have_pure_reply", "is_have_reply", "content_level", "show_tag_id", "reply_num",
@@ -134,39 +132,5 @@ def join_features(device_df, tractate_df, cc_df):
    return df


-def build_features(df):
-    numeric_features = []
-    for col in (_int_columns + _float_columns):
-        if col in _int_columns:
-            numeric_features.append(
-                fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
-        else:
-            numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
-
-    _categorical_columns = [
-        "device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
-        "price_sensitive_history", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
-        "device_fd", "content_fd", "fd1", "fd2", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs",
-        "fs1", "fs2", "fs3", "device_ss", "content_ss", "ss1", "ss2", "ss3", "device_fp", "content_fp", "fp1", "fp2", "fp3",
-        "device_sp", "content_sp", "sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3"
-    ]
-
-    categorical_features = []
-    for col in _categorical_columns:
-        if col == "card_id":
-            categorical_features.append(
-                fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
-                                    dimension=int(df[col].size**0.25)))
-        elif col == "device_id":
-            categorical_features.append(
-                fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
-        else:
-            categorical_features.append(
-                fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
-
-    all_features = (numeric_features + categorical_features)
-    return all_features
-
-
 def device_tractate_fe(device_id, tractate_ids, device_dict, tractate_dict):
    pass
--- a/src/train_diary.py
+++ b/src/train_diary.py
@@ -9,10 +9,11 @@ from pathlib import Path
 import tensorflow as tf
 from sklearn.model_selection import train_test_split

+from models.esmm.diary_model import model_predict_diary
+from models.esmm.fe import click_fe as click_fe
 from models.esmm.fe import device_fe as device_fe
 from models.esmm.fe import diary_fe as diary_fe
-from models.esmm.fe import click_fe as click_fe
-from models.esmm.diary_model import model_predict_diary
+from models.esmm.fe import fe as fe
 from models.esmm.input_fn import esmm_input_fn
 from models.esmm.model import esmm_model_fn, model_export

@@ -43,7 +44,7 @@ def main():
    train_df, test_df = train_test_split(df, test_size=0.2)
    train_df, val_df = train_test_split(train_df, test_size=0.2)

-    all_features = diary_fe.build_features(df)
+    all_features = fe.build_features(df, diary_fe.INT_COLUMNS, diary_fe.FLOAT_COLUMNS, diary_fe.CATEGORICAL_COLUMNS)
    params = {"feature_columns": all_features, "hidden_units": [64, 32], "learning_rate": 0.1}
    model_path = str(Path("~/data/model_tmp/").expanduser())
    # if os.path.exists(model_path):