add prediction

430956d8 · 赵威 · aeb4ca60 · 430956d8 · 430956d8 · 430956d8
Commit 430956d8 authored Jul 20, 2020 by 赵威
6 changed files
--- a/requirements.txt
+++ b/requirements.txt
 tensorflow==1.15.2
 keras==2.3.1
 scikit-learn==0.23.1
+redis==2.10.5
--- a/src/__init__.py
+++ b/src/__init__.py
--- a/src/main.py
+++ b/src/main.py
+import time
+from datetime import datetime
 from pathlib import Path
 import tensorflow as tf
@@ -6,12 +8,13 @@ from sklearn.model_selection import train_test_split
 from models.esmm.fe import (click_feature_engineering, device_feature_engineering, diary_feature_engineering, join_features,
                            read_csv_data)
 from models.esmm.input_fn import build_features, esmm_input_fn
-from models.esmm.model import esmm_model_fn, model_export
+from models.esmm.model import esmm_model_fn, model_export, model_predict
 tf.compat.v1.enable_eager_execution()
 def main():
+    time_begin = time.time()
    device_df, diary_df, click_df, conversion_df = read_csv_data(Path("~/Desktop/cvr_data/"))
    device_df = device_feature_engineering(device_df)
    diary_df = diary_feature_engineering(diary_df)
@@ -27,12 +30,18 @@ def main():
    model_path = str(Path("~/Desktop/models/").expanduser())
    model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path)
-    model.train(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), steps=5000)
+    print("train")
-    model.evaluate(input_fn=lambda: esmm_input_fn(val_df, False), steps=5000)
+    model.train(input_fn=lambda: esmm_input_fn(train_df.sample(100000), shuffle=True), steps=5000)
-    model_export(model, all_features, model_path)
+    model.evaluate(input_fn=lambda: esmm_input_fn(val_df.sample(100000), False), steps=5000)
+    save_path = model_export(model, all_features, model_path)
-    predictions = model.predict(input_fn=lambda: esmm_input_fn(test_df, False))
+    # predictions = model.predict(input_fn=lambda: esmm_input_fn(test_df, False))
-    print(next(iter(predictions)))
+    # print(next(iter(predictions)))
+    model_predict(test_df.sample(300), save_path)
+    total_time = (time.time() - time_begin) / 60
+    print("cost {:.2f} mins at {}".format(total_time, datetime.now()))
 if __name__ == "__main__":

--- a/src/models/esmm/fe.py
+++ b/src/models/esmm/fe.py
@@ -126,8 +126,8 @@ def join_features(device_df, diary_df, cc_df):
    df["fd2"] = df["first_demands"].apply(lambda x: nth_element(x, 1))
    df["fd3"] = df["first_demands"].apply(lambda x: nth_element(x, 2))
    df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 0))
-    df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 1))
+    df["sd2"] = df["second_demands"].apply(lambda x: nth_element(x, 1))
-    df["sd2"] = df["second_demands"].apply(lambda x: nth_element(x, 2))
+    df["sd3"] = df["second_demands"].apply(lambda x: nth_element(x, 2))
    df["fs1"] = df["first_solutions"].apply(lambda x: nth_element(x, 0))
    df["fs2"] = df["first_solutions"].apply(lambda x: nth_element(x, 1))
    df["fs3"] = df["first_solutions"].apply(lambda x: nth_element(x, 2))
@@ -154,5 +154,8 @@ def join_features(device_df, diary_df, cc_df):
        "second_solutions", "first_positions_x", "first_positions_y", "first_positions", "second_positions_x",
        "second_positions_y", "second_positions", "projects_x", "projects_y", "projects"
    ]
+    # for col in drop_columns:
+    #     if col in df.columns:
+    #         df.drop(col, inplace=True, axis=1)
    df.drop(drop_columns, inplace=True, axis=1)
    return df
--- a/src/models/esmm/input_fn.py
+++ b/src/models/esmm/input_fn.py
@@ -13,21 +13,26 @@ def build_features(df):
    categorical_columns = [
        "device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
        "price_sensitive_history", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
-        "device_fd", "content_fd", "fd1", "fd2", "fd3"
+        "device_fd", "content_fd", "fd1", "fd2", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs",
+        "fs1", "fs2", "fs3", "device_ss", "content_ss", "ss1", "ss2", "ss3", "device_fp", "content_fp", "fp1", "fp2", "fp3",
+        "device_sp", "content_sp", "sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3"
    ]
+    categorical_ignore_columns = []
    categorical_features = []
    for col in categorical_columns:
-        if col == "card_id":
+        if col not in categorical_ignore_columns:
-            categorical_features.append(
+            if col == "card_id":
-                fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
+                categorical_features.append(
-                                    dimension=int(df[col].size**0.25)))
+                    fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
-        elif col == "device_id":
+                                        dimension=int(df[col].size**0.25)))
-            categorical_features.append(
+            elif col == "device_id":
-                fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
+                categorical_features.append(
-        else:
+                    fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
-            categorical_features.append(
+            else:
-                fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
+                categorical_features.append(
+                    fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
    all_features = (numeric_features + categorical_features)
    return all_features

--- a/src/models/esmm/model.py
+++ b/src/models/esmm/model.py
@@ -70,12 +70,45 @@ def esmm_model_fn(features, labels, mode, params):
        return res
-def model_export(model, features, save_path):
+def model_export(model, feature_columns, save_path):
-    feature_spec_columns = []
+    feature_spec = fc.make_parse_example_spec(feature_columns)
-    feature_spec_columns.extend(features)
-    feature_spec_columns.append(fc.numeric_column("click_label"))
-    feature_spec_columns.append(fc.numeric_column("conversion_label"))
-    feature_spec = fc.make_parse_example_spec(feature_spec_columns)
    serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
-    model.export_saved_model(save_path, serving_input_fn, as_text=True)
+    path = str(model.export_saved_model(save_path, serving_input_fn), encoding="utf-8")
+    return path
+def _int64_feature(value):
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
+def _float_feature(value):
+    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
+def _bytes_feature(value):
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+def model_predict(inputs, model_path):
+    predict_fn = tf.contrib.predictor.from_saved_model(model_path)
+    int_columns = [
+        "active_type", "active_days", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
+        "topic_num", "favor_num", "vote_num"
+    ]
+    float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
+    examples = []
+    for index, row in inputs.iterrows():
+        features = {}
+        for col, value in row.iteritems():
+            if col in ["click_label", "conversion_label"]:
+                pass
+            elif col in int_columns:
+                features[col] = _int64_feature(value)
+            elif col in float_columns:
+                features[col] = _float_feature(value)
+            else:
+                features[col] = _bytes_feature(str(value).encode(encoding="utf-8"))
+        example = tf.train.Example(features=tf.train.Features(feature=features))
+        examples.append(example.SerializeToString())
+    predictions = predict_fn({"examples": examples})
+    print(predictions)