update fe

932e5762 · 赵威 · f55521ba · 932e5762 · 932e5762 · 932e5762
Commit 932e5762 authored Jul 29, 2020 by 赵威
Showing with 83 additions and 16 deletions

main.py src/main.py +19 -15

device_fe.py src/models/esmm/device_fe.py +63 -0

diary_fe.py src/models/esmm/diary_fe.py +0 -0

fe.py src/models/esmm/fe.py +0 -0

model.py src/models/esmm/model.py +1 -1

No files found.
--- a/src/main.py
+++ b/src/main.py
@@ -10,8 +10,8 @@ import pandas as pd
 import tensorflow as tf
 from sklearn.model_selection import train_test_split
-from models.esmm.fe import (click_feature_engineering, device_feature_engineering, diary_feature_engineering,
+from models.esmm import device_fe as device_fe
-                            get_device_dict_from_redis, get_diary_dict_from_redis, join_features, read_csv_data)
+from models.esmm import diary_fe as diary_fe
 from models.esmm.input_fn import build_features, esmm_input_fn
 from models.esmm.model import esmm_model_fn, model_export, model_predict_diary
@@ -25,15 +25,18 @@ def main():
    # os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
-    # device_df, diary_df, click_df, conversion_df = read_csv_data(Path("~/data/cvr_data").expanduser())
+    # device_df, diary_df, click_df, conversion_df = diary_fe.read_csv_data(Path("~/data/cvr_data").expanduser())
-    device_df, diary_df, click_df, conversion_df = read_csv_data(Path("/srv/apps/node2vec_git/cvr_data/"))
+    device_df, diary_df, click_df, conversion_df = diary_fe.read_csv_data(Path("/srv/apps/node2vec_git/cvr_data/"))
    # print(diary_df.sample(1))
-    device_df = device_feature_engineering(device_df)
+    device_df = device_fe.device_feature_engineering(device_df)
-    # print(device_df.sample(1))
+    print(device_df.sample(1))
-    diary_df = diary_feature_engineering(diary_df)
+    diary_df = diary_fe.diary_feature_engineering(diary_df)
-    # print(diary_df.sample(1))
+    print(diary_df.sample(1))
-    cc_df = click_feature_engineering(click_df, conversion_df)
+    cc_df = diary_fe.click_feature_engineering(click_df, conversion_df)
-    df = join_features(device_df, diary_df, cc_df)
+    print(cc_df.sample(1))
+    df = diary_fe.join_features(device_df, diary_df, cc_df)
+    print(df.sample(1))
+    print(df.dtypes)
    train_df, test_df = train_test_split(df, test_size=0.2)
    train_df, val_df = train_test_split(train_df, test_size=0.2)
@@ -41,8 +44,8 @@ def main():
    all_features = build_features(df)
    params = {"feature_columns": all_features, "hidden_units": [64, 32], "learning_rate": 0.1}
    model_path = str(Path("~/data/model_tmp/").expanduser())
-    if os.path.exists(model_path):
+    # if os.path.exists(model_path):
-        shutil.rmtree(model_path)
+    #     shutil.rmtree(model_path)
    session_config = tf.compat.v1.ConfigProto()
    session_config.gpu_options.allow_growth = True
@@ -50,7 +53,8 @@ def main():
    estimator_config = tf.estimator.RunConfig(session_config=session_config)
    model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path, config=estimator_config)
-    train_spec = tf.estimator.TrainSpec(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), max_steps=50000)
+    # TODO 50000
+    train_spec = tf.estimator.TrainSpec(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), max_steps=20000)
    eval_spec = tf.estimator.EvalSpec(input_fn=lambda: esmm_input_fn(val_df, shuffle=False))
    tf.estimator.train_and_evaluate(model, train_spec, eval_spec)
@@ -76,8 +80,8 @@ def main():
    #     "16195283", "16838351", "17161073", "17297878", "17307484", "17396235", "16418737", "16995481", "17312201", "12237988"
    # ]
-    device_dict = get_device_dict_from_redis()
+    device_dict = diary_fe.get_device_dict_from_redis()
-    diary_dict = get_diary_dict_from_redis()
+    diary_dict = diary_fe.get_diary_dict_from_redis()
    device_ids = list(device_dict.keys())[:20]
    diary_ids = list(diary_dict.keys())

--- a/src/models/esmm/device_fe.py
+++ b/src/models/esmm/device_fe.py
+from utils.cache import redis_db_client
+# "channel_first", "city_first", "model_first",
+DIARY_DEVICE_COLUMNS = [
+    "device_id", "active_type", "active_days", "past_consume_ability_history", "potential_consume_ability_history",
+    "price_sensitive_history", "first_demands", "second_demands", "first_solutions", "second_solutions", "first_positions",
+    "second_positions", "projects"
+]
+def get_device_dict_from_redis():
+    """
+    return: {device_id: {first_demands: [], city_first: ""}}
+    """
+    # TODO
+    db_key = "cvr:db:device2"
+    column_key = db_key + ":column"
+    columns = str(redis_db_client.get(column_key), "utf-8").split("|")
+    d = redis_db_client.hgetall(db_key)
+    res = {}
+    for i in d.values():
+        row_list = str(i, "utf-8").split("|")
+        tmp = {}
+        for (index, elem) in enumerate(row_list):
+            col_name = columns[index]
+            if col_name in [
+                    "first_demands", "second_demands", "first_solutions", "second_solutions", "first_positions",
+                    "second_positions", "projects"
+            ]:
+                tmp[col_name] = elem.split(",")
+            else:
+                tmp[col_name] = elem
+            res[tmp["device_id"]] = tmp
+    return res
+def device_feature_engineering(df):
+    device_df = df.copy()
+    device_df["first_demands"] = device_df["first_demands"].str.split(",")
+    device_df["second_demands"] = device_df["second_demands"].str.split(",")
+    device_df["first_solutions"] = device_df["first_solutions"].str.split(",")
+    device_df["second_solutions"] = device_df["second_solutions"].str.split(",")
+    device_df["first_positions"] = device_df["first_positions"].str.split(",")
+    device_df["second_positions"] = device_df["second_positions"].str.split(",")
+    device_df["projects"] = device_df["projects"].str.split(",")
+    device_df["first_demands"] = device_df["first_demands"].apply(lambda d: d if isinstance(d, list) else [])
+    device_df["second_demands"] = device_df["second_demands"].apply(lambda d: d if isinstance(d, list) else [])
+    device_df["first_solutions"] = device_df["first_solutions"].apply(lambda d: d if isinstance(d, list) else [])
+    device_df["second_solutions"] = device_df["second_solutions"].apply(lambda d: d if isinstance(d, list) else [])
+    device_df["first_positions"] = device_df["first_positions"].apply(lambda d: d if isinstance(d, list) else [])
+    device_df["second_positions"] = device_df["second_positions"].apply(lambda d: d if isinstance(d, list) else [])
+    device_df["projects"] = device_df["projects"].apply(lambda d: d if isinstance(d, list) else [])
+    device_df["city_first"] = device_df["city_first"].fillna("")
+    device_df["model_first"] = device_df["model_first"].fillna("")
+    nullseries = device_df.isnull().sum()
+    print("device:")
+    print(nullseries[nullseries > 0])
+    print(device_df.shape)
+    return device_df[DIARY_DEVICE_COLUMNS]
--- a/src/models/esmm/diary_fe.py
+++ b/src/models/esmm/diary_fe.py
--- a/src/models/esmm/fe.py
+++ b/src/models/esmm/fe.py
--- a/src/models/esmm/model.py
+++ b/src/models/esmm/model.py
@@ -6,7 +6,7 @@ from tensorflow import feature_column as fc
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.ops.losses import losses
-from .fe import device_diary_fe
+from .diary_fe import device_diary_fe
 from .utils import common_elements, nth_element