update model path

40c420da · 赵威 · 56df5503 · 40c420da · 40c420da · 40c420da
Commit 40c420da authored Jul 30, 2020 by 赵威
6 changed files
--- a/src/main_portrait.py
+++ b/src/main_portrait.py
@@ -61,7 +61,7 @@ def main():
    diary_dict = diary_fe.get_diary_dict_from_redis()
    print("redis data: " + str(len(device_dict)) + " " + str(len(diary_dict)))

-    save_path = "/home/gmuser/data/models/1596018742"
+    save_path = "/home/gmuser/data/models/1596077883"
    predict_fn = tf.contrib.predictor.from_saved_model(save_path)

    # device_id = "androidid_a25a1129c0b38f7b"

--- a/src/models/esmm/fe/diary_fe.py
+++ b/src/models/esmm/fe/diary_fe.py
 import timeit

 import pandas as pd
-
+from tensorflow import feature_column as fc
 from utils.cache import redis_db_client

-from ..utils import common_elements, nth_element
+from ..utils import (common_elements, create_boundaries, create_vocabulary_list, nth_element)

 DIARY_COLUMNS = [
    "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "topic_num", "favor_num", "vote_num",
@@ -73,11 +73,13 @@ def diary_feature_engineering(df):
    diary_df["is_have_pure_reply"] = diary_df["is_have_pure_reply"].astype(int)
    diary_df["is_have_reply"] = diary_df["is_have_reply"].astype(int)

+    diary_df = diary_df[DIARY_COLUMNS]
+
    print("diary:")
    nullseries = diary_df.isnull().sum()
    print(nullseries[nullseries > 0])
    print(diary_df.shape)
-    return diary_df[DIARY_COLUMNS]
+    return diary_df


 def join_features(device_df, diary_df, cc_df):
@@ -148,6 +150,47 @@ def join_features(device_df, diary_df, cc_df):
    return df


+def build_features(df):
+    # TODO
+    int_columns = ["active_days", "topic_num", "favor_num", "vote_num"]
+    float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
+    numeric_features = []
+    for col in (int_columns + float_columns):
+        if col in int_columns:
+            numeric_features.append(
+                fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
+        else:
+            numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
+
+    # TODO
+    categorical_columns = [
+        "device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
+        "price_sensitive_history", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
+        "device_fd", "content_fd", "fd1", "fd2", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs",
+        "fs1", "fs2", "fs3", "device_ss", "content_ss", "ss1", "ss2", "ss3", "device_fp", "content_fp", "fp1", "fp2", "fp3",
+        "device_sp", "content_sp", "sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3"
+    ]
+
+    categorical_ignore_columns = []
+
+    categorical_features = []
+    for col in categorical_columns:
+        if col not in categorical_ignore_columns:
+            if col == "card_id":
+                categorical_features.append(
+                    fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
+                                        dimension=int(df[col].size**0.25)))
+            elif col == "device_id":
+                categorical_features.append(
+                    fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
+            else:
+                categorical_features.append(
+                    fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
+
+    all_features = (numeric_features + categorical_features)
+    return all_features
+
+
 def device_diary_fe(device_id, diary_ids, device_dict, diary_dict):
    time_1 = timeit.default_timer()
    device_info = device_dict.get(device_id, {}).copy()

--- a/src/models/esmm/fe/tractate_fe.py
+++ b/src/models/esmm/fe/tractate_fe.py
 import pandas as pd
+from tensorflow import feature_column as fc
+from utils.cache import redis_db_client
+
+from ..utils import (common_elements, create_boundaries, create_vocabulary_list, nth_element)

 TRACTATE_COLUMNS = [
    "card_id", "is_pure_author", "is_have_pure_reply", "is_have_reply", "content_level", "show_tag_id", "reply_num",
@@ -11,12 +15,13 @@ def read_csv_data(dataset_path):
    tractate_df = pd.read_csv(dataset_path.joinpath("tractate.csv"), sep="|")
    click_df = pd.read_csv(dataset_path.joinpath("tractate_click.csv"), sep="|")
    conversion_df = pd.read_csv(dataset_path.joinpath("tractate_click_cvr.csv"), sep="|")
-    return tractate_df, click_df, conversion_df
+    # TODO
+    return tractate_df.sample(5000), click_df.sample(10000), conversion_df


 def get_tractate_from_redis():
    """
-    return: {diary_id: {first_demands: [], is_pure_author: 1}}
+    return: {tractate_id: {first_demands: [], is_pure_author: 1}}
    """
    pass

@@ -44,15 +49,122 @@ def tractate_feature_engineering(tractate_df):
    df["is_have_pure_reply"] = df["is_have_pure_reply"].astype(int)
    df["is_have_reply"] = df["is_have_reply"].astype(int)

+    df = df[TRACTATE_COLUMNS]
+
    print("tractate:")
    nullseries = df.isnull().sum()
    print(nullseries[nullseries > 0])
    print(df.shape)
-    return df[TRACTATE_COLUMNS]
+    return df


 def join_features(device_df, tractate_df, cc_df):
-    pass
+    a = pd.merge(device_df, cc_df, how="inner", left_on="device_id", right_on="cl_id")
+    df = pd.merge(a, tractate_df, how="inner", left_on="card_id", right_on="card_id")
+
+    df["first_demands"] = df[["first_demands_x", "first_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["second_demands"] = df[["second_demands_x", "second_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["first_solutions"] = df[["first_solutions_x", "first_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["second_solutions"] = df[["second_solutions_x", "second_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["first_positions"] = df[["first_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["second_positions"] = df[["second_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["projects"] = df[["projects_x", "projects_y"]].apply(lambda x: common_elements(*x), axis=1)
+
+    df["device_fd"] = df["first_demands_x"].apply(lambda x: nth_element(x, 0))
+    df["device_sd"] = df["second_demands_x"].apply(lambda x: nth_element(x, 0))
+    df["device_fs"] = df["first_solutions_x"].apply(lambda x: nth_element(x, 0))
+    df["device_ss"] = df["second_solutions_x"].apply(lambda x: nth_element(x, 0))
+    df["device_fp"] = df["first_positions_x"].apply(lambda x: nth_element(x, 0))
+    df["device_sp"] = df["second_positions_x"].apply(lambda x: nth_element(x, 0))
+    df["device_p"] = df["projects_x"].apply(lambda x: nth_element(x, 0))
+
+    df["content_fd"] = df["first_demands_y"].apply(lambda x: nth_element(x, 0))
+    df["content_sd"] = df["second_demands_y"].apply(lambda x: nth_element(x, 0))
+    df["content_fs"] = df["first_solutions_y"].apply(lambda x: nth_element(x, 0))
+    df["content_ss"] = df["second_solutions_y"].apply(lambda x: nth_element(x, 0))
+    df["content_fp"] = df["first_positions_y"].apply(lambda x: nth_element(x, 0))
+    df["content_sp"] = df["second_positions_y"].apply(lambda x: nth_element(x, 0))
+    df["content_p"] = df["projects_y"].apply(lambda x: nth_element(x, 0))
+
+    df["fd1"] = df["first_demands"].apply(lambda x: nth_element(x, 0))
+    df["fd2"] = df["first_demands"].apply(lambda x: nth_element(x, 1))
+    df["fd3"] = df["first_demands"].apply(lambda x: nth_element(x, 2))
+    df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 0))
+    df["sd2"] = df["second_demands"].apply(lambda x: nth_element(x, 1))
+    df["sd3"] = df["second_demands"].apply(lambda x: nth_element(x, 2))
+    df["fs1"] = df["first_solutions"].apply(lambda x: nth_element(x, 0))
+    df["fs2"] = df["first_solutions"].apply(lambda x: nth_element(x, 1))
+    df["fs3"] = df["first_solutions"].apply(lambda x: nth_element(x, 2))
+    df["ss1"] = df["second_solutions"].apply(lambda x: nth_element(x, 0))
+    df["ss2"] = df["second_solutions"].apply(lambda x: nth_element(x, 1))
+    df["ss3"] = df["second_solutions"].apply(lambda x: nth_element(x, 2))
+    df["fp1"] = df["first_positions"].apply(lambda x: nth_element(x, 0))
+    df["fp2"] = df["first_positions"].apply(lambda x: nth_element(x, 1))
+    df["fp3"] = df["first_positions"].apply(lambda x: nth_element(x, 2))
+    df["sp1"] = df["second_positions"].apply(lambda x: nth_element(x, 0))
+    df["sp2"] = df["second_positions"].apply(lambda x: nth_element(x, 1))
+    df["sp3"] = df["second_positions"].apply(lambda x: nth_element(x, 2))
+    df["p1"] = df["projects"].apply(lambda x: nth_element(x, 0))
+    df["p2"] = df["projects"].apply(lambda x: nth_element(x, 1))
+    df["p3"] = df["projects"].apply(lambda x: nth_element(x, 2))
+
+    print("df:")
+    nullseries = df.isnull().sum()
+    print(nullseries[nullseries > 0])
+    print(df.shape)
+
+    drop_columns = [
+        "cl_id", "first_demands_x", "first_demands_y", "first_demands", "second_demands_x", "second_demands_y", "second_demands",
+        "first_solutions_x", "first_solutions_y", "first_solutions", "second_solutions_x", "second_solutions_y",
+        "second_solutions", "first_positions_x", "first_positions_y", "first_positions", "second_positions_x",
+        "second_positions_y", "second_positions", "projects_x", "projects_y", "projects"
+    ]
+    # for col in drop_columns:
+    #     if col in df.columns:
+    #         df.drop(col, inplace=True, axis=1)
+    df.drop(drop_columns, inplace=True, axis=1)
+    return df
+
+
+def build_features(df):
+    # TODO
+    int_columns = ["active_days", "topic_num", "favor_num", "vote_num"]
+    float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
+    numeric_features = []
+    for col in (int_columns + float_columns):
+        if col in int_columns:
+            numeric_features.append(
+                fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
+        else:
+            numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
+
+    # TODO
+    categorical_columns = [
+        "device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
+        "price_sensitive_history", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
+        "device_fd", "content_fd", "fd1", "fd2", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs",
+        "fs1", "fs2", "fs3", "device_ss", "content_ss", "ss1", "ss2", "ss3", "device_fp", "content_fp", "fp1", "fp2", "fp3",
+        "device_sp", "content_sp", "sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3"
+    ]
+
+    categorical_ignore_columns = []
+
+    categorical_features = []
+    for col in categorical_columns:
+        if col not in categorical_ignore_columns:
+            if col == "card_id":
+                categorical_features.append(
+                    fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
+                                        dimension=int(df[col].size**0.25)))
+            elif col == "device_id":
+                categorical_features.append(
+                    fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
+            else:
+                categorical_features.append(
+                    fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
+
+    all_features = (numeric_features + categorical_features)
+    return all_features


 def device_tractate_fe(device_id, tractate_ids, device_dict, tractate_dict):

--- a/src/models/esmm/input_fn.py
+++ b/src/models/esmm/input_fn.py
 import tensorflow as tf
-from tensorflow import feature_column as fc
-
-from .utils import create_boundaries, create_vocabulary_list
-
-
-def build_features(df):
-    # TODO
-    int_columns = ["active_days", "topic_num", "favor_num", "vote_num"]
-    float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
-    numeric_features = []
-    for col in (int_columns + float_columns):
-        if col in int_columns:
-            numeric_features.append(fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
-        else:
-            numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
-
-    # TODO
-    categorical_columns = [
-        "device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
-        "price_sensitive_history", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
-        "device_fd", "content_fd", "fd1", "fd2", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs",
-        "fs1", "fs2", "fs3", "device_ss", "content_ss", "ss1", "ss2", "ss3", "device_fp", "content_fp", "fp1", "fp2", "fp3",
-        "device_sp", "content_sp", "sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3"
-    ]
-
-    categorical_ignore_columns = []
-
-    categorical_features = []
-    for col in categorical_columns:
-        if col not in categorical_ignore_columns:
-            if col == "card_id":
-                categorical_features.append(
-                    fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
-                                        dimension=int(df[col].size**0.25)))
-            elif col == "device_id":
-                categorical_features.append(
-                    fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
-            else:
-                categorical_features.append(
-                    fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
-
-    all_features = (numeric_features + categorical_features)
-    return all_features


 def esmm_input_fn(dataframe, shuffle=False, batch_size=256):

--- a/src/train_diary.py
+++ b/src/train_diary.py
@@ -13,7 +13,7 @@ from models.esmm.fe import device_fe as device_fe
 from models.esmm.fe import diary_fe as diary_fe
 from models.esmm.fe import click_fe as click_fe
 from models.esmm.diary_model import model_predict_diary
-from models.esmm.input_fn import build_features, esmm_input_fn
+from models.esmm.input_fn import esmm_input_fn
 from models.esmm.model import esmm_model_fn, model_export


@@ -27,12 +27,13 @@ def main():
    # data_path = Path("~/data/cvr_data").expanduser()  # local
    data_path = Path("/srv/apps/node2vec_git/cvr_data/")  # server
    diary_df, diary_click_df, diary_conversion_df = diary_fe.read_csv_data(data_path)
+    # print(diary_df.sample(1))
+    diary_df = diary_fe.diary_feature_engineering(diary_df)
+    # print(diary_df.sample(1))
    device_df = device_fe.read_csv_data(data_path)
    # print(diary_df.sample(1))
    device_df = device_fe.device_feature_engineering(device_df)
    # print(device_df.sample(1))
-    diary_df = diary_fe.diary_feature_engineering(diary_df)
-    # print(diary_df.sample(1))
    cc_df = click_fe.click_feature_engineering(diary_click_df, diary_conversion_df)
    # print(cc_df.sample(1))
    df = diary_fe.join_features(device_df, diary_df, cc_df)
@@ -42,7 +43,7 @@ def main():
    train_df, test_df = train_test_split(df, test_size=0.2)
    train_df, val_df = train_test_split(train_df, test_size=0.2)

-    all_features = build_features(df)
+    all_features = diary_fe.build_features(df)
    params = {"feature_columns": all_features, "hidden_units": [64, 32], "learning_rate": 0.1}
    model_path = str(Path("~/data/model_tmp/").expanduser())
    # if os.path.exists(model_path):

--- a/src/train_tractate.py
+++ b/src/train_tractate.py
-import datetime
+import os
+import shutil
 import time
-
+from datetime import datetime
 from pathlib import Path
+
+import tensorflow as tf
 from sklearn.model_selection import train_test_split

-from models.esmm.fe import device_fe as device_fe
-from models.esmm.fe import tractate_fe as tractate_fe
-from models.esmm.fe import click_fe as click_fe
+from models.esmm.fe import click_fe, device_fe, tractate_fe
+from models.esmm.input_fn import esmm_input_fn


 def main():
    time_begin = time.time()

+    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
+
    data_path = Path("~/data/cvr_data").expanduser()  # local
    # data_path = Path("/srv/apps/node2vec_git/cvr_data/")  # server
    tractate_df, tractate_click_df, tractate_conversion_df = tractate_fe.read_csv_data(data_path)
+    tractate_df = tractate_fe.tractate_feature_engineering(tractate_df)
    device_df = device_fe.read_csv_data(data_path)
+    device_df = device_fe.device_feature_engineering(device_df)
+
+    cc_df = click_fe.click_feature_engineering(tractate_click_df, tractate_conversion_df)
+    df = tractate_fe.join_features(device_df, tractate_df, cc_df)
+    # print(df.dtypes)
+
+    train_df, test_df = train_test_split(df, test_size=0.2)
+    train_df, val_df = train_test_split(train_df, test_size=0.2)
+
+    all_features = tractate_fe.build_features(df)
+    params = {"feature_columns": all_features, "hidden_units": [64, 32], "learning_rate": 0.1}
+    model_path = str(Path("~/data/model_tmp/").expanduser())
+    # if os.path.exists(model_path):
+    #     shutil.rmtree(model_path)

    total_time = (time.time() - time_begin) / 60
    print("total cost {:.2f} mins at {}".format(total_time, datetime.now()))