init project

aeb4ca60 · 赵威 · 5f5f0d49 · aeb4ca60 · aeb4ca60 · aeb4ca60
Commit aeb4ca60 authored Jul 16, 2020 by 赵威
7 changed files
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+.static_storage/
+.media/
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
--- a/requirements.txt
+++ b/requirements.txt
+tensorflow==1.15.2
+keras==2.3.1
+scikit-learn==0.23.1
--- a/src/main.py
+++ b/src/main.py
+from pathlib import Path
+
+import tensorflow as tf
+from sklearn.model_selection import train_test_split
+
+from models.esmm.fe import (click_feature_engineering, device_feature_engineering, diary_feature_engineering, join_features,
+                            read_csv_data)
+from models.esmm.input_fn import build_features, esmm_input_fn
+from models.esmm.model import esmm_model_fn, model_export
+
+tf.compat.v1.enable_eager_execution()
+
+
+def main():
+    device_df, diary_df, click_df, conversion_df = read_csv_data(Path("~/Desktop/cvr_data/"))
+    device_df = device_feature_engineering(device_df)
+    diary_df = diary_feature_engineering(diary_df)
+    cc_df = click_feature_engineering(click_df, conversion_df)
+    df = join_features(device_df, diary_df, cc_df)
+
+    train_df, test_df = train_test_split(df, test_size=0.2)
+    train_df, val_df = train_test_split(train_df, test_size=0.2)
+
+    all_features = build_features(df)
+
+    params = {"feature_columns": all_features, "hidden_units": [32], "learning_rate": 0.1}
+    model_path = str(Path("~/Desktop/models/").expanduser())
+    model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path)
+
+    model.train(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), steps=5000)
+    model.evaluate(input_fn=lambda: esmm_input_fn(val_df, False), steps=5000)
+    model_export(model, all_features, model_path)
+
+    predictions = model.predict(input_fn=lambda: esmm_input_fn(test_df, False))
+    print(next(iter(predictions)))
+
+
+if __name__ == "__main__":
+    main()
--- a/src/models/esmm/fe.py
+++ b/src/models/esmm/fe.py
+import pandas as pd
+import tensorflow as tf
+
+from .utils import common_elements, nth_element
+
+
+def read_csv_data(dataset_path):
+    device_df = pd.read_csv(dataset_path.joinpath("device.csv"), sep="|")
+    diary_df = pd.read_csv(dataset_path.joinpath("diary_card.csv"), sep="|")
+    click_df = pd.read_csv(dataset_path.joinpath("diary_click_ctr.csv"))
+    conversion_df = pd.read_csv(dataset_path.joinpath("diary_click_cvr.csv"))
+    return device_df, diary_df, click_df, conversion_df
+
+
+def device_feature_engineering(df):
+    device_df = df.copy()
+
+    device_df["first_demands"] = device_df["first_demands"].str.split(",")
+    device_df["second_demands"] = device_df["second_demands"].str.split(",")
+    device_df["first_solutions"] = device_df["first_solutions"].str.split(",")
+    device_df["second_solutions"] = device_df["second_solutions"].str.split(",")
+    device_df["first_positions"] = device_df["first_positions"].str.split(",")
+    device_df["second_positions"] = device_df["second_positions"].str.split(",")
+    device_df["projects"] = device_df["projects"].str.split(",")
+
+    device_df["first_demands"] = device_df["first_demands"].apply(lambda d: d if isinstance(d, list) else [])
+    device_df["second_demands"] = device_df["second_demands"].apply(lambda d: d if isinstance(d, list) else [])
+    device_df["first_solutions"] = device_df["first_solutions"].apply(lambda d: d if isinstance(d, list) else [])
+    device_df["second_solutions"] = device_df["second_solutions"].apply(lambda d: d if isinstance(d, list) else [])
+    device_df["first_positions"] = device_df["first_positions"].apply(lambda d: d if isinstance(d, list) else [])
+    device_df["second_positions"] = device_df["second_positions"].apply(lambda d: d if isinstance(d, list) else [])
+    device_df["projects"] = device_df["projects"].apply(lambda d: d if isinstance(d, list) else [])
+
+    nullseries = device_df.isnull().sum()
+    print("device:")
+    print(nullseries[nullseries > 0])
+
+    device_columns = [
+        "device_id", "active_type", "active_days", "past_consume_ability_history", "potential_consume_ability_history",
+        "price_sensitive_history", "first_demands", "second_demands", "first_solutions", "second_solutions", "first_positions",
+        "second_positions", "projects"
+    ]
+    return device_df[device_columns]
+
+
+def diary_feature_engineering(df):
+    diary_df = df.copy()
+
+    diary_df["first_demands"] = diary_df["first_demands"].str.split(",")
+    diary_df["second_demands"] = diary_df["second_demands"].str.split(",")
+    diary_df["first_solutions"] = diary_df["first_solutions"].str.split(",")
+    diary_df["second_solutions"] = diary_df["second_solutions"].str.split(",")
+    diary_df["first_positions"] = diary_df["first_positions"].str.split(",")
+    diary_df["second_positions"] = diary_df["second_positions"].str.split(",")
+    diary_df["projects"] = diary_df["projects"].str.split(",")
+
+    diary_df["first_demands"] = diary_df["first_demands"].apply(lambda d: d if isinstance(d, list) else [])
+    diary_df["second_demands"] = diary_df["second_demands"].apply(lambda d: d if isinstance(d, list) else [])
+    diary_df["first_solutions"] = diary_df["first_solutions"].apply(lambda d: d if isinstance(d, list) else [])
+    diary_df["second_solutions"] = diary_df["second_solutions"].apply(lambda d: d if isinstance(d, list) else [])
+    diary_df["first_positions"] = diary_df["first_positions"].apply(lambda d: d if isinstance(d, list) else [])
+    diary_df["second_positions"] = diary_df["second_positions"].apply(lambda d: d if isinstance(d, list) else [])
+    diary_df["projects"] = diary_df["projects"].apply(lambda d: d if isinstance(d, list) else [])
+    diary_df["is_pure_author"] = diary_df["is_pure_author"].astype(int)
+    diary_df["is_have_pure_reply"] = diary_df["is_have_pure_reply"].astype(int)
+    diary_df["is_have_reply"] = diary_df["is_have_reply"].astype(int)
+
+    print("diary:")
+    nullseries = diary_df.isnull().sum()
+    print(nullseries[nullseries > 0])
+
+    diary_columns = [
+        "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "topic_num", "favor_num", "vote_num",
+        "one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr", "first_demands", "second_demands", "first_solutions",
+        "second_solutions", "first_positions", "second_positions", "projects"
+    ]
+    return diary_df[diary_columns]
+
+
+def click_feature_engineering(click_df, conversion_df):
+    # click_df = click_df.copy()
+    # conversion_df = conversion_df.copy()
+
+    click_df.rename(columns={"label": "click_label"}, inplace=True)
+    conversion_df.rename(columns={"label": "conversion_label"}, inplace=True)
+    cc_df = pd.merge(click_df, conversion_df, how="left", left_on=["cl_id", "card_id"], right_on=["cl_id", "card_id"])
+    cc_df.drop(["partition_date_x", "partition_date_y"], axis=1, inplace=True)
+    cc_df["conversion_label"].fillna(0, inplace=True)
+
+    print("click:")
+    nullseries = cc_df.isnull().sum()
+    print(nullseries[nullseries > 0])
+
+    return cc_df
+
+
+def join_features(device_df, diary_df, cc_df):
+    a = pd.merge(device_df, cc_df, how="inner", left_on="device_id", right_on="cl_id")
+    df = pd.merge(a, diary_df, how="inner", left_on="card_id", right_on="card_id")
+
+    df["first_demands"] = df[["first_demands_x", "first_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["second_demands"] = df[["second_demands_x", "second_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["first_solutions"] = df[["first_solutions_x", "first_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["second_solutions"] = df[["second_solutions_x", "second_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["first_positions"] = df[["first_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["second_positions"] = df[["second_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["projects"] = df[["projects_x", "projects_y"]].apply(lambda x: common_elements(*x), axis=1)
+
+    df["device_fd"] = df["first_demands_x"].apply(lambda x: nth_element(x, 0))
+    df["device_sd"] = df["second_demands_x"].apply(lambda x: nth_element(x, 0))
+    df["device_fs"] = df["first_solutions_x"].apply(lambda x: nth_element(x, 0))
+    df["device_ss"] = df["second_solutions_x"].apply(lambda x: nth_element(x, 0))
+    df["device_fp"] = df["first_positions_x"].apply(lambda x: nth_element(x, 0))
+    df["device_sp"] = df["second_positions_x"].apply(lambda x: nth_element(x, 0))
+    df["device_p"] = df["projects_x"].apply(lambda x: nth_element(x, 0))
+
+    df["content_fd"] = df["first_demands_y"].apply(lambda x: nth_element(x, 0))
+    df["content_sd"] = df["second_demands_y"].apply(lambda x: nth_element(x, 0))
+    df["content_fs"] = df["first_solutions_y"].apply(lambda x: nth_element(x, 0))
+    df["content_ss"] = df["second_solutions_y"].apply(lambda x: nth_element(x, 0))
+    df["content_fp"] = df["first_positions_y"].apply(lambda x: nth_element(x, 0))
+    df["content_sp"] = df["second_positions_y"].apply(lambda x: nth_element(x, 0))
+    df["content_p"] = df["projects_y"].apply(lambda x: nth_element(x, 0))
+
+    df["fd1"] = df["first_demands"].apply(lambda x: nth_element(x, 0))
+    df["fd2"] = df["first_demands"].apply(lambda x: nth_element(x, 1))
+    df["fd3"] = df["first_demands"].apply(lambda x: nth_element(x, 2))
+    df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 0))
+    df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 1))
+    df["sd2"] = df["second_demands"].apply(lambda x: nth_element(x, 2))
+    df["fs1"] = df["first_solutions"].apply(lambda x: nth_element(x, 0))
+    df["fs2"] = df["first_solutions"].apply(lambda x: nth_element(x, 1))
+    df["fs3"] = df["first_solutions"].apply(lambda x: nth_element(x, 2))
+    df["ss1"] = df["second_solutions"].apply(lambda x: nth_element(x, 0))
+    df["ss2"] = df["second_solutions"].apply(lambda x: nth_element(x, 1))
+    df["ss3"] = df["second_solutions"].apply(lambda x: nth_element(x, 2))
+    df["fp1"] = df["first_positions"].apply(lambda x: nth_element(x, 0))
+    df["fp2"] = df["first_positions"].apply(lambda x: nth_element(x, 1))
+    df["fp3"] = df["first_positions"].apply(lambda x: nth_element(x, 2))
+    df["sp1"] = df["second_positions"].apply(lambda x: nth_element(x, 0))
+    df["sp2"] = df["second_positions"].apply(lambda x: nth_element(x, 1))
+    df["sp3"] = df["second_positions"].apply(lambda x: nth_element(x, 2))
+    df["p1"] = df["projects"].apply(lambda x: nth_element(x, 0))
+    df["p2"] = df["projects"].apply(lambda x: nth_element(x, 1))
+    df["p3"] = df["projects"].apply(lambda x: nth_element(x, 2))
+
+    print("df:")
+    nullseries = df.isnull().sum()
+    print(nullseries[nullseries > 0])
+
+    drop_columns = [
+        "cl_id", "first_demands_x", "first_demands_y", "first_demands", "second_demands_x", "second_demands_y", "second_demands",
+        "first_solutions_x", "first_solutions_y", "first_solutions", "second_solutions_x", "second_solutions_y",
+        "second_solutions", "first_positions_x", "first_positions_y", "first_positions", "second_positions_x",
+        "second_positions_y", "second_positions", "projects_x", "projects_y", "projects"
+    ]
+    df.drop(drop_columns, inplace=True, axis=1)
+    return df
--- a/src/models/esmm/input_fn.py
+++ b/src/models/esmm/input_fn.py
+import tensorflow as tf
+from tensorflow import feature_column as fc
+
+from .utils import create_boundaries, create_vocabulary_list
+
+
+def build_features(df):
+    numeric_columns = ["active_days", "topic_num", "favor_num", "vote_num", "one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
+    numeric_features = []
+    for col in numeric_columns:
+        numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
+
+    categorical_columns = [
+        "device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
+        "price_sensitive_history", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
+        "device_fd", "content_fd", "fd1", "fd2", "fd3"
+    ]
+
+    categorical_features = []
+    for col in categorical_columns:
+        if col == "card_id":
+            categorical_features.append(
+                fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
+                                    dimension=int(df[col].size**0.25)))
+        elif col == "device_id":
+            categorical_features.append(
+                fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
+        else:
+            categorical_features.append(
+                fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
+
+    all_features = (numeric_features + categorical_features)
+    return all_features
+
+
+def esmm_input_fn(dataframe, shuffle=False, batch_size=256):
+    df = dataframe.copy()
+    target = df[["click_label", "conversion_label"]]
+    ds = tf.data.Dataset.from_tensor_slices((dict(df), dict(target)))
+    if shuffle:
+        ds = ds.shuffle(1000).repeat()
+    return ds.batch(batch_size).make_one_shot_iterator().get_next()
--- a/src/models/esmm/model.py
+++ b/src/models/esmm/model.py
+import tensorflow as tf
+from tensorflow import feature_column as fc
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.ops.losses import losses
+
+
+def build_deep_layer(net, params):
+    for num_hidden_units in params["hidden_units"]:
+        net = tf.layers.dense(net,
+                              units=num_hidden_units,
+                              activation=tf.nn.relu,
+                              kernel_initializer=tf.glorot_uniform_initializer())
+    return net
+
+
+def esmm_model_fn(features, labels, mode, params):
+    net = tf.compat.v1.feature_column.input_layer(features, params["feature_columns"])
+    last_ctr_layer = build_deep_layer(net, params)
+    last_cvr_layer = build_deep_layer(net, params)
+
+    head = head_lib._binary_logistic_or_multi_class_head(n_classes=2,
+                                                         weight_column=None,
+                                                         label_vocabulary=None,
+                                                         loss_reduction=losses.Reduction.SUM)
+
+    ctr_logits = tf.layers.dense(last_ctr_layer, units=head.logits_dimension, kernel_initializer=tf.glorot_uniform_initializer())
+    cvr_logits = tf.layers.dense(last_cvr_layer, units=head.logits_dimension, kernel_initializer=tf.glorot_uniform_initializer())
+    ctr_preds = tf.sigmoid(ctr_logits)
+    cvr_preds = tf.sigmoid(cvr_logits)
+    ctcvr_preds = tf.multiply(ctr_preds, cvr_preds)
+
+    # optimizer = tf.compat.v1.train.AdamOptimizer()
+    # click_label = features["click_label"]
+    # conversion_label = features["conversion_label"]
+    # device_id = features["device_id"]
+    # card_id = features["card_id"]
+    if mode == tf.estimator.ModeKeys.PREDICT:
+        predictions = {
+            "ctr_preds": ctr_preds,
+            "cvr_preds": cvr_preds,
+            "ctcvr_preds": ctcvr_preds,
+            # "device_id": device_id,
+            # "card_id": card_id
+        }
+        export_outputs = {"prediction": tf.estimator.export.PredictOutput(predictions["cvr_preds"])}
+        return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs)
+    else:
+        ctr_labels = tf.reshape(tf.cast(labels["click_label"], tf.float32), (-1, 1))
+        cvr_labels = tf.reshape(tf.cast(labels["conversion_label"], tf.float32), (-1, 1))
+        optimizer = tf.compat.v1.train.AdagradOptimizer(learning_rate=params.get("learning_rate", 0.03))
+        ctr_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=ctr_labels, logits=ctr_logits))
+        ctcvr_loss = tf.reduce_sum(tf.compat.v1.losses.log_loss(labels=cvr_labels, predictions=ctcvr_preds))
+        loss = ctr_loss + ctcvr_loss
+
+        if mode == tf.estimator.ModeKeys.EVAL:
+            ctr_accuracy = tf.compat.v1.metrics.accuracy(labels=ctr_labels,
+                                                         predictions=tf.to_float(tf.greater_equal(ctr_preds, 0.5)))
+            ctcvr_accuracy = tf.compat.v1.metrics.accuracy(labels=cvr_labels,
+                                                           predictions=tf.to_float(tf.greater_equal(ctcvr_preds, 0.5)))
+            ctr_auc = tf.compat.v1.metrics.auc(labels=ctr_labels, predictions=ctr_preds)
+            ctcvr_auc = tf.compat.v1.metrics.auc(labels=cvr_labels, predictions=ctcvr_preds)
+            metrics = {"ctcvr_accuracy": ctcvr_accuracy, "ctr_accuracy": ctr_accuracy, "ctr_auc": ctr_auc, "ctcvr_auc": ctcvr_auc}
+            tf.compat.v1.summary.scalar("ctr_accuracy", ctr_accuracy[1])
+            tf.compat.v1.summary.scalar("ctcvr_accuracy", ctcvr_accuracy[1])
+            tf.compat.v1.summary.scalar("ctr_auc", ctr_auc[1])
+            tf.compat.v1.summary.scalar("ctcvr_auc", ctcvr_auc[1])
+            return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
+        train_op = optimizer.minimize(loss, global_step=tf.compat.v1.train.get_global_step())
+        res = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
+        return res
+
+
+def model_export(model, features, save_path):
+    feature_spec_columns = []
+    feature_spec_columns.extend(features)
+    feature_spec_columns.append(fc.numeric_column("click_label"))
+    feature_spec_columns.append(fc.numeric_column("conversion_label"))
+
+    feature_spec = fc.make_parse_example_spec(feature_spec_columns)
+    serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
+    model.export_saved_model(save_path, serving_input_fn, as_text=True)
--- a/src/models/esmm/utils.py
+++ b/src/models/esmm/utils.py
+import pandas as pd
+
+
+def common_elements(lst1, lst2):
+    return [element for element in lst1 if element in lst2]
+
+
+def nth_element(lst, n):
+    if n >= len(lst):
+        return ""
+    return lst[n]
+
+
+def create_boundaries(df, column):
+    start = df[column].min()
+    end = df[column].max()
+    diff = end - start
+    lst = [start, int(diff * 0.35), int(diff * 0.7), end]
+    return pd.Series(lst).drop_duplicates().to_list()
+
+
+def create_vocabulary_list(df, column):
+    return list(df[column].unique())