Commit aeb4ca60 authored by 赵威's avatar 赵威

init project

parent 5f5f0d49
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
.static_storage/
.media/
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
from pathlib import Path
import tensorflow as tf
from sklearn.model_selection import train_test_split
from models.esmm.fe import (click_feature_engineering, device_feature_engineering, diary_feature_engineering, join_features,
read_csv_data)
from models.esmm.input_fn import build_features, esmm_input_fn
from models.esmm.model import esmm_model_fn, model_export
tf.compat.v1.enable_eager_execution()
def main():
device_df, diary_df, click_df, conversion_df = read_csv_data(Path("~/Desktop/cvr_data/"))
device_df = device_feature_engineering(device_df)
diary_df = diary_feature_engineering(diary_df)
cc_df = click_feature_engineering(click_df, conversion_df)
df = join_features(device_df, diary_df, cc_df)
train_df, test_df = train_test_split(df, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.2)
all_features = build_features(df)
params = {"feature_columns": all_features, "hidden_units": [32], "learning_rate": 0.1}
model_path = str(Path("~/Desktop/models/").expanduser())
model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path)
model.train(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), steps=5000)
model.evaluate(input_fn=lambda: esmm_input_fn(val_df, False), steps=5000)
model_export(model, all_features, model_path)
predictions = model.predict(input_fn=lambda: esmm_input_fn(test_df, False))
print(next(iter(predictions)))
if __name__ == "__main__":
main()
import pandas as pd
import tensorflow as tf
from .utils import common_elements, nth_element
def read_csv_data(dataset_path):
device_df = pd.read_csv(dataset_path.joinpath("device.csv"), sep="|")
diary_df = pd.read_csv(dataset_path.joinpath("diary_card.csv"), sep="|")
click_df = pd.read_csv(dataset_path.joinpath("diary_click_ctr.csv"))
conversion_df = pd.read_csv(dataset_path.joinpath("diary_click_cvr.csv"))
return device_df, diary_df, click_df, conversion_df
def device_feature_engineering(df):
device_df = df.copy()
device_df["first_demands"] = device_df["first_demands"].str.split(",")
device_df["second_demands"] = device_df["second_demands"].str.split(",")
device_df["first_solutions"] = device_df["first_solutions"].str.split(",")
device_df["second_solutions"] = device_df["second_solutions"].str.split(",")
device_df["first_positions"] = device_df["first_positions"].str.split(",")
device_df["second_positions"] = device_df["second_positions"].str.split(",")
device_df["projects"] = device_df["projects"].str.split(",")
device_df["first_demands"] = device_df["first_demands"].apply(lambda d: d if isinstance(d, list) else [])
device_df["second_demands"] = device_df["second_demands"].apply(lambda d: d if isinstance(d, list) else [])
device_df["first_solutions"] = device_df["first_solutions"].apply(lambda d: d if isinstance(d, list) else [])
device_df["second_solutions"] = device_df["second_solutions"].apply(lambda d: d if isinstance(d, list) else [])
device_df["first_positions"] = device_df["first_positions"].apply(lambda d: d if isinstance(d, list) else [])
device_df["second_positions"] = device_df["second_positions"].apply(lambda d: d if isinstance(d, list) else [])
device_df["projects"] = device_df["projects"].apply(lambda d: d if isinstance(d, list) else [])
nullseries = device_df.isnull().sum()
print("device:")
print(nullseries[nullseries > 0])
device_columns = [
"device_id", "active_type", "active_days", "past_consume_ability_history", "potential_consume_ability_history",
"price_sensitive_history", "first_demands", "second_demands", "first_solutions", "second_solutions", "first_positions",
"second_positions", "projects"
]
return device_df[device_columns]
def diary_feature_engineering(df):
diary_df = df.copy()
diary_df["first_demands"] = diary_df["first_demands"].str.split(",")
diary_df["second_demands"] = diary_df["second_demands"].str.split(",")
diary_df["first_solutions"] = diary_df["first_solutions"].str.split(",")
diary_df["second_solutions"] = diary_df["second_solutions"].str.split(",")
diary_df["first_positions"] = diary_df["first_positions"].str.split(",")
diary_df["second_positions"] = diary_df["second_positions"].str.split(",")
diary_df["projects"] = diary_df["projects"].str.split(",")
diary_df["first_demands"] = diary_df["first_demands"].apply(lambda d: d if isinstance(d, list) else [])
diary_df["second_demands"] = diary_df["second_demands"].apply(lambda d: d if isinstance(d, list) else [])
diary_df["first_solutions"] = diary_df["first_solutions"].apply(lambda d: d if isinstance(d, list) else [])
diary_df["second_solutions"] = diary_df["second_solutions"].apply(lambda d: d if isinstance(d, list) else [])
diary_df["first_positions"] = diary_df["first_positions"].apply(lambda d: d if isinstance(d, list) else [])
diary_df["second_positions"] = diary_df["second_positions"].apply(lambda d: d if isinstance(d, list) else [])
diary_df["projects"] = diary_df["projects"].apply(lambda d: d if isinstance(d, list) else [])
diary_df["is_pure_author"] = diary_df["is_pure_author"].astype(int)
diary_df["is_have_pure_reply"] = diary_df["is_have_pure_reply"].astype(int)
diary_df["is_have_reply"] = diary_df["is_have_reply"].astype(int)
print("diary:")
nullseries = diary_df.isnull().sum()
print(nullseries[nullseries > 0])
diary_columns = [
"card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "topic_num", "favor_num", "vote_num",
"one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr", "first_demands", "second_demands", "first_solutions",
"second_solutions", "first_positions", "second_positions", "projects"
]
return diary_df[diary_columns]
def click_feature_engineering(click_df, conversion_df):
# click_df = click_df.copy()
# conversion_df = conversion_df.copy()
click_df.rename(columns={"label": "click_label"}, inplace=True)
conversion_df.rename(columns={"label": "conversion_label"}, inplace=True)
cc_df = pd.merge(click_df, conversion_df, how="left", left_on=["cl_id", "card_id"], right_on=["cl_id", "card_id"])
cc_df.drop(["partition_date_x", "partition_date_y"], axis=1, inplace=True)
cc_df["conversion_label"].fillna(0, inplace=True)
print("click:")
nullseries = cc_df.isnull().sum()
print(nullseries[nullseries > 0])
return cc_df
def join_features(device_df, diary_df, cc_df):
a = pd.merge(device_df, cc_df, how="inner", left_on="device_id", right_on="cl_id")
df = pd.merge(a, diary_df, how="inner", left_on="card_id", right_on="card_id")
df["first_demands"] = df[["first_demands_x", "first_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
df["second_demands"] = df[["second_demands_x", "second_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
df["first_solutions"] = df[["first_solutions_x", "first_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
df["second_solutions"] = df[["second_solutions_x", "second_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
df["first_positions"] = df[["first_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
df["second_positions"] = df[["second_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
df["projects"] = df[["projects_x", "projects_y"]].apply(lambda x: common_elements(*x), axis=1)
df["device_fd"] = df["first_demands_x"].apply(lambda x: nth_element(x, 0))
df["device_sd"] = df["second_demands_x"].apply(lambda x: nth_element(x, 0))
df["device_fs"] = df["first_solutions_x"].apply(lambda x: nth_element(x, 0))
df["device_ss"] = df["second_solutions_x"].apply(lambda x: nth_element(x, 0))
df["device_fp"] = df["first_positions_x"].apply(lambda x: nth_element(x, 0))
df["device_sp"] = df["second_positions_x"].apply(lambda x: nth_element(x, 0))
df["device_p"] = df["projects_x"].apply(lambda x: nth_element(x, 0))
df["content_fd"] = df["first_demands_y"].apply(lambda x: nth_element(x, 0))
df["content_sd"] = df["second_demands_y"].apply(lambda x: nth_element(x, 0))
df["content_fs"] = df["first_solutions_y"].apply(lambda x: nth_element(x, 0))
df["content_ss"] = df["second_solutions_y"].apply(lambda x: nth_element(x, 0))
df["content_fp"] = df["first_positions_y"].apply(lambda x: nth_element(x, 0))
df["content_sp"] = df["second_positions_y"].apply(lambda x: nth_element(x, 0))
df["content_p"] = df["projects_y"].apply(lambda x: nth_element(x, 0))
df["fd1"] = df["first_demands"].apply(lambda x: nth_element(x, 0))
df["fd2"] = df["first_demands"].apply(lambda x: nth_element(x, 1))
df["fd3"] = df["first_demands"].apply(lambda x: nth_element(x, 2))
df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 0))
df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 1))
df["sd2"] = df["second_demands"].apply(lambda x: nth_element(x, 2))
df["fs1"] = df["first_solutions"].apply(lambda x: nth_element(x, 0))
df["fs2"] = df["first_solutions"].apply(lambda x: nth_element(x, 1))
df["fs3"] = df["first_solutions"].apply(lambda x: nth_element(x, 2))
df["ss1"] = df["second_solutions"].apply(lambda x: nth_element(x, 0))
df["ss2"] = df["second_solutions"].apply(lambda x: nth_element(x, 1))
df["ss3"] = df["second_solutions"].apply(lambda x: nth_element(x, 2))
df["fp1"] = df["first_positions"].apply(lambda x: nth_element(x, 0))
df["fp2"] = df["first_positions"].apply(lambda x: nth_element(x, 1))
df["fp3"] = df["first_positions"].apply(lambda x: nth_element(x, 2))
df["sp1"] = df["second_positions"].apply(lambda x: nth_element(x, 0))
df["sp2"] = df["second_positions"].apply(lambda x: nth_element(x, 1))
df["sp3"] = df["second_positions"].apply(lambda x: nth_element(x, 2))
df["p1"] = df["projects"].apply(lambda x: nth_element(x, 0))
df["p2"] = df["projects"].apply(lambda x: nth_element(x, 1))
df["p3"] = df["projects"].apply(lambda x: nth_element(x, 2))
print("df:")
nullseries = df.isnull().sum()
print(nullseries[nullseries > 0])
drop_columns = [
"cl_id", "first_demands_x", "first_demands_y", "first_demands", "second_demands_x", "second_demands_y", "second_demands",
"first_solutions_x", "first_solutions_y", "first_solutions", "second_solutions_x", "second_solutions_y",
"second_solutions", "first_positions_x", "first_positions_y", "first_positions", "second_positions_x",
"second_positions_y", "second_positions", "projects_x", "projects_y", "projects"
]
df.drop(drop_columns, inplace=True, axis=1)
return df
import tensorflow as tf
from tensorflow import feature_column as fc
from .utils import create_boundaries, create_vocabulary_list
def build_features(df):
numeric_columns = ["active_days", "topic_num", "favor_num", "vote_num", "one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
numeric_features = []
for col in numeric_columns:
numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
categorical_columns = [
"device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
"price_sensitive_history", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
"device_fd", "content_fd", "fd1", "fd2", "fd3"
]
categorical_features = []
for col in categorical_columns:
if col == "card_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
dimension=int(df[col].size**0.25)))
elif col == "device_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
else:
categorical_features.append(
fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
all_features = (numeric_features + categorical_features)
return all_features
def esmm_input_fn(dataframe, shuffle=False, batch_size=256):
df = dataframe.copy()
target = df[["click_label", "conversion_label"]]
ds = tf.data.Dataset.from_tensor_slices((dict(df), dict(target)))
if shuffle:
ds = ds.shuffle(1000).repeat()
return ds.batch(batch_size).make_one_shot_iterator().get_next()
import tensorflow as tf
from tensorflow import feature_column as fc
from tensorflow.python.estimator.canned import head as head_lib
from tensorflow.python.ops.losses import losses
def build_deep_layer(net, params):
for num_hidden_units in params["hidden_units"]:
net = tf.layers.dense(net,
units=num_hidden_units,
activation=tf.nn.relu,
kernel_initializer=tf.glorot_uniform_initializer())
return net
def esmm_model_fn(features, labels, mode, params):
net = tf.compat.v1.feature_column.input_layer(features, params["feature_columns"])
last_ctr_layer = build_deep_layer(net, params)
last_cvr_layer = build_deep_layer(net, params)
head = head_lib._binary_logistic_or_multi_class_head(n_classes=2,
weight_column=None,
label_vocabulary=None,
loss_reduction=losses.Reduction.SUM)
ctr_logits = tf.layers.dense(last_ctr_layer, units=head.logits_dimension, kernel_initializer=tf.glorot_uniform_initializer())
cvr_logits = tf.layers.dense(last_cvr_layer, units=head.logits_dimension, kernel_initializer=tf.glorot_uniform_initializer())
ctr_preds = tf.sigmoid(ctr_logits)
cvr_preds = tf.sigmoid(cvr_logits)
ctcvr_preds = tf.multiply(ctr_preds, cvr_preds)
# optimizer = tf.compat.v1.train.AdamOptimizer()
# click_label = features["click_label"]
# conversion_label = features["conversion_label"]
# device_id = features["device_id"]
# card_id = features["card_id"]
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {
"ctr_preds": ctr_preds,
"cvr_preds": cvr_preds,
"ctcvr_preds": ctcvr_preds,
# "device_id": device_id,
# "card_id": card_id
}
export_outputs = {"prediction": tf.estimator.export.PredictOutput(predictions["cvr_preds"])}
return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs)
else:
ctr_labels = tf.reshape(tf.cast(labels["click_label"], tf.float32), (-1, 1))
cvr_labels = tf.reshape(tf.cast(labels["conversion_label"], tf.float32), (-1, 1))
optimizer = tf.compat.v1.train.AdagradOptimizer(learning_rate=params.get("learning_rate", 0.03))
ctr_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=ctr_labels, logits=ctr_logits))
ctcvr_loss = tf.reduce_sum(tf.compat.v1.losses.log_loss(labels=cvr_labels, predictions=ctcvr_preds))
loss = ctr_loss + ctcvr_loss
if mode == tf.estimator.ModeKeys.EVAL:
ctr_accuracy = tf.compat.v1.metrics.accuracy(labels=ctr_labels,
predictions=tf.to_float(tf.greater_equal(ctr_preds, 0.5)))
ctcvr_accuracy = tf.compat.v1.metrics.accuracy(labels=cvr_labels,
predictions=tf.to_float(tf.greater_equal(ctcvr_preds, 0.5)))
ctr_auc = tf.compat.v1.metrics.auc(labels=ctr_labels, predictions=ctr_preds)
ctcvr_auc = tf.compat.v1.metrics.auc(labels=cvr_labels, predictions=ctcvr_preds)
metrics = {"ctcvr_accuracy": ctcvr_accuracy, "ctr_accuracy": ctr_accuracy, "ctr_auc": ctr_auc, "ctcvr_auc": ctcvr_auc}
tf.compat.v1.summary.scalar("ctr_accuracy", ctr_accuracy[1])
tf.compat.v1.summary.scalar("ctcvr_accuracy", ctcvr_accuracy[1])
tf.compat.v1.summary.scalar("ctr_auc", ctr_auc[1])
tf.compat.v1.summary.scalar("ctcvr_auc", ctcvr_auc[1])
return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
train_op = optimizer.minimize(loss, global_step=tf.compat.v1.train.get_global_step())
res = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
return res
def model_export(model, features, save_path):
feature_spec_columns = []
feature_spec_columns.extend(features)
feature_spec_columns.append(fc.numeric_column("click_label"))
feature_spec_columns.append(fc.numeric_column("conversion_label"))
feature_spec = fc.make_parse_example_spec(feature_spec_columns)
serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
model.export_saved_model(save_path, serving_input_fn, as_text=True)
import pandas as pd
def common_elements(lst1, lst2):
return [element for element in lst1 if element in lst2]
def nth_element(lst, n):
if n >= len(lst):
return ""
return lst[n]
def create_boundaries(df, column):
start = df[column].min()
end = df[column].max()
diff = end - start
lst = [start, int(diff * 0.35), int(diff * 0.7), end]
return pd.Series(lst).drop_duplicates().to_list()
def create_vocabulary_list(df, column):
return list(df[column].unique())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment