Commit 430956d8 authored by 赵威's avatar 赵威

add prediction

parent aeb4ca60
import time
from datetime import datetime
from pathlib import Path
import tensorflow as tf
......@@ -6,12 +8,13 @@ from sklearn.model_selection import train_test_split
from models.esmm.fe import (click_feature_engineering, device_feature_engineering, diary_feature_engineering, join_features,
read_csv_data)
from models.esmm.input_fn import build_features, esmm_input_fn
from models.esmm.model import esmm_model_fn, model_export
from models.esmm.model import esmm_model_fn, model_export, model_predict
tf.compat.v1.enable_eager_execution()
def main():
time_begin = time.time()
device_df, diary_df, click_df, conversion_df = read_csv_data(Path("~/Desktop/cvr_data/"))
device_df = device_feature_engineering(device_df)
diary_df = diary_feature_engineering(diary_df)
......@@ -27,12 +30,18 @@ def main():
model_path = str(Path("~/Desktop/models/").expanduser())
model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path)
model.train(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), steps=5000)
model.evaluate(input_fn=lambda: esmm_input_fn(val_df, False), steps=5000)
model_export(model, all_features, model_path)
print("train")
model.train(input_fn=lambda: esmm_input_fn(train_df.sample(100000), shuffle=True), steps=5000)
model.evaluate(input_fn=lambda: esmm_input_fn(val_df.sample(100000), False), steps=5000)
save_path = model_export(model, all_features, model_path)
predictions = model.predict(input_fn=lambda: esmm_input_fn(test_df, False))
print(next(iter(predictions)))
# predictions = model.predict(input_fn=lambda: esmm_input_fn(test_df, False))
# print(next(iter(predictions)))
model_predict(test_df.sample(300), save_path)
total_time = (time.time() - time_begin) / 60
print("cost {:.2f} mins at {}".format(total_time, datetime.now()))
if __name__ == "__main__":
......
......@@ -126,8 +126,8 @@ def join_features(device_df, diary_df, cc_df):
df["fd2"] = df["first_demands"].apply(lambda x: nth_element(x, 1))
df["fd3"] = df["first_demands"].apply(lambda x: nth_element(x, 2))
df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 0))
df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 1))
df["sd2"] = df["second_demands"].apply(lambda x: nth_element(x, 2))
df["sd2"] = df["second_demands"].apply(lambda x: nth_element(x, 1))
df["sd3"] = df["second_demands"].apply(lambda x: nth_element(x, 2))
df["fs1"] = df["first_solutions"].apply(lambda x: nth_element(x, 0))
df["fs2"] = df["first_solutions"].apply(lambda x: nth_element(x, 1))
df["fs3"] = df["first_solutions"].apply(lambda x: nth_element(x, 2))
......@@ -154,5 +154,8 @@ def join_features(device_df, diary_df, cc_df):
"second_solutions", "first_positions_x", "first_positions_y", "first_positions", "second_positions_x",
"second_positions_y", "second_positions", "projects_x", "projects_y", "projects"
]
# for col in drop_columns:
# if col in df.columns:
# df.drop(col, inplace=True, axis=1)
df.drop(drop_columns, inplace=True, axis=1)
return df
......@@ -13,21 +13,26 @@ def build_features(df):
categorical_columns = [
"device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
"price_sensitive_history", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
"device_fd", "content_fd", "fd1", "fd2", "fd3"
"device_fd", "content_fd", "fd1", "fd2", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs",
"fs1", "fs2", "fs3", "device_ss", "content_ss", "ss1", "ss2", "ss3", "device_fp", "content_fp", "fp1", "fp2", "fp3",
"device_sp", "content_sp", "sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3"
]
categorical_ignore_columns = []
categorical_features = []
for col in categorical_columns:
if col == "card_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
dimension=int(df[col].size**0.25)))
elif col == "device_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
else:
categorical_features.append(
fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
if col not in categorical_ignore_columns:
if col == "card_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
dimension=int(df[col].size**0.25)))
elif col == "device_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
else:
categorical_features.append(
fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
all_features = (numeric_features + categorical_features)
return all_features
......
......@@ -70,12 +70,45 @@ def esmm_model_fn(features, labels, mode, params):
return res
def model_export(model, features, save_path):
feature_spec_columns = []
feature_spec_columns.extend(features)
feature_spec_columns.append(fc.numeric_column("click_label"))
feature_spec_columns.append(fc.numeric_column("conversion_label"))
feature_spec = fc.make_parse_example_spec(feature_spec_columns)
def model_export(model, feature_columns, save_path):
feature_spec = fc.make_parse_example_spec(feature_columns)
serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
model.export_saved_model(save_path, serving_input_fn, as_text=True)
path = str(model.export_saved_model(save_path, serving_input_fn), encoding="utf-8")
return path
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _float_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def model_predict(inputs, model_path):
predict_fn = tf.contrib.predictor.from_saved_model(model_path)
int_columns = [
"active_type", "active_days", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
"topic_num", "favor_num", "vote_num"
]
float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
examples = []
for index, row in inputs.iterrows():
features = {}
for col, value in row.iteritems():
if col in ["click_label", "conversion_label"]:
pass
elif col in int_columns:
features[col] = _int64_feature(value)
elif col in float_columns:
features[col] = _float_feature(value)
else:
features[col] = _bytes_feature(str(value).encode(encoding="utf-8"))
example = tf.train.Example(features=tf.train.Features(feature=features))
examples.append(example.SerializeToString())
predictions = predict_fn({"examples": examples})
print(predictions)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment