Commit 430956d8 authored by 赵威's avatar 赵威

add prediction

parent aeb4ca60
import time
from datetime import datetime
from pathlib import Path from pathlib import Path
import tensorflow as tf import tensorflow as tf
...@@ -6,12 +8,13 @@ from sklearn.model_selection import train_test_split ...@@ -6,12 +8,13 @@ from sklearn.model_selection import train_test_split
from models.esmm.fe import (click_feature_engineering, device_feature_engineering, diary_feature_engineering, join_features, from models.esmm.fe import (click_feature_engineering, device_feature_engineering, diary_feature_engineering, join_features,
read_csv_data) read_csv_data)
from models.esmm.input_fn import build_features, esmm_input_fn from models.esmm.input_fn import build_features, esmm_input_fn
from models.esmm.model import esmm_model_fn, model_export from models.esmm.model import esmm_model_fn, model_export, model_predict
tf.compat.v1.enable_eager_execution() tf.compat.v1.enable_eager_execution()
def main(): def main():
time_begin = time.time()
device_df, diary_df, click_df, conversion_df = read_csv_data(Path("~/Desktop/cvr_data/")) device_df, diary_df, click_df, conversion_df = read_csv_data(Path("~/Desktop/cvr_data/"))
device_df = device_feature_engineering(device_df) device_df = device_feature_engineering(device_df)
diary_df = diary_feature_engineering(diary_df) diary_df = diary_feature_engineering(diary_df)
...@@ -27,12 +30,18 @@ def main(): ...@@ -27,12 +30,18 @@ def main():
model_path = str(Path("~/Desktop/models/").expanduser()) model_path = str(Path("~/Desktop/models/").expanduser())
model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path) model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path)
model.train(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), steps=5000) print("train")
model.evaluate(input_fn=lambda: esmm_input_fn(val_df, False), steps=5000) model.train(input_fn=lambda: esmm_input_fn(train_df.sample(100000), shuffle=True), steps=5000)
model_export(model, all_features, model_path) model.evaluate(input_fn=lambda: esmm_input_fn(val_df.sample(100000), False), steps=5000)
save_path = model_export(model, all_features, model_path)
predictions = model.predict(input_fn=lambda: esmm_input_fn(test_df, False)) # predictions = model.predict(input_fn=lambda: esmm_input_fn(test_df, False))
print(next(iter(predictions))) # print(next(iter(predictions)))
model_predict(test_df.sample(300), save_path)
total_time = (time.time() - time_begin) / 60
print("cost {:.2f} mins at {}".format(total_time, datetime.now()))
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -126,8 +126,8 @@ def join_features(device_df, diary_df, cc_df): ...@@ -126,8 +126,8 @@ def join_features(device_df, diary_df, cc_df):
df["fd2"] = df["first_demands"].apply(lambda x: nth_element(x, 1)) df["fd2"] = df["first_demands"].apply(lambda x: nth_element(x, 1))
df["fd3"] = df["first_demands"].apply(lambda x: nth_element(x, 2)) df["fd3"] = df["first_demands"].apply(lambda x: nth_element(x, 2))
df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 0)) df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 0))
df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 1)) df["sd2"] = df["second_demands"].apply(lambda x: nth_element(x, 1))
df["sd2"] = df["second_demands"].apply(lambda x: nth_element(x, 2)) df["sd3"] = df["second_demands"].apply(lambda x: nth_element(x, 2))
df["fs1"] = df["first_solutions"].apply(lambda x: nth_element(x, 0)) df["fs1"] = df["first_solutions"].apply(lambda x: nth_element(x, 0))
df["fs2"] = df["first_solutions"].apply(lambda x: nth_element(x, 1)) df["fs2"] = df["first_solutions"].apply(lambda x: nth_element(x, 1))
df["fs3"] = df["first_solutions"].apply(lambda x: nth_element(x, 2)) df["fs3"] = df["first_solutions"].apply(lambda x: nth_element(x, 2))
...@@ -154,5 +154,8 @@ def join_features(device_df, diary_df, cc_df): ...@@ -154,5 +154,8 @@ def join_features(device_df, diary_df, cc_df):
"second_solutions", "first_positions_x", "first_positions_y", "first_positions", "second_positions_x", "second_solutions", "first_positions_x", "first_positions_y", "first_positions", "second_positions_x",
"second_positions_y", "second_positions", "projects_x", "projects_y", "projects" "second_positions_y", "second_positions", "projects_x", "projects_y", "projects"
] ]
# for col in drop_columns:
# if col in df.columns:
# df.drop(col, inplace=True, axis=1)
df.drop(drop_columns, inplace=True, axis=1) df.drop(drop_columns, inplace=True, axis=1)
return df return df
...@@ -13,21 +13,26 @@ def build_features(df): ...@@ -13,21 +13,26 @@ def build_features(df):
categorical_columns = [ categorical_columns = [
"device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history", "device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
"price_sensitive_history", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "price_sensitive_history", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
"device_fd", "content_fd", "fd1", "fd2", "fd3" "device_fd", "content_fd", "fd1", "fd2", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs",
"fs1", "fs2", "fs3", "device_ss", "content_ss", "ss1", "ss2", "ss3", "device_fp", "content_fp", "fp1", "fp2", "fp3",
"device_sp", "content_sp", "sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3"
] ]
categorical_ignore_columns = []
categorical_features = [] categorical_features = []
for col in categorical_columns: for col in categorical_columns:
if col == "card_id": if col not in categorical_ignore_columns:
categorical_features.append( if col == "card_id":
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64), categorical_features.append(
dimension=int(df[col].size**0.25))) fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
elif col == "device_id": dimension=int(df[col].size**0.25)))
categorical_features.append( elif col == "device_id":
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25))) categorical_features.append(
else: fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
categorical_features.append( else:
fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col)))) categorical_features.append(
fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
all_features = (numeric_features + categorical_features) all_features = (numeric_features + categorical_features)
return all_features return all_features
......
...@@ -70,12 +70,45 @@ def esmm_model_fn(features, labels, mode, params): ...@@ -70,12 +70,45 @@ def esmm_model_fn(features, labels, mode, params):
return res return res
def model_export(model, features, save_path): def model_export(model, feature_columns, save_path):
feature_spec_columns = [] feature_spec = fc.make_parse_example_spec(feature_columns)
feature_spec_columns.extend(features)
feature_spec_columns.append(fc.numeric_column("click_label"))
feature_spec_columns.append(fc.numeric_column("conversion_label"))
feature_spec = fc.make_parse_example_spec(feature_spec_columns)
serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec) serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
model.export_saved_model(save_path, serving_input_fn, as_text=True) path = str(model.export_saved_model(save_path, serving_input_fn), encoding="utf-8")
return path
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _float_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def model_predict(inputs, model_path):
predict_fn = tf.contrib.predictor.from_saved_model(model_path)
int_columns = [
"active_type", "active_days", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
"topic_num", "favor_num", "vote_num"
]
float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
examples = []
for index, row in inputs.iterrows():
features = {}
for col, value in row.iteritems():
if col in ["click_label", "conversion_label"]:
pass
elif col in int_columns:
features[col] = _int64_feature(value)
elif col in float_columns:
features[col] = _float_feature(value)
else:
features[col] = _bytes_feature(str(value).encode(encoding="utf-8"))
example = tf.train.Example(features=tf.train.Features(feature=features))
examples.append(example.SerializeToString())
predictions = predict_fn({"examples": examples})
print(predictions)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment