Commit 78361361 authored by 赵威's avatar 赵威

diary prediction

parent 2801c03a
......@@ -5,35 +5,38 @@ import tensorflow as tf
from .fe.diary_fe import device_diary_fe
from .model import _bytes_feature, _float_feature, _int64_feature
_int_columns = [
"active_type", "active_days", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
"topic_num", "favor_num", "vote_num"
]
_float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
_categorical_columns = [
"device_id", "past_consume_ability_history", "potential_consume_ability_history", "price_sensitive_history", "device_fd",
"device_sd", "device_fs", "device_ss", "device_fp", "device_sp", "device_p", "content_fd", "content_sd", "content_fs",
"content_ss", "content_fp", "content_sp", "content_p", "fd1", "fd2", "fd3", "sd1", "sd2", "sd3", "fs1", "fs2", "fs3", "ss1",
"ss2", "ss3", "fp1", "fp2", "fp3", "sp1", "sp2", "sp3", "p1", "p2", "p3"
]
PREDICTION_ALL_COLUMNS = _int_columns + _float_columns + _categorical_columns
def model_predict_diary(device_id, diary_ids, device_dict, diary_dict, predict_fn):
try:
time_1 = timeit.default_timer()
device_info, diary_lst, diary_ids_res = device_diary_fe(device_id, diary_ids, device_dict, diary_dict)
print("predict check: " + str(len(diary_lst)) + " " + str(len(diary_ids_res)))
# TODO
int_columns = [
"active_type", "active_days", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
"topic_num", "favor_num", "vote_num"
]
float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
str_columns = [
"device_id", "past_consume_ability_history", "potential_consume_ability_history", "price_sensitive_history",
"device_fd", "device_sd", "device_fs", "device_ss", "device_fp", "device_sp", "device_p", "content_fd", "content_sd",
"content_fs", "content_ss", "content_fp", "content_sp", "content_p", "fd1", "fd2", "fd3", "sd1", "sd2", "sd3", "fs1",
"fs2", "fs3", "ss1", "ss2", "ss3", "fp1", "fp2", "fp3", "sp1", "sp2", "sp3", "p1", "p2", "p3"
]
examples = []
for diary_info in diary_lst:
tmp = {}
tmp.update(device_info)
tmp.update(diary_info)
features = {}
for col in int_columns:
for col in _int_columns:
features[col] = _int64_feature(int(tmp[col]))
for col in float_columns:
for col in _float_columns:
features[col] = _float_feature(float(tmp[col]))
for col in str_columns:
for col in _categorical_columns:
features[col] = _bytes_feature(str(tmp[col]).encode(encoding="utf-8"))
example = tf.train.Example(features=tf.train.Features(feature=features))
examples.append(example.SerializeToString())
......
......@@ -9,7 +9,7 @@ from pathlib import Path
import tensorflow as tf
from sklearn.model_selection import train_test_split
from models.esmm.diary_model import model_predict_diary
from models.esmm.diary_model import PREDICTION_ALL_COLUMNS, model_predict_diary
from models.esmm.fe import click_fe, device_fe, diary_fe, fe
from models.esmm.input_fn import esmm_input_fn
from models.esmm.model import esmm_model_fn, model_export
......@@ -22,58 +22,59 @@ def main():
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
# data_path = Path("~/data/cvr_data").expanduser() # local
data_path = Path("/srv/apps/node2vec_git/cvr_data/") # server
diary_df, diary_click_df, diary_conversion_df = diary_fe.read_csv_data(data_path)
# print(diary_df.sample(1))
diary_df = diary_fe.diary_feature_engineering(diary_df)
# print(diary_df.sample(1))
device_df = device_fe.read_csv_data(data_path)
# print(diary_df.sample(1))
device_df = device_fe.device_feature_engineering(device_df, "diary")
# print(device_df.sample(1))
cc_df = click_fe.click_feature_engineering(diary_click_df, diary_conversion_df)
# print(cc_df.sample(1))
df = diary_fe.join_features(device_df, diary_df, cc_df)
# print(df.sample(1))
# print(df.dtypes)
train_df, test_df = train_test_split(df, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.2)
all_features = fe.build_features(df, diary_fe.INT_COLUMNS, diary_fe.FLOAT_COLUMNS, diary_fe.CATEGORICAL_COLUMNS)
params = {"feature_columns": all_features, "hidden_units": [64, 32], "learning_rate": 0.1}
model_path = str(Path("~/data/model_tmp/").expanduser())
# if os.path.exists(model_path):
# shutil.rmtree(model_path)
session_config = tf.compat.v1.ConfigProto()
session_config.gpu_options.allow_growth = True
session_config.gpu_options.per_process_gpu_memory_fraction = 0.9
estimator_config = tf.estimator.RunConfig(session_config=session_config)
model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path, config=estimator_config)
train_spec = tf.estimator.TrainSpec(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), max_steps=50000)
eval_spec = tf.estimator.EvalSpec(input_fn=lambda: esmm_input_fn(val_df, shuffle=False))
tf.estimator.train_and_evaluate(model, train_spec, eval_spec)
model_export_path = str(Path("~/data/models/diary").expanduser())
save_path = model_export(model, all_features, model_export_path)
print("save to: " + save_path)
# # data_path = Path("~/data/cvr_data").expanduser() # local
# data_path = Path("/srv/apps/node2vec_git/cvr_data/") # server
# diary_df, diary_click_df, diary_conversion_df = diary_fe.read_csv_data(data_path)
# # print(diary_df.sample(1))
# diary_df = diary_fe.diary_feature_engineering(diary_df)
# # print(diary_df.sample(1))
# device_df = device_fe.read_csv_data(data_path)
# # print(diary_df.sample(1))
# device_df = device_fe.device_feature_engineering(device_df, "diary")
# # print(device_df.sample(1))
# cc_df = click_fe.click_feature_engineering(diary_click_df, diary_conversion_df)
# # print(cc_df.sample(1))
# df = diary_fe.join_features(device_df, diary_df, cc_df)
# # print(df.sample(1))
# # print(df.dtypes)
# train_df, test_df = train_test_split(df, test_size=0.2)
# train_df, val_df = train_test_split(train_df, test_size=0.2)
# all_features = fe.build_features(df, diary_fe.INT_COLUMNS, diary_fe.FLOAT_COLUMNS, diary_fe.CATEGORICAL_COLUMNS)
# params = {"feature_columns": all_features, "hidden_units": [64, 32], "learning_rate": 0.1}
# model_path = str(Path("~/data/model_tmp/").expanduser())
# # if os.path.exists(model_path):
# # shutil.rmtree(model_path)
# session_config = tf.compat.v1.ConfigProto()
# session_config.gpu_options.allow_growth = True
# session_config.gpu_options.per_process_gpu_memory_fraction = 0.9
# estimator_config = tf.estimator.RunConfig(session_config=session_config)
# model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path, config=estimator_config)
# train_spec = tf.estimator.TrainSpec(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), max_steps=50000)
# eval_spec = tf.estimator.EvalSpec(input_fn=lambda: esmm_input_fn(val_df, shuffle=False))
# tf.estimator.train_and_evaluate(model, train_spec, eval_spec)
# model_export_path = str(Path("~/data/models/diary").expanduser())
# save_path = model_export(model, all_features, model_export_path)
# print("save to: " + save_path)
diary_train_columns = set(diary_fe.INT_COLUMNS + diary_fe.FLOAT_COLUMNS + diary_fe.CATEGORICAL_COLUMNS)
diary_predict_columns = set(PREDICTION_ALL_COLUMNS)
print(diary_predict_columns.difference(diary_train_columns))
print(diary_train_columns.difference(diary_predict_columns))
assert diary_predict_columns == diary_train_columns
print("============================================================")
# save_path = str(Path("~/Desktop/models/1596012827").expanduser()) # local
# save_path = "/home/gmuser/data/models/diary/1596083349" # server
save_path = "/home/gmuser/data/models/diary/1596083349" # server
# tf.saved_model.load
predict_fn = tf.contrib.predictor.from_saved_model(save_path)
print("============================================================")
# device_id = "861601036552944"
# diary_ids = [
# "16195283", "16838351", "17161073", "17297878", "17307484", "17396235", "16418737", "16995481", "17312201", "12237988"
# ]
device_dict = device_fe.get_device_dict_from_redis()
diary_dict = diary_fe.get_diary_dict_from_redis()
......
......@@ -76,9 +76,9 @@ def main():
device_ids = list(device_dict.keys())[:20]
tractate_ids = list(tractate_dict.keys())
print(len(device_dict), len(tractate_dict), "\n")
print(device_dict[device_ids[0]], "\n")
print(tractate_dict[tractate_ids[0]], "\n")
# print(len(device_dict), len(tractate_dict), "\n")
# print(device_dict[device_ids[0]], "\n")
# print(tractate_dict[tractate_ids[0]], "\n")
for i in range(5):
time_1 = timeit.default_timer()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment