Commit a01adb19 authored by 赵威's avatar 赵威

update path

parent 430956d8
......@@ -10,12 +10,12 @@ from models.esmm.fe import (click_feature_engineering, device_feature_engineerin
from models.esmm.input_fn import build_features, esmm_input_fn
from models.esmm.model import esmm_model_fn, model_export, model_predict
tf.compat.v1.enable_eager_execution()
# tf.compat.v1.enable_eager_execution()
def main():
time_begin = time.time()
device_df, diary_df, click_df, conversion_df = read_csv_data(Path("~/Desktop/cvr_data/"))
device_df, diary_df, click_df, conversion_df = read_csv_data(Path("~/data/cvr_data/"))
device_df = device_feature_engineering(device_df)
diary_df = diary_feature_engineering(diary_df)
cc_df = click_feature_engineering(click_df, conversion_df)
......@@ -27,18 +27,24 @@ def main():
all_features = build_features(df)
params = {"feature_columns": all_features, "hidden_units": [32], "learning_rate": 0.1}
model_path = str(Path("~/Desktop/models/").expanduser())
model_path = str(Path("~/data/model_tmp/").expanduser())
model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path)
print("train")
model.train(input_fn=lambda: esmm_input_fn(train_df.sample(100000), shuffle=True), steps=5000)
model.evaluate(input_fn=lambda: esmm_input_fn(val_df.sample(100000), False), steps=5000)
save_path = model_export(model, all_features, model_path)
model.train(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), steps=5000)
metrics = model.evaluate(input_fn=lambda: esmm_input_fn(val_df, False), steps=5000)
print("metrics: " + str(metrics))
model_export_path = str(Path("~/data/models/").expanduser())
save_path = model_export(model, all_features, model_export_path)
print("save to: " + save_path)
# predictions = model.predict(input_fn=lambda: esmm_input_fn(test_df, False))
# print(next(iter(predictions)))
time_1 = time.time()
model_predict(test_df.sample(300), save_path)
total_1 = (time.time() - time_1)
print("prediction cost {:.5f} s at {}".format(total_1, datetime.now()))
total_time = (time.time() - time_begin) / 60
print("cost {:.2f} mins at {}".format(total_time, datetime.now()))
......
......@@ -6,10 +6,11 @@ from .utils import common_elements, nth_element
def read_csv_data(dataset_path):
device_df = pd.read_csv(dataset_path.joinpath("device.csv"), sep="|")
diary_df = pd.read_csv(dataset_path.joinpath("diary_card.csv"), sep="|")
click_df = pd.read_csv(dataset_path.joinpath("diary_click_ctr.csv"))
conversion_df = pd.read_csv(dataset_path.joinpath("diary_click_cvr.csv"))
return device_df, diary_df, click_df, conversion_df
diary_df = pd.read_csv(dataset_path.joinpath("diary.csv"), sep="|")
click_df = pd.read_csv(dataset_path.joinpath("click.csv"), sep="|")
conversion_df = pd.read_csv(dataset_path.joinpath("click_cvr.csv"), sep="|")
# TODO remove sample
return device_df.sample(10000), diary_df.sample(5000), click_df, conversion_df
def device_feature_engineering(df):
......@@ -34,6 +35,7 @@ def device_feature_engineering(df):
nullseries = device_df.isnull().sum()
print("device:")
print(nullseries[nullseries > 0])
# print(device_df.size)
device_columns = [
"device_id", "active_type", "active_days", "past_consume_ability_history", "potential_consume_ability_history",
......@@ -68,6 +70,7 @@ def diary_feature_engineering(df):
print("diary:")
nullseries = diary_df.isnull().sum()
print(nullseries[nullseries > 0])
# print(diary_df.size)
diary_columns = [
"card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "topic_num", "favor_num", "vote_num",
......@@ -90,6 +93,7 @@ def click_feature_engineering(click_df, conversion_df):
print("click:")
nullseries = cc_df.isnull().sum()
print(nullseries[nullseries > 0])
# print(cc_df.size)
return cc_df
......@@ -147,6 +151,7 @@ def join_features(device_df, diary_df, cc_df):
print("df:")
nullseries = df.isnull().sum()
print(nullseries[nullseries > 0])
# print(df.size)
drop_columns = [
"cl_id", "first_demands_x", "first_demands_y", "first_demands", "second_demands_x", "second_demands_y", "second_demands",
......
......@@ -5,10 +5,14 @@ from .utils import create_boundaries, create_vocabulary_list
def build_features(df):
numeric_columns = ["active_days", "topic_num", "favor_num", "vote_num", "one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
int_columns = ["active_days", "topic_num", "favor_num", "vote_num"]
float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
numeric_features = []
for col in numeric_columns:
numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
for col in (int_columns + float_columns):
if col in int_columns:
numeric_features.append(fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
else:
numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
categorical_columns = [
"device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment