Commit a01adb19 authored by 赵威's avatar 赵威

update path

parent 430956d8
...@@ -10,12 +10,12 @@ from models.esmm.fe import (click_feature_engineering, device_feature_engineerin ...@@ -10,12 +10,12 @@ from models.esmm.fe import (click_feature_engineering, device_feature_engineerin
from models.esmm.input_fn import build_features, esmm_input_fn from models.esmm.input_fn import build_features, esmm_input_fn
from models.esmm.model import esmm_model_fn, model_export, model_predict from models.esmm.model import esmm_model_fn, model_export, model_predict
tf.compat.v1.enable_eager_execution() # tf.compat.v1.enable_eager_execution()
def main(): def main():
time_begin = time.time() time_begin = time.time()
device_df, diary_df, click_df, conversion_df = read_csv_data(Path("~/Desktop/cvr_data/")) device_df, diary_df, click_df, conversion_df = read_csv_data(Path("~/data/cvr_data/"))
device_df = device_feature_engineering(device_df) device_df = device_feature_engineering(device_df)
diary_df = diary_feature_engineering(diary_df) diary_df = diary_feature_engineering(diary_df)
cc_df = click_feature_engineering(click_df, conversion_df) cc_df = click_feature_engineering(click_df, conversion_df)
...@@ -27,18 +27,24 @@ def main(): ...@@ -27,18 +27,24 @@ def main():
all_features = build_features(df) all_features = build_features(df)
params = {"feature_columns": all_features, "hidden_units": [32], "learning_rate": 0.1} params = {"feature_columns": all_features, "hidden_units": [32], "learning_rate": 0.1}
model_path = str(Path("~/Desktop/models/").expanduser()) model_path = str(Path("~/data/model_tmp/").expanduser())
model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path) model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path)
print("train") print("train")
model.train(input_fn=lambda: esmm_input_fn(train_df.sample(100000), shuffle=True), steps=5000) model.train(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), steps=5000)
model.evaluate(input_fn=lambda: esmm_input_fn(val_df.sample(100000), False), steps=5000) metrics = model.evaluate(input_fn=lambda: esmm_input_fn(val_df, False), steps=5000)
save_path = model_export(model, all_features, model_path) print("metrics: " + str(metrics))
model_export_path = str(Path("~/data/models/").expanduser())
save_path = model_export(model, all_features, model_export_path)
print("save to: " + save_path)
# predictions = model.predict(input_fn=lambda: esmm_input_fn(test_df, False)) # predictions = model.predict(input_fn=lambda: esmm_input_fn(test_df, False))
# print(next(iter(predictions))) # print(next(iter(predictions)))
time_1 = time.time()
model_predict(test_df.sample(300), save_path) model_predict(test_df.sample(300), save_path)
total_1 = (time.time() - time_1)
print("prediction cost {:.5f} s at {}".format(total_1, datetime.now()))
total_time = (time.time() - time_begin) / 60 total_time = (time.time() - time_begin) / 60
print("cost {:.2f} mins at {}".format(total_time, datetime.now())) print("cost {:.2f} mins at {}".format(total_time, datetime.now()))
......
...@@ -6,10 +6,11 @@ from .utils import common_elements, nth_element ...@@ -6,10 +6,11 @@ from .utils import common_elements, nth_element
def read_csv_data(dataset_path): def read_csv_data(dataset_path):
device_df = pd.read_csv(dataset_path.joinpath("device.csv"), sep="|") device_df = pd.read_csv(dataset_path.joinpath("device.csv"), sep="|")
diary_df = pd.read_csv(dataset_path.joinpath("diary_card.csv"), sep="|") diary_df = pd.read_csv(dataset_path.joinpath("diary.csv"), sep="|")
click_df = pd.read_csv(dataset_path.joinpath("diary_click_ctr.csv")) click_df = pd.read_csv(dataset_path.joinpath("click.csv"), sep="|")
conversion_df = pd.read_csv(dataset_path.joinpath("diary_click_cvr.csv")) conversion_df = pd.read_csv(dataset_path.joinpath("click_cvr.csv"), sep="|")
return device_df, diary_df, click_df, conversion_df # TODO remove sample
return device_df.sample(10000), diary_df.sample(5000), click_df, conversion_df
def device_feature_engineering(df): def device_feature_engineering(df):
...@@ -34,6 +35,7 @@ def device_feature_engineering(df): ...@@ -34,6 +35,7 @@ def device_feature_engineering(df):
nullseries = device_df.isnull().sum() nullseries = device_df.isnull().sum()
print("device:") print("device:")
print(nullseries[nullseries > 0]) print(nullseries[nullseries > 0])
# print(device_df.size)
device_columns = [ device_columns = [
"device_id", "active_type", "active_days", "past_consume_ability_history", "potential_consume_ability_history", "device_id", "active_type", "active_days", "past_consume_ability_history", "potential_consume_ability_history",
...@@ -68,6 +70,7 @@ def diary_feature_engineering(df): ...@@ -68,6 +70,7 @@ def diary_feature_engineering(df):
print("diary:") print("diary:")
nullseries = diary_df.isnull().sum() nullseries = diary_df.isnull().sum()
print(nullseries[nullseries > 0]) print(nullseries[nullseries > 0])
# print(diary_df.size)
diary_columns = [ diary_columns = [
"card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "topic_num", "favor_num", "vote_num", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "topic_num", "favor_num", "vote_num",
...@@ -90,6 +93,7 @@ def click_feature_engineering(click_df, conversion_df): ...@@ -90,6 +93,7 @@ def click_feature_engineering(click_df, conversion_df):
print("click:") print("click:")
nullseries = cc_df.isnull().sum() nullseries = cc_df.isnull().sum()
print(nullseries[nullseries > 0]) print(nullseries[nullseries > 0])
# print(cc_df.size)
return cc_df return cc_df
...@@ -147,6 +151,7 @@ def join_features(device_df, diary_df, cc_df): ...@@ -147,6 +151,7 @@ def join_features(device_df, diary_df, cc_df):
print("df:") print("df:")
nullseries = df.isnull().sum() nullseries = df.isnull().sum()
print(nullseries[nullseries > 0]) print(nullseries[nullseries > 0])
# print(df.size)
drop_columns = [ drop_columns = [
"cl_id", "first_demands_x", "first_demands_y", "first_demands", "second_demands_x", "second_demands_y", "second_demands", "cl_id", "first_demands_x", "first_demands_y", "first_demands", "second_demands_x", "second_demands_y", "second_demands",
......
...@@ -5,10 +5,14 @@ from .utils import create_boundaries, create_vocabulary_list ...@@ -5,10 +5,14 @@ from .utils import create_boundaries, create_vocabulary_list
def build_features(df): def build_features(df):
numeric_columns = ["active_days", "topic_num", "favor_num", "vote_num", "one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"] int_columns = ["active_days", "topic_num", "favor_num", "vote_num"]
float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
numeric_features = [] numeric_features = []
for col in numeric_columns: for col in (int_columns + float_columns):
numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col))) if col in int_columns:
numeric_features.append(fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
else:
numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
categorical_columns = [ categorical_columns = [
"device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history", "device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment