Commit d9f5cd31 authored by 赵威's avatar 赵威

train tractate

parent 2c4ec6cd
import pandas as pd
from utils.cache import redis_db_client
# "channel_first", "city_first", "model_first",
DEVICE_COLUMNS = [
DIARY_DEVICE_COLUMNS = [
"device_id", "active_type", "active_days", "past_consume_ability_history", "potential_consume_ability_history",
"price_sensitive_history", "first_demands", "second_demands", "first_solutions", "second_solutions", "first_positions",
"second_positions", "projects"
]
TRACTATE_DEVICE_COLUMNS = [
"device_id", "active_type", "active_days", "channel_first", "city_first", "model_first", "past_consume_ability_history",
"potential_consume_ability_history", "price_sensitive_history", "first_demands", "second_demands", "first_solutions",
"second_solutions", "first_positions", "second_positions", "projects", "click_tractate_id1", "click_tractate_id2",
"click_tractate_id3", "click_tractate_id4", "click_tractate_id5"
]
def read_csv_data(dataset_path):
device_df = pd.read_csv(dataset_path.joinpath("device.csv"), sep="|")
......@@ -41,30 +47,46 @@ def get_device_dict_from_redis():
return res
def device_feature_engineering(df):
device_df = df.copy()
def device_feature_engineering(device_df, content_type):
df = device_df.copy()
df["first_demands"] = df["first_demands"].str.split(",")
df["second_demands"] = df["second_demands"].str.split(",")
df["first_solutions"] = df["first_solutions"].str.split(",")
df["second_solutions"] = df["second_solutions"].str.split(",")
df["first_positions"] = df["first_positions"].str.split(",")
df["second_positions"] = df["second_positions"].str.split(",")
df["projects"] = df["projects"].str.split(",")
df["first_demands"] = df["first_demands"].apply(lambda d: d if isinstance(d, list) else [])
df["second_demands"] = df["second_demands"].apply(lambda d: d if isinstance(d, list) else [])
df["first_solutions"] = df["first_solutions"].apply(lambda d: d if isinstance(d, list) else [])
df["second_solutions"] = df["second_solutions"].apply(lambda d: d if isinstance(d, list) else [])
df["first_positions"] = df["first_positions"].apply(lambda d: d if isinstance(d, list) else [])
df["second_positions"] = df["second_positions"].apply(lambda d: d if isinstance(d, list) else [])
df["projects"] = df["projects"].apply(lambda d: d if isinstance(d, list) else [])
df["city_first"] = df["city_first"].fillna("")
df["model_first"] = df["model_first"].fillna("")
device_df["first_demands"] = device_df["first_demands"].str.split(",")
device_df["second_demands"] = device_df["second_demands"].str.split(",")
device_df["first_solutions"] = device_df["first_solutions"].str.split(",")
device_df["second_solutions"] = device_df["second_solutions"].str.split(",")
device_df["first_positions"] = device_df["first_positions"].str.split(",")
device_df["second_positions"] = device_df["second_positions"].str.split(",")
device_df["projects"] = device_df["projects"].str.split(",")
df["click_diary_id1"] = df["click_diary_id1"].astype(str)
df["click_diary_id2"] = df["click_diary_id2"].astype(str)
df["click_diary_id3"] = df["click_diary_id3"].astype(str)
df["click_diary_id4"] = df["click_diary_id4"].astype(str)
df["click_diary_id5"] = df["click_diary_id5"].astype(str)
device_df["first_demands"] = device_df["first_demands"].apply(lambda d: d if isinstance(d, list) else [])
device_df["second_demands"] = device_df["second_demands"].apply(lambda d: d if isinstance(d, list) else [])
device_df["first_solutions"] = device_df["first_solutions"].apply(lambda d: d if isinstance(d, list) else [])
device_df["second_solutions"] = device_df["second_solutions"].apply(lambda d: d if isinstance(d, list) else [])
device_df["first_positions"] = device_df["first_positions"].apply(lambda d: d if isinstance(d, list) else [])
device_df["second_positions"] = device_df["second_positions"].apply(lambda d: d if isinstance(d, list) else [])
device_df["projects"] = device_df["projects"].apply(lambda d: d if isinstance(d, list) else [])
df["click_tractate_id1"] = df["click_tractate_id1"].astype(str)
df["click_tractate_id2"] = df["click_tractate_id2"].astype(str)
df["click_tractate_id3"] = df["click_tractate_id3"].astype(str)
df["click_tractate_id4"] = df["click_tractate_id4"].astype(str)
df["click_tractate_id5"] = df["click_tractate_id5"].astype(str)
device_df["city_first"] = device_df["city_first"].fillna("")
device_df["model_first"] = device_df["model_first"].fillna("")
columns = DIARY_DEVICE_COLUMNS
if content_type == "tractate":
columns = TRACTATE_DEVICE_COLUMNS
nullseries = device_df.isnull().sum()
nullseries = df.isnull().sum()
print("device:")
print(nullseries[nullseries > 0])
print(device_df.shape)
return device_df[DEVICE_COLUMNS]
print(df.shape)
return df[columns]
......@@ -22,9 +22,12 @@ def build_features(df, int_columns, float_columns, categorical_columns):
elif col == "device_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 400000), dimension=int(df[col].size**0.25)))
elif col == "show_tag_id":
elif col in [
"show_tag_id", "click_tractate_id1", "click_tractate_id2", "click_tractate_id3", "click_tractate_id4",
"click_tractate_id5"
]:
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 100000), dimension=int(df[col].size**0.25)))
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000), dimension=int(df[col].size**0.25)))
else:
categorical_features.append(
fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
......
......@@ -16,7 +16,8 @@ CATEGORICAL_COLUMNS = [
"card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "show_tag_id", "device_fd", "content_fd",
"fd1", "fd2", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs", "fs1", "fs2", "fs3",
"device_ss", "content_ss", "ss1", "ss2", "ss3", "device_fp", "content_fp", "fp1", "fp2", "fp3", "device_sp", "content_sp",
"sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3"
"sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3", "click_tractate_id1", "click_tractate_id2",
"click_tractate_id3", "click_tractate_id4", "click_tractate_id5"
]
......@@ -57,6 +58,7 @@ def tractate_feature_engineering(tractate_df):
df["is_pure_author"] = df["is_pure_author"].astype(int)
df["is_have_pure_reply"] = df["is_have_pure_reply"].astype(int)
df["is_have_reply"] = df["is_have_reply"].astype(int)
df["show_tag_id"] = df["show_tag_id"].astype(str)
df = df[TRACTATE_COLUMNS]
......
......@@ -30,7 +30,7 @@ def main():
# print(diary_df.sample(1))
device_df = device_fe.read_csv_data(data_path)
# print(diary_df.sample(1))
device_df = device_fe.device_feature_engineering(device_df)
device_df = device_fe.device_feature_engineering(device_df, "diary")
# print(device_df.sample(1))
cc_df = click_fe.click_feature_engineering(diary_click_df, diary_conversion_df)
# print(cc_df.sample(1))
......
......@@ -9,6 +9,7 @@ from sklearn.model_selection import train_test_split
from models.esmm.fe import click_fe, device_fe, fe, tractate_fe
from models.esmm.input_fn import esmm_input_fn
from models.esmm.model import esmm_model_fn, model_export
def main():
......@@ -16,12 +17,14 @@ def main():
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
data_path = Path("~/data/cvr_data").expanduser() # local
# data_path = Path("/srv/apps/node2vec_git/cvr_data/") # server
# data_path = Path("~/data/cvr_data").expanduser() # local
data_path = Path("/srv/apps/node2vec_git/cvr_data/") # server
tractate_df, tractate_click_df, tractate_conversion_df = tractate_fe.read_csv_data(data_path)
tractate_df = tractate_fe.tractate_feature_engineering(tractate_df)
device_df = device_fe.read_csv_data(data_path)
device_df = device_fe.device_feature_engineering(device_df)
device_df = device_fe.device_feature_engineering(device_df, "tractate")
# print(device_df.columns)
# print(device_df.dtypes, "\n")
cc_df = click_fe.click_feature_engineering(tractate_click_df, tractate_conversion_df)
df = tractate_fe.join_features(device_df, tractate_df, cc_df)
......@@ -55,6 +58,12 @@ def main():
total_time = (time.time() - time_begin) / 60
print("total cost {:.2f} mins at {}".format(total_time, datetime.now()))
# save_path = str(Path("~/data/models/tractate/1596089465").expanduser()) # local
# save_path = "/home/gmuser/data/models/tractate/" # server
predict_fn = tf.contrib.predictor.from_saved_model(save_path)
print("============================================================")
if __name__ == "__main__":
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment