Commit ef92df4d authored by 赵威's avatar 赵威

update diary fe

parent be82b754
import timeit import timeit
import pandas as pd import pandas as pd
import tensorflow as tf
from tensorflow import feature_column as fc
from utils.cache import redis_db_client from utils.cache import redis_db_client
from ..utils import (common_elements, create_boundaries, create_vocabulary_list, nth_element) from ..utils import common_elements, nth_element
DIARY_COLUMNS = [ DIARY_COLUMNS = [
"card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "topic_num", "favor_num", "vote_num", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "topic_num", "favor_num", "vote_num",
"one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr", "first_demands", "second_demands", "first_solutions", "second_solutions", "one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr", "first_demands", "second_demands", "first_solutions", "second_solutions",
"first_positions", "second_positions", "projects" "first_positions", "second_positions", "projects"
] ]
_int_columns = ["active_days", "topic_num", "favor_num", "vote_num"] INT_COLUMNS = ["active_days", "topic_num", "favor_num", "vote_num"]
_float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"] FLOAT_COLUMNS = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
_categorical_columns = [ CATEGORICAL_COLUMNS = [
"device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history", "price_sensitive_history", "device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history", "price_sensitive_history",
"card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "device_fd", "content_fd", "fd1", "fd2", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "device_fd", "content_fd", "fd1", "fd2",
"fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs", "fs1", "fs2", "fs3", "device_ss", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs", "fs1", "fs2", "fs3", "device_ss",
...@@ -160,32 +158,6 @@ def join_features(device_df, diary_df, cc_df): ...@@ -160,32 +158,6 @@ def join_features(device_df, diary_df, cc_df):
return df return df
def build_features(df):
numeric_features = []
for col in (_int_columns + _float_columns):
if col in _int_columns:
numeric_features.append(
fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
else:
numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
categorical_features = []
for col in _categorical_columns:
if col == "card_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
dimension=int(df[col].size**0.25)))
elif col == "device_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
else:
categorical_features.append(
fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
all_features = (numeric_features + categorical_features)
return all_features
def device_diary_fe(device_id, diary_ids, device_dict, diary_dict): def device_diary_fe(device_id, diary_ids, device_dict, diary_dict):
time_1 = timeit.default_timer() time_1 = timeit.default_timer()
device_info = device_dict.get(device_id, {}).copy() device_info = device_dict.get(device_id, {}).copy()
......
import tensorflow as tf
from tensorflow import feature_column as fc
from ..utils import create_boundaries, create_vocabulary_list
def build_features(df, int_columns, float_columns, categorical_columns):
numeric_features = []
for col in (int_columns + float_columns):
if col in int_columns:
numeric_features.append(
fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
else:
numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
categorical_features = []
for col in categorical_columns:
if col == "card_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
dimension=int(df[col].size**0.25)))
elif col == "device_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 400000), dimension=int(df[col].size**0.25)))
elif col == "show_tag_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 100000), dimension=int(df[col].size**0.25)))
else:
categorical_features.append(
fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
all_features = (numeric_features + categorical_features)
return all_features
import pandas as pd import pandas as pd
import tensorflow as tf
from tensorflow import feature_column as fc
from utils.cache import redis_db_client from utils.cache import redis_db_client
from ..utils import (common_elements, create_boundaries, create_vocabulary_list, nth_element) from ..utils import common_elements, nth_element
TRACTATE_COLUMNS = [ TRACTATE_COLUMNS = [
"card_id", "is_pure_author", "is_have_pure_reply", "is_have_reply", "content_level", "show_tag_id", "reply_num", "card_id", "is_pure_author", "is_have_pure_reply", "is_have_reply", "content_level", "show_tag_id", "reply_num",
...@@ -134,39 +132,5 @@ def join_features(device_df, tractate_df, cc_df): ...@@ -134,39 +132,5 @@ def join_features(device_df, tractate_df, cc_df):
return df return df
def build_features(df):
numeric_features = []
for col in (_int_columns + _float_columns):
if col in _int_columns:
numeric_features.append(
fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
else:
numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
_categorical_columns = [
"device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
"price_sensitive_history", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
"device_fd", "content_fd", "fd1", "fd2", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs",
"fs1", "fs2", "fs3", "device_ss", "content_ss", "ss1", "ss2", "ss3", "device_fp", "content_fp", "fp1", "fp2", "fp3",
"device_sp", "content_sp", "sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3"
]
categorical_features = []
for col in _categorical_columns:
if col == "card_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
dimension=int(df[col].size**0.25)))
elif col == "device_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
else:
categorical_features.append(
fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
all_features = (numeric_features + categorical_features)
return all_features
def device_tractate_fe(device_id, tractate_ids, device_dict, tractate_dict): def device_tractate_fe(device_id, tractate_ids, device_dict, tractate_dict):
pass pass
...@@ -9,10 +9,11 @@ from pathlib import Path ...@@ -9,10 +9,11 @@ from pathlib import Path
import tensorflow as tf import tensorflow as tf
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from models.esmm.diary_model import model_predict_diary
from models.esmm.fe import click_fe as click_fe
from models.esmm.fe import device_fe as device_fe from models.esmm.fe import device_fe as device_fe
from models.esmm.fe import diary_fe as diary_fe from models.esmm.fe import diary_fe as diary_fe
from models.esmm.fe import click_fe as click_fe from models.esmm.fe import fe as fe
from models.esmm.diary_model import model_predict_diary
from models.esmm.input_fn import esmm_input_fn from models.esmm.input_fn import esmm_input_fn
from models.esmm.model import esmm_model_fn, model_export from models.esmm.model import esmm_model_fn, model_export
...@@ -43,7 +44,7 @@ def main(): ...@@ -43,7 +44,7 @@ def main():
train_df, test_df = train_test_split(df, test_size=0.2) train_df, test_df = train_test_split(df, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.2) train_df, val_df = train_test_split(train_df, test_size=0.2)
all_features = diary_fe.build_features(df) all_features = fe.build_features(df, diary_fe.INT_COLUMNS, diary_fe.FLOAT_COLUMNS, diary_fe.CATEGORICAL_COLUMNS)
params = {"feature_columns": all_features, "hidden_units": [64, 32], "learning_rate": 0.1} params = {"feature_columns": all_features, "hidden_units": [64, 32], "learning_rate": 0.1}
model_path = str(Path("~/data/model_tmp/").expanduser()) model_path = str(Path("~/data/model_tmp/").expanduser())
# if os.path.exists(model_path): # if os.path.exists(model_path):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment