Commit ef92df4d authored by 赵威's avatar 赵威

update diary fe

parent be82b754
import timeit
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column as fc
from utils.cache import redis_db_client
from ..utils import (common_elements, create_boundaries, create_vocabulary_list, nth_element)
from ..utils import common_elements, nth_element
DIARY_COLUMNS = [
"card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "topic_num", "favor_num", "vote_num",
"one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr", "first_demands", "second_demands", "first_solutions", "second_solutions",
"first_positions", "second_positions", "projects"
]
_int_columns = ["active_days", "topic_num", "favor_num", "vote_num"]
_float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
_categorical_columns = [
INT_COLUMNS = ["active_days", "topic_num", "favor_num", "vote_num"]
FLOAT_COLUMNS = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
CATEGORICAL_COLUMNS = [
"device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history", "price_sensitive_history",
"card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "device_fd", "content_fd", "fd1", "fd2",
"fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs", "fs1", "fs2", "fs3", "device_ss",
......@@ -160,32 +158,6 @@ def join_features(device_df, diary_df, cc_df):
return df
def build_features(df):
numeric_features = []
for col in (_int_columns + _float_columns):
if col in _int_columns:
numeric_features.append(
fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
else:
numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
categorical_features = []
for col in _categorical_columns:
if col == "card_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
dimension=int(df[col].size**0.25)))
elif col == "device_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
else:
categorical_features.append(
fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
all_features = (numeric_features + categorical_features)
return all_features
def device_diary_fe(device_id, diary_ids, device_dict, diary_dict):
time_1 = timeit.default_timer()
device_info = device_dict.get(device_id, {}).copy()
......
import tensorflow as tf
from tensorflow import feature_column as fc
from ..utils import create_boundaries, create_vocabulary_list
def build_features(df, int_columns, float_columns, categorical_columns):
numeric_features = []
for col in (int_columns + float_columns):
if col in int_columns:
numeric_features.append(
fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
else:
numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
categorical_features = []
for col in categorical_columns:
if col == "card_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
dimension=int(df[col].size**0.25)))
elif col == "device_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 400000), dimension=int(df[col].size**0.25)))
elif col == "show_tag_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 100000), dimension=int(df[col].size**0.25)))
else:
categorical_features.append(
fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
all_features = (numeric_features + categorical_features)
return all_features
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column as fc
from utils.cache import redis_db_client
from ..utils import (common_elements, create_boundaries, create_vocabulary_list, nth_element)
from ..utils import common_elements, nth_element
TRACTATE_COLUMNS = [
"card_id", "is_pure_author", "is_have_pure_reply", "is_have_reply", "content_level", "show_tag_id", "reply_num",
......@@ -134,39 +132,5 @@ def join_features(device_df, tractate_df, cc_df):
return df
def build_features(df):
numeric_features = []
for col in (_int_columns + _float_columns):
if col in _int_columns:
numeric_features.append(
fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
else:
numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
_categorical_columns = [
"device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
"price_sensitive_history", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
"device_fd", "content_fd", "fd1", "fd2", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs",
"fs1", "fs2", "fs3", "device_ss", "content_ss", "ss1", "ss2", "ss3", "device_fp", "content_fp", "fp1", "fp2", "fp3",
"device_sp", "content_sp", "sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3"
]
categorical_features = []
for col in _categorical_columns:
if col == "card_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
dimension=int(df[col].size**0.25)))
elif col == "device_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
else:
categorical_features.append(
fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
all_features = (numeric_features + categorical_features)
return all_features
def device_tractate_fe(device_id, tractate_ids, device_dict, tractate_dict):
pass
......@@ -9,10 +9,11 @@ from pathlib import Path
import tensorflow as tf
from sklearn.model_selection import train_test_split
from models.esmm.diary_model import model_predict_diary
from models.esmm.fe import click_fe as click_fe
from models.esmm.fe import device_fe as device_fe
from models.esmm.fe import diary_fe as diary_fe
from models.esmm.fe import click_fe as click_fe
from models.esmm.diary_model import model_predict_diary
from models.esmm.fe import fe as fe
from models.esmm.input_fn import esmm_input_fn
from models.esmm.model import esmm_model_fn, model_export
......@@ -43,7 +44,7 @@ def main():
train_df, test_df = train_test_split(df, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.2)
all_features = diary_fe.build_features(df)
all_features = fe.build_features(df, diary_fe.INT_COLUMNS, diary_fe.FLOAT_COLUMNS, diary_fe.CATEGORICAL_COLUMNS)
params = {"feature_columns": all_features, "hidden_units": [64, 32], "learning_rate": 0.1}
model_path = str(Path("~/data/model_tmp/").expanduser())
# if os.path.exists(model_path):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment