Commit be82b754 authored by 赵威's avatar 赵威

update columns

parent a23369bd
......@@ -12,6 +12,15 @@ DIARY_COLUMNS = [
"one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr", "first_demands", "second_demands", "first_solutions", "second_solutions",
"first_positions", "second_positions", "projects"
]
_int_columns = ["active_days", "topic_num", "favor_num", "vote_num"]
_float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
_categorical_columns = [
"device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history", "price_sensitive_history",
"card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "device_fd", "content_fd", "fd1", "fd2",
"fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs", "fs1", "fs2", "fs3", "device_ss",
"content_ss", "ss1", "ss2", "ss3", "device_fp", "content_fp", "fp1", "fp2", "fp3", "device_sp", "content_sp", "sp1", "sp2",
"sp3", "device_p", "content_p", "p1", "p2", "p3"
]
def read_csv_data(dataset_path):
......@@ -152,41 +161,26 @@ def join_features(device_df, diary_df, cc_df):
def build_features(df):
# TODO
int_columns = ["active_days", "topic_num", "favor_num", "vote_num"]
float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
numeric_features = []
for col in (int_columns + float_columns):
if col in int_columns:
for col in (_int_columns + _float_columns):
if col in _int_columns:
numeric_features.append(
fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
else:
numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
# TODO
categorical_columns = [
"device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
"price_sensitive_history", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
"device_fd", "content_fd", "fd1", "fd2", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs",
"fs1", "fs2", "fs3", "device_ss", "content_ss", "ss1", "ss2", "ss3", "device_fp", "content_fp", "fp1", "fp2", "fp3",
"device_sp", "content_sp", "sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3"
]
categorical_ignore_columns = []
categorical_features = []
for col in categorical_columns:
if col not in categorical_ignore_columns:
if col == "card_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
dimension=int(df[col].size**0.25)))
elif col == "device_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
else:
categorical_features.append(
fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
for col in _categorical_columns:
if col == "card_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
dimension=int(df[col].size**0.25)))
elif col == "device_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
else:
categorical_features.append(
fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
all_features = (numeric_features + categorical_features)
return all_features
......
......@@ -11,6 +11,16 @@ TRACTATE_COLUMNS = [
"first_demands", "second_demands", "first_solutions", "second_solutions", "first_positions", "second_positions", "projects"
]
_int_columns = ["active_days", "reply_num", "reply_pure_num"]
_float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr", "thirty_ctr", "sixty_ctr", "ninety_ctr", "history_ctr"]
_categorical_columns = [
"device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history", "price_sensitive_history",
"card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "show_tag_id", "device_fd", "content_fd",
"fd1", "fd2", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs", "fs1", "fs2", "fs3",
"device_ss", "content_ss", "ss1", "ss2", "ss3", "device_fp", "content_fp", "fp1", "fp2", "fp3", "device_sp", "content_sp",
"sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3"
]
def read_csv_data(dataset_path):
tractate_df = pd.read_csv(dataset_path.joinpath("tractate.csv"), sep="|")
......@@ -120,27 +130,20 @@ def join_features(device_df, tractate_df, cc_df):
"second_solutions", "first_positions_x", "first_positions_y", "first_positions", "second_positions_x",
"second_positions_y", "second_positions", "projects_x", "projects_y", "projects"
]
# for col in drop_columns:
# if col in df.columns:
# df.drop(col, inplace=True, axis=1)
df.drop(drop_columns, inplace=True, axis=1)
return df
def build_features(df):
# TODO
int_columns = ["active_days", "topic_num", "favor_num", "vote_num"]
float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
numeric_features = []
for col in (int_columns + float_columns):
if col in int_columns:
for col in (_int_columns + _float_columns):
if col in _int_columns:
numeric_features.append(
fc.bucketized_column(fc.numeric_column(col, dtype=tf.int64), boundaries=create_boundaries(df, col)))
else:
numeric_features.append(fc.bucketized_column(fc.numeric_column(col), boundaries=create_boundaries(df, col)))
# TODO
categorical_columns = [
_categorical_columns = [
"device_id", "active_type", "past_consume_ability_history", "potential_consume_ability_history",
"price_sensitive_history", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
"device_fd", "content_fd", "fd1", "fd2", "fd3", "device_sd", "content_sd", "sd1", "sd2", "sd3", "device_fs", "content_fs",
......@@ -148,21 +151,18 @@ def build_features(df):
"device_sp", "content_sp", "sp1", "sp2", "sp3", "device_p", "content_p", "p1", "p2", "p3"
]
categorical_ignore_columns = []
categorical_features = []
for col in categorical_columns:
if col not in categorical_ignore_columns:
if col == "card_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
dimension=int(df[col].size**0.25)))
elif col == "device_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
else:
categorical_features.append(
fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
for col in _categorical_columns:
if col == "card_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 20000, dtype=tf.int64),
dimension=int(df[col].size**0.25)))
elif col == "device_id":
categorical_features.append(
fc.embedding_column(fc.categorical_column_with_hash_bucket(col, 200000), dimension=int(df[col].size**0.25)))
else:
categorical_features.append(
fc.indicator_column(fc.categorical_column_with_vocabulary_list(col, create_vocabulary_list(df, col))))
all_features = (numeric_features + categorical_features)
return all_features
......
......@@ -25,6 +25,8 @@ def main():
cc_df = click_fe.click_feature_engineering(tractate_click_df, tractate_conversion_df)
df = tractate_fe.join_features(device_df, tractate_df, cc_df)
for i in df.columns:
print(i)
# print(df.dtypes)
train_df, test_df = train_test_split(df, test_size=0.2)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment