get dataframe

44fbe3c4 · 赵威 · c8b0b7c0 · 44fbe3c4 · 44fbe3c4 · 44fbe3c4
Commit 44fbe3c4 authored Nov 05, 2020 by 赵威
Hide whitespace changes
Inline Side-by-side

Showing with 264 additions and 3 deletions

.gitignore .gitignore +3 -0

dssm_model.py dssm/dssm_model.py +258 -0

get_tractate_data.py dssm/get_tractate_data.py +3 -3

No files found.
--- a/.gitignore
+++ b/.gitignore
@@ -108,3 +108,5 @@ diary_before_cover_vec.txt

 _index/*
 ! _index/.gitkeep
+
+_data/*
\ No newline at end of file
--- a/dssm/dssm_model.py
+++ b/dssm/dssm_model.py
+import os
+
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from tensorflow.keras import activations, layers, losses, metrics, optimizers
+
+base_dir = os.getcwd()
+# base_dir = "/Users/offic/work/GM/strategy_embedding/"  # TODO remove
+DATA_PATH = os.path.join(base_dir, "_data")
+MODEL_PATH = os.path.join(base_dir, "_models")
+
+DEVICE_COLUMNS = [
+    "device_id",
+    "device_fd",
+    "device_sd",
+    "device_fs",
+    "device_ss",
+    "device_fp",
+    "device_sp",
+    "device_p",
+    "device_fd2",
+    "device_sd2",
+    "device_fs2",
+    "device_ss2",
+    "device_fp2",
+    "device_sp2",
+    "device_p2",
+]
+
+LABEL_COLUMNS = "label"
+
+TRACTATE_COLUMNS = [
+    "card_id",
+    "is_pure_author",
+    "is_have_pure_reply",
+    "is_have_reply",
+    "content_level",
+    "topic_seven_click_num",
+    "topic_thirty_click_num",
+    "topic_num",
+    "seven_transform_num",
+    "thirty_transform_num",
+    "favor_num",
+    "favor_pure_num",
+    "vote_num",
+    "vote_display_num",
+    "reply_num",
+    "reply_pure_num",
+    "one_click_num",
+    "three_click_num",
+    "seven_click_num",
+    "fifteen_click_num",
+    "thirty_click_num",
+    "sixty_click_num",
+    "ninety_click_num",
+    "history_click_num",
+    "one_precise_exposure_num",
+    "three_precise_exposure_num",
+    "seven_precise_exposure_num",
+    "fifteen_precise_exposure_num",
+    "thirty_precise_exposure_num",
+    "sixty_precise_exposure_num",
+    "ninety_precise_exposure_num",
+    "history_precise_exposure_num",
+    "one_vote_user_num",
+    "three_vote_user_num",
+    "seven_vote_user_num",
+    "fifteen_vote_user_num",
+    "thirty_vote_user_num",
+    "sixty_vote_user_num",
+    "ninety_vote_user_num",
+    "history_vote_user_num",
+    "one_reply_user_num",
+    "three_reply_user_num",
+    "seven_reply_user_num",
+    "fifteen_reply_user_num",
+    "thirty_reply_user_num",
+    "sixty_reply_user_num",
+    "ninety_reply_user_num",
+    "history_reply_user_num",
+    "one_browse_user_num",
+    "three_browse_user_num",
+    "seven_browse_user_num",
+    "fifteen_browse_user_num",
+    "thirty_browse_user_num",
+    "sixty_browse_user_num",
+    "ninety_browse_user_num",
+    "history_browse_user_num",
+    "one_reply_num",
+    "three_reply_num",
+    "seven_reply_num",
+    "fifteen_reply_num",
+    "thirty_reply_num",
+    "sixty_reply_num",
+    "ninety_reply_num",
+    "history_reply_num",
+    "one_ctr",
+    "three_ctr",
+    "seven_ctr",
+    "fifteen_ctr",
+    "thirty_ctr",
+    "sixty_ctr",
+    "ninety_ctr",
+    "history_ctr",
+    "one_vote_pure_rate",
+    "three_vote_pure_rate",
+    "seven_vote_pure_rate",
+    "fifteen_vote_pure_rate",
+    "thirty_vote_pure_rate",
+    "sixty_vote_pure_rate",
+    "ninety_vote_pure_rate",
+    "history_vote_pure_rate",
+    "one_reply_pure_rate",
+    "three_reply_pure_rate",
+    "seven_reply_pure_rate",
+    "fifteen_reply_pure_rate",
+    "thirty_reply_pure_rate",
+    "sixty_reply_pure_rate",
+    "ninety_reply_pure_rate",
+    "history_reply_pure_rate",
+    "card_fd",
+    "card_sd",
+    "card_fs",
+    "card_ss",
+    "card_fp",
+    "card_sp",
+    "card_p",
+    "card_fd2",
+    "card_sd2",
+    "card_fs2",
+    "card_ss2",
+    "card_fp2",
+    "card_sp2",
+    "card_p2",
+]
+
+
+def nth_element(lst, n):
+    if n >= len(lst):
+        return ""
+    return lst[n]
+
+
+def get_df(file):
+    full_path = os.path.join(DATA_PATH, file)
+    df = pd.read_csv(full_path, sep="|")
+    return df
+
+
+def device_tractae_fe():
+    click_df = get_df("tractate_click.csv")
+    exposure_df = get_df("tractate_exposure.csv")
+    device_fe_df = get_df("device_feature.csv")
+    tractate_fe_df = get_df("tractate_feature.csv")
+    # print(click_df.head(3))
+    # print(exposure_df.head(3))
+    # print(device_fe_df.head(3))
+    # print(tractate_fe_df.head(3))
+
+    #
+    base_df = pd.merge(click_df, exposure_df, how="outer", indicator="Exist")
+    base_df["label"] = np.where(base_df["Exist"] == "right_only", 0.5, 1.0)
+    base_df.drop("Exist", inplace=True, axis=1)
+
+    #
+    device_fe_df.fillna("", inplace=True)
+    device_fe_df.rename(columns={"cl_id": "device_id"}, inplace=True)
+
+    device_fe_df["first_demands"] = device_fe_df["first_demands"].str.split(",").\
+        apply(lambda d: d if isinstance(d, list) else [])
+    device_fe_df["second_demands"] = device_fe_df["second_demands"].str.split(",").\
+        apply(lambda d: d if isinstance(d, list) else [])
+    device_fe_df["first_solutions"] = device_fe_df["first_solutions"].str.split(",").\
+        apply(lambda d: d if isinstance(d, list) else [])
+    device_fe_df["second_solutions"] = device_fe_df["second_solutions"].str.split(",").\
+        apply(lambda d: d if isinstance(d, list) else [])
+    device_fe_df["first_positions"] = device_fe_df["first_positions"].str.split(",").\
+        apply(lambda d: d if isinstance(d, list) else [])
+    device_fe_df["second_positions"] = device_fe_df["second_positions"].str.split(",").\
+        apply(lambda d: d if isinstance(d, list) else [])
+    device_fe_df["projects"] = device_fe_df["projects"].str.split(",").\
+        apply(lambda d: d if isinstance(d, list) else [])
+
+    device_fe_df["device_fd"] = device_fe_df["first_demands"].apply(lambda x: nth_element(x, 0))
+    device_fe_df["device_sd"] = device_fe_df["second_demands"].apply(lambda x: nth_element(x, 0))
+    device_fe_df["device_fs"] = device_fe_df["first_solutions"].apply(lambda x: nth_element(x, 0))
+    device_fe_df["device_ss"] = device_fe_df["second_solutions"].apply(lambda x: nth_element(x, 0))
+    device_fe_df["device_fp"] = device_fe_df["first_positions"].apply(lambda x: nth_element(x, 0))
+    device_fe_df["device_sp"] = device_fe_df["second_positions"].apply(lambda x: nth_element(x, 0))
+    device_fe_df["device_p"] = device_fe_df["projects"].apply(lambda x: nth_element(x, 0))
+
+    device_fe_df["device_fd2"] = device_fe_df["first_demands"].apply(lambda x: nth_element(x, 1))
+    device_fe_df["device_sd2"] = device_fe_df["second_demands"].apply(lambda x: nth_element(x, 1))
+    device_fe_df["device_fs2"] = device_fe_df["first_solutions"].apply(lambda x: nth_element(x, 1))
+    device_fe_df["device_ss2"] = device_fe_df["second_solutions"].apply(lambda x: nth_element(x, 1))
+    device_fe_df["device_fp2"] = device_fe_df["first_positions"].apply(lambda x: nth_element(x, 1))
+    device_fe_df["device_sp2"] = device_fe_df["second_positions"].apply(lambda x: nth_element(x, 1))
+    device_fe_df["device_p2"] = device_fe_df["projects"].apply(lambda x: nth_element(x, 1))
+    _drop_columns = [
+        "first_demands", "second_demands", "first_solutions", "second_solutions", "first_positions", "second_positions",
+        "projects"
+    ]
+    device_fe_df.drop(columns=_drop_columns, axis=1, inplace=True)
+
+    #
+    _card_drop_columns = [
+        "card_first_demands", "card_second_demands", "card_first_solutions", "card_second_solutions", "card_first_positions",
+        "card_second_positions", "card_projects"
+    ]
+    tractate_fe_df[_card_drop_columns].fillna("", inplace=True)
+
+    tractate_fe_df["card_first_demands"] = tractate_fe_df["card_first_demands"].str.split(",").\
+        apply(lambda d: d if isinstance(d, list) else [])
+    tractate_fe_df["card_second_demands"] = tractate_fe_df["card_second_demands"].str.split(",").\
+        apply(lambda d: d if isinstance(d, list) else [])
+    tractate_fe_df["card_first_solutions"] = tractate_fe_df["card_first_solutions"].str.split(",").\
+        apply(lambda d: d if isinstance(d, list) else [])
+    tractate_fe_df["card_second_solutions"] = tractate_fe_df["card_second_solutions"].str.split(",").\
+        apply(lambda d: d if isinstance(d, list) else [])
+    tractate_fe_df["card_first_positions"] = tractate_fe_df["card_first_positions"].str.split(",").\
+        apply(lambda d: d if isinstance(d, list) else [])
+    tractate_fe_df["card_second_positions"] = tractate_fe_df["card_second_positions"].str.split(",").\
+        apply(lambda d: d if isinstance(d, list) else [])
+    tractate_fe_df["card_projects"] = tractate_fe_df["card_projects"].str.split(",").\
+        apply(lambda d: d if isinstance(d, list) else [])
+
+    tractate_fe_df["card_fd"] = tractate_fe_df["card_first_demands"].apply(lambda x: nth_element(x, 0))
+    tractate_fe_df["card_sd"] = tractate_fe_df["card_second_demands"].apply(lambda x: nth_element(x, 0))
+    tractate_fe_df["card_fs"] = tractate_fe_df["card_first_solutions"].apply(lambda x: nth_element(x, 0))
+    tractate_fe_df["card_ss"] = tractate_fe_df["card_second_solutions"].apply(lambda x: nth_element(x, 0))
+    tractate_fe_df["card_fp"] = tractate_fe_df["card_first_positions"].apply(lambda x: nth_element(x, 0))
+    tractate_fe_df["card_sp"] = tractate_fe_df["card_second_positions"].apply(lambda x: nth_element(x, 0))
+    tractate_fe_df["card_p"] = tractate_fe_df["card_projects"].apply(lambda x: nth_element(x, 0))
+
+    tractate_fe_df["card_fd2"] = tractate_fe_df["card_first_demands"].apply(lambda x: nth_element(x, 1))
+    tractate_fe_df["card_sd2"] = tractate_fe_df["card_second_demands"].apply(lambda x: nth_element(x, 1))
+    tractate_fe_df["card_fs2"] = tractate_fe_df["card_first_solutions"].apply(lambda x: nth_element(x, 1))
+    tractate_fe_df["card_ss2"] = tractate_fe_df["card_second_solutions"].apply(lambda x: nth_element(x, 1))
+    tractate_fe_df["card_fp2"] = tractate_fe_df["card_first_positions"].apply(lambda x: nth_element(x, 1))
+    tractate_fe_df["card_sp2"] = tractate_fe_df["card_second_positions"].apply(lambda x: nth_element(x, 1))
+    tractate_fe_df["card_p2"] = tractate_fe_df["card_projects"].apply(lambda x: nth_element(x, 1))
+    tractate_fe_df.drop(columns=_card_drop_columns, axis=1, inplace=True)
+
+    #
+    df = pd.merge(pd.merge(base_df, device_fe_df), tractate_fe_df)
+
+    nullseries = df.isnull().sum()
+    nulls = nullseries[nullseries > 0]
+    if nulls.any():
+        print(nulls)
+        raise Exception("dataframe nulls")
+    return df
+
+
+if __name__ == "__main__":
+    df = device_tractae_fe()
+    print(df.head(3))
--- a/dssm/get_tractate_data.py
+++ b/dssm/get_tractate_data.py
@@ -406,13 +406,13 @@ if __name__ == "__main__":
    start, end = get_ndays_before_no_minus(days), get_ndays_before_no_minus(1)

    click_df = get_click_data(spark, card_type, start, end)
-    save_df_to_csv(click_df, "tracate_click.csv")
+    save_df_to_csv(click_df, "tractate_click.csv")

    exposure_df = get_exposure_data(spark, card_type, start, end)
-    save_df_to_csv(exposure_df, "tracate_exposure.csv")
+    save_df_to_csv(exposure_df, "tractate_exposure.csv")

    tractate_feature_df = get_card_feature_df(spark, card_type, end)
-    save_df_to_csv(tractate_feature_df, "tracate_feature.csv")
+    save_df_to_csv(tractate_feature_df, "tractate_feature.csv")

    device_feature_df = get_device_tags(spark)
    save_df_to_csv(device_feature_df, "device_feature.csv")