get data from file

2217a2cd · 赵威 · 77fd440a · 2217a2cd · 2217a2cd · 2217a2cd
Commit 2217a2cd authored Nov 16, 2020 by 赵威
Hide whitespace changes
Inline Side-by-side

Showing with 38 additions and 106 deletions

get_data.py personas_vector/get_data.py +0 -106

personas_dssm_model.py personas_vector/personas_dssm_model.py +34 -0

utils.py utils/utils.py +4 -0

No files found.
--- a/personas_vector/get_data.py
+++ b/personas_vector/get_data.py
@@ -12,112 +12,6 @@ from utils.spark import (get_click_data, get_device_tags, get_exposure_data, get
 base_dir = os.getcwd()
 DATA_PATH = os.path.join(base_dir, "_data")

-# def device_tractae_fe():
-#     click_df = get_df("tractate_click.csv")
-#     exposure_df = get_df("tractate_exposure.csv")
-#     device_fe_df = get_df("device_feature.csv")
-#     tractate_fe_df = get_df("tractate_feature.csv")
-#     print(click_df.shape)
-#     print(exposure_df.shape)
-#     print(device_fe_df.shape)
-#     print(tractate_fe_df.shape)
-
-#     #
-#     click_df.drop("partition_date", inplace=True, axis=1)
-#     exposure_df.drop("partition_date", inplace=True, axis=1)
-#     base_df = pd.merge(click_df, exposure_df, how="outer", indicator="Exist")
-#     base_df["label"] = np.where(base_df["Exist"] == "right_only", 0.75, 1.0)
-#     base_df.drop("Exist", inplace=True, axis=1)
-
-#     #
-#     device_fe_df.fillna("", inplace=True)
-#     device_fe_df.rename(columns={"cl_id": "device_id"}, inplace=True)
-
-#     device_fe_df["first_demands"] = device_fe_df["first_demands"].str.split(",").\
-#         apply(lambda d: d if isinstance(d, list) else [])
-#     device_fe_df["second_demands"] = device_fe_df["second_demands"].str.split(",").\
-#         apply(lambda d: d if isinstance(d, list) else [])
-#     device_fe_df["first_solutions"] = device_fe_df["first_solutions"].str.split(",").\
-#         apply(lambda d: d if isinstance(d, list) else [])
-#     device_fe_df["second_solutions"] = device_fe_df["second_solutions"].str.split(",").\
-#         apply(lambda d: d if isinstance(d, list) else [])
-#     device_fe_df["first_positions"] = device_fe_df["first_positions"].str.split(",").\
-#         apply(lambda d: d if isinstance(d, list) else [])
-#     device_fe_df["second_positions"] = device_fe_df["second_positions"].str.split(",").\
-#         apply(lambda d: d if isinstance(d, list) else [])
-#     device_fe_df["projects"] = device_fe_df["projects"].str.split(",").\
-#         apply(lambda d: d if isinstance(d, list) else [])
-
-#     device_fe_df["device_fd"] = device_fe_df["first_demands"].apply(lambda x: nth_element(x, 0))
-#     device_fe_df["device_sd"] = device_fe_df["second_demands"].apply(lambda x: nth_element(x, 0))
-#     device_fe_df["device_fs"] = device_fe_df["first_solutions"].apply(lambda x: nth_element(x, 0))
-#     device_fe_df["device_ss"] = device_fe_df["second_solutions"].apply(lambda x: nth_element(x, 0))
-#     device_fe_df["device_fp"] = device_fe_df["first_positions"].apply(lambda x: nth_element(x, 0))
-#     device_fe_df["device_sp"] = device_fe_df["second_positions"].apply(lambda x: nth_element(x, 0))
-#     device_fe_df["device_p"] = device_fe_df["projects"].apply(lambda x: nth_element(x, 0))
-
-#     device_fe_df["device_fd2"] = device_fe_df["first_demands"].apply(lambda x: nth_element(x, 1))
-#     device_fe_df["device_sd2"] = device_fe_df["second_demands"].apply(lambda x: nth_element(x, 1))
-#     device_fe_df["device_fs2"] = device_fe_df["first_solutions"].apply(lambda x: nth_element(x, 1))
-#     device_fe_df["device_ss2"] = device_fe_df["second_solutions"].apply(lambda x: nth_element(x, 1))
-#     device_fe_df["device_fp2"] = device_fe_df["first_positions"].apply(lambda x: nth_element(x, 1))
-#     device_fe_df["device_sp2"] = device_fe_df["second_positions"].apply(lambda x: nth_element(x, 1))
-#     device_fe_df["device_p2"] = device_fe_df["projects"].apply(lambda x: nth_element(x, 1))
-#     _drop_columns = [
-#         "first_demands", "second_demands", "first_solutions", "second_solutions", "first_positions", "second_positions",
-#         "projects"
-#     ]
-#     device_fe_df.drop(columns=_drop_columns, axis=1, inplace=True)
-
-#     #
-#     _card_drop_columns = [
-#         "card_first_demands", "card_second_demands", "card_first_solutions", "card_second_solutions", "card_first_positions",
-#         "card_second_positions", "card_projects"
-#     ]
-#     tractate_fe_df[_card_drop_columns].fillna("", inplace=True)
-
-#     tractate_fe_df["card_first_demands"] = tractate_fe_df["card_first_demands"].str.split(",").\
-#         apply(lambda d: d if isinstance(d, list) else [])
-#     tractate_fe_df["card_second_demands"] = tractate_fe_df["card_second_demands"].str.split(",").\
-#         apply(lambda d: d if isinstance(d, list) else [])
-#     tractate_fe_df["card_first_solutions"] = tractate_fe_df["card_first_solutions"].str.split(",").\
-#         apply(lambda d: d if isinstance(d, list) else [])
-#     tractate_fe_df["card_second_solutions"] = tractate_fe_df["card_second_solutions"].str.split(",").\
-#         apply(lambda d: d if isinstance(d, list) else [])
-#     tractate_fe_df["card_first_positions"] = tractate_fe_df["card_first_positions"].str.split(",").\
-#         apply(lambda d: d if isinstance(d, list) else [])
-#     tractate_fe_df["card_second_positions"] = tractate_fe_df["card_second_positions"].str.split(",").\
-#         apply(lambda d: d if isinstance(d, list) else [])
-#     tractate_fe_df["card_projects"] = tractate_fe_df["card_projects"].str.split(",").\
-#         apply(lambda d: d if isinstance(d, list) else [])
-
-#     tractate_fe_df["card_fd"] = tractate_fe_df["card_first_demands"].apply(lambda x: nth_element(x, 0))
-#     tractate_fe_df["card_sd"] = tractate_fe_df["card_second_demands"].apply(lambda x: nth_element(x, 0))
-#     tractate_fe_df["card_fs"] = tractate_fe_df["card_first_solutions"].apply(lambda x: nth_element(x, 0))
-#     tractate_fe_df["card_ss"] = tractate_fe_df["card_second_solutions"].apply(lambda x: nth_element(x, 0))
-#     tractate_fe_df["card_fp"] = tractate_fe_df["card_first_positions"].apply(lambda x: nth_element(x, 0))
-#     tractate_fe_df["card_sp"] = tractate_fe_df["card_second_positions"].apply(lambda x: nth_element(x, 0))
-#     tractate_fe_df["card_p"] = tractate_fe_df["card_projects"].apply(lambda x: nth_element(x, 0))
-
-#     tractate_fe_df["card_fd2"] = tractate_fe_df["card_first_demands"].apply(lambda x: nth_element(x, 1))
-#     tractate_fe_df["card_sd2"] = tractate_fe_df["card_second_demands"].apply(lambda x: nth_element(x, 1))
-#     tractate_fe_df["card_fs2"] = tractate_fe_df["card_first_solutions"].apply(lambda x: nth_element(x, 1))
-#     tractate_fe_df["card_ss2"] = tractate_fe_df["card_second_solutions"].apply(lambda x: nth_element(x, 1))
-#     tractate_fe_df["card_fp2"] = tractate_fe_df["card_first_positions"].apply(lambda x: nth_element(x, 1))
-#     tractate_fe_df["card_sp2"] = tractate_fe_df["card_second_positions"].apply(lambda x: nth_element(x, 1))
-#     tractate_fe_df["card_p2"] = tractate_fe_df["card_projects"].apply(lambda x: nth_element(x, 1))
-#     tractate_fe_df.drop(columns=_card_drop_columns, axis=1, inplace=True)
-
-#     #
-#     df = pd.merge(pd.merge(base_df, device_fe_df), tractate_fe_df)
-
-#     nullseries = df.isnull().sum()
-#     nulls = nullseries[nullseries > 0]
-#     if nulls.any():
-#         print(nulls)
-#         raise Exception("dataframe nulls")
-#     return df
-
 if __name__ == "__main__":
    spark = get_spark("personas_vector_data")
    card_type = "user_post"

--- a/personas_vector/personas_dssm_model.py
+++ b/personas_vector/personas_dssm_model.py
+import pandas as pd
+from utils.files import get_df
+from utils.utils import nth_element
+
+DEVICE_COLUMNS = [
+    "device_id",
+]
+
+TRACTATE_COLUMNS = [
+    "card_id",
+]
+
+
+def device_tractae_fe():
+    click_df = get_df("personas_tractate_click.csv")
+    exposure_df = get_df("personas_tractate_exposure.csv")
+    device_fe_df = get_df("personas_device_feature.csv")
+    tractate_tags_df = get_df("personas_tractate_tags.csv")
+    print(click_df.shape)
+    print(exposure_df.shape)
+    print(device_fe_df.shape)
+    print(tractate_tags_df.shape)
+
+
+if __name__ == "__main__":
+    click_df = get_df("personas_tractate_click.csv")
+    exposure_df = get_df("personas_tractate_exposure.csv")
+    device_fe_df = get_df("personas_device_feature.csv")
+    tractate_tags_df = get_df("personas_tractate_tags.csv")
+    print(click_df.shape)
+    print(exposure_df.shape)
+    print(device_fe_df.shape)
+    print(tractate_tags_df.shape)
+    print(tractate_tags_df.head(3))
--- a/utils/utils.py
+++ b/utils/utils.py
+def nth_element(lst, n):
+    if n >= len(lst):
+        return ""
+    return lst[n]