Commit 566d56d6 authored by 赵威's avatar 赵威

get shape

parent 44fbe3c4
......@@ -153,12 +153,14 @@ def device_tractae_fe():
exposure_df = get_df("tractate_exposure.csv")
device_fe_df = get_df("device_feature.csv")
tractate_fe_df = get_df("tractate_feature.csv")
# print(click_df.head(3))
# print(exposure_df.head(3))
# print(device_fe_df.head(3))
# print(tractate_fe_df.head(3))
print(click_df.shape)
print(exposure_df.shape)
print(device_fe_df.shape)
print(tractate_fe_df.shape)
#
click_df.drop("partition_date", inplace=True, axis=1)
exposure_df.drop("partition_date", inplace=True, axis=1)
base_df = pd.merge(click_df, exposure_df, how="outer", indicator="Exist")
base_df["label"] = np.where(base_df["Exist"] == "right_only", 0.5, 1.0)
base_df.drop("Exist", inplace=True, axis=1)
......@@ -245,6 +247,8 @@ def device_tractae_fe():
#
df = pd.merge(pd.merge(base_df, device_fe_df), tractate_fe_df)
# a = pd.merge(base_df, tractate_fe_df, how="left", left_on="card_id", right_on="card_id")
nullseries = df.isnull().sum()
nulls = nullseries[nullseries > 0]
if nulls.any():
......@@ -255,4 +259,4 @@ def device_tractae_fe():
if __name__ == "__main__":
df = device_tractae_fe()
print(df.head(3))
print(df.head(3), df.shape)
......@@ -407,14 +407,18 @@ if __name__ == "__main__":
click_df = get_click_data(spark, card_type, start, end)
save_df_to_csv(click_df, "tractate_click.csv")
print(click_df.shape)
exposure_df = get_exposure_data(spark, card_type, start, end)
save_df_to_csv(exposure_df, "tractate_exposure.csv")
print(exposure_df.shape)
tractate_feature_df = get_card_feature_df(spark, card_type, end)
save_df_to_csv(tractate_feature_df, "tractate_feature.csv")
print(tractate_feature_df.shape)
device_feature_df = get_device_tags(spark)
save_df_to_csv(device_feature_df, "device_feature.csv")
print(device_feature_df.shape)
# spark-submit --master yarn --deploy-mode client --queue root.strategy --driver-memory 16g --executor-memory 1g --executor-cores 1 --num-executors 70 --conf spark.default.parallelism=100 --conf spark.storage.memoryFraction=0.5 --conf spark.shuffle.memoryFraction=0.3 --conf spark.locality.wait=0 --jars /srv/apps/tispark-core-2.1-SNAPSHOT-jar-with-dependencies.jar,/srv/apps/spark-connector_2.11-1.9.0-rc2.jar,/srv/apps/mysql-connector-java-5.1.38.jar /srv/apps/strategy_embedding/dssm/get_tractate_data.py
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment