Commit 42f6ddfb authored by 张彦钊's avatar 张彦钊

把top特征和l1、l2特征合并

parent 952ee419
...@@ -50,18 +50,19 @@ def get_data(): ...@@ -50,18 +50,19 @@ def get_data():
unique_values = [] unique_values = []
features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1", "time", "stat_date"] "channel", "top", "time", "stat_date"]
for i in features: for i in features:
df[i] = df[i].astype("str") df[i] = df[i].astype("str")
df[i] = df[i].fillna("lost") df[i] = df[i].fillna("lost")
# 下面这行代码是为了区分不同的列中有相同的值 # 下面这行代码是为了区分不同的列中有相同的值
df[i] = df[i] + i df[i] = df[i] + i
unique_values.extend(list(df[i].unique())) unique_values.extend(list(df[i].unique()))
df["l2"] = df["l2"].astype("str") for i in ["l1","l2"]:
df["l2"] = df["l2"].fillna("lost") df[i] = df[i].astype("str")
# l1和l2中的值允许相同 df[i] = df[i].fillna("lost")
df["l2"] = df["l2"]+"l1" # l1和l2中的值与top类别是一个类别
unique_values.extend(list(df["l2"].unique())) df[i] = df[i]+"top"
unique_values.extend(list(df[i].unique()))
print("features:") print("features:")
print(len(unique_values)) print(len(unique_values))
...@@ -78,7 +79,6 @@ def get_data(): ...@@ -78,7 +79,6 @@ def get_data():
train[i] = train[i].map(value_map) train[i] = train[i].map(value_map)
test[i] = test[i].map(value_map) test[i] = test[i].map(value_map)
print("train shape") print("train shape")
print(train.shape) print(train.shape)
print("test shape") print("test shape")
...@@ -120,14 +120,16 @@ def get_predict(date,value_map): ...@@ -120,14 +120,16 @@ def get_predict(date,value_map):
print(df.shape) print(df.shape)
features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1", "time", "stat_date"] "channel", "top", "time", "stat_date"]
for i in features: for i in features:
df[i] = df[i].astype("str") df[i] = df[i].astype("str")
df[i] = df[i].fillna("lost") df[i] = df[i].fillna("lost")
df[i] = df[i] + i df[i] = df[i] + i
df["l2"] = df["l2"].astype("str") for i in ["l1","l2"]:
df["l2"] = df["l2"].fillna("lost") df[i] = df[i].astype("str")
df["l2"] = df["l2"] + "l1" df[i] = df[i].fillna("lost")
# l1和l2中的值与top类别是一个类别
df[i] = df[i]+"top"
native_pre = df[df["label"] == 0] native_pre = df[df["label"] == 0]
native_pre = native_pre.drop("label", axis=1) native_pre = native_pre.drop("label", axis=1)
...@@ -147,10 +149,12 @@ def get_predict(date,value_map): ...@@ -147,10 +149,12 @@ def get_predict(date,value_map):
print("native") print("native")
print(native_pre.shape) print(native_pre.shape)
print(native_pre.head())
write_csv(native_pre, "native",200000) write_csv(native_pre, "native",200000)
print("nearby") print("nearby")
print(nearby_pre.shape) print(nearby_pre.shape)
print(nearby_pre.head())
write_csv(nearby_pre, "nearby", 160000) write_csv(nearby_pre, "nearby", 160000)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment