Commit d8a64a73 authored by 张彦钊's avatar 张彦钊

修改测试文件

parent 9350f8f1
......@@ -18,9 +18,7 @@ def app_list_func(x,l):
return ",".join([str(j) for j in e])
def multi_hot(df,column,n):
df = df.select(column).fillna("lost_na")
df.show(6)
app_list_value = [i.split(",") for i in df.select(column).unique()]
app_list_value = [i.split(",") for i in df.select(column).collect().unique()]
app_list_unique = []
for i in app_list_value:
app_list_unique.extend(i)
......@@ -80,8 +78,10 @@ def feature_engineer():
df = df.drop_duplicates(["ucity_id", "level2_ids", "ccity_name", "device_type", "manufacturer",
"channel", "top", "time", "stat_date", "app_list", "hospital_id", "level3_ids"])
multi_hot(df, "app_list", 1)
df = df.fillna("na")
df = df.toPandas()
print(df.shape)
print(df.head(6))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment