Commit a2049b7f authored by 张彦钊's avatar 张彦钊

修改测试文件

parent 6122a37d
......@@ -18,15 +18,15 @@ def app_list_func(x,l):
return ",".join([str(j) for j in e])
def multi_hot(df,column,n):
df[column] = df[column].fillna("lost_na")
app_list_value = [i.split(",") for i in df[column].unique()]
df = df.select[column].fillna("lost_na")
app_list_value = [i.split(",") for i in df.select[column].unique()]
app_list_unique = []
for i in app_list_value:
app_list_unique.extend(i)
app_list_unique = list(set(app_list_unique))
number = len(app_list_unique)
app_list_map = dict(zip(app_list_unique, list(range(n, number + n))))
df[column] = df[column].apply(app_list_func, args=(app_list_map,))
df = df.select[column].apply(app_list_func, args=(app_list_map,))
return number,app_list_map
def feature_engineer():
......@@ -75,11 +75,13 @@ def feature_engineer():
hospital = spark.sql(sql)
df = df.join(hospital,"diary_service_id","left_outer").fillna("na")
print(df.count())
df = df.drop("level2").drop("diary_service_id")
df = df.drop_duplicates(["ucity_id", "level2_ids", "ccity_name", "device_type", "manufacturer",
"channel", "top", "time", "stat_date", "app_list", "hospital_id", "level3_ids"])
print(df.count())
multi_hot(df, "app_list", 1)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment