Commit a704e2fd authored by 张彦钊's avatar 张彦钊

add distinct

parent 42a5684a
...@@ -19,7 +19,7 @@ def app_list_func(x,l): ...@@ -19,7 +19,7 @@ def app_list_func(x,l):
def multi_hot(df,column,n): def multi_hot(df,column,n):
v = set(df.select(column).rdd.map(lambda x: x[0]).collect()) v = df.select(column).distinct().rdd.map(lambda x: x[0]).collect()
app_list_value = [i.split(",") for i in v] app_list_value = [i.split(",") for i in v]
app_list_unique = [] app_list_unique = []
for i in app_list_value: for i in app_list_value:
...@@ -81,7 +81,7 @@ def feature_engineer(): ...@@ -81,7 +81,7 @@ def feature_engineer():
unique_values = [] unique_values = []
for i in features: for i in features:
unique_values.extend(list(set(df.select(i).rdd.map(lambda x: x[0]).collect()))) unique_values.extend(df.select(i).distinct().rdd.map(lambda x: x[0]).collect())
temp = list(range(2 + apps_number + level2_number + level3_number, temp = list(range(2 + apps_number + level2_number + level3_number,
2 + apps_number + level2_number + level3_number + len(unique_values))) 2 + apps_number + level2_number + level3_number + len(unique_values)))
value_map = dict(zip(unique_values, temp)) value_map = dict(zip(unique_values, temp))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment