Commit a704e2fd authored by 张彦钊's avatar 张彦钊

add distinct

parent 42a5684a
......@@ -19,7 +19,7 @@ def app_list_func(x,l):
def multi_hot(df,column,n):
v = set(df.select(column).rdd.map(lambda x: x[0]).collect())
v = df.select(column).distinct().rdd.map(lambda x: x[0]).collect()
app_list_value = [i.split(",") for i in v]
app_list_unique = []
for i in app_list_value:
......@@ -81,7 +81,7 @@ def feature_engineer():
unique_values = []
for i in features:
unique_values.extend(list(set(df.select(i).rdd.map(lambda x: x[0]).collect())))
unique_values.extend(df.select(i).distinct().rdd.map(lambda x: x[0]).collect())
temp = list(range(2 + apps_number + level2_number + level3_number,
2 + apps_number + level2_number + level3_number + len(unique_values)))
value_map = dict(zip(unique_values, temp))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment