Commit 2fe3223d authored by 张彦钊's avatar 张彦钊

change test file

parent 01fe42f0
...@@ -21,7 +21,7 @@ def app_list_func(x,l): ...@@ -21,7 +21,7 @@ def app_list_func(x,l):
def multi_hot(df,column,n): def multi_hot(df,column,n):
a = time.time() a = time.time()
v = df.select(column).distinct().rdd.map(lambda x: x[0]).collect() v = list(set(df.select(column).rdd.map(lambda x: x[0]).collect()))
b = time.time() b = time.time()
print(column) print(column)
print("cost time 分钟") print("cost time 分钟")
...@@ -86,7 +86,7 @@ def feature_engineer(): ...@@ -86,7 +86,7 @@ def feature_engineer():
unique_values = [] unique_values = []
for i in features: for i in features:
a = time.time() a = time.time()
unique_values.extend(df.select(i).distinct().rdd.map(lambda x: x[0]).collect()) unique_values.extend(list(set(df.select(i).rdd.map(lambda x: x[0]).collect())))
b = time.time() b = time.time()
print(i) print(i)
print((b-a)/60) print((b-a)/60)
......
...@@ -150,7 +150,14 @@ if __name__ == '__main__': ...@@ -150,7 +150,14 @@ if __name__ == '__main__':
# [path + "tr/part-r-00000"] # [path + "tr/part-r-00000"]
import subprocess import subprocess
spark = SparkSession.builder.getOrCreate()
b = [("a", 1), ("a", 1), ("b", 3), ("a", 2)]
rdd = spark.sparkContext.parallelize(b)
df = spark.createDataFrame(rdd).toDF("id", "n")
df.show()
df.createOrReplaceTempView("df")
t = spark.sql("select id from df").map()
print(t)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment