Commit 2fe3223d authored by 张彦钊's avatar 张彦钊

change test file

parent 01fe42f0
......@@ -21,7 +21,7 @@ def app_list_func(x,l):
def multi_hot(df,column,n):
a = time.time()
v = df.select(column).distinct().rdd.map(lambda x: x[0]).collect()
v = list(set(df.select(column).rdd.map(lambda x: x[0]).collect()))
b = time.time()
print(column)
print("cost time 分钟")
......@@ -86,7 +86,7 @@ def feature_engineer():
unique_values = []
for i in features:
a = time.time()
unique_values.extend(df.select(i).distinct().rdd.map(lambda x: x[0]).collect())
unique_values.extend(list(set(df.select(i).rdd.map(lambda x: x[0]).collect())))
b = time.time()
print(i)
print((b-a)/60)
......
......@@ -150,7 +150,14 @@ if __name__ == '__main__':
# [path + "tr/part-r-00000"]
import subprocess
spark = SparkSession.builder.getOrCreate()
b = [("a", 1), ("a", 1), ("b", 3), ("a", 2)]
rdd = spark.sparkContext.parallelize(b)
df = spark.createDataFrame(rdd).toDF("id", "n")
df.show()
df.createOrReplaceTempView("df")
t = spark.sql("select id from df").map()
print(t)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment