change test file

2fe3223d · 张彦钊 · 01fe42f0 · 2fe3223d · 2fe3223d
Commit 2fe3223d authored May 24, 2019 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 3 deletions

feature_engineering.py tensnsorflow/feature_engineering.py +2 -2

multi.py tensnsorflow/multi.py +8 -1

No files found.
--- a/tensnsorflow/feature_engineering.py
+++ b/tensnsorflow/feature_engineering.py
@@ -21,7 +21,7 @@ def app_list_func(x,l):
 def multi_hot(df,column,n):
    a = time.time()
-    v = df.select(column).distinct().rdd.map(lambda x: x[0]).collect()
+    v = list(set(df.select(column).rdd.map(lambda x: x[0]).collect()))
    b = time.time()
    print(column)
    print("cost time 分钟")
@@ -86,7 +86,7 @@ def feature_engineer():
    unique_values = []
    for i in features:
        a = time.time()
-        unique_values.extend(df.select(i).distinct().rdd.map(lambda x: x[0]).collect())
+        unique_values.extend(list(set(df.select(i).rdd.map(lambda x: x[0]).collect())))
        b = time.time()
        print(i)
        print((b-a)/60)

--- a/tensnsorflow/multi.py
+++ b/tensnsorflow/multi.py
@@ -150,7 +150,14 @@ if __name__ == '__main__':
    # [path + "tr/part-r-00000"]
    import subprocess
+    spark = SparkSession.builder.getOrCreate()
+    b = [("a", 1), ("a", 1), ("b", 3), ("a", 2)]
+    rdd = spark.sparkContext.parallelize(b)
+    df = spark.createDataFrame(rdd).toDF("id", "n")
+    df.show()
+    df.createOrReplaceTempView("df")
+    t = spark.sql("select id from df").map()
+    print(t)