修改测试文件

ca8dbaa3 · 张彦钊 · 44fd59df · ca8dbaa3
Commit ca8dbaa3 authored Apr 25, 2019 by 张彦钊
Show whitespace changes
Inline Side-by-side

Showing with 27 additions and 36 deletions

multi.py tensnsorflow/multi.py +27 -36

No files found.
--- a/tensnsorflow/multi.py
+++ b/tensnsorflow/multi.py
@@ -62,52 +62,43 @@ def feature_engineer():

    df = spark.sql(sql)

-    url = "jdbc:mysql://172.16.30.143:3306/zhengxing"
-    jdbcDF = spark.read.format("jdbc").option("driver", "com.mysql.jdbc.Driver").option("url", url) \
-        .option("dbtable", "api_service").option("user", 'work').option("password", 'BJQaT9VzDcuPBqkd').load()
-    jdbcDF.createOrReplaceTempView("api_service")
-    jdbc = spark.read.format("jdbc").option("driver", "com.mysql.jdbc.Driver").option("url", url) \
-        .option("dbtable", "api_doctor").option("user", 'work').option("password", 'BJQaT9VzDcuPBqkd').load()
-    jdbc.createOrReplaceTempView("api_doctor")
-
-    sql = "select s.id as diary_service_id,d.hospital_id " \
-          "from api_service s left join api_doctor d on s.doctor_id = d.id"
-    hospital = spark.sql(sql)
-
-    df = df.join(hospital,"diary_service_id","left_outer").fillna("na")
-    df = df.drop("level2").drop("diary_service_id")
-    df = df.drop_duplicates(["ucity_id", "level2_ids", "ccity_name", "device_type", "manufacturer",
-                              "channel", "top", "time", "stat_date", "app_list", "hospital_id", "level3_ids"])
-
-    df = df.fillna("na")
-    v = df.select("app_list").rdd.map(lambda x:x[0]).collect()
-    print(type(v))
-    print(v[:2])
-
-    # app_list_value = [i.split(",") for i in v]
+
+
+    # url = "jdbc:mysql://172.16.30.143:3306/zhengxing"
+    # jdbcDF = spark.read.format("jdbc").option("driver", "com.mysql.jdbc.Driver").option("url", url) \
+    #     .option("dbtable", "api_service").option("user", 'work').option("password", 'BJQaT9VzDcuPBqkd').load()
+    # jdbcDF.createOrReplaceTempView("api_service")
+    # jdbc = spark.read.format("jdbc").option("driver", "com.mysql.jdbc.Driver").option("url", url) \
+    #     .option("dbtable", "api_doctor").option("user", 'work').option("password", 'BJQaT9VzDcuPBqkd').load()
+    # jdbc.createOrReplaceTempView("api_doctor")
    #
-    # app_list_unique = []
-    # for i in app_list_value:
-    #     app_list_unique.extend(i)
-    # app_list_unique = list(set(app_list_unique))
-    # number = len(app_list_unique)
-    # app_list_map = dict(zip(app_list_unique, list(range(1, number + 1))))
+    # sql = "select s.id as diary_service_id,d.hospital_id " \
+    #       "from api_service s left join api_doctor d on s.doctor_id = d.id"
+    # hospital = spark.sql(sql)
    #
-    # df = df.select("app_list","ucity_id", "level2_ids", "ccity_name", "device_type", "manufacturer","channel",
-    #                "top", "time", "stat_date", "hospital_id", "level3_ids","y","z",
-    #                "treatment_method","price_min","price_max","treatment_time","maintain_time","recover_time")\
-    #     .map(lambda x :app_list_func(x[0],app_list_map))
+    # df = df.join(hospital,"diary_service_id","left_outer").fillna("na")
+    # df = df.drop("level2").drop("diary_service_id")
+    # df = df.drop_duplicates(["ucity_id", "level2_ids", "ccity_name", "device_type", "manufacturer",
+    #                           "channel", "top", "time", "stat_date", "app_list", "hospital_id", "level3_ids"])
    #
-    # df.show(6)
-
+    # df = df.fillna("na")

+    v = set(df.select("app_list").rdd.map(lambda x: x[0]).collect())

+    app_list_value = [i.split(",") for i in v]

+    app_list_unique = []
+    for i in app_list_value:
+        app_list_unique.extend(i)
+    app_list_unique = list(set(app_list_unique))
+    number = len(app_list_unique)
+    app_list_map = dict(zip(app_list_unique, list(range(1, number + 1))))

+    print(df.select("app_list","stat_date").rdd.map(lambda x:(app_list_func(x[0],app_list_map),x[1])).first())



-    # app_list_number, app_list_map = multi_hot(df, "app_list", 2)
+# app_list_number, app_list_map = multi_hot(df, "app_list", 2)
    # level2_number, level2_map = multi_hot(df, "clevel2_id", 2 + app_list_number)
    # level3_number, level3_map = multi_hot(df, "level3_ids", 2 + app_list_number + level2_number)
    #