修改测试文件

6122a37d · 张彦钊 · 2f8b47e9 · 6122a37d
Commit 6122a37d authored Apr 25, 2019 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 23 additions and 12 deletions

multi.py tensnsorflow/multi.py +23 -12

No files found.
--- a/tensnsorflow/multi.py
+++ b/tensnsorflow/multi.py
@@ -7,6 +7,28 @@ from pyspark.sql import SparkSession
 import datetime
 import pandas as pd
+def app_list_func(x,l):
+    b = x.split(",")
+    e = []
+    for i in b:
+        if i in l.keys():
+            e.append(l[i])
+        else:
+            e.append(0)
+    return ",".join([str(j) for j in e])
+def multi_hot(df,column,n):
+    df[column] = df[column].fillna("lost_na")
+    app_list_value = [i.split(",") for i in df[column].unique()]
+    app_list_unique = []
+    for i in app_list_value:
+        app_list_unique.extend(i)
+    app_list_unique = list(set(app_list_unique))
+    number = len(app_list_unique)
+    app_list_map = dict(zip(app_list_unique, list(range(n, number + n))))
+    df[column] = df[column].apply(app_list_func, args=(app_list_map,))
+    return number,app_list_map
 def feature_engineer():
    db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select max(stat_date) from esmm_train_data"
@@ -54,7 +76,7 @@ def feature_engineer():
    df = df.join(hospital,"diary_service_id","left_outer").fillna("na")
    print(df.count())
-    df = df.drop(["level2","diary_service_id"])
+    df = df.drop("level2").drop("diary_service_id")
    df = df.drop_duplicates(["ucity_id", "level2_ids", "ccity_name", "device_type", "manufacturer",
                              "channel", "top", "time", "stat_date", "app_list", "hospital_id", "level3_ids"])
    print(df.count())
@@ -62,18 +84,7 @@ def feature_engineer():
-    # df = df.rename(columns={0: "y", 1: "z", 2: "stat_date", 3: "ucity_id", 4: "clevel2_id", 5: "ccity_name",
-    #                         6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "device_id",
-    #                         11: "time", 12: "app_list", 13: "service_id", 14: "level3_ids", 15: "level2"})
-    #
-    #
-    # df = df.drop_duplicates(["ucity_id", "clevel2_id", "ccity_name", "device_type", "manufacturer",
-    #                          "channel", "top", "time", "stat_date", "app_list", "hospital_id", "level3_ids"])
-    #
-    # print("after")
-    # print(df.shape)
    # app_list_number, app_list_map = multi_hot(df, "app_list", 2)
    # level2_number, level2_map = multi_hot(df, "clevel2_id", 2 + app_list_number)
    # level3_number, level3_map = multi_hot(df, "level3_ids", 2 + app_list_number + level2_number)