pandas 映射

b0fa9e51 · 张彦钊 · 6908cb9b · b0fa9e51
Commit b0fa9e51 authored Jan 15, 2019 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 51 additions and 5 deletions

feature_engineering.py tensnsorflow/feature_engineering.py +51 -5

No files found.
--- a/tensnsorflow/feature_engineering.py
+++ b/tensnsorflow/feature_engineering.py
@@ -57,18 +57,64 @@ def get_data():
    print(df.head(2))

    value_map = {v: k for k, v in enumerate(unique_values)}
+
+    df = df.drop("device_id", axis=1)
+    train = df[df["stat_date"] != validate_date]
+    test = df[df["stat_date"] == validate_date]
    for i in features:
-        df[i] = df[i].map(value_map)
-    print("类别总数")
-    print(len(value_map))
-    print(df.head(2))
+        train[i] = train[i].map(value_map)
+        test[i] = test[i].map(value_map)
+
+    train.to_csv(path + "tr.csv", index=False)
+    test.to_csv(path + "va.csv", index=False)
+
+    return validate_date,value_map
+
+
+def get_predict(date,value_map):
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \
+          "u.device_type,u.manufacturer,u.channel,c.top,df.level2_ids,e.device_id,e.cid_id,cut.time " \
+          "from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \
+          "left join cid_type_top c on e.device_id = c.device_id " \
+          "left join diary_feat df on e.cid_id = df.diary_id " \
+          "left join cid_time_cut cut on e.cid_id = cut.cid"
+    df = con_sql(db, sql)
+    df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
+                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "level2_ids",
+                            11: "device_id", 12: "cid_id", 13: "time"})
+
+    df["stat_date"] = date
+
+    features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
+                "channel", "top", "level2_ids", "time", "stat_date"]
+    for i in features:
+        df[i] = df[i].astype("str")
+        df[i] = df[i].fillna("lost")
+        df[i] = df[i] + i

+    native_pre = df[df["label"] == "0"]
+    native_pre = native_pre.drop("label", axis=1)
+    nearby_pre = df[df["label"] == "1"]
+    nearby_pre = nearby_pre.drop("label", axis=1)
+
+    for i in features:
+        native_pre[i] = native_pre[i].map(value_map)
+        nearby_pre[i] = nearby_pre[i].map(value_map)

+    print("native")
+    print(native_pre.shape)
+    native_pre.to_csv(path + "native.csv", index=False)

+    print("nearby")
+    print(nearby_pre.shape)
+    nearby_pre.to_csv(path + "nearby.csv",index=False)


 if __name__ == '__main__':
-    get_data()
+    path = "/home/gmuser/esmm_data/"
+    date,value = get_data()
+    get_predict(date, value)