multi hot insert database

2ff68205 · 张彦钊 · f45e1013 · 2ff68205 · 2ff68205
Commit 2ff68205 authored Jan 17, 2019 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 28 additions and 14 deletions

feature_engineering.py tensnsorflow/feature_engineering.py +27 -13

multi_hot.py tensnsorflow/multi_hot.py +1 -1

No files found.
--- a/tensnsorflow/feature_engineering.py
+++ b/tensnsorflow/feature_engineering.py
@@ -2,6 +2,7 @@ import pandas as pd
 import pymysql
 import datetime
 def con_sql(db,sql):
    cursor = db.cursor()
    try:
@@ -26,7 +27,7 @@ def get_data():
    print(start)
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name," \
-          "u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,cl.l3,e.device_id,cut.time " \
+          "u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,e.device_id,cut.time " \
          "from esmm_train_data e left join user_feature u on e.device_id = u.device_id " \
          "left join cid_type_top c on e.device_id = c.device_id " \
          "left join cid_level2 cl on e.cid_id = cl.cid " \
@@ -36,24 +37,32 @@ def get_data():
    # print(df.shape)
    df = df.rename(columns={0: "y", 1: "z", 2: "stat_date", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11: "l2",
-                            12: "l3",13: "device_id", 14: "time"})
+                            12: "device_id", 13: "time"})
    print("esmm data ok")
    # print(df.head(2)
    print("before")
    print(df.shape)
    print("after")
    df = df.drop_duplicates()
-    features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
+    df = df.drop_duplicates(["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
-                             "channel", "top", "level2_ids", "time", "stat_date"]
+                             "channel", "top", "l1","l2", "time", "stat_date"])
-    df = df.drop_duplicates(features)
    print(df.shape)
    unique_values = []
+    features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
+                "channel", "top", "l1", "time", "stat_date"]
    for i in features:
        df[i] = df[i].astype("str")
        df[i] = df[i].fillna("lost")
        df[i] = df[i] + i
        unique_values.extend(list(df[i].unique()))
+    df["l2"] = df["l2"].astype("str")
+    df["l2"] = df["l2"].fillna("lost")
+    df["l2"] = df["l2"]+"l1"
+    unique_values.extend(list(df["l2"].unique()))
+    print("features:")
+    print(len(unique_values))
    print(df.head(2))
    temp = list(range(1,len(unique_values)+1))
@@ -62,7 +71,8 @@ def get_data():
    df = df.drop("device_id", axis=1)
    train = df[df["stat_date"] != validate_date+"stat_date"]
    test = df[df["stat_date"] == validate_date+"stat_date"]
-    for i in features:
+    for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
+                "channel", "top", "l1", "time", "stat_date","l2"]:
        train[i] = train[i].map(value_map)
        test[i] = test[i].map(value_map)
@@ -85,22 +95,22 @@ def write_csv(df,name,n):
        elif i + n > df.shape[0]:
            temp = df.iloc[i:]
        else:
-            temp = df.loc[i:i + n]
+            temp = df.iloc[i:i + n]
        temp.to_csv(path + name+ "/{}_{}.csv".format(name,i), index=False)
 def get_predict(date,value_map):
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \
-          "u.device_type,u.manufacturer,u.channel,c.top,df.level2_ids,e.device_id,e.cid_id,cut.time " \
+          "u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,e.device_id,e.cid_id,cut.time " \
          "from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \
          "left join cid_type_top c on e.device_id = c.device_id " \
-          "left join diary_feat df on e.cid_id = df.diary_id " \
+          "left join cid_level2 cl on e.cid_id = cl.cid " \
          "left join cid_time_cut cut on e.cid_id = cut.cid"
    df = con_sql(db, sql)
    df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
-                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "level2_ids",
+                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11:"l2",
-                            11: "device_id", 12: "cid_id", 13: "time"})
+                            12: "device_id", 13: "cid_id", 14: "time"})
    df["stat_date"] = date
@@ -108,18 +118,22 @@ def get_predict(date,value_map):
    print(df.shape)
    features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
-                "channel", "top", "level2_ids", "time", "stat_date"]
+                "channel", "top", "l1", "time", "stat_date"]
    for i in features:
        df[i] = df[i].astype("str")
        df[i] = df[i].fillna("lost")
        df[i] = df[i] + i
+    df["l2"] = df["l2"].astype("str")
+    df["l2"] = df["l2"].fillna("lost")
+    df["l2"] = df["l2"] + "l1"
    native_pre = df[df["label"] == 0]
    native_pre = native_pre.drop("label", axis=1)
    nearby_pre = df[df["label"] == 1]
    nearby_pre = nearby_pre.drop("label", axis=1)
-    for i in features:
+    for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
+                "channel", "top", "l1", "time", "stat_date","l2"]:
        native_pre[i] = native_pre[i].map(value_map)
        # TODO 没有覆盖到的类别会处理成na，暂时用0填充，后续完善一下
        native_pre[i] = native_pre[i].fillna(0)

--- a/tensnsorflow/multi_hot.py
+++ b/tensnsorflow/multi_hot.py
@@ -29,7 +29,7 @@ def multi():
    for i in list(df["level"].unique()):
        l = i.split(";")
        l = sorted(l)
-        if len(l) == 3:
+        if len(l) >= 3:
            df.loc[df["level"] == i, ["l1"]] = l[0]
            df.loc[df["level"] == i, ["l2"]] = l[1]
            df.loc[df["level"] == i, ["l3"]] = l[2]