add date

1ea6364d · 张彦钊 · 88db6d8d · 1ea6364d
Commit 1ea6364d authored Jan 15, 2019 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 7 deletions

data2ffm.py eda/esmm/Feature_pipline/data2ffm.py +6 -7

No files found.
--- a/eda/esmm/Feature_pipline/data2ffm.py
+++ b/eda/esmm/Feature_pipline/data2ffm.py
@@ -166,8 +166,6 @@ def get_data():
    print(df.shape)
    print("after")
    df = df.drop_duplicates()
-    df = df.drop_duplicates(["ucity_id","clevel1_id", "ccity_name","device_type","manufacturer",
-                             "channel","top","level2_ids","time"])
    print(df.shape)
    df["clevel1_id"] = df["clevel1_id"].astype("str")
@@ -177,12 +175,12 @@ def get_data():
    df["time"] = df["time"].astype("str")
    df["y"] = df["stat_date"].str.cat([df["device_id"].values.tolist(),df["y"].values.tolist(),df["z"].values.tolist()], sep=",")
-    df = df.drop(["z","stat_date","device_id"], axis=1)
+    df = df.drop(["z","device_id"], axis=1)
    df = df.fillna("na")
    print(df.head(2))
-    print(df.count())
    features = 0
-    l = ["ucity_id","clevel1_id","ccity_name","device_type","manufacturer","channel","level2_ids","top","time"]
+    l = ["ucity_id","clevel1_id","ccity_name","device_type","manufacturer","channel","level2_ids","top","time","stat_date"]
    for i in l:
        features = features + len(df[i].unique())
    print("fields:{}".format(df.shape[1]-1))
@@ -224,7 +222,7 @@ def transform(a,validate_date):
    return model
-def get_predict_set(model,columns):
+def get_predict_set(model,columns,date):
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \
          "u.device_type,u.manufacturer,u.channel,c.top,df.level2_ids,e.device_id,e.cid_id,cut.time " \
@@ -237,6 +235,7 @@ def get_predict_set(model,columns):
                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "level2_ids",
                            11:"device_id",12:"cid_id",13:"time"})
+    df["stat_date"] = date
    print(df.shape)
    for i in columns.keys():
        df.loc[~df[i].isin(columns[i]), [i]] = "na"
@@ -291,7 +290,7 @@ if __name__ == "__main__":
    a = time.time()
    temp, validate_date, column_map = get_data()
    model = transform(temp, validate_date)
-    get_predict_set(model,column_map)
+    get_predict_set(model,column_map,validate_date)
    b = time.time()
    print("cost(分钟)")
    print((b-a)/60)