change transform

3b881969 · 张彦钊 · 8f18a377 · 3b881969
Commit 3b881969 authored Dec 12, 2018 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 45 additions and 12 deletions

ffm.py tensnsorflow/ffm.py +45 -12

No files found.
--- a/tensnsorflow/ffm.py
+++ b/tensnsorflow/ffm.py
@@ -45,7 +45,8 @@ def get_data():
    df = df.rename(columns={0:"device_id",1: "y",2:"z",3:"stat_date",4:"ucity_id",5:"cid_id",
                                6:"clevel1_id",7:"ccity_name"})
    print("esmm data ok")
+    ucity_id = list(set(df["ucity_id"].values.tolist()))
+    cid = list(set(df["cid_id"].values.tolist()))
    df["clevel1_id"] = df["clevel1_id"].astype("str")
    df["cid_id"] = df["cid_id"].astype("str")
    df["y"] = df["y"].astype("str")
@@ -63,16 +64,15 @@ def get_data():
    print(df.shape)
    df = df.drop("device_id", axis=1)
    print(df.head())
    transform(df, validate_date)
+    return ucity_id,cid
 def transform(df,validate_date):
    model = multiFFMFormatPandas()
-    for i in [200000,160000,130000]:
+    temp = model.fit_transform(df, y="y", n=160000, processes=18)
-        a = time.time()
-        temp = model.fit_transform(df, y="y", n=i, processes=18)
-        b = time.time()
-        print("{}cost{}".format(i,b - a))
    # df = pd.DataFrame(df)
    # df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
    # df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
@@ -124,11 +124,43 @@ def get_statistics():
 def get_predict_set():
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
-    sql = "select device_id,y,z,stat_date,ucity_id,cid_id,clevel1_id,ccity_name from esmm_pre_data where label = 0"
+    sql = "select device_id,y,z,stat_date,ucity_id,cid_id,clevel1_id,ccity_name,label from esmm_pre_data"
-    native_pre = con_sql(db, sql)
+    df = con_sql(db, sql)
-    native_pre = native_pre.rename(columns={0: "device_id", 1: "y", 2: "z", 3: "stat_date", 4: "ucity_id", 5: "cid_id",
+    df = df.rename(columns={0: "device_id", 1: "y", 2: "z", 3: "stat_date", 4: "ucity_id", 5: "cid_id",
-                            6: "clevel1_id", 7: "ccity_name"})
+                            6: "clevel1_id", 7: "ccity_name",8:"label"})
-    print("native_pre ok")
+    print("native_pre ok"+df.shape)
+    # df["clevel1_id"] = df["clevel1_id"].astype("str")
+    # df["cid_id"] = df["cid_id"].astype("str")
+    # df["y"] = df["y"].astype("str")
+    # df["z"] = df["z"].astype("str")
+    # df["y"] = df["label"].str.cat(
+    #     [df["device_id"].values.tolist(), df["ucity_id"].values.tolist(), df["cid_id"].values.tolist(),
+    #      df["y"].values.tolist(), df["z"].values.tolist()], sep=",")
+    # df = df.drop(["z","label"], axis=1)
+    device = tuple(set(df["device_id"].values.tolist()))
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
+    sql = "select device_id,total,精选,直播,鼻部,眼部,微整,牙齿,轮廓,美肤抗衰," \
+          "吸脂,脂肪填充,隆胸,私密,毛发管理,公立,韩国 from home_tab_click where device_id in {}".format(device)
+    statics = con_sql(db, sql)
+    native_pre = pd.merge(df, statics, how='left').fillna(0)
+    print("native_pre ok" + native_pre.shape)
+    # df = pd.DataFrame(df)
+    # df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
+    # df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
+    # df["city_id"] = df[0].apply(lambda x: x.split(",")[2])
+    # df["diary_id"] = df[0].apply(lambda x: x.split(",")[3])
+    # df["seq"] = list(range(df.shape[0]))
+    # df["seq"] = df["seq"].astype("str")
+    # df["ffm"] = df[0].apply(lambda x: ",".join(x.split(",")[4:]))
+    # df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",")
+    # df["random"] = np.random.randint(1, 2147483647, df.shape[0])
+    # df = df.drop([0,"seq"], axis=1)
+    # print(df.head())
@@ -239,6 +271,7 @@ class multiFFMFormatPandas:
 if __name__ == "__main__":
    path = "/home/gmuser/ffm/"
-    get_data()
+    # get_data()
+    get_predict_set()