Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline

change data dir to /home/gaoyazhe/

Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline
change data dir to /home/gaoyazhe/
5d5ac927 · 高雅喆 · 7622f47e · eb4e1fe6 · 5d5ac927
Commit 5d5ac927 authored Dec 12, 2018 by 高雅喆
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 51 deletions

ffm.py tensnsorflow/ffm.py +18 -51

No files found.
--- a/tensnsorflow/ffm.py
+++ b/tensnsorflow/ffm.py
@@ -161,54 +161,37 @@ def get_data():
    df = df.drop("z", axis=1)
    df = pd.merge(df,get_statistics(),how='left',on = "device_id").fillna(0)
    df = df.drop("device_id", axis=1)
-    print(df.head())
+    print(df.head(2))
    return df,validate_date,ucity_id,cid


 def transform(a,validate_date):
    model = multiFFMFormatPandas()
-    df = model.fit_transform(a, y="y", n=160000, processes=18)
+    df = model.fit_transform(a, y="y", n=160000, processes=25)
    df = pd.DataFrame(df)
    df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
    df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
    df["city_id"] = df[0].apply(lambda x: x.split(",")[2])
    df["cid"] = df[0].apply(lambda x: x.split(",")[3])
+    df["number"] = np.random.randint(1, 2147483647, df.shape[0])
    df["seq"] = list(range(df.shape[0]))
    df["seq"] = df["seq"].astype("str")
    df["data"] = df[0].apply(lambda x: ",".join(x.split(",")[4:]))
    df["data"] = df["seq"].str.cat(df["data"], sep=",")
-    df["number"] = np.random.randint(1, 2147483647, df.shape[0])
    df = df.drop([0,"seq"], axis=1)
-    print(df.head())
+    print(df.head(2))

    train = df[df["stat_date"] != validate_date]
    train = train.drop("stat_date",axis=1)
    test = df[df["stat_date"] == validate_date]
    test = test.drop("stat_date",axis=1)
-    print("train shape")
-    print(train.shape)
-
-    yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
-    pd.io.sql.to_sql(train, "train_zhao", yconnect, schema='jerry_test', if_exists='replace', index=False)
-    print("train insert done")
-    pd.io.sql.to_sql(test, "test_zhao", yconnect, schema='jerry_test', if_exists='replace', index=False)
-    print("test insert done")
+    # print("train shape")
+    # print(train.shape)
+    train.to_csv(path + "train.csv", sep="\t", index=False)
+    test.to_csv(path + "test.csv", sep="\t", index=False)

    return model

-
-    # n = 100000
-    # for i in range(0,df.shape[0],n):
-    #     print(i)
-    #     if i == 0:
-    #         temp = df.loc[0:n]
-    #     elif i+n > df.shape[0]:
-    #         temp = df.loc[i+1:]
-    #     else:
-    #         temp = df.loc[i+1:i+n]
-    #     pd.io.sql.to_sql(temp, table, yconnect, schema='jerry_test', if_exists='append', index=False)
-    #     print("insert done")
-
 def get_statistics():
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
    sql = "select device_id,total,精选,直播,鼻部,眼部,微整,牙齿,轮廓,美肤抗衰," \
@@ -222,6 +205,7 @@ def get_statistics():
    df = df.drop("total", axis=1)
    return df

+
 def get_predict_set(ucity_id, cid,model):
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select device_id,y,z,stat_date,ucity_id,cid_id,clevel1_id,ccity_name,label from esmm_pre_data"
@@ -245,49 +229,32 @@ def get_predict_set(ucity_id, cid,model):
    print("df ok")
    print(df.shape)
    print(df.head(2))
-    df = model.transform(df,n=160000, processes=18)
+    df = model.transform(df,n=160000, processes=25)
    df = pd.DataFrame(df)
    df["label"] = df[0].apply(lambda x: x.split(",")[0])
    df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
    df["city_id"] = df[0].apply(lambda x: x.split(",")[2])
    df["cid"] = df[0].apply(lambda x: x.split(",")[3])
+    df["number"] = np.random.randint(1, 2147483647, df.shape[0])
    df["seq"] = list(range(df.shape[0]))
    df["seq"] = df["seq"].astype("str")
    df["data"] = df[0].apply(lambda x: ",".join(x.split(",")[4:]))
    df["data"] = df["seq"].str.cat(df["data"], sep=",")
-    df["number"] = np.random.randint(1, 2147483647, df.shape[0])
    df = df.drop([0, "seq"], axis=1)
    print(df.head())

    native_pre = df[df["label"] == "0"]
    native_pre = native_pre.drop("label", axis=1)
-    print("native_pre shape")
-    print(native_pre.shape)
+    native_pre.to_csv(path+"native_pre.csv",sep="\t",index=False)
+    # print("native_pre shape")
+    # print(native_pre.shape)

    nearby_pre = df[df["label"] == "1"]
    nearby_pre = nearby_pre.drop("label", axis=1)
-    print("nearby_pre shape")
-    print(nearby_pre.shape)
-
-    yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
-    pd.io.sql.to_sql(native_pre, "native_zhao", yconnect, schema='jerry_test', if_exists='replace', index=False)
-    print("train insert done")
-    pd.io.sql.to_sql(nearby_pre, "nearby_zhao", yconnect, schema='jerry_test', if_exists='replace', index=False)
-    print("test insert done")
-
-
-    # df = pd.DataFrame(df)
-    # df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
-    # df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
-    # df["city_id"] = df[0].apply(lambda x: x.split(",")[2])
-    # df["diary_id"] = df[0].apply(lambda x: x.split(",")[3])
-    # df["seq"] = list(range(df.shape[0]))
-    # df["seq"] = df["seq"].astype("str")
-    # df["ffm"] = df[0].apply(lambda x: ",".join(x.split(",")[4:]))
-    # df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",")
-    # df["random"] = np.random.randint(1, 2147483647, df.shape[0])
-    # df = df.drop([0,"seq"], axis=1)
-    # print(df.head())
+    nearby_pre.to_csv(path + "nearby_pre.csv", sep="\t", index=False)
+    # print("nearby_pre shape")
+    # print(nearby_pre.shape)
+


 if __name__ == "__main__":