change transform

9ba791f3 · 张彦钊 · 9d70d94d · 9ba791f3
Commit 9ba791f3 authored Dec 12, 2018 by 张彦钊
Show whitespace changes
Inline Side-by-side

Showing with 30 additions and 25 deletions

ffm.py tensnsorflow/ffm.py +30 -25

No files found.
--- a/tensnsorflow/ffm.py
+++ b/tensnsorflow/ffm.py
@@ -5,6 +5,7 @@ import pandas as pd
 from multiprocessing import Pool
 import numpy as np
 import datetime
+import time
 from sqlalchemy import create_engine


@@ -36,7 +37,7 @@ def get_data():
    validate_date = con_sql(db, sql)[0].values.tolist()[0]
    print("validate_date:"+validate_date)
    temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d")
-    start = (temp - datetime.timedelta(days=3)).strftime("%Y-%m-%d")
+    start = (temp - datetime.timedelta(days=15)).strftime("%Y-%m-%d")
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select device_id,y,z,stat_date,ucity_id,cid_id,clevel1_id,ccity_name from esmm_train_data " \
          "where stat_date >= '{}'".format(start)
@@ -67,30 +68,34 @@ def get_data():

 def transform(df,validate_date):
    model = multiFFMFormatPandas()
-    df = model.fit_transform(df, y="y", n=100000, processes=18)
-    df = pd.DataFrame(df)
-    df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
-    df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
-    df["city_id"] = df[0].apply(lambda x: x.split(",")[2])
-    df["diary_id"] = df[0].apply(lambda x: x.split(",")[3])
-    df["seq"] = list(range(df.shape[0]))
-    df["seq"] = df["seq"].astype("str")
-    df["ffm"] = df[0].apply(lambda x: ",".join(x.split(",")[4:]))
-    df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",")
-    df["random"] = np.random.randint(1, 2147483647, df.shape[0])
-    df = df.drop([0,"seq"], axis=1)
-    print(df.head())
-
-    train = df[df["stat_date"] != validate_date]
-    train = train.drop("stat_date",axis=1)
-    print("train shape")
-    print(train.shape)
-    test = df[df["stat_date"] == validate_date]
-    test = test.drop("stat_date",axis=1)
-    print("test shape")
-    print(test.shape)
-    train.to_csv(path+"train.csv",index=None)
-    test.to_csv(path + "test.csv", index=None)
+    for i in range(80000,200000,10000):
+        a = time.time()
+        df = model.fit_transform(df, y="y", n=i, processes=18)
+        b = time.time()
+        print("{}cost{}".format(i,b - a))
+    # df = pd.DataFrame(df)
+    # df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
+    # df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
+    # df["city_id"] = df[0].apply(lambda x: x.split(",")[2])
+    # df["diary_id"] = df[0].apply(lambda x: x.split(",")[3])
+    # df["seq"] = list(range(df.shape[0]))
+    # df["seq"] = df["seq"].astype("str")
+    # df["ffm"] = df[0].apply(lambda x: ",".join(x.split(",")[4:]))
+    # df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",")
+    # df["random"] = np.random.randint(1, 2147483647, df.shape[0])
+    # df = df.drop([0,"seq"], axis=1)
+    # print(df.head())
+    #
+    # train = df[df["stat_date"] != validate_date]
+    # train = train.drop("stat_date",axis=1)
+    # print("train shape")
+    # print(train.shape)
+    # test = df[df["stat_date"] == validate_date]
+    # test = test.drop("stat_date",axis=1)
+    # print("test shape")
+    # print(test.shape)
+    # train.to_csv(path+"train.csv",index=None)
+    # test.to_csv(path + "test.csv", index=None)
    # yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
    # n = 100000
    # for i in range(0,df.shape[0],n):