增加保存到csv

8eca148c · 张彦钊 · 280a03ce · 8eca148c
Commit 8eca148c authored Nov 30, 2018 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 34 additions and 42 deletions

ffm.py tensnsorflow/ffm.py +34 -42

No files found.
--- a/tensnsorflow/ffm.py
+++ b/tensnsorflow/ffm.py
@@ -24,16 +24,16 @@ def get_data():
    esmm = esmm.rename(columns={0:"stat_date",1: "device_id",2:"ucity_id",3:"cid_id",4:"diary_service_id",5:"y",
                                6:"z",7:"clevel1_id",8:"slevel1_id"})
    print("esmm data ok")
-    print(esmm.head())
+    print(esmm.shape)
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
    sql = "select * from home_tab_click"
    temp = con_sql(db,sql)
    temp = temp.rename(columns={0: "device_id"})
    print("click data ok")
-    print(temp.head())
-    df = pd.merge(esmm,temp,on = "device_id").dropna()
+    # print(temp.head())
+    df = pd.merge(esmm,temp,on = "device_id",how='left').fillna(0)

-    print(df.head())
+    print(df.shape)

    df["diary_service_id"] = df["diary_service_id"].astype("str")
    df["clevel1_id"] = df["clevel1_id"].astype("str")
@@ -46,9 +46,9 @@ def get_data():
    df = df.drop("z", axis=1)
    print(df.head())
    train = df[df["stat_date"] != "2018-11-25"]
-    transform(train,"train")
+    transform(train,"crvtrain.csv")
    test = df[df["stat_date"] == "2018-11-25"]
-    transform(test, "test")
+    transform(test, "crvtest.csv")


 def transform(df,table):
@@ -56,28 +56,30 @@ def transform(df,table):
    df = model.fit_transform(df, y="y", n=50000, processes=20)
    df = pd.DataFrame(df)
    df["device_id"] = df[0].apply(lambda x: x.split(",")[0])
-    df["ucity_id"] = df[0].apply(lambda x: x.split(",")[1])
-    df["cid_id"] = df[0].apply(lambda x: x.split(",")[2])
-    df["y"] = df[0].apply(lambda x: x.split(",")[3])
-    df["ffm"] = df[0].apply(lambda x: x.split(",")[4])
+    df["city_id"] = df[0].apply(lambda x: x.split(",")[1])
+    df["diary_id"] = df[0].apply(lambda x: x.split(",")[2])
+    # df["y"] = df[0].apply(lambda x: x.split(",")[3])
    df["seq"] = list(range(df.shape[0]))
    df["seq"] = df["seq"].astype("str")
-    df["ffm"] = df["seq"].str.cat([df["y"].values.tolist(), df["ffm"].values.tolist()], sep=",")
-    df["number"] = np.random.randint(1, 2147483647, df.shape[0])
-    df = df.drop(0, axis=1)
+    df["ffm"] = df[0].apply(lambda x: x.split(",")[3:])
+    df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",")
+    df["random"] = np.random.randint(1, 2147483647, df.shape[0])
+    df = df.drop(0, axis=1).drop("seq",axis=1)
    print("size")
    print(df.shape)
-    yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
-    for i in range(0,df.shape[0],10000):
-        print(i)
-        if i == 0:
-            temp = df.loc[0:10000]
-        elif i+10000 > df.shape[0]:
-            temp = df.loc[i:]
-        else:
-            temp = df.loc[i+1:i+10000]
-        pd.io.sql.to_sql(temp, table, yconnect, schema='jerry_test', if_exists='append', index=False)
-        print("insert done")
+    df.to_csv(path+table,index=None)
+    # yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
+    # n = 100000
+    # for i in range(0,df.shape[0],n):
+    #     print(i)
+    #     if i == 0:
+    #         temp = df.loc[0:n]
+    #     elif i+n > df.shape[0]:
+    #         temp = df.loc[i+1:]
+    #     else:
+    #         temp = df.loc[i+1:i+n]
+    #     pd.io.sql.to_sql(temp, table, yconnect, schema='jerry_test', if_exists='append', index=False)
+    #     print("insert done")



@@ -123,11 +125,6 @@ class multiFFMFormatPandas:

    def transform_row_(self, row, t):
        ffm = []
-        if self.y is not None:
-            ffm.append(str(row.loc[row.index == self.y][0]))
-        if self.y is None:
-            ffm.append(str(0))
-
        for col, val in row.loc[row.index != self.y].to_dict().items():
            col_type = t[col]
            name = '{}_{}'.format(col, val)
@@ -135,7 +132,12 @@ class multiFFMFormatPandas:
                ffm.append('{}:{}:1'.format(self.field_index_[col]+1, self.feature_index_[name]+1))
            elif col_type.kind == 'i':
                ffm.append('{}:{}:{}'.format(self.field_index_[col]+1, self.feature_index_[col]+1, val))
-        return ' '.join(ffm)
+        result = ' '.join(ffm)
+        if self.y is not None:
+            result = str(row.loc[row.index == self.y][0]) + "," + result
+        if self.y is None:
+            result = str(0) + "," + result
+        return result

    def transform(self, df,n=1500,processes=2):
        # n是每个线程运行最大的数据条数,processes是线程数
@@ -172,18 +174,6 @@ class multiFFMFormatPandas:
                data_list.append(data.iloc[x:data.__len__()])
                break

-        '''
-        # 返回生成器方法，但是本地测试效率不高
-        x = 0
-        while True:
-            if x + step < data.__len__():
-                yield data.iloc[x:x + step]
-                x = x + step + 1
-            else:
-                yield data.iloc[x:data.__len__()]
-                break
-        '''
-
        return data_list

    # 原生转化方法，不需要多进程
@@ -201,4 +191,5 @@ class multiFFMFormatPandas:


 if __name__ == "__main__":
+    path = "/data/ffm/"
    get_data()
\ No newline at end of file