From ef710cfa406176461eb0c58849a631c530a97a9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=BD=A6=E9=92=8A?= <zhangyanzhao@igengmei.com> Date: Tue, 11 Dec 2018 17:04:46 +0800 Subject: [PATCH] add data --- tensnsorflow/ffm.py | 61 ++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/tensnsorflow/ffm.py b/tensnsorflow/ffm.py index 0c45e4bf..a7f0316a 100644 --- a/tensnsorflow/ffm.py +++ b/tensnsorflow/ffm.py @@ -20,29 +20,24 @@ def con_sql(db,sql): db.close() return df +def test(): + sql = "select max(update_time) from ffm_diary_queue" + db = pymysql.connect(host='192.168.15.12', port=4000, user='root', db='eagle') + cursor = db.cursor() + cursor.execute(sql) + result = cursor.fetchone()[0] + db.close() + print(result) def get_data(): - db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod') - sql = "select * from esmm_data where stat_date >= '2018-11-20' limit 6" - esmm = con_sql(db,sql) - esmm = esmm.rename(columns={0:"stat_date",1: "device_id",2:"ucity_id",3:"cid_id",4:"diary_service_id",5:"y", - 6:"z",7:"clevel1_id",8:"slevel1_id"}) + db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') + sql = "select device_id,y,z,stat_date,ucity_id,cid_id,clevel1_id,ccity_name from esmm_train_data" + df = con_sql(db,sql) + df = df.rename(columns={0:"device_id",1: "y",2:"z",3:"stat_date",4:"ucity_id",5:"cid_id", + 6:"clevel1_id",7:"ccity_name"}) print("esmm data ok") - print(esmm.head()) - print(esmm.shape) - db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle') - sql = "select * from home_tab_click limit 6" - temp = con_sql(db,sql) - temp = temp.rename(columns={0: "device_id"}) - print("click data ok") - # print(temp.head()) - df = pd.merge(esmm,temp,on = "device_id",how='left').fillna(0) - # print("åˆå¹¶åŽï¼š") - print(df.shape) - - df["diary_service_id"] = df["diary_service_id"].astype("str") + print(df.head()) df["clevel1_id"] = df["clevel1_id"].astype("str") - df["slevel1_id"] = df["slevel1_id"].astype("str") df["cid_id"] = df["cid_id"].astype("str") df["y"] = df["y"].astype("str") df["z"] = df["z"].astype("str") @@ -67,9 +62,11 @@ def transform(df): df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",") df["random"] = np.random.randint(1, 2147483647, df.shape[0]) df = df.drop(0, axis=1).drop("seq",axis=1) - print("size") - print(df.shape) print(df.head()) + db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') + sql = "select max(stat_date) from esmm_train_data" + df = con_sql(db, sql)[0].values.tolist()[0] + train = df[df["stat_date"] != "2018-11-25"] train = train.drop("stat_date",axis=1) test = df[df["stat_date"] == "2018-11-25"] @@ -89,6 +86,19 @@ def transform(df): # pd.io.sql.to_sql(temp, table, yconnect, schema='jerry_test', if_exists='append', index=False) # print("insert done") +def get_statistics(): + db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle') + sql = "select device_id,device_type,channel,total,精选,ç›´æ’,鼻部,眼部,微整,牙齿,轮廓,美肤抗衰," \ + "å¸è„‚,脂肪填充,隆胸,ç§å¯†,毛å‘管ç†,公立,韩国" + df = con_sql(db, sql) + + df = df.rename(columns={0:"device_id",1:"device_type",2:"channel",3:"total"}) + print(df.head()) + for i in df.columns.difference(["device_id", "device_type","channel","total"]): + df[i] = df[i]/df["total"] + print(df.head()) + + class multiFFMFormatPandas: @@ -108,9 +118,10 @@ class multiFFMFormatPandas: if self.feature_index_ is None: self.feature_index_ = dict() - last_idx = 0 for col in df.columns: + self.feature_index_[col] = 1 + last_idx = 1 vals = df[col].unique() for val in vals: if pd.isnull(val): @@ -119,9 +130,6 @@ class multiFFMFormatPandas: if name not in self.feature_index_: self.feature_index_[name] = last_idx last_idx += 1 - self.feature_index_[col] = last_idx - last_idx += 1 - return self def fit_transform(self, df, y=None,n=50000,processes=4): @@ -200,4 +208,5 @@ class multiFFMFormatPandas: if __name__ == "__main__": path = "/home/gmuser/ffm/" - get_data() \ No newline at end of file + # get_data() + get_statistics() -- 2.18.0