Commit be9387e7 authored by 张彦钊's avatar 张彦钊

change transform

parent e45c3070
...@@ -143,7 +143,7 @@ def get_data(): ...@@ -143,7 +143,7 @@ def get_data():
validate_date = con_sql(db, sql)[0].values.tolist()[0] validate_date = con_sql(db, sql)[0].values.tolist()[0]
print("validate_date:"+validate_date) print("validate_date:"+validate_date)
temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d") temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d")
start = (temp - datetime.timedelta(days=15)).strftime("%Y-%m-%d") start = (temp - datetime.timedelta(days=5)).strftime("%Y-%m-%d")
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select device_id,y,z,stat_date,ucity_id,cid_id,clevel1_id,ccity_name from esmm_train_data " \ sql = "select device_id,y,z,stat_date,ucity_id,cid_id,clevel1_id,ccity_name from esmm_train_data " \
"where stat_date >= '{}'".format(start) "where stat_date >= '{}'".format(start)
...@@ -163,45 +163,44 @@ def get_data(): ...@@ -163,45 +163,44 @@ def get_data():
print(df.head(2)) print(df.head(2))
print("shape") print("shape")
print(df.shape) print(df.shape)
df = pd.merge(df,get_statistics(),how='left').fillna(0) df = pd.merge(df,get_statistics(),how='left',on = "device_id").fillna(0)
print("merge") print("merge")
print(df.head()) # print(df.head())
print("shape") print("shape")
print(df.shape) print(df.shape)
df = df.drop("device_id", axis=1) df = df.drop("device_id", axis=1)
print(df.head()) print(df.head())
return df,validate_date,ucity_id,cid
transform(df, validate_date)
return ucity_id,cid
def transform(a,validate_date):
def transform(df,validate_date):
model = multiFFMFormatPandas() model = multiFFMFormatPandas()
temp = model.fit_transform(df, y="y", n=160000, processes=18) df = model.fit_transform(a, y="y", n=160000, processes=18)
df = pd.DataFrame(df)
df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
df["city_id"] = df[0].apply(lambda x: x.split(",")[2])
df["diary_id"] = df[0].apply(lambda x: x.split(",")[3])
df["seq"] = list(range(df.shape[0]))
df["seq"] = df["seq"].astype("str")
df["ffm"] = df[0].apply(lambda x: ",".join(x.split(",")[4:]))
df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",")
df["random"] = np.random.randint(1, 2147483647, df.shape[0])
df = df.drop([0,"seq"], axis=1)
print(df.head())
# df = pd.DataFrame(df) train = df[df["stat_date"] != validate_date]
# df["stat_date"] = df[0].apply(lambda x: x.split(",")[0]) train = train.drop("stat_date",axis=1)
# df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
# df["city_id"] = df[0].apply(lambda x: x.split(",")[2])
# df["diary_id"] = df[0].apply(lambda x: x.split(",")[3])
# df["seq"] = list(range(df.shape[0]))
# df["seq"] = df["seq"].astype("str")
# df["ffm"] = df[0].apply(lambda x: ",".join(x.split(",")[4:]))
# df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",")
# df["random"] = np.random.randint(1, 2147483647, df.shape[0])
# df = df.drop([0,"seq"], axis=1)
# print(df.head())
#
# train = df[df["stat_date"] != validate_date]
# train = train.drop("stat_date",axis=1)
# print("train shape") # print("train shape")
# print(train.shape) # print(train.shape)
# test = df[df["stat_date"] == validate_date] test = df[df["stat_date"] == validate_date]
# test = test.drop("stat_date",axis=1) test = test.drop("stat_date",axis=1)
# print("test shape") # print("test shape")
# print(test.shape) # print(test.shape)
# train.to_csv(path+"train.csv",index=None) # train.to_csv(path+"train.csv",index=None)
# test.to_csv(path + "test.csv", index=None) # test.to_csv(path + "test.csv", index=None)
return model
# yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8') # yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
# n = 100000 # n = 100000
# for i in range(0,df.shape[0],n): # for i in range(0,df.shape[0],n):
...@@ -217,41 +216,66 @@ def transform(df,validate_date): ...@@ -217,41 +216,66 @@ def transform(df,validate_date):
def get_statistics(): def get_statistics():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
sql = "select device_id,device_type,total,精选,直播,鼻部,眼部,微整,牙齿,轮廓,美肤抗衰," \ sql = "select device_id,total,精选,直播,鼻部,眼部,微整,牙齿,轮廓,美肤抗衰," \
"吸脂,脂肪填充,隆胸,私密,毛发管理,公立,韩国 from home_tab_click" "吸脂,脂肪填充,隆胸,私密,毛发管理,公立,韩国 from home_tab_click"
df = con_sql(db, sql) df = con_sql(db, sql)
df = df.rename(columns={0:"device_id",1:"os",2:"total"}) df = df.rename(columns={0:"device_id",1:"total"})
for i in df.columns.difference(["device_id", "os","total"]): for i in df.columns.difference(["device_id","total"]):
df[i] = df[i]/df["total"] df[i] = df[i]/df["total"]
df[i] = df[i].apply(lambda x: format(x,".4f")) df[i] = df[i].apply(lambda x: format(x,".4f"))
df[i] = df[i].astype("float") df[i] = df[i].astype("float")
df = df.drop("total", axis=1) df = df.drop("total", axis=1)
return df return df
def get_predict_set(): def get_predict_set(ucity_id, cid,model):
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select device_id,y,z,stat_date,ucity_id,cid_id,clevel1_id,ccity_name,label from esmm_pre_data" sql = "select device_id,y,z,stat_date,ucity_id,cid_id,clevel1_id,ccity_name,label from esmm_pre_data"
df = con_sql(db, sql) df = con_sql(db, sql)
df = df.rename(columns={0: "device_id", 1: "y", 2: "z", 3: "stat_date", 4: "ucity_id", 5: "cid_id", df = df.rename(columns={0: "device_id", 1: "y", 2: "z", 3: "stat_date", 4: "ucity_id", 5: "cid_id",
6: "clevel1_id", 7: "ccity_name",8:"label"}) 6: "clevel1_id", 7: "ccity_name",8:"label"})
print("native_pre ok") print("df ok")
df = df[df["cid_id"].isin(cid)]
df = df[df["ucity_id"].isin(ucity_id)]
print(df.shape) print(df.shape)
# df["clevel1_id"] = df["clevel1_id"].astype("str") df["clevel1_id"] = df["clevel1_id"].astype("str")
# df["cid_id"] = df["cid_id"].astype("str") df["cid_id"] = df["cid_id"].astype("str")
# df["y"] = df["y"].astype("str") df["y"] = df["y"].astype("str")
# df["z"] = df["z"].astype("str") df["z"] = df["z"].astype("str")
# df["y"] = df["label"].str.cat( df["y"] = df["label"].str.cat(
# [df["device_id"].values.tolist(), df["ucity_id"].values.tolist(), df["cid_id"].values.tolist(), [df["device_id"].values.tolist(), df["ucity_id"].values.tolist(), df["cid_id"].values.tolist(),
# df["y"].values.tolist(), df["z"].values.tolist()], sep=",") df["y"].values.tolist(), df["z"].values.tolist()], sep=",")
# df = df.drop(["z","label"], axis=1) df = df.drop(["z","label"], axis=1)
device = tuple(set(df["device_id"].values.tolist())) df = pd.merge(df, get_statistics(), how='left',on = "device_id").fillna(0)
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle') df = df.drop("device_id", axis=1)
sql = "select device_id,total,精选,直播,鼻部,眼部,微整,牙齿,轮廓,美肤抗衰," \ print("df ok")
"吸脂,脂肪填充,隆胸,私密,毛发管理,公立,韩国 from home_tab_click where device_id in {}".format(device) print(df.shape)
statics = con_sql(db, sql) print(df.head(2))
native_pre = pd.merge(df, statics, how='left').fillna(0) df = model.transform(df,n=160000, processes=18)
print("native_pre ok") df = pd.DataFrame(df)
df["label"] = df[0].apply(lambda x: x.split(",")[0])
df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
df["city_id"] = df[0].apply(lambda x: x.split(",")[2])
df["diary_id"] = df[0].apply(lambda x: x.split(",")[3])
df["seq"] = list(range(df.shape[0]))
df["seq"] = df["seq"].astype("str")
df["ffm"] = df[0].apply(lambda x: ",".join(x.split(",")[4:]))
df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",")
df["random"] = np.random.randint(1, 2147483647, df.shape[0])
df = df.drop([0, "seq"], axis=1)
print(df.head())
native_pre = df[df["label"] == 0]
native_pre = native_pre.drop("label", axis=1)
print("native_pre shape")
print(native_pre.shape) print(native_pre.shape)
native_pre.to_csv(path + "native_pre.csv", index=None)
nearby_pre = df[df["label"] == 1]
nearby_pre = nearby_pre.drop("label", axis=1)
print("nearby_pre shape")
print(nearby_pre.shape)
nearby_pre.to_csv(path + "nearby_pre.csv", index=None)
# df = pd.DataFrame(df) # df = pd.DataFrame(df)
# df["stat_date"] = df[0].apply(lambda x: x.split(",")[0]) # df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
...@@ -269,7 +293,8 @@ def get_predict_set(): ...@@ -269,7 +293,8 @@ def get_predict_set():
if __name__ == "__main__": if __name__ == "__main__":
path = "/home/gmuser/ffm/" path = "/home/gmuser/ffm/"
# get_data() df, validate_date, ucity_id, cid = get_data()
get_predict_set() model = transform(df, validate_date)
get_predict_set(ucity_id, cid,model)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment