Commit 3b881969 authored by 张彦钊's avatar 张彦钊

change transform

parent 8f18a377
...@@ -45,7 +45,8 @@ def get_data(): ...@@ -45,7 +45,8 @@ def get_data():
df = df.rename(columns={0:"device_id",1: "y",2:"z",3:"stat_date",4:"ucity_id",5:"cid_id", df = df.rename(columns={0:"device_id",1: "y",2:"z",3:"stat_date",4:"ucity_id",5:"cid_id",
6:"clevel1_id",7:"ccity_name"}) 6:"clevel1_id",7:"ccity_name"})
print("esmm data ok") print("esmm data ok")
ucity_id = list(set(df["ucity_id"].values.tolist()))
cid = list(set(df["cid_id"].values.tolist()))
df["clevel1_id"] = df["clevel1_id"].astype("str") df["clevel1_id"] = df["clevel1_id"].astype("str")
df["cid_id"] = df["cid_id"].astype("str") df["cid_id"] = df["cid_id"].astype("str")
df["y"] = df["y"].astype("str") df["y"] = df["y"].astype("str")
...@@ -63,16 +64,15 @@ def get_data(): ...@@ -63,16 +64,15 @@ def get_data():
print(df.shape) print(df.shape)
df = df.drop("device_id", axis=1) df = df.drop("device_id", axis=1)
print(df.head()) print(df.head())
transform(df, validate_date) transform(df, validate_date)
return ucity_id,cid
def transform(df,validate_date): def transform(df,validate_date):
model = multiFFMFormatPandas() model = multiFFMFormatPandas()
for i in [200000,160000,130000]: temp = model.fit_transform(df, y="y", n=160000, processes=18)
a = time.time()
temp = model.fit_transform(df, y="y", n=i, processes=18)
b = time.time()
print("{}cost{}".format(i,b - a))
# df = pd.DataFrame(df) # df = pd.DataFrame(df)
# df["stat_date"] = df[0].apply(lambda x: x.split(",")[0]) # df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
# df["device_id"] = df[0].apply(lambda x: x.split(",")[1]) # df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
...@@ -124,11 +124,43 @@ def get_statistics(): ...@@ -124,11 +124,43 @@ def get_statistics():
def get_predict_set(): def get_predict_set():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select device_id,y,z,stat_date,ucity_id,cid_id,clevel1_id,ccity_name from esmm_pre_data where label = 0" sql = "select device_id,y,z,stat_date,ucity_id,cid_id,clevel1_id,ccity_name,label from esmm_pre_data"
native_pre = con_sql(db, sql) df = con_sql(db, sql)
native_pre = native_pre.rename(columns={0: "device_id", 1: "y", 2: "z", 3: "stat_date", 4: "ucity_id", 5: "cid_id", df = df.rename(columns={0: "device_id", 1: "y", 2: "z", 3: "stat_date", 4: "ucity_id", 5: "cid_id",
6: "clevel1_id", 7: "ccity_name"}) 6: "clevel1_id", 7: "ccity_name",8:"label"})
print("native_pre ok") print("native_pre ok"+df.shape)
# df["clevel1_id"] = df["clevel1_id"].astype("str")
# df["cid_id"] = df["cid_id"].astype("str")
# df["y"] = df["y"].astype("str")
# df["z"] = df["z"].astype("str")
# df["y"] = df["label"].str.cat(
# [df["device_id"].values.tolist(), df["ucity_id"].values.tolist(), df["cid_id"].values.tolist(),
# df["y"].values.tolist(), df["z"].values.tolist()], sep=",")
# df = df.drop(["z","label"], axis=1)
device = tuple(set(df["device_id"].values.tolist()))
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
sql = "select device_id,total,精选,直播,鼻部,眼部,微整,牙齿,轮廓,美肤抗衰," \
"吸脂,脂肪填充,隆胸,私密,毛发管理,公立,韩国 from home_tab_click where device_id in {}".format(device)
statics = con_sql(db, sql)
native_pre = pd.merge(df, statics, how='left').fillna(0)
print("native_pre ok" + native_pre.shape)
# df = pd.DataFrame(df)
# df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
# df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
# df["city_id"] = df[0].apply(lambda x: x.split(",")[2])
# df["diary_id"] = df[0].apply(lambda x: x.split(",")[3])
# df["seq"] = list(range(df.shape[0]))
# df["seq"] = df["seq"].astype("str")
# df["ffm"] = df[0].apply(lambda x: ",".join(x.split(",")[4:]))
# df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",")
# df["random"] = np.random.randint(1, 2147483647, df.shape[0])
# df = df.drop([0,"seq"], axis=1)
# print(df.head())
...@@ -239,6 +271,7 @@ class multiFFMFormatPandas: ...@@ -239,6 +271,7 @@ class multiFFMFormatPandas:
if __name__ == "__main__": if __name__ == "__main__":
path = "/home/gmuser/ffm/" path = "/home/gmuser/ffm/"
get_data() # get_data()
get_predict_set()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment