Commit 77f1ce57 authored by 张彦钊's avatar 张彦钊

add shape

parent e3f29305
...@@ -4,6 +4,7 @@ import pymysql ...@@ -4,6 +4,7 @@ import pymysql
import pandas as pd import pandas as pd
from multiprocessing import Pool from multiprocessing import Pool
import numpy as np import numpy as np
import datetime
from sqlalchemy import create_engine from sqlalchemy import create_engine
...@@ -20,31 +21,37 @@ def con_sql(db,sql): ...@@ -20,31 +21,37 @@ def con_sql(db,sql):
db.close() db.close()
return df return df
def test(): # def test():
sql = "select max(update_time) from ffm_diary_queue" # sql = "select max(update_time) from ffm_diary_queue"
db = pymysql.connect(host='192.168.15.12', port=4000, user='root', db='eagle') # db = pymysql.connect(host='192.168.15.12', port=4000, user='root', db='eagle')
cursor = db.cursor() # cursor = db.cursor()
cursor.execute(sql) # cursor.execute(sql)
result = cursor.fetchone()[0] # result = cursor.fetchone()[0]
db.close() # db.close()
print(result) # print(result)
def get_data(): def get_data():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select device_id,y,z,stat_date,ucity_id,cid_id,clevel1_id,ccity_name from esmm_train_data" sql = "select max(stat_date) from esmm_train_data"
validate_date = con_sql(db, sql)[0].values.tolist()[0]
temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d")
start = (temp - datetime.timedelta(days=3)).strftime("%Y-%m-%d")
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select device_id,y,z,stat_date,ucity_id,cid_id,clevel1_id,ccity_name from esmm_train_data " \
"where stat_date >= '{}'".format(start)
df = con_sql(db,sql) df = con_sql(db,sql)
df = df.rename(columns={0:"device_id",1: "y",2:"z",3:"stat_date",4:"ucity_id",5:"cid_id", df = df.rename(columns={0:"device_id",1: "y",2:"z",3:"stat_date",4:"ucity_id",5:"cid_id",
6:"clevel1_id",7:"ccity_name"}) 6:"clevel1_id",7:"ccity_name"})
print("esmm data ok") print("esmm data ok")
df["clevel1_id"] = df["clevel1_id"].astype("str")
df["cid_id"] = df["cid_id"].astype("str")
df["y"] = df["y"].astype("str")
df["z"] = df["z"].astype("str")
df["y"] = df["stat_date"].str.cat([df["device_id"].values.tolist(),df["ucity_id"].values.tolist(), df["cid_id"].values.tolist(),
df["y"].values.tolist(),df["z"].values.tolist()], sep=",")
df = df.drop("z", axis=1)
print(df.head()) print(df.head())
# df["clevel1_id"] = df["clevel1_id"].astype("str")
# df["cid_id"] = df["cid_id"].astype("str")
# df["y"] = df["y"].astype("str")
# df["z"] = df["z"].astype("str")
# df["y"] = df["stat_date"].str.cat([df["device_id"].values.tolist(),df["ucity_id"].values.tolist(), df["cid_id"].values.tolist(),
# df["y"].values.tolist(),df["z"].values.tolist()], sep=",")
# df = df.drop("z", axis=1)
# print(df.head())
print("shape") print("shape")
print(df.shape) print(df.shape)
df = pd.merge(df,get_statistics(),how='left').fillna(0) df = pd.merge(df,get_statistics(),how='left').fillna(0)
...@@ -52,13 +59,14 @@ def get_data(): ...@@ -52,13 +59,14 @@ def get_data():
print(df.head()) print(df.head())
print("shape") print("shape")
print(df.shape) print(df.shape)
# df = df.drop("device_id", axis=1) df = df.drop("device_id", axis=1)
# transform(df) print(df.head())
transform(df,)
def transform(df): def transform(df,validate_date):
model = multiFFMFormatPandas() model = multiFFMFormatPandas()
df = model.fit_transform(df, y="y", n=80000, processes=10) df = model.fit_transform(df, y="y", n=80000, processes=18)
df = pd.DataFrame(df) df = pd.DataFrame(df)
df["stat_date"] = df[0].apply(lambda x: x.split(",")[0]) df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
df["device_id"] = df[0].apply(lambda x: x.split(",")[1]) df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
...@@ -69,16 +77,17 @@ def transform(df): ...@@ -69,16 +77,17 @@ def transform(df):
df["ffm"] = df[0].apply(lambda x: ",".join(x.split(",")[4:])) df["ffm"] = df[0].apply(lambda x: ",".join(x.split(",")[4:]))
df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",") df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",")
df["random"] = np.random.randint(1, 2147483647, df.shape[0]) df["random"] = np.random.randint(1, 2147483647, df.shape[0])
df = df.drop(0, axis=1).drop("seq",axis=1) df = df.drop([0,"seq"], axis=1)
print(df.head()) print(df.head())
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select max(stat_date) from esmm_train_data"
df = con_sql(db, sql)[0].values.tolist()[0]
train = df[df["stat_date"] != "2018-11-25"] train = df[df["stat_date"] != validate_date]
train = train.drop("stat_date",axis=1) train = train.drop("stat_date",axis=1)
test = df[df["stat_date"] == "2018-11-25"] print("train shape")
print(train.shape)
test = df[df["stat_date"] == validate_date]
test = test.drop("stat_date",axis=1) test = test.drop("stat_date",axis=1)
print("test shape")
print(test.shape)
train.to_csv(path+"train.csv",index=None) train.to_csv(path+"train.csv",index=None)
test.to_csv(path + "test.csv", index=None) test.to_csv(path + "test.csv", index=None)
# yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8') # yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
...@@ -99,13 +108,12 @@ def get_statistics(): ...@@ -99,13 +108,12 @@ def get_statistics():
sql = "select device_id,device_type,total,精选,直播,鼻部,眼部,微整,牙齿,轮廓,美肤抗衰," \ sql = "select device_id,device_type,total,精选,直播,鼻部,眼部,微整,牙齿,轮廓,美肤抗衰," \
"吸脂,脂肪填充,隆胸,私密,毛发管理,公立,韩国 from home_tab_click" "吸脂,脂肪填充,隆胸,私密,毛发管理,公立,韩国 from home_tab_click"
df = con_sql(db, sql).drop_duplicates() df = con_sql(db, sql).drop_duplicates()
df = df.rename(columns={0:"device_id",1:"device_type",2:"total"}) df = df.rename(columns={0:"device_id",1:"os",2:"total"})
# for i in df.columns.difference(["device_id", "device_type","channel","total"]): for i in df.columns.difference(["device_id", "os","total"]):
# df[i] = df[i]/df["total"] df[i] = df[i]/df["total"]
# df = df.drop("total", axis=1) df = df.drop("total", axis=1)
return df return df
class multiFFMFormatPandas: class multiFFMFormatPandas:
def __init__(self): def __init__(self):
self.field_index_ = None self.field_index_ = None
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment