Commit 9ba791f3 authored by 张彦钊's avatar 张彦钊

change transform

parent 9d70d94d
......@@ -5,6 +5,7 @@ import pandas as pd
from multiprocessing import Pool
import numpy as np
import datetime
import time
from sqlalchemy import create_engine
......@@ -36,7 +37,7 @@ def get_data():
validate_date = con_sql(db, sql)[0].values.tolist()[0]
print("validate_date:"+validate_date)
temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d")
start = (temp - datetime.timedelta(days=3)).strftime("%Y-%m-%d")
start = (temp - datetime.timedelta(days=15)).strftime("%Y-%m-%d")
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select device_id,y,z,stat_date,ucity_id,cid_id,clevel1_id,ccity_name from esmm_train_data " \
"where stat_date >= '{}'".format(start)
......@@ -67,30 +68,34 @@ def get_data():
def transform(df,validate_date):
model = multiFFMFormatPandas()
df = model.fit_transform(df, y="y", n=100000, processes=18)
df = pd.DataFrame(df)
df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
df["city_id"] = df[0].apply(lambda x: x.split(",")[2])
df["diary_id"] = df[0].apply(lambda x: x.split(",")[3])
df["seq"] = list(range(df.shape[0]))
df["seq"] = df["seq"].astype("str")
df["ffm"] = df[0].apply(lambda x: ",".join(x.split(",")[4:]))
df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",")
df["random"] = np.random.randint(1, 2147483647, df.shape[0])
df = df.drop([0,"seq"], axis=1)
print(df.head())
train = df[df["stat_date"] != validate_date]
train = train.drop("stat_date",axis=1)
print("train shape")
print(train.shape)
test = df[df["stat_date"] == validate_date]
test = test.drop("stat_date",axis=1)
print("test shape")
print(test.shape)
train.to_csv(path+"train.csv",index=None)
test.to_csv(path + "test.csv", index=None)
for i in range(80000,200000,10000):
a = time.time()
df = model.fit_transform(df, y="y", n=i, processes=18)
b = time.time()
print("{}cost{}".format(i,b - a))
# df = pd.DataFrame(df)
# df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
# df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
# df["city_id"] = df[0].apply(lambda x: x.split(",")[2])
# df["diary_id"] = df[0].apply(lambda x: x.split(",")[3])
# df["seq"] = list(range(df.shape[0]))
# df["seq"] = df["seq"].astype("str")
# df["ffm"] = df[0].apply(lambda x: ",".join(x.split(",")[4:]))
# df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",")
# df["random"] = np.random.randint(1, 2147483647, df.shape[0])
# df = df.drop([0,"seq"], axis=1)
# print(df.head())
#
# train = df[df["stat_date"] != validate_date]
# train = train.drop("stat_date",axis=1)
# print("train shape")
# print(train.shape)
# test = df[df["stat_date"] == validate_date]
# test = test.drop("stat_date",axis=1)
# print("test shape")
# print(test.shape)
# train.to_csv(path+"train.csv",index=None)
# test.to_csv(path + "test.csv", index=None)
# yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
# n = 100000
# for i in range(0,df.shape[0],n):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment