Commit 12be6962 authored by 张彦钊's avatar 张彦钊

add print

parent f6badd3c
......@@ -98,12 +98,7 @@ class multiFFMFormatPandas:
result_map = {}
for i in data_list:
print("before:total")
print(len(result_map))
print(len(i.get()))
result_map.update(i.get())
print("result_map")
print(len(result_map))
pool.close()
pool.join()
......@@ -120,10 +115,10 @@ class multiFFMFormatPandas:
x = 0
while True:
if x + step < data.__len__():
data_list.append(data.loc[x:x + step])
x = x + step + 1
data_list.append(data.iloc[x:x + step])
x = x + step
else:
data_list.append(data.loc[x:data.__len__()])
data_list.append(data.iloc[x:data.__len__()])
break
return data_list
......@@ -179,6 +174,8 @@ def get_data():
ucity_id = list(set(df["ucity_id"].values.tolist()))
manufacturer = list(set(df["manufacturer"].values.tolist()))
channel = list(set(df["channel"].values.tolist()))
print("before transform")
print(df.shape)
return df,validate_date,ucity_id,ccity_name,manufacturer,channel
......@@ -187,22 +184,24 @@ def transform(a,validate_date):
model = multiFFMFormatPandas()
df = model.fit_transform(a, y="y", n=160000, processes=22)
df = pd.DataFrame(df)
df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
df["y"] = df[0].apply(lambda x: x.split(",")[2])
df["z"] = df[0].apply(lambda x: x.split(",")[3])
df["number"] = np.random.randint(1, 2147483647, df.shape[0])
df["seq"] = list(range(df.shape[0]))
df["seq"] = df["seq"].astype("str")
df["data"] = df[0].apply(lambda x: ",".join(x.split(",")[2:]))
df["data"] = df["seq"].str.cat(df["data"], sep=",")
df = df.drop([0,"seq"], axis=1)
print(df.head(2))
train = df[df["stat_date"] != validate_date]
train = train.drop("stat_date",axis=1)
test = df[df["stat_date"] == validate_date]
test = test.drop("stat_date",axis=1)
print("after transform")
print(df.shape)
# df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
# df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
# df["y"] = df[0].apply(lambda x: x.split(",")[2])
# df["z"] = df[0].apply(lambda x: x.split(",")[3])
# df["number"] = np.random.randint(1, 2147483647, df.shape[0])
# df["seq"] = list(range(df.shape[0]))
# df["seq"] = df["seq"].astype("str")
# df["data"] = df[0].apply(lambda x: ",".join(x.split(",")[2:]))
# df["data"] = df["seq"].str.cat(df["data"], sep=",")
# df = df.drop([0,"seq"], axis=1)
# print(df.head(2))
# train = df[df["stat_date"] != validate_date]
# train = train.drop("stat_date",axis=1)
# test = df[df["stat_date"] == validate_date]
# test = test.drop("stat_date",axis=1)
# print("train shape")
# print(train.shape)
# train.to_csv(path + "tr.csv", sep="\t", index=False)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment