Commit 12be6962 authored by 张彦钊's avatar 张彦钊

add print

parent f6badd3c
...@@ -98,12 +98,7 @@ class multiFFMFormatPandas: ...@@ -98,12 +98,7 @@ class multiFFMFormatPandas:
result_map = {} result_map = {}
for i in data_list: for i in data_list:
print("before:total")
print(len(result_map))
print(len(i.get()))
result_map.update(i.get()) result_map.update(i.get())
print("result_map")
print(len(result_map))
pool.close() pool.close()
pool.join() pool.join()
...@@ -120,10 +115,10 @@ class multiFFMFormatPandas: ...@@ -120,10 +115,10 @@ class multiFFMFormatPandas:
x = 0 x = 0
while True: while True:
if x + step < data.__len__(): if x + step < data.__len__():
data_list.append(data.loc[x:x + step]) data_list.append(data.iloc[x:x + step])
x = x + step + 1 x = x + step
else: else:
data_list.append(data.loc[x:data.__len__()]) data_list.append(data.iloc[x:data.__len__()])
break break
return data_list return data_list
...@@ -179,6 +174,8 @@ def get_data(): ...@@ -179,6 +174,8 @@ def get_data():
ucity_id = list(set(df["ucity_id"].values.tolist())) ucity_id = list(set(df["ucity_id"].values.tolist()))
manufacturer = list(set(df["manufacturer"].values.tolist())) manufacturer = list(set(df["manufacturer"].values.tolist()))
channel = list(set(df["channel"].values.tolist())) channel = list(set(df["channel"].values.tolist()))
print("before transform")
print(df.shape)
return df,validate_date,ucity_id,ccity_name,manufacturer,channel return df,validate_date,ucity_id,ccity_name,manufacturer,channel
...@@ -187,22 +184,24 @@ def transform(a,validate_date): ...@@ -187,22 +184,24 @@ def transform(a,validate_date):
model = multiFFMFormatPandas() model = multiFFMFormatPandas()
df = model.fit_transform(a, y="y", n=160000, processes=22) df = model.fit_transform(a, y="y", n=160000, processes=22)
df = pd.DataFrame(df) df = pd.DataFrame(df)
df["stat_date"] = df[0].apply(lambda x: x.split(",")[0]) print("after transform")
df["device_id"] = df[0].apply(lambda x: x.split(",")[1]) print(df.shape)
df["y"] = df[0].apply(lambda x: x.split(",")[2]) # df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
df["z"] = df[0].apply(lambda x: x.split(",")[3]) # df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
df["number"] = np.random.randint(1, 2147483647, df.shape[0]) # df["y"] = df[0].apply(lambda x: x.split(",")[2])
df["seq"] = list(range(df.shape[0])) # df["z"] = df[0].apply(lambda x: x.split(",")[3])
df["seq"] = df["seq"].astype("str") # df["number"] = np.random.randint(1, 2147483647, df.shape[0])
df["data"] = df[0].apply(lambda x: ",".join(x.split(",")[2:])) # df["seq"] = list(range(df.shape[0]))
df["data"] = df["seq"].str.cat(df["data"], sep=",") # df["seq"] = df["seq"].astype("str")
df = df.drop([0,"seq"], axis=1) # df["data"] = df[0].apply(lambda x: ",".join(x.split(",")[2:]))
print(df.head(2)) # df["data"] = df["seq"].str.cat(df["data"], sep=",")
# df = df.drop([0,"seq"], axis=1)
train = df[df["stat_date"] != validate_date] # print(df.head(2))
train = train.drop("stat_date",axis=1)
test = df[df["stat_date"] == validate_date] # train = df[df["stat_date"] != validate_date]
test = test.drop("stat_date",axis=1) # train = train.drop("stat_date",axis=1)
# test = df[df["stat_date"] == validate_date]
# test = test.drop("stat_date",axis=1)
# print("train shape") # print("train shape")
# print(train.shape) # print(train.shape)
# train.to_csv(path + "tr.csv", sep="\t", index=False) # train.to_csv(path + "tr.csv", sep="\t", index=False)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment