Commit 1ea6364d authored by 张彦钊's avatar 张彦钊

add date

parent 88db6d8d
...@@ -166,8 +166,6 @@ def get_data(): ...@@ -166,8 +166,6 @@ def get_data():
print(df.shape) print(df.shape)
print("after") print("after")
df = df.drop_duplicates() df = df.drop_duplicates()
df = df.drop_duplicates(["ucity_id","clevel1_id", "ccity_name","device_type","manufacturer",
"channel","top","level2_ids","time"])
print(df.shape) print(df.shape)
df["clevel1_id"] = df["clevel1_id"].astype("str") df["clevel1_id"] = df["clevel1_id"].astype("str")
...@@ -177,12 +175,12 @@ def get_data(): ...@@ -177,12 +175,12 @@ def get_data():
df["time"] = df["time"].astype("str") df["time"] = df["time"].astype("str")
df["y"] = df["stat_date"].str.cat([df["device_id"].values.tolist(),df["y"].values.tolist(),df["z"].values.tolist()], sep=",") df["y"] = df["stat_date"].str.cat([df["device_id"].values.tolist(),df["y"].values.tolist(),df["z"].values.tolist()], sep=",")
df = df.drop(["z","stat_date","device_id"], axis=1) df = df.drop(["z","device_id"], axis=1)
df = df.fillna("na") df = df.fillna("na")
print(df.head(2)) print(df.head(2))
print(df.count())
features = 0 features = 0
l = ["ucity_id","clevel1_id","ccity_name","device_type","manufacturer","channel","level2_ids","top","time"] l = ["ucity_id","clevel1_id","ccity_name","device_type","manufacturer","channel","level2_ids","top","time","stat_date"]
for i in l: for i in l:
features = features + len(df[i].unique()) features = features + len(df[i].unique())
print("fields:{}".format(df.shape[1]-1)) print("fields:{}".format(df.shape[1]-1))
...@@ -224,7 +222,7 @@ def transform(a,validate_date): ...@@ -224,7 +222,7 @@ def transform(a,validate_date):
return model return model
def get_predict_set(model,columns): def get_predict_set(model,columns,date):
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \ sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \
"u.device_type,u.manufacturer,u.channel,c.top,df.level2_ids,e.device_id,e.cid_id,cut.time " \ "u.device_type,u.manufacturer,u.channel,c.top,df.level2_ids,e.device_id,e.cid_id,cut.time " \
...@@ -237,6 +235,7 @@ def get_predict_set(model,columns): ...@@ -237,6 +235,7 @@ def get_predict_set(model,columns):
6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "level2_ids", 6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "level2_ids",
11:"device_id",12:"cid_id",13:"time"}) 11:"device_id",12:"cid_id",13:"time"})
df["stat_date"] = date
print(df.shape) print(df.shape)
for i in columns.keys(): for i in columns.keys():
df.loc[~df[i].isin(columns[i]), [i]] = "na" df.loc[~df[i].isin(columns[i]), [i]] = "na"
...@@ -291,7 +290,7 @@ if __name__ == "__main__": ...@@ -291,7 +290,7 @@ if __name__ == "__main__":
a = time.time() a = time.time()
temp, validate_date, column_map = get_data() temp, validate_date, column_map = get_data()
model = transform(temp, validate_date) model = transform(temp, validate_date)
get_predict_set(model,column_map) get_predict_set(model,column_map,validate_date)
b = time.time() b = time.time()
print("cost(分钟)") print("cost(分钟)")
print((b-a)/60) print((b-a)/60)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment