Commit b0fa9e51 authored by 张彦钊's avatar 张彦钊

pandas 映射

parent 6908cb9b
......@@ -57,18 +57,64 @@ def get_data():
print(df.head(2))
value_map = {v: k for k, v in enumerate(unique_values)}
df = df.drop("device_id", axis=1)
train = df[df["stat_date"] != validate_date]
test = df[df["stat_date"] == validate_date]
for i in features:
df[i] = df[i].map(value_map)
print("类别总数")
print(len(value_map))
print(df.head(2))
train[i] = train[i].map(value_map)
test[i] = test[i].map(value_map)
train.to_csv(path + "tr.csv", index=False)
test.to_csv(path + "va.csv", index=False)
return validate_date,value_map
def get_predict(date,value_map):
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \
"u.device_type,u.manufacturer,u.channel,c.top,df.level2_ids,e.device_id,e.cid_id,cut.time " \
"from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \
"left join cid_type_top c on e.device_id = c.device_id " \
"left join diary_feat df on e.cid_id = df.diary_id " \
"left join cid_time_cut cut on e.cid_id = cut.cid"
df = con_sql(db, sql)
df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "level2_ids",
11: "device_id", 12: "cid_id", 13: "time"})
df["stat_date"] = date
features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "level2_ids", "time", "stat_date"]
for i in features:
df[i] = df[i].astype("str")
df[i] = df[i].fillna("lost")
df[i] = df[i] + i
native_pre = df[df["label"] == "0"]
native_pre = native_pre.drop("label", axis=1)
nearby_pre = df[df["label"] == "1"]
nearby_pre = nearby_pre.drop("label", axis=1)
for i in features:
native_pre[i] = native_pre[i].map(value_map)
nearby_pre[i] = nearby_pre[i].map(value_map)
print("native")
print(native_pre.shape)
native_pre.to_csv(path + "native.csv", index=False)
print("nearby")
print(nearby_pre.shape)
nearby_pre.to_csv(path + "nearby.csv",index=False)
if __name__ == '__main__':
get_data()
path = "/home/gmuser/esmm_data/"
date,value = get_data()
get_predict(date, value)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment