Commit 2ff68205 authored by 张彦钊's avatar 张彦钊

multi hot insert database

parent f45e1013
...@@ -2,6 +2,7 @@ import pandas as pd ...@@ -2,6 +2,7 @@ import pandas as pd
import pymysql import pymysql
import datetime import datetime
def con_sql(db,sql): def con_sql(db,sql):
cursor = db.cursor() cursor = db.cursor()
try: try:
...@@ -26,7 +27,7 @@ def get_data(): ...@@ -26,7 +27,7 @@ def get_data():
print(start) print(start)
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name," \ sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name," \
"u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,cl.l3,e.device_id,cut.time " \ "u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,e.device_id,cut.time " \
"from esmm_train_data e left join user_feature u on e.device_id = u.device_id " \ "from esmm_train_data e left join user_feature u on e.device_id = u.device_id " \
"left join cid_type_top c on e.device_id = c.device_id " \ "left join cid_type_top c on e.device_id = c.device_id " \
"left join cid_level2 cl on e.cid_id = cl.cid " \ "left join cid_level2 cl on e.cid_id = cl.cid " \
...@@ -36,24 +37,32 @@ def get_data(): ...@@ -36,24 +37,32 @@ def get_data():
# print(df.shape) # print(df.shape)
df = df.rename(columns={0: "y", 1: "z", 2: "stat_date", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name", df = df.rename(columns={0: "y", 1: "z", 2: "stat_date", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11: "l2", 6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11: "l2",
12: "l3",13: "device_id", 14: "time"}) 12: "device_id", 13: "time"})
print("esmm data ok") print("esmm data ok")
# print(df.head(2) # print(df.head(2)
print("before") print("before")
print(df.shape) print(df.shape)
print("after") print("after")
df = df.drop_duplicates() df = df.drop_duplicates()
features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", df = df.drop_duplicates(["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "level2_ids", "time", "stat_date"] "channel", "top", "l1","l2", "time", "stat_date"])
df = df.drop_duplicates(features)
print(df.shape) print(df.shape)
unique_values = [] unique_values = []
features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1", "time", "stat_date"]
for i in features: for i in features:
df[i] = df[i].astype("str") df[i] = df[i].astype("str")
df[i] = df[i].fillna("lost") df[i] = df[i].fillna("lost")
df[i] = df[i] + i df[i] = df[i] + i
unique_values.extend(list(df[i].unique())) unique_values.extend(list(df[i].unique()))
df["l2"] = df["l2"].astype("str")
df["l2"] = df["l2"].fillna("lost")
df["l2"] = df["l2"]+"l1"
unique_values.extend(list(df["l2"].unique()))
print("features:")
print(len(unique_values))
print(df.head(2)) print(df.head(2))
temp = list(range(1,len(unique_values)+1)) temp = list(range(1,len(unique_values)+1))
...@@ -62,7 +71,8 @@ def get_data(): ...@@ -62,7 +71,8 @@ def get_data():
df = df.drop("device_id", axis=1) df = df.drop("device_id", axis=1)
train = df[df["stat_date"] != validate_date+"stat_date"] train = df[df["stat_date"] != validate_date+"stat_date"]
test = df[df["stat_date"] == validate_date+"stat_date"] test = df[df["stat_date"] == validate_date+"stat_date"]
for i in features: for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1", "time", "stat_date","l2"]:
train[i] = train[i].map(value_map) train[i] = train[i].map(value_map)
test[i] = test[i].map(value_map) test[i] = test[i].map(value_map)
...@@ -85,22 +95,22 @@ def write_csv(df,name,n): ...@@ -85,22 +95,22 @@ def write_csv(df,name,n):
elif i + n > df.shape[0]: elif i + n > df.shape[0]:
temp = df.iloc[i:] temp = df.iloc[i:]
else: else:
temp = df.loc[i:i + n] temp = df.iloc[i:i + n]
temp.to_csv(path + name+ "/{}_{}.csv".format(name,i), index=False) temp.to_csv(path + name+ "/{}_{}.csv".format(name,i), index=False)
def get_predict(date,value_map): def get_predict(date,value_map):
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \ sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \
"u.device_type,u.manufacturer,u.channel,c.top,df.level2_ids,e.device_id,e.cid_id,cut.time " \ "u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,e.device_id,e.cid_id,cut.time " \
"from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \ "from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \
"left join cid_type_top c on e.device_id = c.device_id " \ "left join cid_type_top c on e.device_id = c.device_id " \
"left join diary_feat df on e.cid_id = df.diary_id " \ "left join cid_level2 cl on e.cid_id = cl.cid " \
"left join cid_time_cut cut on e.cid_id = cut.cid" "left join cid_time_cut cut on e.cid_id = cut.cid"
df = con_sql(db, sql) df = con_sql(db, sql)
df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name", df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "level2_ids", 6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11:"l2",
11: "device_id", 12: "cid_id", 13: "time"}) 12: "device_id", 13: "cid_id", 14: "time"})
df["stat_date"] = date df["stat_date"] = date
...@@ -108,18 +118,22 @@ def get_predict(date,value_map): ...@@ -108,18 +118,22 @@ def get_predict(date,value_map):
print(df.shape) print(df.shape)
features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "level2_ids", "time", "stat_date"] "channel", "top", "l1", "time", "stat_date"]
for i in features: for i in features:
df[i] = df[i].astype("str") df[i] = df[i].astype("str")
df[i] = df[i].fillna("lost") df[i] = df[i].fillna("lost")
df[i] = df[i] + i df[i] = df[i] + i
df["l2"] = df["l2"].astype("str")
df["l2"] = df["l2"].fillna("lost")
df["l2"] = df["l2"] + "l1"
native_pre = df[df["label"] == 0] native_pre = df[df["label"] == 0]
native_pre = native_pre.drop("label", axis=1) native_pre = native_pre.drop("label", axis=1)
nearby_pre = df[df["label"] == 1] nearby_pre = df[df["label"] == 1]
nearby_pre = nearby_pre.drop("label", axis=1) nearby_pre = nearby_pre.drop("label", axis=1)
for i in features: for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1", "time", "stat_date","l2"]:
native_pre[i] = native_pre[i].map(value_map) native_pre[i] = native_pre[i].map(value_map)
# TODO 没有覆盖到的类别会处理成na,暂时用0填充,后续完善一下 # TODO 没有覆盖到的类别会处理成na,暂时用0填充,后续完善一下
native_pre[i] = native_pre[i].fillna(0) native_pre[i] = native_pre[i].fillna(0)
......
...@@ -29,7 +29,7 @@ def multi(): ...@@ -29,7 +29,7 @@ def multi():
for i in list(df["level"].unique()): for i in list(df["level"].unique()):
l = i.split(";") l = i.split(";")
l = sorted(l) l = sorted(l)
if len(l) == 3: if len(l) >= 3:
df.loc[df["level"] == i, ["l1"]] = l[0] df.loc[df["level"] == i, ["l1"]] = l[0]
df.loc[df["level"] == i, ["l2"]] = l[1] df.loc[df["level"] == i, ["l2"]] = l[1]
df.loc[df["level"] == i, ["l3"]] = l[2] df.loc[df["level"] == i, ["l3"]] = l[2]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment