Commit 2ff68205 authored by 张彦钊's avatar 张彦钊

multi hot insert database

parent f45e1013
......@@ -2,6 +2,7 @@ import pandas as pd
import pymysql
import datetime
def con_sql(db,sql):
cursor = db.cursor()
try:
......@@ -26,7 +27,7 @@ def get_data():
print(start)
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name," \
"u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,cl.l3,e.device_id,cut.time " \
"u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,e.device_id,cut.time " \
"from esmm_train_data e left join user_feature u on e.device_id = u.device_id " \
"left join cid_type_top c on e.device_id = c.device_id " \
"left join cid_level2 cl on e.cid_id = cl.cid " \
......@@ -36,24 +37,32 @@ def get_data():
# print(df.shape)
df = df.rename(columns={0: "y", 1: "z", 2: "stat_date", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11: "l2",
12: "l3",13: "device_id", 14: "time"})
12: "device_id", 13: "time"})
print("esmm data ok")
# print(df.head(2)
print("before")
print(df.shape)
print("after")
df = df.drop_duplicates()
features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "level2_ids", "time", "stat_date"]
df = df.drop_duplicates(features)
df = df.drop_duplicates(["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1","l2", "time", "stat_date"])
print(df.shape)
unique_values = []
features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1", "time", "stat_date"]
for i in features:
df[i] = df[i].astype("str")
df[i] = df[i].fillna("lost")
df[i] = df[i] + i
unique_values.extend(list(df[i].unique()))
df["l2"] = df["l2"].astype("str")
df["l2"] = df["l2"].fillna("lost")
df["l2"] = df["l2"]+"l1"
unique_values.extend(list(df["l2"].unique()))
print("features:")
print(len(unique_values))
print(df.head(2))
temp = list(range(1,len(unique_values)+1))
......@@ -62,7 +71,8 @@ def get_data():
df = df.drop("device_id", axis=1)
train = df[df["stat_date"] != validate_date+"stat_date"]
test = df[df["stat_date"] == validate_date+"stat_date"]
for i in features:
for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1", "time", "stat_date","l2"]:
train[i] = train[i].map(value_map)
test[i] = test[i].map(value_map)
......@@ -85,22 +95,22 @@ def write_csv(df,name,n):
elif i + n > df.shape[0]:
temp = df.iloc[i:]
else:
temp = df.loc[i:i + n]
temp = df.iloc[i:i + n]
temp.to_csv(path + name+ "/{}_{}.csv".format(name,i), index=False)
def get_predict(date,value_map):
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \
"u.device_type,u.manufacturer,u.channel,c.top,df.level2_ids,e.device_id,e.cid_id,cut.time " \
"u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,e.device_id,e.cid_id,cut.time " \
"from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \
"left join cid_type_top c on e.device_id = c.device_id " \
"left join diary_feat df on e.cid_id = df.diary_id " \
"left join cid_level2 cl on e.cid_id = cl.cid " \
"left join cid_time_cut cut on e.cid_id = cut.cid"
df = con_sql(db, sql)
df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "level2_ids",
11: "device_id", 12: "cid_id", 13: "time"})
6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11:"l2",
12: "device_id", 13: "cid_id", 14: "time"})
df["stat_date"] = date
......@@ -108,18 +118,22 @@ def get_predict(date,value_map):
print(df.shape)
features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "level2_ids", "time", "stat_date"]
"channel", "top", "l1", "time", "stat_date"]
for i in features:
df[i] = df[i].astype("str")
df[i] = df[i].fillna("lost")
df[i] = df[i] + i
df["l2"] = df["l2"].astype("str")
df["l2"] = df["l2"].fillna("lost")
df["l2"] = df["l2"] + "l1"
native_pre = df[df["label"] == 0]
native_pre = native_pre.drop("label", axis=1)
nearby_pre = df[df["label"] == 1]
nearby_pre = nearby_pre.drop("label", axis=1)
for i in features:
for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1", "time", "stat_date","l2"]:
native_pre[i] = native_pre[i].map(value_map)
# TODO 没有覆盖到的类别会处理成na,暂时用0填充,后续完善一下
native_pre[i] = native_pre[i].fillna(0)
......
......@@ -29,7 +29,7 @@ def multi():
for i in list(df["level"].unique()):
l = i.split(";")
l = sorted(l)
if len(l) == 3:
if len(l) >= 3:
df.loc[df["level"] == i, ["l1"]] = l[0]
df.loc[df["level"] == i, ["l2"]] = l[1]
df.loc[df["level"] == i, ["l3"]] = l[2]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment