import pandas as pd import pymysql import datetime def con_sql(db,sql): cursor = db.cursor() try: cursor.execute(sql) result = cursor.fetchall() df = pd.DataFrame(list(result)) except Exception: print("发生异常", Exception) df = pd.DataFrame() finally: db.close() return df def get_data(): db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') sql = "select max(stat_date) from esmm_train_data" validate_date = con_sql(db, sql)[0].values.tolist()[0] print("validate_date:" + validate_date) temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d") start = (temp - datetime.timedelta(days=30)).strftime("%Y-%m-%d") print(start) db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name," \ "u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,e.device_id,cut.time " \ "from esmm_train_data e left join user_feature u on e.device_id = u.device_id " \ "left join cid_type_top c on e.device_id = c.device_id " \ "left join cid_level2 cl on e.cid_id = cl.cid " \ "left join cid_time_cut cut on e.cid_id = cut.cid " \ "where e.stat_date >= '{}'".format(start) df = con_sql(db, sql) # print(df.shape) df = df.rename(columns={0: "y", 1: "z", 2: "stat_date", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name", 6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11: "l2", 12: "device_id", 13: "time"}) print("esmm data ok") # print(df.head(2) print("before") print(df.shape) print("after") df = df.drop_duplicates() df = df.drop_duplicates(["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", "channel", "top", "l1","l2", "time", "stat_date"]) print(df.shape) unique_values = [] features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", "channel", "top", "time", "stat_date"] for i in features: df[i] = df[i].astype("str") df[i] = df[i].fillna("lost") # 下面这行代码是为了区分不同的列中有相同的值 df[i] = df[i] + i unique_values.extend(list(df[i].unique())) for i in ["l1","l2"]: df[i] = df[i].astype("str") df[i] = df[i].fillna("lost") # l1和l2中的值与top类别是一个类别 df[i] = df[i]+"top" unique_values.extend(list(df[i].unique())) print("features:") print(len(unique_values)) print(df.head(2)) temp = list(range(1,len(unique_values)+1)) value_map = dict(zip(unique_values,temp)) df = df.drop("device_id", axis=1) train = df[df["stat_date"] != validate_date+"stat_date"] test = df[df["stat_date"] == validate_date+"stat_date"] for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", "channel", "top", "l1", "time", "stat_date","l2"]: train[i] = train[i].map(value_map) test[i] = test[i].map(value_map) print("train shape") print(train.shape) print("test shape") print(test.shape) write_csv(train, "tr",100000) write_csv(test, "va",80000) return validate_date,value_map def write_csv(df,name,n): for i in range(0, df.shape[0], n): if i == 0: temp = df.iloc[0:n] elif i + n > df.shape[0]: temp = df.iloc[i:] else: temp = df.iloc[i:i + n] temp.to_csv(path + name+ "/{}_{}.csv".format(name,i), index=False) def get_predict(date,value_map): db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \ "u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,e.device_id,e.cid_id,cut.time " \ "from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \ "left join cid_type_top c on e.device_id = c.device_id " \ "left join cid_level2 cl on e.cid_id = cl.cid " \ "left join cid_time_cut cut on e.cid_id = cut.cid" df = con_sql(db, sql) df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name", 6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11:"l2", 12: "device_id", 13: "cid_id", 14: "time"}) df["stat_date"] = date print("predict shape") print(df.shape) df["uid"] = df["device_id"] df["city"] = df["ucity_id"] features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", "channel", "top", "time", "stat_date"] for i in features: df[i] = df[i].astype("str") df[i] = df[i].fillna("lost") df[i] = df[i] + i for i in ["l1","l2"]: df[i] = df[i].astype("str") df[i] = df[i].fillna("lost") # l1和l2中的值与top类别是一个类别 df[i] = df[i]+"top" native_pre = df[df["label"] == 0] native_pre = native_pre.drop("label", axis=1) nearby_pre = df[df["label"] == 1] nearby_pre = nearby_pre.drop("label", axis=1) for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", "channel", "top", "l1", "time", "stat_date","l2"]: native_pre[i] = native_pre[i].map(value_map) # TODO 没有覆盖到的类别会处理成na,暂时用0填充,后续完善一下 native_pre[i] = native_pre[i].fillna(0) nearby_pre[i] = nearby_pre[i].map(value_map) # TODO 没有覆盖到的类别会处理成na,暂时用0填充,后续完善一下 nearby_pre[i] = nearby_pre[i].fillna(0) print("native") print(native_pre.shape) print(native_pre.head()) native_pre[["uid","city","cid_id"]].to_csv(path+"native.csv",index=False) write_csv(native_pre, "native",200000) print("nearby") print(nearby_pre.shape) print(nearby_pre.head()) nearby_pre[["uid","city","cid_id"]].to_csv(path+"nearby.csv",index=False) write_csv(nearby_pre, "nearby", 160000) if __name__ == '__main__': path = "/home/gmuser/esmm_data/" date,value = get_data() get_predict(date, value)