import pandas as pd import pymysql import datetime def con_sql(db,sql): cursor = db.cursor() try: cursor.execute(sql) result = cursor.fetchall() df = pd.DataFrame(list(result)) except Exception: print("发生异常", Exception) df = pd.DataFrame() finally: db.close() return df def multi_hot(df,column,n): df[column] = df[column].fillna("lost_na") app_list_value = [i.split(",") for i in df[column].unique()] app_list_unique = [] for i in app_list_value: app_list_unique.extend(i) app_list_unique = list(set(app_list_unique)) number = len(app_list_unique) app_list_map = dict(zip(app_list_unique, list(range(n, number + n)))) df[column] = df[column].apply(app_list_func, args=(app_list_map,)) return number,app_list_map def get_data(): db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') sql = "select max(stat_date) from {}".format(train_data_set) validate_date = con_sql(db, sql)[0].values.tolist()[0] print("validate_date:" + validate_date) temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d") start = (temp - datetime.timedelta(days=300)).strftime("%Y-%m-%d") print(start) db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') sql = "select e.y,e.z,e.stat_date,e.ucity_id,feat.level2_ids,e.ccity_name," \ "u.device_type,u.manufacturer,u.channel,c.top,e.device_id,cut.time,dl.app_list " \ "from {} e left join user_feature u on e.device_id = u.device_id " \ "left join cid_type_top c on e.device_id = c.device_id " \ "left join cid_time_cut cut on e.cid_id = cut.cid " \ "left join device_app_list dl on e.device_id = dl.device_id " \ "left join diary_feat feat on e.cid_id = feat.diary_id " \ "where e.stat_date >= '{}'".format(train_data_set,start) df = con_sql(db, sql) # print(df.shape) df = df.rename(columns={0: "y", 1: "z", 2: "stat_date", 3: "ucity_id", 4: "clevel2_id", 5: "ccity_name", 6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "device_id", 11: "time",12:"app_list"}) print("esmm data ok") # print(df.head(2) print("before") print(df.shape) print("after") df = df.drop_duplicates() df = df.drop_duplicates(["ucity_id", "clevel2_id", "ccity_name", "device_type", "manufacturer", "channel", "top", "time", "stat_date","app_list"]) app_list_number,app_list_map = multi_hot(df,"app_list",1) level2_number,level2_map = multi_hot(df,"clevel2_id",1+app_list_number) # df["app_list"] = df["app_list"].fillna("lost_na") # app_list_value = [i.split(",") for i in df["app_list"].unique()] # app_list_unique = [] # for i in app_list_value: # app_list_unique.extend(i) # app_list_unique = list(set(app_list_unique)) # app_list_map = dict(zip(app_list_unique, list(range(1, len(app_list_unique) + 1)))) # df["app_list"] = df["app_list"].apply(app_list_func,args=(app_list_map,)) unique_values = [] features = ["ucity_id", "ccity_name", "device_type", "manufacturer", "channel", "top", "time", "stat_date"] for i in features: df[i] = df[i].astype("str") df[i] = df[i].fillna("lost") # 下面这行代码是为了区分不同的列中有相同的值 df[i] = df[i] + i unique_values.extend(list(df[i].unique())) temp = list(range(1+app_list_number+level2_number, 1 + app_list_number+level2_number + len(unique_values))) value_map = dict(zip(unique_values,temp)) df = df.drop("device_id", axis=1) train = df[df["stat_date"] != validate_date+"stat_date"] test = df[df["stat_date"] == validate_date+"stat_date"] for i in ["ucity_id", "ccity_name", "device_type", "manufacturer", "channel", "top", "time", "stat_date"]: train[i] = train[i].map(value_map) test[i] = test[i].map(value_map) print("train shape") print(train.shape) print("test shape") print(test.shape) write_csv(train, "tr",100000) write_csv(test, "va",80000) return validate_date,value_map,app_list_map,level2_map def app_list_func(x,l): b = x.split(",") e = [] for i in b: if i in l.keys(): e.append(l[i]) else: e.append(0) return ",".join([str(j) for j in e]) def write_csv(df,name,n): for i in range(0, df.shape[0], n): if i == 0: temp = df.iloc[0:n] elif i + n > df.shape[0]: temp = df.iloc[i:] else: temp = df.iloc[i:i + n] temp.to_csv(path + name+ "/{}_{}.csv".format(name,i), index=False) def get_predict(date,value_map,app_list_map,level2_map): db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') sql = "select e.y,e.z,e.label,e.ucity_id,feat.level2_ids,e.ccity_name," \ "u.device_type,u.manufacturer,u.channel,c.top,e.device_id,e.cid_id,cut.time,dl.app_list " \ "from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \ "left join cid_type_top c on e.device_id = c.device_id " \ "left join cid_time_cut cut on e.cid_id = cut.cid " \ "left join device_app_list dl on e.device_id = dl.device_id " \ "left join diary_feat feat on e.cid_id = feat.diary_id" df = con_sql(db, sql) df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel2_id", 5: "ccity_name", 6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "device_id", 11: "cid_id", 12: "time",13:"app_list"}) df["stat_date"] = date print(df.head(6)) df["app_list"] = df["app_list"].fillna("lost_na") df["app_list"] = df["app_list"].apply(app_list_func,args=(app_list_map,)) df["clevel2_id"] = df["clevel2_id"].fillna("lost_na") df["clevel2_id"] = df["clevel2_id"].apply(app_list_func, args=(level2_map,)) # print("predict shape") # print(df.shape) df["uid"] = df["device_id"] df["city"] = df["ucity_id"] features = ["ucity_id", "ccity_name", "device_type", "manufacturer", "channel", "top", "time", "stat_date"] for i in features: df[i] = df[i].astype("str") df[i] = df[i].fillna("lost") df[i] = df[i] + i native_pre = df[df["label"] == 0] native_pre = native_pre.drop("label", axis=1) nearby_pre = df[df["label"] == 1] nearby_pre = nearby_pre.drop("label", axis=1) for i in ["ucity_id", "ccity_name", "device_type", "manufacturer", "channel", "top", "time", "stat_date"]: native_pre[i] = native_pre[i].map(value_map) # TODO 没有覆盖到的类别会处理成na,暂时用0填充,后续完善一下 native_pre[i] = native_pre[i].fillna(0) nearby_pre[i] = nearby_pre[i].map(value_map) # TODO 没有覆盖到的类别会处理成na,暂时用0填充,后续完善一下 nearby_pre[i] = nearby_pre[i].fillna(0) print("native") print(native_pre.shape) native_pre[["uid","city","cid_id"]].to_csv(path+"native.csv",index=False) write_csv(native_pre, "native",200000) print("nearby") print(nearby_pre.shape) nearby_pre[["uid","city","cid_id"]].to_csv(path+"nearby.csv",index=False) write_csv(nearby_pre, "nearby", 160000) if __name__ == '__main__': train_data_set = "esmm_train_data" path = "/data/esmm/" date,value,app_list,level2 = get_data() get_predict(date, value,app_list,level2)