import time from prepareData import fetch_data from utils import * import pandas as pd from config import * import pickle def feature_en(data_start_date, data_end_date, validation_date, test_date): exposure, click, click_device_id = fetch_data(data_start_date, data_end_date) # 求曝光表和点击表的差集合 print("曝光表处理前的样本个数") print(exposure.shape) exposure = exposure.append(click) exposure = exposure.append(click) subset = click.columns.tolist() exposure = exposure.drop_duplicates(subset=subset, keep=False) print("差集后曝光表个数") print(exposure.shape) exposure = exposure.loc[exposure["device_id"].isin(click_device_id)] print("去除未点击用户后曝光表个数") print(exposure.shape) # 打标签 click["y"] = 1 exposure["y"] = 0 print("正样本个数") print(click.shape[0]) print("负样本个数") print(exposure.shape[0]) # 合并点击表和曝光表 data = click.append(exposure) print("点击表和曝光表合并成功") data = data.sort_values(by="stat_date", ascending=False) test_number = data[data["stat_date"] == test_date].shape[0] validation_number = data[data["stat_date"] == validation_date].shape[0] data = data.drop("stat_date", axis=1) # 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征 data.loc[data["hour"] == 0, ["hour"]] = 24 data.loc[data["minute"] == 0, ["minute"]] = 60 data["hour"] = data["hour"].astype("category") data["minute"] = data["minute"].astype("category") # 持久化候选cid,选预测候选集时用这个过滤 data_set_cid = data["cid"].unique() cid_df = pd.DataFrame() cid_df['cid'] = data_set_cid cid_df.to_csv(DIRECTORY_PATH + "train/data_set_cid.csv", index=False) print("成功保存data_set_cid") # 将device_id 保存,目的是为了判断预测的device_id是否在这个集合里,如果不在,不需要预测 data_set_device_id = data["device_id"].unique() device_id_df = pd.DataFrame() device_id_df['device_id'] = data_set_device_id device_id_df.to_csv(DIRECTORY_PATH + "train/data_set_device_id.csv", index=False) print("成功保存data_set_device_id") return data, test_number, validation_number def ffm_transform(data, test_number, validation_number): print("Start ffm transform") start = time.time() ffm_train = multiFFMFormatPandas() data = ffm_train.fit_transform(data, y='y',n=50000,processes=20) with open(DIRECTORY_PATH+"train/ffm.pkl", "wb") as f: pickle.dump(ffm_train, f) print("done transform ffm") end = time.time() print("ffm转化数据耗时(分):") print((end - start)/60) data.to_csv(DIRECTORY_PATH + "total_ffm_data.csv", index=False) data = pd.read_csv(DIRECTORY_PATH + "total_ffm_data.csv", header=None) print("数据集大小") print(data.shape) test = data.loc[:test_number] print("测试集大小") print(test.shape[0]) test.to_csv(DIRECTORY_PATH + "test_ffm_data.csv", index=False, header=None) # 注意:测试集的日期一定要大于验证集,否则数据切割可能会出现错误 validation = data.loc[(test_number + 1):(test_number + validation_number)] print("验证集大小") print(validation.shape[0]) validation.to_csv(DIRECTORY_PATH + "validation_ffm_data.csv", index=False, header=None) train = data.loc[(test_number + validation_number + 1):] print("训练集大小") print(train.shape[0]) # TODO validation date is not the end of train date train.to_csv(DIRECTORY_PATH + "train_ffm_data.csv", index=False, header=None)