1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import time
from prepareData import fetch_data
from utils import *
import pandas as pd
from config import *
import pickle
def feature_en(data_start_date, data_end_date, validation_date, test_date):
exposure, click, click_device_id = fetch_data(data_start_date, data_end_date)
# 求曝光表和点击表的差集合
print("曝光表处理前的样本个数")
print(exposure.shape)
exposure = exposure.append(click)
exposure = exposure.append(click)
subset = click.columns.tolist()
exposure = exposure.drop_duplicates(subset=subset, keep=False)
print("差集后曝光表个数")
print(exposure.shape)
exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
print("去除未点击用户后曝光表个数")
print(exposure.shape)
# 打标签
click["y"] = 1
exposure["y"] = 0
print("正样本个数")
print(click.shape[0])
print("负样本个数")
print(exposure.shape[0])
# 合并点击表和曝光表
data = click.append(exposure)
print("点击表和曝光表合并成功")
data = data.sort_values(by="stat_date", ascending=False)
test_number = data[data["stat_date"] == test_date].shape[0]
validation_number = data[data["stat_date"] == validation_date].shape[0]
data = data.drop("stat_date", axis=1)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data.loc[data["hour"] == 0, ["hour"]] = 24
data.loc[data["minute"] == 0, ["minute"]] = 60
data["hour"] = data["hour"].astype("category")
data["minute"] = data["minute"].astype("category")
# 持久化候选cid,选预测候选集时用这个过滤
data_set_cid = data["cid"].unique()
cid_df = pd.DataFrame()
cid_df['cid'] = data_set_cid
cid_df.to_csv(DIRECTORY_PATH + "train/data_set_cid.csv", index=False)
print("成功保存data_set_cid")
# 将device_id 保存,目的是为了判断预测的device_id是否在这个集合里,如果不在,不需要预测
data_set_device_id = data["device_id"].unique()
device_id_df = pd.DataFrame()
device_id_df['device_id'] = data_set_device_id
device_id_df.to_csv(DIRECTORY_PATH + "train/data_set_device_id.csv", index=False)
print("成功保存data_set_device_id")
return data, test_number, validation_number
def ffm_transform(data, test_number, validation_number):
print("Start ffm transform")
start = time.time()
ffm_train = multiFFMFormatPandas()
data = ffm_train.fit_transform(data, y='y',n=50000,processes=20)
with open(DIRECTORY_PATH+"train/ffm.pkl", "wb") as f:
pickle.dump(ffm_train, f)
print("done transform ffm")
end = time.time()
print("ffm转化数据耗时(分):")
print((end - start)/60)
data.to_csv(DIRECTORY_PATH + "total_ffm_data.csv", index=False)
data = pd.read_csv(DIRECTORY_PATH + "total_ffm_data.csv", header=None)
print("数据集大小")
print(data.shape)
test = data.loc[:test_number]
print("测试集大小")
print(test.shape[0])
test.to_csv(DIRECTORY_PATH + "test_ffm_data.csv", index=False, header=None)
# 注意:测试集的日期一定要大于验证集,否则数据切割可能会出现错误
validation = data.loc[(test_number + 1):(test_number + validation_number)]
print("验证集大小")
print(validation.shape[0])
validation.to_csv(DIRECTORY_PATH + "validation_ffm_data.csv", index=False, header=None)
train = data.loc[(test_number + validation_number + 1):]
print("训练集大小")
print(train.shape[0])
# TODO validation date is not the end of train date
train.to_csv(DIRECTORY_PATH + "train_ffm_data.csv", index=False, header=None)