1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import datetime
import pymysql
import pandas as pd
from sklearn.utils import shuffle
import numpy as np
import xlearn as xl
# 从数据库的表里获取数据,并转化成df格式
def con_sql(sql):
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
df = pd.DataFrame(list(result)).dropna()
db.close()
return df
# 获取点击表里的device_id
sql = "select distinct device_id from data_feed_click where cid_type = 'diary'"
click_device_id = con_sql(sql)[0].values.tolist()
print("成功获取点击表里的device_id")
# 获取点击表里的数据
sql = "select cid,device_id,time from data_feed_click where cid_type = 'diary'"
click = con_sql(sql)
click = click.rename(columns={0:"cid",1:"device_id",2:"time"})
print("成功获取点击表里的数据")
# 获取曝光表里的数据
sql = "select cid,device_id,time from data_feed_exposure where cid_type = 'diary'"
exposure = con_sql(sql)
exposure = exposure.rename(columns={0:"cid",1:"device_id",2:"time"})
print("成功获取曝光表里的数据")
# 求曝光表和点击表的差集合
exposure.append(click)
exposure.append(click)
subset = click.columns.tolist()
exposure = exposure.drop_duplicates(subset=subset,keep=False)
print("成功完成曝光表和点击表的差集合")
exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
# 打标签
click["y"] = 1
exposure["y"] = 0
print("成功获取正负样本")
# 合并点击表和曝光表
data = click.append(exposure)
print("done 合并点击表和曝光表")
print(data.head(2))
# 从time特征中抽取hour、weekday
data["hour"] = data["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
data["weekday"] = data["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).weekday())
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data.loc[data["hour"]==0] = 24
data.loc[data["weekday"]==0] = 7
data["hour"] = data["hour"].astype("category")
data["weekday"] = data["weekday"].astype("category")
data = data.drop("time",axis=1)
print("成功从time特征中抽取hour、weekday")
print(data.head(2))
data = shuffle(data)
print("start ffm transform")
# ffm 格式转换函数、类
class FFMFormatPandas:
def __init__(self):
self.field_index_ = None
self.feature_index_ = None
self.y = None
def fit(self, df, y=None):
self.y = y
df_ffm = df[df.columns.difference([self.y])]
if self.field_index_ is None:
self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
if self.feature_index_ is not None:
last_idx = max(list(self.feature_index_.values()))
if self.feature_index_ is None:
self.feature_index_ = dict()
last_idx = 0
for col in df.columns:
vals = df[col].unique()
for val in vals:
if pd.isnull(val):
continue
name = '{}_{}'.format(col, val)
if name not in self.feature_index_:
self.feature_index_[name] = last_idx
last_idx += 1
self.feature_index_[col] = last_idx
last_idx += 1
return self
def fit_transform(self, df, y=None):
self.fit(df, y)
return self.transform(df)
def transform_row_(self, row, t):
ffm = []
if self.y != None:
ffm.append(str(row.loc[row.index == self.y][0]))
if self.y is None:
ffm.append(str(0))
for col, val in row.loc[row.index != self.y].to_dict().items():
col_type = t[col]
name = '{}_{}'.format(col, val)
if col_type.kind == 'O':
ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
elif col_type.kind == 'i':
ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
return ' '.join(ffm)
def transform(self, df):
t = df.dtypes.to_dict()
return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
ffm_train = FFMFormatPandas()
data = ffm_train.fit_transform(data, y='y')
print("done transform ffm")
n = np.rint(data.shape[0]/8)
m = np.rint(data.shape[0]*(3/8))
# 1/8的数据集用来做测试集
data.loc[:n].to_csv("/home/zhangyanzhao/test.csv",index = False,header = None)
# 1/4的数据集用来做验证集
data.loc[n+1:m].to_csv("/home/zhangyanzhao/validation.csv",index = False,header = None)
# 剩余的数据集用来做验证集
data.loc[m+1:].to_csv("/home/zhangyanzhao/train.csv",index = False,header = None)
# 销毁data,目的是为了节省内存
data = data.drop(data.index.tolist())
print("start training")
ffm_model = xl.create_ffm()
ffm_model.setTrain("/home/zhangyanzhao/train.csv")
ffm_model.setValidate("/home/zhangyanzhao/validation.csv")
param = {'task':'binary', 'lr':0.2,
'lambda':0.002, 'metric':'auc'}
ffm_model.fit(param, '/home/zhangyanzhao/model.out')
ffm_model.setTest("/home/zhangyanzhao/test.csv")
ffm_model.setSigmoid()
ffm_model.predict("/home/zhangyanzhao/model.out", "/home/zhangyanzhao/output.txt")
print("end")