import datetime import pymysql import pandas as pd from sklearn.utils import shuffle import numpy as np import xlearn as xl # 从数据库的表里获取数据,并转化成df格式 def con_sql(sql): db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') cursor = db.cursor() cursor.execute(sql) result = cursor.fetchall() df = pd.DataFrame(list(result)).dropna() db.close() return df # 获取点击表里的device_id sql = "select distinct device_id from data_feed_click where cid_type = 'diary'" click_device_id = con_sql(sql)[0].values.tolist() print("成功获取点击表里的device_id") # 获取点击表里的数据 sql = "select cid,device_id,time from data_feed_click where cid_type = 'diary'" click = con_sql(sql) click = click.rename(columns={0:"cid",1:"device_id",2:"time"}) print("成功获取点击表里的数据") # 获取曝光表里的数据 sql = "select cid,device_id,time from data_feed_exposure where cid_type = 'diary'" exposure = con_sql(sql) exposure = exposure.rename(columns={0:"cid",1:"device_id",2:"time"}) print("成功获取曝光表里的数据") # 求曝光表和点击表的差集合 exposure.append(click) exposure.append(click) subset = click.columns.tolist() exposure = exposure.drop_duplicates(subset=subset,keep=False) print("成功完成曝光表和点击表的差集合") exposure = exposure.loc[exposure["device_id"].isin(click_device_id)] # 打标签 click["y"] = 1 exposure["y"] = 0 print("成功获取正负样本") # 合并点击表和曝光表 data = click.append(exposure) print("done 合并点击表和曝光表") print(data.head(2)) # 从time特征中抽取hour、weekday data["hour"] = data["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour) data["weekday"] = data["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).weekday()) # 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征 data.loc[data["hour"]==0] = 24 data.loc[data["weekday"]==0] = 7 data["hour"] = data["hour"].astype("category") data["weekday"] = data["weekday"].astype("category") data = data.drop("time",axis=1) print("成功从time特征中抽取hour、weekday") print(data.head(2)) data = shuffle(data) print("start ffm transform") # ffm 格式转换函数、类 class FFMFormatPandas: def __init__(self): self.field_index_ = None self.feature_index_ = None self.y = None def fit(self, df, y=None): self.y = y df_ffm = df[df.columns.difference([self.y])] if self.field_index_ is None: self.field_index_ = {col: i for i, col in enumerate(df_ffm)} if self.feature_index_ is not None: last_idx = max(list(self.feature_index_.values())) if self.feature_index_ is None: self.feature_index_ = dict() last_idx = 0 for col in df.columns: vals = df[col].unique() for val in vals: if pd.isnull(val): continue name = '{}_{}'.format(col, val) if name not in self.feature_index_: self.feature_index_[name] = last_idx last_idx += 1 self.feature_index_[col] = last_idx last_idx += 1 return self def fit_transform(self, df, y=None): self.fit(df, y) return self.transform(df) def transform_row_(self, row, t): ffm = [] if self.y != None: ffm.append(str(row.loc[row.index == self.y][0])) if self.y is None: ffm.append(str(0)) for col, val in row.loc[row.index != self.y].to_dict().items(): col_type = t[col] name = '{}_{}'.format(col, val) if col_type.kind == 'O': ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name])) elif col_type.kind == 'i': ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val)) return ' '.join(ffm) def transform(self, df): t = df.dtypes.to_dict() return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()}) ffm_train = FFMFormatPandas() data = ffm_train.fit_transform(data, y='y') print("done transform ffm") n = np.rint(data.shape[0]/8) m = np.rint(data.shape[0]*(3/8)) # 1/8的数据集用来做测试集 data.loc[:n].to_csv("/home/zhangyanzhao/test.csv",index = False,header = None) # 1/4的数据集用来做验证集 data.loc[n+1:m].to_csv("/home/zhangyanzhao/validation.csv",index = False,header = None) # 剩余的数据集用来做验证集 data.loc[m+1:].to_csv("/home/zhangyanzhao/train.csv",index = False,header = None) # 销毁data,目的是为了节省内存 data = data.drop(data.index.tolist()) print("start training") ffm_model = xl.create_ffm() ffm_model.setTrain("/home/zhangyanzhao/train.csv") ffm_model.setValidate("/home/zhangyanzhao/validation.csv") param = {'task':'binary', 'lr':0.2, 'lambda':0.002, 'metric':'auc'} ffm_model.fit(param, '/home/zhangyanzhao/model.out') ffm_model.setTest("/home/zhangyanzhao/test.csv") ffm_model.setSigmoid() ffm_model.predict("/home/zhangyanzhao/model.out", "/home/zhangyanzhao/output.txt") print("end")