diary-training.py 5.22 KB
Newer Older
张彦钊's avatar
张彦钊 committed
1 2 3 4 5 6
import datetime
import pymysql
import pandas as pd
from sklearn.utils import shuffle
import numpy as np
import xlearn as xl
张彦钊's avatar
张彦钊 committed
7 8


张彦钊's avatar
张彦钊 committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
# 从数据库的表里获取数据,并转化成df格式
def con_sql(sql):
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    cursor = db.cursor()
    cursor.execute(sql)
    result = cursor.fetchall()
    df = pd.DataFrame(list(result)).dropna()
    db.close()
    return df

# 获取点击表里的device_id
sql = "select distinct device_id from data_feed_click where cid_type = 'diary'"
click_device_id = con_sql(sql)[0].values.tolist()
print("成功获取点击表里的device_id")

# 获取点击表里的数据
sql = "select cid,device_id,time from data_feed_click where cid_type = 'diary'"
click = con_sql(sql)
click = click.rename(columns={0:"cid",1:"device_id",2:"time"})
print("成功获取点击表里的数据")

# 获取曝光表里的数据
sql = "select cid,device_id,time from data_feed_exposure where cid_type = 'diary'"
exposure = con_sql(sql)
exposure = exposure.rename(columns={0:"cid",1:"device_id",2:"time"})
print("成功获取曝光表里的数据")

# 求曝光表和点击表的差集合
exposure.append(click)
exposure.append(click)
subset = click.columns.tolist()
exposure = exposure.drop_duplicates(subset=subset,keep=False)
print("成功完成曝光表和点击表的差集合")
exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
# 打标签
click["y"] = 1
exposure["y"] = 0
print("成功获取正负样本")

# 合并点击表和曝光表
data = click.append(exposure)
print("done 合并点击表和曝光表")
print(data.head(2))
# 从time特征中抽取hour、weekday
张彦钊's avatar
张彦钊 committed
53 54
data["hour"] = data["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
data["weekday"] = data["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).weekday())
张彦钊's avatar
张彦钊 committed
55
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
张彦钊's avatar
张彦钊 committed
56 57 58 59
data.loc[data["hour"]==0] = 24
data.loc[data["weekday"]==0] = 7
data["hour"] = data["hour"].astype("category")
data["weekday"] = data["weekday"].astype("category")
张彦钊's avatar
张彦钊 committed
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
data = data.drop("time",axis=1)
print("成功从time特征中抽取hour、weekday")
print(data.head(2))

data = shuffle(data)

print("start ffm transform")
# ffm 格式转换函数、类
class FFMFormatPandas:
    def __init__(self):
        self.field_index_ = None
        self.feature_index_ = None
        self.y = None

    def fit(self, df, y=None):
        self.y = y
        df_ffm = df[df.columns.difference([self.y])]
        if self.field_index_ is None:
            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}

        if self.feature_index_ is not None:
            last_idx = max(list(self.feature_index_.values()))

        if self.feature_index_ is None:
            self.feature_index_ = dict()
            last_idx = 0

        for col in df.columns:
            vals = df[col].unique()
            for val in vals:
                if pd.isnull(val):
                    continue
                name = '{}_{}'.format(col, val)
                if name not in self.feature_index_:
                    self.feature_index_[name] = last_idx
                    last_idx += 1
            self.feature_index_[col] = last_idx
            last_idx += 1
        return self

    def fit_transform(self, df, y=None):
        self.fit(df, y)
        return self.transform(df)

    def transform_row_(self, row, t):
        ffm = []
        if self.y != None:
            ffm.append(str(row.loc[row.index == self.y][0]))
        if self.y is None:
            ffm.append(str(0))

        for col, val in row.loc[row.index != self.y].to_dict().items():
            col_type = t[col]
            name = '{}_{}'.format(col, val)
            if col_type.kind ==  'O':
                ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
            elif col_type.kind == 'i':
                ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
        return ' '.join(ffm)

    def transform(self, df):
        t = df.dtypes.to_dict()
        return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})

ffm_train = FFMFormatPandas()
data = ffm_train.fit_transform(data, y='y')
print("done transform ffm")

n = np.rint(data.shape[0]/8)
m = np.rint(data.shape[0]*(3/8))
# 1/8的数据集用来做测试集
data.loc[:n].to_csv("/home/zhangyanzhao/test.csv",index = False,header = None)
# 1/4的数据集用来做验证集
data.loc[n+1:m].to_csv("/home/zhangyanzhao/validation.csv",index = False,header = None)
# 剩余的数据集用来做验证集
data.loc[m+1:].to_csv("/home/zhangyanzhao/train.csv",index = False,header = None)
# 销毁data,目的是为了节省内存
data = data.drop(data.index.tolist())

print("start training")
ffm_model = xl.create_ffm()
ffm_model.setTrain("/home/zhangyanzhao/train.csv")
ffm_model.setValidate("/home/zhangyanzhao/validation.csv")

param = {'task':'binary', 'lr':0.2,
         'lambda':0.002, 'metric':'auc'}

ffm_model.fit(param, '/home/zhangyanzhao/model.out')

ffm_model.setTest("/home/zhangyanzhao/test.csv")
ffm_model.setSigmoid()

ffm_model.predict("/home/zhangyanzhao/model.out", "/home/zhangyanzhao/output.txt")
print("end")