first time commit

55f77c05 · 张彦钊 · 879adfa0 · 55f77c05
Commit 55f77c05 authored Aug 06, 2018 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 163 additions and 0 deletions

diary-training.py diary-training.py +163 -0

No files found.
--- a/diary-training.py
+++ b/diary-training.py
+import datetime
+import pymysql
+import pandas as pd
+from sklearn.utils import shuffle
+import numpy as np
+import xlearn as xl
+# 从数据库的表里获取数据，并转化成df格式
+def con_sql(sql):
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    cursor = db.cursor()
+    cursor.execute(sql)
+    result = cursor.fetchall()
+    df = pd.DataFrame(list(result)).dropna()
+    db.close()
+    return df
+
+# 获取点击表里的device_id
+sql = "select distinct device_id from data_feed_click where cid_type = 'diary'"
+click_device_id = con_sql(sql)[0].values.tolist()
+print("成功获取点击表里的device_id")
+
+# 获取点击表里的数据
+sql = "select cid,device_id,time from data_feed_click where cid_type = 'diary'"
+click = con_sql(sql)
+click = click.rename(columns={0:"cid",1:"device_id",2:"time"})
+print("成功获取点击表里的数据")
+
+# 获取曝光表里的数据
+sql = "select cid,device_id,time from data_feed_exposure where cid_type = 'diary'"
+exposure = con_sql(sql)
+exposure = exposure.rename(columns={0:"cid",1:"device_id",2:"time"})
+print("成功获取曝光表里的数据")
+
+# 求曝光表和点击表的差集合
+exposure.append(click)
+exposure.append(click)
+subset = click.columns.tolist()
+exposure = exposure.drop_duplicates(subset=subset,keep=False)
+print("成功完成曝光表和点击表的差集合")
+exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
+# 打标签
+click["y"] = 1
+exposure["y"] = 0
+print("成功获取正负样本")
+
+# 合并点击表和曝光表
+data = click.append(exposure)
+print("done 合并点击表和曝光表")
+print(data.head(2))
+# 从time特征中抽取hour、weekday
+data["hour"] = data["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
+data["weekday"] = data["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).weekday())
+# 数值是0的特征会被ffm格式删除，经过下面的处理后，没有数值是0的特征
+data.loc[data["hour"]==0] = 24
+data.loc[data["weekday"]==0] = 7
+data["hour"] = data["hour"].astype("category")
+data["weekday"] = data["weekday"].astype("category")
+data = data.drop("time",axis=1)
+print("成功从time特征中抽取hour、weekday")
+print(data.head(2))
+
+data = shuffle(data)
+
+print("start ffm transform")
+# ffm 格式转换函数、类
+class FFMFormatPandas:
+    def __init__(self):
+        self.field_index_ = None
+        self.feature_index_ = None
+        self.y = None
+
+    def fit(self, df, y=None):
+        self.y = y
+        df_ffm = df[df.columns.difference([self.y])]
+        if self.field_index_ is None:
+            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
+
+        if self.feature_index_ is not None:
+            last_idx = max(list(self.feature_index_.values()))
+
+        if self.feature_index_ is None:
+            self.feature_index_ = dict()
+            last_idx = 0
+
+        for col in df.columns:
+            vals = df[col].unique()
+            for val in vals:
+                if pd.isnull(val):
+                    continue
+                name = '{}_{}'.format(col, val)
+                if name not in self.feature_index_:
+                    self.feature_index_[name] = last_idx
+                    last_idx += 1
+            self.feature_index_[col] = last_idx
+            last_idx += 1
+        return self
+
+    def fit_transform(self, df, y=None):
+        self.fit(df, y)
+        return self.transform(df)
+
+    def transform_row_(self, row, t):
+        ffm = []
+        if self.y != None:
+            ffm.append(str(row.loc[row.index == self.y][0]))
+        if self.y is None:
+            ffm.append(str(0))
+
+        for col, val in row.loc[row.index != self.y].to_dict().items():
+            col_type = t[col]
+            name = '{}_{}'.format(col, val)
+            if col_type.kind ==  'O':
+                ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
+            elif col_type.kind == 'i':
+                ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
+        return ' '.join(ffm)
+
+    def transform(self, df):
+        t = df.dtypes.to_dict()
+        return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
+
+ffm_train = FFMFormatPandas()
+data = ffm_train.fit_transform(data, y='y')
+print("done transform ffm")
+
+n = np.rint(data.shape[0]/8)
+m = np.rint(data.shape[0]*(3/8))
+# 1/8的数据集用来做测试集
+data.loc[:n].to_csv("/home/zhangyanzhao/test.csv",index = False,header = None)
+# 1/4的数据集用来做验证集
+data.loc[n+1:m].to_csv("/home/zhangyanzhao/validation.csv",index = False,header = None)
+# 剩余的数据集用来做验证集
+data.loc[m+1:].to_csv("/home/zhangyanzhao/train.csv",index = False,header = None)
+# 销毁data，目的是为了节省内存
+data = data.drop(data.index.tolist())
+
+print("start training")
+ffm_model = xl.create_ffm()
+ffm_model.setTrain("/home/zhangyanzhao/train.csv")
+ffm_model.setValidate("/home/zhangyanzhao/validation.csv")
+
+param = {'task':'binary', 'lr':0.2,
+         'lambda':0.002, 'metric':'auc'}
+
+ffm_model.fit(param, '/home/zhangyanzhao/model.out')
+
+ffm_model.setTest("/home/zhangyanzhao/test.csv")
+ffm_model.setSigmoid()
+
+ffm_model.predict("/home/zhangyanzhao/model.out", "/home/zhangyanzhao/output.txt")
+print("end")
+
+
+
+
+
+
+
+
+
+
+
+