change directories and modules

d61a97b5 · 张彦钊 · 4f096b7d · d61a97b5 · d61a97b5 · d61a97b5
Commit d61a97b5 authored Aug 07, 2018 by 张彦钊
7 changed files
--- a/__pycache__/utils.cpython-36.pyc
+++ b/__pycache__/utils.cpython-36.pyc
--- a/diary-training.py
+++ b/diary-training.py
-import datetime
-import pymysql
 import pandas as pd
-import numpy as np
+from utils import FFMFormatPandas
 import xlearn as xl
+import time
+from prepareData import fetch_data

-
-# 从数据库的表里获取数据，并转化成df格式
-def con_sql(sql):
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
-    cursor = db.cursor()
-    cursor.execute(sql)
-    result = cursor.fetchall()
-    df = pd.DataFrame(list(result)).dropna()
-    db.close()
-    return df
-
-
-# 获取点击表里的device_id
-sql = "select distinct device_id from data_feed_click"
-click_device_id = con_sql(sql)[0].values.tolist()
-print("成功获取点击表里的device_id")
-
-# 获取点击表里的数据
-sql = "select cid,device_id,time,stat_date from data_feed_click where stat_date >= '2018-08-03'"
-click = con_sql(sql)
-click = click.rename(columns={0:"cid",1:"device_id",2:"time",3:"stat_date"})
-print("成功获取点击表里的数据")
-# 从time特征中抽取hour
-click["hour"] = click["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
-click["minute"] = click["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).minute)
-click = click.drop("time",axis=1)
-print("点击表数据预览")
-print(click.head(2))
-
-# 获取曝光表里的数据
-sql = "select cid,device_id,time,stat_date from data_feed_exposure where stat_date >= '2018-08-03'"
-exposure = con_sql(sql)
-exposure = exposure.rename(columns={0:"cid",1:"device_id",2:"time",3:"stat_date"})
-print("成功获取曝光表里的数据")
-# 从time特征中抽取hour
-exposure["hour"] = exposure["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
-exposure["minute"] = exposure["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).minute)
-exposure = exposure.drop("time",axis=1)
-print("曝光表数据预览")
-print(exposure.head(2))
+exposure, click, click_device_id = fetch_data()

 # 求曝光表和点击表的差集合
 print("曝光表处理前的样本个数")
@@ -86,69 +47,13 @@ print(data.head(2))


 print("start ffm transform")
-
-
-# ffm 格式转换函数、类
-class FFMFormatPandas:
-    def __init__(self):
-        self.field_index_ = None
-        self.feature_index_ = None
-        self.y = None
-
-    def fit(self, df, y=None):
-        self.y = y
-        df_ffm = df[df.columns.difference([self.y])]
-        if self.field_index_ is None:
-            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
-
-        if self.feature_index_ is not None:
-            last_idx = max(list(self.feature_index_.values()))
-
-        if self.feature_index_ is None:
-            self.feature_index_ = dict()
-            last_idx = 0
-
-        for col in df.columns:
-            vals = df[col].unique()
-            for val in vals:
-                if pd.isnull(val):
-                    continue
-                name = '{}_{}'.format(col, val)
-                if name not in self.feature_index_:
-                    self.feature_index_[name] = last_idx
-                    last_idx += 1
-            self.feature_index_[col] = last_idx
-            last_idx += 1
-        return self
-
-    def fit_transform(self, df, y=None):
-        self.fit(df, y)
-        return self.transform(df)
-
-    def transform_row_(self, row, t):
-        ffm = []
-        if self.y != None:
-            ffm.append(str(row.loc[row.index == self.y][0]))
-        if self.y is None:
-            ffm.append(str(0))
-
-        for col, val in row.loc[row.index != self.y].to_dict().items():
-            col_type = t[col]
-            name = '{}_{}'.format(col, val)
-            if col_type.kind ==  'O':
-                ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
-            elif col_type.kind == 'i':
-                ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
-        return ' '.join(ffm)
-
-    def transform(self, df):
-        t = df.dtypes.to_dict()
-        return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
-
-
+start = time.time()
 ffm_train = FFMFormatPandas()
 data = ffm_train.fit_transform(data, y='y')
 print("done transform ffm")
+end = time.time()
+print("ffm转化数据耗时：")
+print(end-start)
 data.to_csv("/home/zhangyanzhao/data.csv",index=False)
 data = pd.read_csv("/home/zhangyanzhao/data.csv",header=None)
 print("数据集大小")

--- a/prepareData.py
+++ b/prepareData.py
+from utils import con_sql
+import datetime
+
+
+def fetch_data(start_date='2018-08-03'):
+
+    # 获取点击表里的device_id
+    sql = "select distinct device_id from data_feed_click"
+    click_device_id = con_sql(sql)[0].values.tolist()
+    print("成功获取点击表里的device_id")
+
+    # 获取点击表里的数据
+    sql = "select cid,device_id,time,stat_date from data_feed_click where stat_date >= {0}".format(start_date)
+    click = con_sql(sql)
+    click = click.rename(columns={0:"cid",1:"device_id",2:"time",3:"stat_date"})
+    print("成功获取点击表里的数据")
+    # 从time特征中抽取hour
+    click["hour"] = click["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
+    click["minute"] = click["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).minute)
+    click = click.drop("time",axis=1)
+    print("点击表数据预览")
+    print(click.head(2))
+
+    # 获取曝光表里的数据
+    sql = "select cid,device_id,time,stat_date from data_feed_exposure where stat_date >= {0}".format(start_date)
+    exposure = con_sql(sql)
+    exposure = exposure.rename(columns={0:"cid",1:"device_id",2:"time",3:"stat_date"})
+    print("成功获取曝光表里的数据")
+    # 从time特征中抽取hour
+    exposure["hour"] = exposure["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
+    exposure["minute"] = exposure["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).minute)
+    exposure = exposure.drop("time",axis=1)
+    print("曝光表数据预览")
+    print(exposure.head(2))
+
+    return exposure, click, click_device_id
--- a/diaryTestSet.py
+++ b/diaryTestSet.py
--- a/test/prepareTestData.py
+++ b/test/prepareTestData.py
+from prepareData import fetch_data
+
+
+def fetch_test_data():
+    exposure, click, click_device_ids = fetch_data(start_date='2018-08-06')
+
+
+
--- a/test/testCases.py
+++ b/test/testCases.py
+from utils import *
+import datetime
+
+
+if __name__ == '__main__':
+    data = pd.read_csv("data/raw-exposure.csv")[["cid","device_id","time"]]
+
+    data["hour"] = data["time"].apply(lambda x: lambda x:datetime.datetime.fromtimestamp(x).hour)
+
+    #data.to_csv("data/data.csv")
+    print(data.head())
+
+
+
+
--- a/utils.py
+++ b/utils.py
+# encoding = "utf-8"
+import pymysql
+import pandas as pd
+import numpy as np
+
+
+# 从数据库的表里获取数据，并转化成df格式
+def con_sql(sql):
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    cursor = db.cursor()
+    cursor.execute(sql)
+    result = cursor.fetchall()
+    df = pd.DataFrame(list(result)).dropna()
+    db.close()
+    return df
+
+
+# ffm 格式转换函数、类
+class FFMFormatPandas:
+    def __init__(self):
+        self.field_index_ = None
+        self.feature_index_ = None
+        self.y = None
+
+    def fit(self, df, y=None):
+        self.y = y
+        df_ffm = df[df.columns.difference([self.y])]
+        if self.field_index_ is None:
+            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
+
+        if self.feature_index_ is not None:
+            last_idx = max(list(self.feature_index_.values()))
+
+        if self.feature_index_ is None:
+            self.feature_index_ = dict()
+            last_idx = 0
+
+        for col in df.columns:
+            vals = df[col].unique()
+            for val in vals:
+                if pd.isnull(val):
+                    continue
+                name = '{}_{}'.format(col, val)
+                if name not in self.feature_index_:
+                    self.feature_index_[name] = last_idx
+                    last_idx += 1
+            self.feature_index_[col] = last_idx
+            last_idx += 1
+        return self
+
+    def fit_transform(self, df, y=None):
+        self.fit(df, y)
+        return self.transform(df)
+
+    def transform_row_(self, row, t):
+        ffm = []
+        if self.y != None:
+            ffm.append(str(row.loc[row.index == self.y][0]))
+        if self.y is None:
+            ffm.append(str(0))
+
+        for col, val in row.loc[row.index != self.y].to_dict().items():
+            col_type = t[col]
+            name = '{}_{}'.format(col, val)
+            if col_type.kind ==  'O':
+                ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
+            elif col_type.kind == 'i':
+                ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
+        return ' '.join(ffm)
+
+    def transform(self, df):
+        t = df.dtypes.to_dict()
+        return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
\ No newline at end of file