add print in ctr

fbb3fd1c · 张彦钊 · f0b5b9e9 · fbb3fd1c · fbb3fd1c · fbb3fd1c
Commit fbb3fd1c authored Sep 12, 2018 by 张彦钊
Show whitespace changes
Inline Side-by-side

Showing with 69 additions and 66 deletions

ctr.py ctr.py +1 -0

utils.py local/utils.py +64 -62

prepareData.py prepareData.py +3 -3

utils.py utils.py +1 -1

No files found.
--- a/ctr.py
+++ b/ctr.py
@@ -64,6 +64,7 @@ def get_tail6():
    print("成功获取")
    result = cursor.fetchall()
    db.close()
+    print(pd.DataFrame(list(result)).empty)
    user = pd.DataFrame(list(result))[0].values.tolist()
    user = tuple(user)
    print("用户个数")

--- a/local/utils.py
+++ b/local/utils.py
@@ -165,65 +165,67 @@ class multiFFMFormatPandas:
            return False
 # ffm 格式转换函数、类
-# class FFMFormatPandas:
+class FFMFormatPandas:
-#     def __init__(self):
+    def __init__(self):
-#         self.field_index_ = None
+        self.field_index_ = None
-#         self.feature_index_ = None
+        self.feature_index_ = None
-#         self.y = None
+        self.y = None
-#
-#     def fit(self, df, y=None):
+    def fit(self, df, y=None):
-#         self.y = y
+        self.y = y
-#         df_ffm = df[df.columns.difference([self.y])]
+        df_ffm = df[df.columns.difference([self.y])]
-#         if self.field_index_ is None:
+        if self.field_index_ is None:
-#             self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
+            # 除了y，每列列名加索引对应的字典，例如field_index = {name：0，age：1}
-#
+            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
-#         if self.feature_index_ is not None:
-#             last_idx = max(list(self.feature_index_.values()))
+        if self.feature_index_ is not None:
-#
+            last_idx = max(list(self.feature_index_.values()))
-#         if self.feature_index_ is None:
-#             self.feature_index_ = dict()
+        if self.feature_index_ is None:
-#             last_idx = 0
+            self.feature_index_ = dict()
-#
+            last_idx = 0
-#         for col in df.columns:
-#             vals = df[col].unique()
+        for col in df.columns:
-#             for val in vals:
+            vals = df[col].unique()
-#                 if pd.isnull(val):
+            for val in vals:
-#                     continue
+                if pd.isnull(val):
-#                 name = '{}_{}'.format(col, val)
+                    continue
-#                 if name not in self.feature_index_:
+                name = '{}_{}'.format(col, val)
-#                     self.feature_index_[name] = last_idx
+                if name not in self.feature_index_:
-#                     last_idx += 1
+                    # feature_index = {name_tom :0,name_lily :1,name:2,age_18:3,age_19:4:age:5}
-#             self.feature_index_[col] = last_idx
+                    self.feature_index_[name] = last_idx
-#             last_idx += 1
+                    last_idx += 1
-#         return self
+            self.feature_index_[col] = last_idx
-#
+            last_idx += 1
-#     def fit_transform(self, df, y=None):
+        return self
-#         self.fit(df, y)
-#         return self.transform(df)
+    def fit_transform(self, df, y=None):
-#
+        self.fit(df, y)
-#     def transform_row_(self, row, t):
+        return self.transform(df)
-#         ffm = []
-#         if self.y is not None:
+    def transform_row_(self, row, t):
-#             ffm.append(str(row.loc[row.index == self.y][0]))
+        ffm = []
-#         if self.y is None:
+        if self.y is not None:
-#             ffm.append(str(0))
+            ffm.append(str(row.loc[row.index == self.y][0]))
-#
+        if self.y is None:
-#         for col, val in row.loc[row.index != self.y].to_dict().items():
+            ffm.append(str(0))
-#             col_type = t[col]
-#             name = '{}_{}'.format(col, val)
+        for col, val in row.loc[row.index != self.y].to_dict().items():
-#             if col_type.kind == 'O':
+            col_type = t[col]
-#                 ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
+            name = '{}_{}'.format(col, val)
-#             elif col_type.kind == 'i':
+            if col_type.kind == 'O':
-#                 ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
+                ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
-#         return ' '.join(ffm)
+            elif col_type.kind == 'i':
-#
+                ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
-#     def transform(self, df):
+        return ' '.join(ffm)
-#         t = df.dtypes.to_dict()
-#         return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
+    def transform(self, df):
-#
+        t = df.dtypes.to_dict()
-#     # 下面这个方法不是这个类原有的方法，是新增的。目的是用来判断这个用户是不是在训练数据集中存在
+        return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
-#     def is_feature_index_exist(self, name):
-#         if name in self.feature_index_:
+    # 下面这个方法不是这个类原有的方法，是新增的。目的是用来判断这个用户是不是在训练数据集中存在
-#             return True
+    def is_feature_index_exist(self, name):
-#         else:
+        if name in self.feature_index_:
-#             return False
+            return True
+        else:
+            return False
--- a/prepareData.py
+++ b/prepareData.py
@@ -6,7 +6,7 @@ import pymysql
 def fetch_data(start_date, end_date):
    # 获取点击表里的device_id
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
    sql = "select distinct device_id from data_feed_click"
    click_device_id = con_sql(db,sql)[0].values.tolist()
    print("成功获取点击表里的device_id")
@@ -14,7 +14,7 @@ def fetch_data(start_date, end_date):
    sql = "select cid,device_id,time,stat_date from data_feed_click " \
          "where stat_date >= '{0}' and stat_date <= '{1}'".format(start_date, end_date)
    # 因为上面的db已经关了，需要再写一遍
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
    click = con_sql(db,sql)
    click = click.rename(columns={0: "cid", 1: "device_id", 2: "time_date", 3: "stat_date"})
    print("成功获取点击表里的数据")
@@ -28,7 +28,7 @@ def fetch_data(start_date, end_date):
          "where stat_date >= '{0}' and stat_date <= '{1}'".format(start_date, end_date)
    start = time.time()
    # 因为上面的db已经关了，需要再写一遍
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
    exposure = con_sql(db,sql)
    end = time.time()
    print("获取曝光表耗时{}分".format((end-start)/60))

--- a/utils.py
+++ b/utils.py
@@ -283,7 +283,7 @@ class multiFFMFormatPandas:
 #         t = df.dtypes.to_dict()
 #         return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
 #
-#     # 下面这个方法不是这个类原有的方法，是新增的。目的是用来判断这个用户是不是在训练数据集中存在
+#     下面这个方法不是这个类原有的方法，是新增的。目的是用来判断这个用户是不是在训练数据集中存在
 #     def is_feature_index_exist(self, name):
 #         if name in self.feature_index_:
 #             return True