copy utils.py to local

b35c02f4 · 张彦钊 · 4784895c · b35c02f4
Commit b35c02f4 authored Aug 14, 2018 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 90 additions and 0 deletions

utils.py local/utils.py +90 -0

No files found.
--- a/local/utils.py
+++ b/local/utils.py
+# encoding = "utf-8"
+import pymysql
+import pandas as pd
+import numpy as np
+import redis
+# 从Tidb数据库的表里获取数据，并转化成df格式
+def con_sql(sql):
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    cursor = db.cursor()
+    cursor.execute(sql)
+    result = cursor.fetchall()
+    df = pd.DataFrame(list(result)).dropna()
+    db.close()
+    return df
+# 把数据写到redis里
+# TODO 生产环境的redis地址没有提供，下面的地址是测试环境的，需要改成生产环境地址
+def add_data_to_redis(key,val):
+    r = redis.StrictRedis(host='10.30.50.58', port=6379, db = 12)
+    r.set(key, val)
+    # 设置key的过期时间，36小时后过期
+    r.expire(key,36*60*60)
+# ffm 格式转换函数、类
+class FFMFormatPandas:
+    def __init__(self):
+        self.field_index_ = None
+        self.feature_index_ = None
+        self.y = None
+    def fit(self, df, y=None):
+        self.y = y
+        df_ffm = df[df.columns.difference([self.y])]
+        if self.field_index_ is None:
+            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
+        if self.feature_index_ is not None:
+            last_idx = max(list(self.feature_index_.values()))
+        if self.feature_index_ is None:
+            self.feature_index_ = dict()
+            last_idx = 0
+        for col in df.columns:
+            vals = df[col].unique()
+            for val in vals:
+                if pd.isnull(val):
+                    continue
+                name = '{}_{}'.format(col, val)
+                if name not in self.feature_index_:
+                    self.feature_index_[name] = last_idx
+                    last_idx += 1
+            self.feature_index_[col] = last_idx
+            last_idx += 1
+        return self
+    def fit_transform(self, df, y=None):
+        self.fit(df, y)
+        return self.transform(df)
+    def transform_row_(self, row, t):
+        ffm = []
+        if self.y is not None:
+            ffm.append(str(row.loc[row.index == self.y][0]))
+        if self.y is None:
+            ffm.append(str(0))
+        for col, val in row.loc[row.index != self.y].to_dict().items():
+            col_type = t[col]
+            name = '{}_{}'.format(col, val)
+            if col_type.kind == 'O':
+                ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
+            elif col_type.kind == 'i':
+                ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
+        return ' '.join(ffm)
+    def transform(self, df):
+        t = df.dtypes.to_dict()
+        return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
+    # 下面这个方法不是这个类原有的方法，是新增的。目的是用来判断这个用户是不是在训练数据集中存在
+    def is_feature_index_exist(self, name):
+        if name in self.feature_index_:
+            return True
+        else:
+            return False