Commit a7d512a3 authored by 张彦钊's avatar 张彦钊

ffm转化累计相加

parent 7a5e54d0
......@@ -38,6 +38,10 @@ class multiFFMFormatPandas:
self.y = None
def fit(self, df, y=None):
b = df.dtypes
c = list(b.values)
d =tuple(df.dtypes.to_dict())
f = dict(zip(d,c))
self.y = y
df_ffm = df[df.columns.difference([self.y])]
if self.field_index_ is None:
......@@ -49,19 +53,24 @@ class multiFFMFormatPandas:
if self.feature_index_ is None:
self.feature_index_ = dict()
for col in df.columns:
self.feature_index_[col] = 1
last_idx = 1
vals = list(df[col].unique())
vals.append("na")
vals = set(vals)
for val in vals:
if pd.isnull(val):
continue
name = '{}_{}'.format(col, val)
if name not in self.feature_index_:
self.feature_index_[name] = last_idx
last_idx += 1
last_idx = 1
l = list(df.columns)
l.remove(y)
for col in l:
if f[col]=="O":
vals = list(df[col].unique())
vals.append("na")
vals = set(vals)
for val in vals:
if pd.isnull(val):
continue
name = '{}_{}'.format(col, val)
if name not in self.feature_index_:
self.feature_index_[name] = last_idx
last_idx += 1
else:
self.feature_index_[col] = last_idx
last_idx += 1
return self
def fit_transform(self, df, y=None,n=50000,processes=4):
......@@ -131,14 +140,6 @@ class multiFFMFormatPandas:
return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
# 下面这个方法不是这个类原有的方法,是新增的。目的是用来判断这个用户是不是在训练数据集中存在
def is_feature_index_exist(self, name):
if name in self.feature_index_:
return True
else:
return False
def get_data():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select max(stat_date) from esmm_train_data"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment