Commit a7d512a3 authored by 张彦钊's avatar 张彦钊

ffm转化累计相加

parent 7a5e54d0
...@@ -38,6 +38,10 @@ class multiFFMFormatPandas: ...@@ -38,6 +38,10 @@ class multiFFMFormatPandas:
self.y = None self.y = None
def fit(self, df, y=None): def fit(self, df, y=None):
b = df.dtypes
c = list(b.values)
d =tuple(df.dtypes.to_dict())
f = dict(zip(d,c))
self.y = y self.y = y
df_ffm = df[df.columns.difference([self.y])] df_ffm = df[df.columns.difference([self.y])]
if self.field_index_ is None: if self.field_index_ is None:
...@@ -49,9 +53,11 @@ class multiFFMFormatPandas: ...@@ -49,9 +53,11 @@ class multiFFMFormatPandas:
if self.feature_index_ is None: if self.feature_index_ is None:
self.feature_index_ = dict() self.feature_index_ = dict()
for col in df.columns:
self.feature_index_[col] = 1
last_idx = 1 last_idx = 1
l = list(df.columns)
l.remove(y)
for col in l:
if f[col]=="O":
vals = list(df[col].unique()) vals = list(df[col].unique())
vals.append("na") vals.append("na")
vals = set(vals) vals = set(vals)
...@@ -62,6 +68,9 @@ class multiFFMFormatPandas: ...@@ -62,6 +68,9 @@ class multiFFMFormatPandas:
if name not in self.feature_index_: if name not in self.feature_index_:
self.feature_index_[name] = last_idx self.feature_index_[name] = last_idx
last_idx += 1 last_idx += 1
else:
self.feature_index_[col] = last_idx
last_idx += 1
return self return self
def fit_transform(self, df, y=None,n=50000,processes=4): def fit_transform(self, df, y=None,n=50000,processes=4):
...@@ -131,14 +140,6 @@ class multiFFMFormatPandas: ...@@ -131,14 +140,6 @@ class multiFFMFormatPandas:
return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()}) return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
# 下面这个方法不是这个类原有的方法,是新增的。目的是用来判断这个用户是不是在训练数据集中存在
def is_feature_index_exist(self, name):
if name in self.feature_index_:
return True
else:
return False
def get_data(): def get_data():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select max(stat_date) from esmm_train_data" sql = "select max(stat_date) from esmm_train_data"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment