Commit 7a48b8e5 authored by 张彦钊's avatar 张彦钊

修改ffm转化函数,改成features累计相加

parent 1d7d0e0c
......@@ -70,24 +70,24 @@ def gen_tfrecords(in_file):
#"feat_vals": tf.train.Feature(float_list=tf.train.FloatList(value=feat_vals))})
#3 特殊字段单独处理
for f, (fname, def_id) in UMH_Fileds.items():
if f in ffv[:,0]:
mask = np.array(f == ffv[:,0])
feat_ids = ffv[mask,1]
feat_vals= ffv[mask,2]
else:
feat_ids = np.array([def_id])
feat_vals = np.array([1.0])
feature.update({fname+"ids": tf.train.Feature(int64_list=tf.train.Int64List(value=feat_ids.astype(np.int))),
fname+"vals": tf.train.Feature(float_list=tf.train.FloatList(value=feat_vals.astype(np.float)))})
for f, (fname, def_id) in Ad_Fileds.items():
if f in ffv[:,0]:
mask = np.array(f == ffv[:,0])
feat_ids = ffv[mask,1]
else:
feat_ids = np.array([def_id])
feature.update({fname+"ids": tf.train.Feature(int64_list=tf.train.Int64List(value=feat_ids.astype(np.int)))})
# for f, (fname, def_id) in UMH_Fileds.items():
# if f in ffv[:,0]:
# mask = np.array(f == ffv[:,0])
# feat_ids = ffv[mask,1]
# feat_vals= ffv[mask,2]
# else:
# feat_ids = np.array([def_id])
# feat_vals = np.array([1.0])
# feature.update({fname+"ids": tf.train.Feature(int64_list=tf.train.Int64List(value=feat_ids.astype(np.int))),
# fname+"vals": tf.train.Feature(float_list=tf.train.FloatList(value=feat_vals.astype(np.float)))})
# for f, (fname, def_id) in Ad_Fileds.items():
# if f in ffv[:,0]:
# mask = np.array(f == ffv[:,0])
# feat_ids = ffv[mask,1]
# else:
# feat_ids = np.array([def_id])
# feature.update({fname+"ids": tf.train.Feature(int64_list=tf.train.Int64List(value=feat_ids.astype(np.int)))})
# serialized to Example
example = tf.train.Example(features = tf.train.Features(feature = feature))
......
......@@ -38,6 +38,10 @@ class multiFFMFormatPandas:
self.y = None
def fit(self, df, y=None):
b = df.dtypes
c = list(b.values)
d =tuple(df.dtypes.to_dict())
f = dict(zip(d,c))
self.y = y
df_ffm = df[df.columns.difference([self.y])]
if self.field_index_ is None:
......@@ -49,17 +53,24 @@ class multiFFMFormatPandas:
if self.feature_index_ is None:
self.feature_index_ = dict()
for col in df.columns:
self.feature_index_[col] = 1
last_idx = 1
vals = df[col].unique()
for val in vals:
if pd.isnull(val):
continue
name = '{}_{}'.format(col, val)
if name not in self.feature_index_:
self.feature_index_[name] = last_idx
last_idx += 1
last_idx = 1
l = list(df.columns)
l.remove(y)
for col in l:
if f[col]=="O":
vals = df[col].unique()
for val in vals:
if pd.isnull(val):
continue
name = '{}_{}'.format(col, val)
if name not in self.feature_index_:
self.feature_index_[name] = last_idx
last_idx += 1
else:
self.feature_index_[col] = last_idx
last_idx += 1
print("last_idx")
print(last_idx)
return self
def fit_transform(self, df, y=None,n=50000,processes=4):
......
......@@ -75,6 +75,19 @@ def get_cid():
if __name__ == "__main__":
get_cid()
writer = tf.python_io.TFRecordWriter('csv_train.tfrecords')
for i in xrange(train_values.shape[0]):
image_raw = train_values[i].tostring()
# build example protobuf
example = tf.train.Example(
features=tf.train.Features(feature={
'image_raw': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_raw])),
'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[train_labels[i]]))
}))
writer.write(record=example.SerializeToString())
writer.close()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment