修改ffm转化函数，改成features累计相加

7a48b8e5 · 张彦钊 · 1d7d0e0c · 7a48b8e5 · 7a48b8e5 · 7a48b8e5
Commit 7a48b8e5 authored Jan 07, 2019 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 54 additions and 30 deletions

get_tfrecord.py eda/esmm/Feature_pipline/get_tfrecord.py +18 -18

ffm.py tensnsorflow/ffm.py +22 -11

test.py tensnsorflow/test.py +14 -1

No files found.
--- a/eda/esmm/Feature_pipline/get_tfrecord.py
+++ b/eda/esmm/Feature_pipline/get_tfrecord.py
@@ -70,24 +70,24 @@ def gen_tfrecords(in_file):
                            #"feat_vals": tf.train.Feature(float_list=tf.train.FloatList(value=feat_vals))})

            #3 特殊字段单独处理
-            for f, (fname, def_id) in UMH_Fileds.items():
-                if f in ffv[:,0]:
-                    mask = np.array(f == ffv[:,0])
-                    feat_ids = ffv[mask,1]
-                    feat_vals= ffv[mask,2]
-                else:
-                    feat_ids = np.array([def_id])
-                    feat_vals = np.array([1.0])
-                feature.update({fname+"ids": tf.train.Feature(int64_list=tf.train.Int64List(value=feat_ids.astype(np.int))),
-                                fname+"vals": tf.train.Feature(float_list=tf.train.FloatList(value=feat_vals.astype(np.float)))})
-
-            for f, (fname, def_id) in Ad_Fileds.items():
-                if f in ffv[:,0]:
-                    mask = np.array(f == ffv[:,0])
-                    feat_ids = ffv[mask,1]
-                else:
-                    feat_ids = np.array([def_id])
-                feature.update({fname+"ids": tf.train.Feature(int64_list=tf.train.Int64List(value=feat_ids.astype(np.int)))})
+            # for f, (fname, def_id) in UMH_Fileds.items():
+            #     if f in ffv[:,0]:
+            #         mask = np.array(f == ffv[:,0])
+            #         feat_ids = ffv[mask,1]
+            #         feat_vals= ffv[mask,2]
+            #     else:
+            #         feat_ids = np.array([def_id])
+            #         feat_vals = np.array([1.0])
+            #     feature.update({fname+"ids": tf.train.Feature(int64_list=tf.train.Int64List(value=feat_ids.astype(np.int))),
+            #                     fname+"vals": tf.train.Feature(float_list=tf.train.FloatList(value=feat_vals.astype(np.float)))})
+
+            # for f, (fname, def_id) in Ad_Fileds.items():
+            #     if f in ffv[:,0]:
+            #         mask = np.array(f == ffv[:,0])
+            #         feat_ids = ffv[mask,1]
+            #     else:
+            #         feat_ids = np.array([def_id])
+            #     feature.update({fname+"ids": tf.train.Feature(int64_list=tf.train.Int64List(value=feat_ids.astype(np.int)))})

            # serialized to Example
            example = tf.train.Example(features = tf.train.Features(feature = feature))

--- a/tensnsorflow/ffm.py
+++ b/tensnsorflow/ffm.py
@@ -38,6 +38,10 @@ class multiFFMFormatPandas:
        self.y = None

    def fit(self, df, y=None):
+        b = df.dtypes
+        c = list(b.values)
+        d =tuple(df.dtypes.to_dict())
+        f = dict(zip(d,c))
        self.y = y
        df_ffm = df[df.columns.difference([self.y])]
        if self.field_index_ is None:
@@ -49,17 +53,24 @@ class multiFFMFormatPandas:
        if self.feature_index_ is None:
            self.feature_index_ = dict()

-        for col in df.columns:
-            self.feature_index_[col] = 1
-            last_idx = 1
-            vals = df[col].unique()
-            for val in vals:
-                if pd.isnull(val):
-                    continue
-                name = '{}_{}'.format(col, val)
-                if name not in self.feature_index_:
-                    self.feature_index_[name] = last_idx
-                    last_idx += 1
+        last_idx = 1
+        l = list(df.columns)
+        l.remove(y)
+        for col in l:
+            if f[col]=="O":
+                vals = df[col].unique()
+                for val in vals:
+                    if pd.isnull(val):
+                        continue
+                    name = '{}_{}'.format(col, val)
+                    if name not in self.feature_index_:
+                        self.feature_index_[name] = last_idx
+                        last_idx += 1
+            else:
+                self.feature_index_[col] = last_idx
+                last_idx += 1
+        print("last_idx")
+        print(last_idx)
        return self

    def fit_transform(self, df, y=None,n=50000,processes=4):

--- a/tensnsorflow/test.py
+++ b/tensnsorflow/test.py
@@ -75,6 +75,19 @@ def get_cid():


 if __name__ == "__main__":
-   get_cid()
+    writer = tf.python_io.TFRecordWriter('csv_train.tfrecords')
+
+    for i in xrange(train_values.shape[0]):
+        image_raw = train_values[i].tostring()
+
+        # build example protobuf
+        example = tf.train.Example(
+            features=tf.train.Features(feature={
+                'image_raw': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_raw])),
+                'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[train_labels[i]]))
+            }))
+        writer.write(record=example.SerializeToString())
+
+    writer.close()