change test file

3f37c5ce · 张彦钊 · ead306a3 · 3f37c5ce
Commit 3f37c5ce authored Apr 30, 2019 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 23 additions and 18 deletions

record.py tensnsorflow/record.py +23 -18

No files found.
--- a/tensnsorflow/record.py
+++ b/tensnsorflow/record.py
@@ -20,29 +20,32 @@ tf.app.flags.DEFINE_string("output_dir", "./", "output dir")
 tf.app.flags.DEFINE_integer("threads", 16, "threads num")

 def gen_tfrecords(in_file):
-    basename = os.path.basename(in_file) + ".tfrecord"
-    out_file = os.path.join(FLAGS.output_dir, basename)
+    # basename = os.path.basename("/home/gmuser/") + ".tfrecord"
+    # out_file = os.path.join(FLAGS.output_dir, basename)
+    out_file = "/home/gmuser/hello.tfrecord"
    tfrecord_out = tf.python_io.TFRecordWriter(out_file)
-    df = pd.read_csv(in_file)
+    from hdfs import InsecureClient
+    from hdfs.ext.dataframe import read_dataframe
+    client = InsecureClient('http://nvwa01:50070')
+    df = read_dataframe(client,"/recommend/tr/part-00000-2f0d632b-0c61-4a0b-97d4-54bd5e579c5e-c000.avro")

    for i in range(df.shape[0]):
-        feats = ["ucity_id", "ccity_name", "device_type", "manufacturer",
-                "channel", "top", "time", "stat_date","hospital_id",
-                 "method", "min", "max", "treatment_time", "maintain_time", "recover_time"]
+        feats = ["cid_id"]
        id = np.array([])
        for j in feats:
            id = np.append(id,df[j][i])
-        app_list = np.array(str(df["app_list"][i]).split(","))
-        level2_list = np.array(str(df["clevel2_id"][i]).split(","))
-        level3_list = np.array(str(df["level3_ids"][i]).split(","))
+        # app_list = np.array(str(df["app_list"][i]).split(","))
+        # level2_list = np.array(str(df["clevel2_id"][i]).split(","))
+        # level3_list = np.array(str(df["level3_ids"][i]).split(","))
        features = tf.train.Features(feature={
            "y": tf.train.Feature(float_list=tf.train.FloatList(value=[df["y"][i]])),
            "z": tf.train.Feature(float_list=tf.train.FloatList(value=[df["z"][i]])),
-            "ids": tf.train.Feature(int64_list=tf.train.Int64List(value=id.astype(np.int))),
-            "app_list":tf.train.Feature(int64_list=tf.train.Int64List(value=app_list.astype(np.int))),
-            "level2_list": tf.train.Feature(int64_list=tf.train.Int64List(value=level2_list.astype(np.int))),
-            "level3_list": tf.train.Feature(int64_list=tf.train.Int64List(value=level3_list.astype(np.int)))
-            })
+            "ids": tf.train.Feature(int64_list=tf.train.Int64List(value=id.astype(np.int)))
+        })
+            # "app_list":tf.train.Feature(int64_list=tf.train.Int64List(value=app_list.astype(np.int))),
+            # "level2_list": tf.train.Feature(int64_list=tf.train.Int64List(value=level2_list.astype(np.int))),
+            # "level3_list": tf.train.Feature(int64_list=tf.train.Int64List(value=level3_list.astype(np.int)))
+

        example = tf.train.Example(features = features)
        serialized = example.SerializeToString()
@@ -52,7 +55,7 @@ def gen_tfrecords(in_file):
 def main(_):
    if not os.path.exists(FLAGS.output_dir):
        os.mkdir(FLAGS.output_dir)
-    file_list = glob.glob(os.path.join(FLAGS.input_dir, "*.csv"))
+    file_list = glob.glob(os.path.join(FLAGS.input_dir, "*.avro"))
    print("total files: %d" % len(file_list))

    pool = ThreadPool(FLAGS.threads) # Sets the pool size
@@ -62,5 +65,7 @@ def main(_):


 if __name__ == "__main__":
-    tf.logging.set_verbosity(tf.logging.INFO)
-    tf.app.run()
\ No newline at end of file
+    # tf.logging.set_verbosity(tf.logging.INFO)
+    # tf.app.run()
+
+    gen_tfrecords("a")
\ No newline at end of file