Commit 3f37c5ce authored by 张彦钊's avatar 张彦钊

change test file

parent ead306a3
......@@ -20,29 +20,32 @@ tf.app.flags.DEFINE_string("output_dir", "./", "output dir")
tf.app.flags.DEFINE_integer("threads", 16, "threads num")
def gen_tfrecords(in_file):
basename = os.path.basename(in_file) + ".tfrecord"
out_file = os.path.join(FLAGS.output_dir, basename)
# basename = os.path.basename("/home/gmuser/") + ".tfrecord"
# out_file = os.path.join(FLAGS.output_dir, basename)
out_file = "/home/gmuser/hello.tfrecord"
tfrecord_out = tf.python_io.TFRecordWriter(out_file)
df = pd.read_csv(in_file)
from hdfs import InsecureClient
from hdfs.ext.dataframe import read_dataframe
client = InsecureClient('http://nvwa01:50070')
df = read_dataframe(client,"/recommend/tr/part-00000-2f0d632b-0c61-4a0b-97d4-54bd5e579c5e-c000.avro")
for i in range(df.shape[0]):
feats = ["ucity_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "time", "stat_date","hospital_id",
"method", "min", "max", "treatment_time", "maintain_time", "recover_time"]
feats = ["cid_id"]
id = np.array([])
for j in feats:
id = np.append(id,df[j][i])
app_list = np.array(str(df["app_list"][i]).split(","))
level2_list = np.array(str(df["clevel2_id"][i]).split(","))
level3_list = np.array(str(df["level3_ids"][i]).split(","))
# app_list = np.array(str(df["app_list"][i]).split(","))
# level2_list = np.array(str(df["clevel2_id"][i]).split(","))
# level3_list = np.array(str(df["level3_ids"][i]).split(","))
features = tf.train.Features(feature={
"y": tf.train.Feature(float_list=tf.train.FloatList(value=[df["y"][i]])),
"z": tf.train.Feature(float_list=tf.train.FloatList(value=[df["z"][i]])),
"ids": tf.train.Feature(int64_list=tf.train.Int64List(value=id.astype(np.int))),
"app_list":tf.train.Feature(int64_list=tf.train.Int64List(value=app_list.astype(np.int))),
"level2_list": tf.train.Feature(int64_list=tf.train.Int64List(value=level2_list.astype(np.int))),
"level3_list": tf.train.Feature(int64_list=tf.train.Int64List(value=level3_list.astype(np.int)))
"ids": tf.train.Feature(int64_list=tf.train.Int64List(value=id.astype(np.int)))
})
# "app_list":tf.train.Feature(int64_list=tf.train.Int64List(value=app_list.astype(np.int))),
# "level2_list": tf.train.Feature(int64_list=tf.train.Int64List(value=level2_list.astype(np.int))),
# "level3_list": tf.train.Feature(int64_list=tf.train.Int64List(value=level3_list.astype(np.int)))
example = tf.train.Example(features = features)
serialized = example.SerializeToString()
......@@ -52,7 +55,7 @@ def gen_tfrecords(in_file):
def main(_):
if not os.path.exists(FLAGS.output_dir):
os.mkdir(FLAGS.output_dir)
file_list = glob.glob(os.path.join(FLAGS.input_dir, "*.csv"))
file_list = glob.glob(os.path.join(FLAGS.input_dir, "*.avro"))
print("total files: %d" % len(file_list))
pool = ThreadPool(FLAGS.threads) # Sets the pool size
......@@ -62,5 +65,7 @@ def main(_):
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run()
\ No newline at end of file
# tf.logging.set_verbosity(tf.logging.INFO)
# tf.app.run()
gen_tfrecords("a")
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment