down_samples_and_vocab.py 1.42 KB
import redis
import sys
import os
import json

def getRedisConn():
    pool = redis.ConnectionPool(host="172.16.50.145",password="XfkMCCdWDIU%ls$h",port=6379,db=0)
    conn = redis.Redis(connection_pool=pool)
    # conn = redis.Redis(host="172.16.50.145", port=6379, password="XfkMCCdWDIU%ls$h",db=0)
    # conn = redis.Redis(host="172.18.51.10", port=6379, db=0, decode_responses = True) #test
    return conn

if len(sys.argv) == 2:
    save_dir = sys.argv[1]
else:
    save_dir = '/data/files/wideAndDeep/trainData/'

print('save_dir: ', save_dir)
if not os.path.exists(save_dir):
    print('mkdir save_dir: ', save_dir)
    os.makedirs(save_dir)

conn = getRedisConn()
vocab_keys = conn.lrange("strategy:all:vocab", 0, -1)
print("vocab_keys: ", vocab_keys[0])
vocab_keys = eval(vocab_keys[0])



for vocab_key in vocab_keys:
    print('vocab_key: ', vocab_key)
    splits = vocab_key.split(":")
    field = splits[1]
    filename = field + "_vocab.csv"

    print('filename: ', filename)
    with open(os.path.join(save_dir, filename), 'w') as f:
        texts = conn.lrange(vocab_key, 0, -1)
        texts = list(filter(lambda x: x != '', eval(texts[0])))
        print('texts: ', len(texts))
        f.write('\n'.join(texts))

os.system("hdfs dfs -getmerge /strategy/train_samples {save_dir}train_samples.csv".format(save_dir = save_dir))
os.system("hdfs dfs -getmerge /strategy/eval_samples {save_dir}eval_samples.csv".format(save_dir = save_dir))