import tensorflow.compat.v1 as tf tf.logging.set_verbosity(tf.logging.INFO) DATA_DIR = '/data/files/wideAndDeep/trainData/' def input_fn(csv_path, epoch, shuffle, batch_size): dataset = tf.data.TextLineDataset(csv_path) def parse_line(line_tensor): splits = tf.compat.v1.string_split([line_tensor], delimiter='|', skip_empty=False).values return { 'ITEM_CATEGORY_card_id': splits[0], 'USER_CATEGORY_device_id': splits[2], 'USER_CATEGORY_os': splits[3], 'USER_CATEGORY_user_city_id': splits[4], 'USER_MULTI_CATEGORY_second_solutions': tf.compat.v1.string_split([splits[6]], delimiter=',').values, 'USER_MULTI_CATEGORY_second_demands': tf.compat.v1.string_split([splits[7]], delimiter=',').values, 'USER_MULTI_CATEGORY_second_positions': tf.compat.v1.string_split([splits[8]], delimiter=',').values, 'USER_MULTI_CATEGORY_projects': tf.compat.v1.string_split([splits[9]], delimiter=',').values, 'ITEM_NUMERIC_click_count_sum': tf.compat.v1.string_to_number(splits[10]), 'ITEM_NUMERIC_click_count_avg': tf.compat.v1.string_to_number(splits[11]), 'ITEM_NUMERIC_click_count_stddev': tf.compat.v1.string_to_number(splits[12]), 'ITEM_NUMERIC_exp_count_sum': tf.compat.v1.string_to_number(splits[13]), 'ITEM_NUMERIC_exp_count_avg': tf.compat.v1.string_to_number(splits[14]), 'ITEM_NUMERIC_exp_count_stddev': tf.compat.v1.string_to_number(splits[15]), 'ITEM_NUMERIC_discount': tf.compat.v1.string_to_number(splits[16]), 'ITEM_NUMERIC_case_count': tf.compat.v1.string_to_number(splits[17]), 'ITEM_NUMERIC_sales_count': tf.compat.v1.string_to_number(splits[18]), 'ITEM_CATEGORY_service_type': splits[19], 'ITEM_CATEGORY_merchant_id': splits[20], 'ITEM_CATEGORY_doctor_type': splits[21], 'ITEM_CATEGORY_doctor_id': splits[22], 'ITEM_CATEGORY_doctor_famous': splits[23], 'ITEM_CATEGORY_hospital_id': splits[24], 'ITEM_CATEGORY_hospital_city_tag_id': splits[25], 'ITEM_CATEGORY_hospital_type': splits[26], 'ITEM_CATEGORY_hospital_is_high_quality': splits[27], 'ITEM_MULTI_CATEGORY_second_demands': tf.compat.v1.string_split([splits[28]], delimiter=',').values, 'ITEM_MULTI_CATEGORY_second_solutions': tf.compat.v1.string_split([splits[29]], delimiter=',').values, 'ITEM_MULTI_CATEGORY_second_positions': tf.compat.v1.string_split([splits[30]], delimiter=',').values, 'ITEM_MULTI_CATEGORY_projects': tf.compat.v1.string_split([splits[31]], delimiter=',').values, 'ITEM_NUMERIC_sku_price': tf.compat.v1.string_to_number(splits[32]), # 'label': tf.compat.v1.string_to_number(splits[5]) }, tf.compat.v1.string_to_number(splits[5]) padded_shapes = ({'ITEM_CATEGORY_card_id': (), 'USER_CATEGORY_device_id': (), 'USER_CATEGORY_os': (), 'USER_CATEGORY_user_city_id': (), 'USER_MULTI_CATEGORY_second_solutions': [-1], 'USER_MULTI_CATEGORY_second_demands': [-1], 'USER_MULTI_CATEGORY_second_positions': [-1], 'USER_MULTI_CATEGORY_projects': [-1], 'ITEM_NUMERIC_click_count_sum': (), 'ITEM_NUMERIC_click_count_avg': (), 'ITEM_NUMERIC_click_count_stddev': (), 'ITEM_NUMERIC_exp_count_sum': (), 'ITEM_NUMERIC_exp_count_avg': (), 'ITEM_NUMERIC_exp_count_stddev': (), 'ITEM_NUMERIC_discount': (), 'ITEM_NUMERIC_case_count': (), 'ITEM_NUMERIC_sales_count': (), 'ITEM_CATEGORY_service_type': (), 'ITEM_CATEGORY_merchant_id': (), 'ITEM_CATEGORY_doctor_type': (), 'ITEM_CATEGORY_doctor_id': (), 'ITEM_CATEGORY_doctor_famous': (), 'ITEM_CATEGORY_hospital_id': (), 'ITEM_CATEGORY_hospital_city_tag_id': (), 'ITEM_CATEGORY_hospital_type': (), 'ITEM_CATEGORY_hospital_is_high_quality': (), 'ITEM_MULTI_CATEGORY_second_demands': [-1], 'ITEM_MULTI_CATEGORY_second_solutions': [-1], 'ITEM_MULTI_CATEGORY_second_positions': [-1], 'ITEM_MULTI_CATEGORY_projects': [-1], 'ITEM_NUMERIC_sku_price': ()}, ()) padding_values = ({'ITEM_CATEGORY_card_id': '-1', 'USER_CATEGORY_device_id': '-1', 'USER_CATEGORY_os': '-1', 'USER_CATEGORY_user_city_id': '-1', 'USER_MULTI_CATEGORY_second_solutions': '-1', 'USER_MULTI_CATEGORY_second_demands': '-1', 'USER_MULTI_CATEGORY_second_positions': '-1', 'USER_MULTI_CATEGORY_projects': '-1', 'ITEM_NUMERIC_click_count_sum': 0.0, 'ITEM_NUMERIC_click_count_avg': 0.0, 'ITEM_NUMERIC_click_count_stddev': 0.0, 'ITEM_NUMERIC_exp_count_sum': 0.0, 'ITEM_NUMERIC_exp_count_avg': 0.0, 'ITEM_NUMERIC_exp_count_stddev': 0.0, 'ITEM_NUMERIC_discount': 0.0, 'ITEM_NUMERIC_case_count': 0.0, 'ITEM_NUMERIC_sales_count': 0.0, 'ITEM_CATEGORY_service_type': '-1', 'ITEM_CATEGORY_merchant_id': '-1', 'ITEM_CATEGORY_doctor_type': '-1', 'ITEM_CATEGORY_doctor_id': '-1', 'ITEM_CATEGORY_doctor_famous': '-1', 'ITEM_CATEGORY_hospital_id': '-1', 'ITEM_CATEGORY_hospital_city_tag_id': '-1', 'ITEM_CATEGORY_hospital_type': '-1', 'ITEM_CATEGORY_hospital_is_high_quality': '-1', 'ITEM_MULTI_CATEGORY_second_demands': '-1', 'ITEM_MULTI_CATEGORY_second_solutions': '-1', 'ITEM_MULTI_CATEGORY_second_positions': '-1', 'ITEM_MULTI_CATEGORY_projects': '-1', 'ITEM_NUMERIC_sku_price': 0.0}, 0.0) dataset = dataset.map(parse_line, num_parallel_calls = 8).cache() dataset = dataset.padded_batch(batch_size, padded_shapes, padding_values=padding_values) if shuffle: dataset = dataset.shuffle(2048) return dataset.prefetch(512 * 100).repeat(epoch) dataset = input_fn(DATA_DIR + 'eval_samples.csv', 1, False, 2 ** 15) import time iter1 = dataset.make_one_shot_iterator() try: start = time.time() while True: iter1.get_next() except: print(time.time() - start) dataset = input_fn(DATA_DIR + 'eval_samples.csv', 1, False, 2 ** 15) import time iter1 = dataset.make_one_shot_iterator() try: start = time.time() while True: iter1.get_next() except: print(time.time() - start) iter2 = dataset.make_one_shot_iterator() try: start = time.time() while True: iter2.get_next() except: print(time.time() - start) iter2 = dataset.make_one_shot_iterator() try: start = time.time() while True: iter2.get_next() except: print(time.time() - start)