Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
PerfFlat.cu
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 
10 #include "../../IndexFlat.h"
11 #include "../../utils.h"
12 #include "../GpuIndexFlat.h"
13 #include "IndexWrapper.h"
14 #include "../test/TestUtils.h"
15 #include "../utils/DeviceTensor.cuh"
16 #include "../utils/DeviceUtils.h"
17 #include "../utils/HostTensor.cuh"
18 #include "../utils/Timer.h"
19 #include <gflags/gflags.h>
20 #include <map>
21 #include <memory>
22 #include <vector>
23 
24 #include <cuda_profiler_api.h>
25 
26 DEFINE_bool(l2, true, "L2 or inner product");
27 DEFINE_int32(k, 3, "final number of closest results returned");
28 DEFINE_int32(num, 128, "# of vecs");
29 DEFINE_int32(dim, 128, "# of dimensions");
30 DEFINE_int32(num_queries, 3, "number of query vectors");
31 DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
32 DEFINE_bool(use_float16, false, "use encodings in float16");
33 DEFINE_bool(use_float16_math, false, "perform math in float16");
34 DEFINE_bool(transposed, false, "store vectors transposed");
35 DEFINE_int64(seed, -1, "specify random seed");
36 DEFINE_int32(num_gpus, 1, "number of gpus to use");
37 DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use");
38 DEFINE_bool(cpu, true, "run the CPU code for timing and comparison");
39 DEFINE_bool(use_unified_mem, false, "use Pascal unified memory for the index");
40 
41 using namespace faiss::gpu;
42 
43 int main(int argc, char** argv) {
44  gflags::ParseCommandLineFlags(&argc, &argv, true);
45 
46  cudaProfilerStop();
47 
48  auto seed = FLAGS_seed != -1L ? FLAGS_seed : time(nullptr);
49  printf("using seed %ld\n", seed);
50 
51  auto numQueries = FLAGS_num_queries;
52 
53  auto index = std::unique_ptr<faiss::IndexFlat>(
54  new faiss::IndexFlat(FLAGS_dim, FLAGS_l2 ?
55  faiss::METRIC_L2 : faiss::METRIC_INNER_PRODUCT));
56 
57  HostTensor<float, 2, true> vecs({FLAGS_num, FLAGS_dim});
58  faiss::float_rand(vecs.data(), vecs.numElements(), seed);
59 
60  index->add(FLAGS_num, vecs.data());
61 
62  printf("Database: dim %d num vecs %d\n", FLAGS_dim, FLAGS_num);
63  printf("%s lookup: %d queries, total k %d\n",
64  FLAGS_l2 ? "L2" : "IP",
65  numQueries, FLAGS_k);
66  printf("float16 encoding %s\n", FLAGS_use_float16 ? "enabled" : "disabled");
67  printf("transposed storage %s\n", FLAGS_transposed ? "enabled" : "disabled");
68 
69  // Convert to GPU index
70  printf("Copying index to %d GPU(s)...\n", FLAGS_num_gpus);
71 
72  auto initFn = [&index](faiss::gpu::GpuResources* res, int dev) ->
73  std::unique_ptr<faiss::gpu::GpuIndexFlat> {
74  ((faiss::gpu::StandardGpuResources*) res)->setPinnedMemory(
75  FLAGS_pinned_mem);
76 
77  GpuIndexFlatConfig config;
78  config.device = dev;
79  config.useFloat16 = FLAGS_use_float16;
80  config.useFloat16Accumulator = FLAGS_use_float16_math;
81  config.storeTransposed = FLAGS_transposed;
82  config.memorySpace = FLAGS_use_unified_mem ?
83  MemorySpace::Unified : MemorySpace::Device;
84 
85  auto p = std::unique_ptr<faiss::gpu::GpuIndexFlat>(
86  new faiss::gpu::GpuIndexFlat(res, index.get(), config));
87  return p;
88  };
89 
90  IndexWrapper<faiss::gpu::GpuIndexFlat> gpuIndex(FLAGS_num_gpus, initFn);
91  printf("copy done\n");
92 
93  // Build query vectors
94  HostTensor<float, 2, true> cpuQuery({numQueries, FLAGS_dim});
95  faiss::float_rand(cpuQuery.data(), cpuQuery.numElements(), seed);
96 
97  // Time faiss CPU
98  HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
99  HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
100 
101  if (FLAGS_cpu) {
102  float cpuTime = 0.0f;
103 
104  CpuTimer timer;
105  index->search(numQueries,
106  cpuQuery.data(),
107  FLAGS_k,
108  cpuDistances.data(),
109  cpuIndices.data());
110 
111  cpuTime = timer.elapsedMilliseconds();
112  printf("CPU time %.3f ms\n", cpuTime);
113  }
114 
115  HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
116  HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
117 
118  CUDA_VERIFY(cudaProfilerStart());
119  faiss::gpu::synchronizeAllDevices();
120 
121  float gpuTime = 0.0f;
122 
123  // Time GPU
124  {
125  CpuTimer timer;
126 
127  gpuIndex.getIndex()->search(cpuQuery.getSize(0),
128  cpuQuery.data(),
129  FLAGS_k,
130  gpuDistances.data(),
131  gpuIndices.data());
132 
133  // There is a device -> host copy above, so no need to time
134  // additional synchronization with the GPU
135  gpuTime = timer.elapsedMilliseconds();
136  }
137 
138  CUDA_VERIFY(cudaProfilerStop());
139  printf("GPU time %.3f ms\n", gpuTime);
140 
141  if (FLAGS_cpu) {
142  compareLists(cpuDistances.data(), cpuIndices.data(),
143  gpuDistances.data(), gpuIndices.data(),
144  numQueries, FLAGS_k,
145  "", true, FLAGS_diff, false);
146  }
147 
148  CUDA_VERIFY(cudaDeviceSynchronize());
149 
150  return 0;
151 }
float elapsedMilliseconds()
Returns elapsed time in milliseconds.
Definition: Timer.cpp:51
CPU wallclock elapsed timer.
Definition: Timer.h:41
bool useFloat16
Whether or not data is stored as float16.
Definition: GpuIndexFlat.h:34
int device
GPU device on which the index is resident.
Definition: GpuIndex.h:26
MemorySpace memorySpace
Definition: GpuIndex.h:31