1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
/**
* Copyright (c) 2015-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD+Patents license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <cuda_profiler_api.h>
#include "../../IndexFlat.h"
#include "../../IndexIVFPQ.h"
#include "../GpuIndexIVFPQ.h"
#include "../StandardGpuResources.h"
#include "../test/TestUtils.h"
#include "../utils/DeviceUtils.h"
#include "../utils/Timer.h"
#include <gflags/gflags.h>
#include <map>
#include <vector>
DEFINE_int32(batches, 10, "number of batches of vectors to add");
DEFINE_int32(batch_size, 10000, "number of vectors in each batch");
DEFINE_int32(dim, 256, "dimension of vectors");
DEFINE_int32(centroids, 4096, "num coarse centroids to use");
DEFINE_int32(bytes_per_vec, 32, "bytes per encoded vector");
DEFINE_int32(bits_per_code, 8, "bits per PQ code");
DEFINE_int32(index, 2, "0 = no indices on GPU; 1 = 32 bit, 2 = 64 bit on GPU");
DEFINE_bool(time_gpu, true, "time add to GPU");
DEFINE_bool(time_cpu, false, "time add to CPU");
DEFINE_bool(per_batch_time, false, "print per-batch times");
DEFINE_bool(reserve_memory, false, "whether or not to pre-reserve memory");
int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
cudaProfilerStop();
int dim = FLAGS_dim;
int numCentroids = FLAGS_centroids;
int bytesPerVec = FLAGS_bytes_per_vec;
int bitsPerCode = FLAGS_bits_per_code;
faiss::gpu::StandardGpuResources res;
// IndexIVFPQ will complain, but just give us enough to get through this
int numTrain = 4 * numCentroids;
std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
faiss::IndexFlatL2 coarseQuantizer(dim);
faiss::IndexIVFPQ cpuIndex(&coarseQuantizer, dim, numCentroids,
bytesPerVec, bitsPerCode);
if (FLAGS_time_cpu) {
cpuIndex.train(numTrain, trainVecs.data());
}
faiss::gpu::GpuIndexIVFPQConfig config;
config.device = 0;
config.indicesOptions = (faiss::gpu::IndicesOptions) FLAGS_index;
faiss::gpu::GpuIndexIVFPQ gpuIndex(
&res, dim, numCentroids, bytesPerVec, bitsPerCode,
faiss::METRIC_L2, config);
if (FLAGS_time_gpu) {
gpuIndex.train(numTrain, trainVecs.data());
if (FLAGS_reserve_memory) {
size_t numVecs = (size_t) FLAGS_batches * (size_t) FLAGS_batch_size;
gpuIndex.reserveMemory(numVecs);
}
}
cudaDeviceSynchronize();
CUDA_VERIFY(cudaProfilerStart());
float totalGpuTime = 0.0f;
float totalCpuTime = 0.0f;
for (int i = 0; i < FLAGS_batches; ++i) {
if (!FLAGS_per_batch_time) {
if (i % 10 == 0) {
printf("Adding batch %d\n", i + 1);
}
}
auto addVecs = faiss::gpu::randVecs(FLAGS_batch_size, dim);
if (FLAGS_time_gpu) {
faiss::gpu::CpuTimer timer;
gpuIndex.add(FLAGS_batch_size, addVecs.data());
CUDA_VERIFY(cudaDeviceSynchronize());
auto time = timer.elapsedMilliseconds();
totalGpuTime += time;
if (FLAGS_per_batch_time) {
printf("Batch %d | GPU time to add %d vecs: %.3f ms (%.5f ms per)\n",
i + 1, FLAGS_batch_size, time, time / (float) FLAGS_batch_size);
}
}
if (FLAGS_time_cpu) {
faiss::gpu::CpuTimer timer;
cpuIndex.add(FLAGS_batch_size, addVecs.data());
auto time = timer.elapsedMilliseconds();
totalCpuTime += time;
if (FLAGS_per_batch_time) {
printf("Batch %d | CPU time to add %d vecs: %.3f ms (%.5f ms per)\n",
i + 1, FLAGS_batch_size, time, time / (float) FLAGS_batch_size);
}
}
}
CUDA_VERIFY(cudaProfilerStop());
int total = FLAGS_batch_size * FLAGS_batches;
if (FLAGS_time_gpu) {
printf("%d dim, %d centroids, %d x %d encoding\n"
"GPU time to add %d vectors (%d batches, %d per batch): "
"%.3f ms (%.3f us per)\n",
dim, numCentroids, bytesPerVec, bitsPerCode,
total, FLAGS_batches, FLAGS_batch_size,
totalGpuTime, totalGpuTime * 1000.0f / (float) total);
}
if (FLAGS_time_cpu) {
printf("%d dim, %d centroids, %d x %d encoding\n"
"CPU time to add %d vectors (%d batches, %d per batch): "
"%.3f ms (%.3f us per)\n",
dim, numCentroids, bytesPerVec, bitsPerCode,
total, FLAGS_batches, FLAGS_batch_size,
totalCpuTime, totalCpuTime * 1000.0f / (float) total);
}
return 0;
}