13 #include <cuda_profiler_api.h> 
   14 #include "../../IndexFlat.h" 
   15 #include "../../IndexIVFPQ.h" 
   16 #include "../GpuIndexIVFPQ.h" 
   17 #include "../StandardGpuResources.h" 
   18 #include "../test/TestUtils.h" 
   19 #include "../utils/DeviceUtils.h" 
   20 #include "../utils/Timer.h" 
   21 #include <gflags/gflags.h> 
   25 DEFINE_int32(batches, 10, 
"number of batches of vectors to add");
 
   26 DEFINE_int32(batch_size, 10000, 
"number of vectors in each batch");
 
   27 DEFINE_int32(dim, 256, 
"dimension of vectors");
 
   28 DEFINE_int32(centroids, 4096, 
"num coarse centroids to use");
 
   29 DEFINE_int32(bytes_per_vec, 32, 
"bytes per encoded vector");
 
   30 DEFINE_int32(bits_per_code, 8, 
"bits per PQ code");
 
   31 DEFINE_int32(index, 2, 
"0 = no indices on GPU; 1 = 32 bit, 2 = 64 bit on GPU");
 
   32 DEFINE_bool(time_gpu, 
true, 
"time add to GPU");
 
   33 DEFINE_bool(time_cpu, 
false, 
"time add to CPU");
 
   34 DEFINE_bool(per_batch_time, 
false, 
"print per-batch times");
 
   35 DEFINE_bool(reserve_memory, 
false, 
"whether or not to pre-reserve memory");
 
   37 int main(
int argc, 
char** argv) {
 
   38   google::ParseCommandLineFlags(&argc, &argv, 
true);
 
   43   int numCentroids = FLAGS_centroids;
 
   44   int bytesPerVec = FLAGS_bytes_per_vec;
 
   45   int bitsPerCode = FLAGS_bits_per_code;
 
   50   int numTrain = 4 * numCentroids;
 
   51   std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
 
   55                              bytesPerVec, bitsPerCode);
 
   57     cpuIndex.train(numTrain, trainVecs.data());
 
   62     dim, numCentroids, bytesPerVec, bitsPerCode,
 
   64     (faiss::gpu::IndicesOptions) FLAGS_index,
 
   69     gpuIndex.train(numTrain, trainVecs.data());
 
   70     if (FLAGS_reserve_memory) {
 
   71       size_t numVecs = (size_t) FLAGS_batches * (
size_t) FLAGS_batch_size;
 
   72       gpuIndex.reserveMemory(numVecs);
 
   76   cudaDeviceSynchronize();
 
   77   CUDA_VERIFY(cudaProfilerStart());
 
   79   float totalGpuTime = 0.0f;
 
   80   float totalCpuTime = 0.0f;
 
   82   for (
int i = 0; i < FLAGS_batches; ++i) {
 
   83     if (!FLAGS_per_batch_time) {
 
   85         printf(
"Adding batch %d\n", i + 1);
 
   89     auto addVecs = faiss::gpu::randVecs(FLAGS_batch_size, dim);
 
   93       gpuIndex.add(FLAGS_batch_size, addVecs.data());
 
   94       CUDA_VERIFY(cudaDeviceSynchronize());
 
   99       if (FLAGS_per_batch_time) {
 
  100       printf(
"Batch %d | GPU time to add %d vecs: %.3f ms (%.5f ms per)\n",
 
  101              i + 1, FLAGS_batch_size, time, time / (
float) FLAGS_batch_size);
 
  105     if (FLAGS_time_cpu) {
 
  107       cpuIndex.add(FLAGS_batch_size, addVecs.data());
 
  110       totalCpuTime += time;
 
  112       if (FLAGS_per_batch_time) {
 
  113         printf(
"Batch %d | CPU time to add %d vecs: %.3f ms (%.5f ms per)\n",
 
  114                i + 1, FLAGS_batch_size, time, time / (
float) FLAGS_batch_size);
 
  119   CUDA_VERIFY(cudaProfilerStop());
 
  121   int total = FLAGS_batch_size * FLAGS_batches;
 
  123   if (FLAGS_time_gpu) {
 
  124     printf(
"%d dim, %d centroids, %d x %d encoding\n" 
  125            "GPU time to add %d vectors (%d batches, %d per batch): " 
  126            "%.3f ms (%.3f us per)\n",
 
  127            dim, numCentroids, bytesPerVec, bitsPerCode,
 
  128            total, FLAGS_batches, FLAGS_batch_size,
 
  129            totalGpuTime, totalGpuTime * 1000.0f / (
float) total);
 
  132   if (FLAGS_time_cpu) {
 
  133     printf(
"%d dim, %d centroids, %d x %d encoding\n" 
  134            "CPU time to add %d vectors (%d batches, %d per batch): " 
  135            "%.3f ms (%.3f us per)\n",
 
  136            dim, numCentroids, bytesPerVec, bitsPerCode,
 
  137            total, FLAGS_batches, FLAGS_batch_size,
 
  138            totalCpuTime, totalCpuTime * 1000.0f / (
float) total);
 
float elapsedMilliseconds()
Returns elapsed time in milliseconds. 
CPU wallclock elapsed timer.