• matthijs's avatar
    sync with FB version 2017-11-22 · 250a3d3f
    matthijs authored
    various bugfixes from github issues
    kmean with some frozen centroids
    GPU better tiling for large flat datasets
    default AVX for vector ops
    250a3d3f
TestGpuIndexFlat.cpp 9.6 KB
/**
 * Copyright (c) 2015-present, Facebook, Inc.
 * All rights reserved.
 *
 * This source code is licensed under the BSD+Patents license found in the
 * LICENSE file in the root directory of this source tree.
 */

// Copyright 2004-present Facebook. All Rights Reserved.

#include "../../IndexFlat.h"
#include "../GpuIndexFlat.h"
#include "../StandardGpuResources.h"
#include "../utils/DeviceUtils.h"
#include "../test/TestUtils.h"
#include <gtest/gtest.h>
#include <sstream>
#include <vector>

// FIXME: figure out a better way to test fp16
constexpr float kF16MaxRelErr = 0.07f;
constexpr float kF32MaxRelErr = 6e-3f;

struct TestFlatOptions {
  TestFlatOptions()
      : useL2(true),
        useFloat16(false),
        useTransposed(false),
        numVecsOverride(-1),
        numQueriesOverride(-1),
        kOverride(-1) {
  }

  bool useL2;
  bool useFloat16;
  bool useTransposed;
  int numVecsOverride;
  int numQueriesOverride;
  int kOverride;
};

void testFlat(const TestFlatOptions& opt) {
  int numVecs = opt.numVecsOverride > 0 ?
    opt.numVecsOverride : faiss::gpu::randVal(1000, 20000);
  int dim = faiss::gpu::randVal(50, 800);
  int numQuery = opt.numQueriesOverride > 0 ?
    opt.numQueriesOverride : faiss::gpu::randVal(1, 512);

  // Due to loss of precision in a float16 accumulator, for large k,
  // the number of differences is pretty huge. Restrict ourselves to a
  // fairly small `k` for float16
  int k = opt.useFloat16 ?
    std::min(faiss::gpu::randVal(1, 50), numVecs) :
    std::min(faiss::gpu::randVal(1, 1024), numVecs);
  if (opt.kOverride > 0) {
    k = opt.kOverride;
  }

  faiss::IndexFlatIP cpuIndexIP(dim);
  faiss::IndexFlatL2 cpuIndexL2(dim);

  faiss::IndexFlat* cpuIndex =
    opt.useL2 ? (faiss::IndexFlat*) &cpuIndexL2 :
    (faiss::IndexFlat*) &cpuIndexIP;

  // Construct on a random device to test multi-device, if we have
  // multiple devices
  int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);

  faiss::gpu::StandardGpuResources res;
  res.noTempMemory();


  faiss::gpu::GpuIndexFlatConfig config;
  config.device = device;
  config.useFloat16 = opt.useFloat16;
  config.storeTransposed = opt.useTransposed;

  faiss::gpu::GpuIndexFlatIP gpuIndexIP(&res, dim, config);
  faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);

  faiss::gpu::GpuIndexFlat* gpuIndex =
    opt.useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
    (faiss::gpu::GpuIndexFlat*) &gpuIndexIP;

  std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
  cpuIndex->add(numVecs, vecs.data());
  gpuIndex->add(numVecs, vecs.data());

  std::stringstream str;
  str << (opt.useL2 ? "L2" : "IP") << " numVecs " << numVecs
      << " dim " << dim
      << " useFloat16 " << opt.useFloat16
      << " transposed " << opt.useTransposed
      << " numQuery " << numQuery
      << " k " << k;

  // To some extent, we depend upon the relative error for the test
  // for float16
  faiss::gpu::compareIndices(*cpuIndex, *gpuIndex, numQuery, dim, k, str.str(),
                             opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
                             // FIXME: the fp16 bounds are
                             // useless when math (the accumulator) is
                             // in fp16. Figure out another way to test
                             opt.useFloat16 ? 0.99f : 0.1f,
                             opt.useFloat16 ? 0.65f : 0.015f);
}

TEST(TestGpuIndexFlat, IP_Float32) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();

    TestFlatOptions opt;
    opt.useL2 = false;
    opt.useFloat16 = false;
    opt.useTransposed = false;

    testFlat(opt);

    opt.useTransposed = true;
    testFlat(opt);
  }
}

TEST(TestGpuIndexFlat, L2_Float32) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();

    TestFlatOptions opt;
    opt.useL2 = true;
    opt.useFloat16 = false;
    opt.useTransposed = false;

    testFlat(opt);

    opt.useTransposed = true;
    testFlat(opt);
  }
}

// test specialized k == 1 codepath
TEST(TestGpuIndexFlat, L2_Float32_K1) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();

    TestFlatOptions opt;
    opt.useL2 = true;
    opt.useFloat16 = false;
    opt.useTransposed = false;
    opt.kOverride = 1;

    testFlat(opt);
  }
}

TEST(TestGpuIndexFlat, IP_Float16) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();

    TestFlatOptions opt;
    opt.useL2 = false;
    opt.useFloat16 = true;
    opt.useTransposed = false;

    testFlat(opt);

    opt.useTransposed = true;
    testFlat(opt);
  }
}

TEST(TestGpuIndexFlat, L2_Float16) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();

    TestFlatOptions opt;
    opt.useL2 = true;
    opt.useFloat16 = true;
    opt.useTransposed = false;

    testFlat(opt);

    opt.useTransposed = true;
    testFlat(opt);
  }
}

// test specialized k == 1 codepath
TEST(TestGpuIndexFlat, L2_Float16_K1) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();

    TestFlatOptions opt;
    opt.useL2 = true;
    opt.useFloat16 = true;
    opt.useTransposed = false;
    opt.kOverride = 1;

    testFlat(opt);
  }
}

// test tiling along a huge vector set
TEST(TestGpuIndexFlat, L2_Tiling) {
  for (int tries = 0; tries < 3; ++tries) {
    faiss::gpu::newTestSeed();

    TestFlatOptions opt;
    opt.useL2 = true;
    opt.useFloat16 = false;
    opt.useTransposed = false;
    opt.numVecsOverride = 1000000;
    opt.numQueriesOverride = 8;

    testFlat(opt);

    opt.useTransposed = true;
    testFlat(opt);
  }
}

TEST(TestGpuIndexFlat, QueryEmpty) {
  faiss::gpu::StandardGpuResources res;
  res.noTempMemory();

  faiss::gpu::GpuIndexFlatConfig config;
  config.device = 0;
  config.useFloat16 = false;
  config.storeTransposed = false;

  int dim = 128;
  faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);

  // Querying an empty index should not blow up, and just return
  // (FLT_MAX, -1)
  int numQuery = 10;
  int k = 50;
  std::vector<float> queries(numQuery * dim, 1.0f);

  std::vector<float> dist(numQuery * k, 0);
  std::vector<faiss::Index::idx_t> ind(numQuery * k);

  gpuIndex.search(numQuery, queries.data(), k, dist.data(), ind.data());

  for (auto d : dist) {
    EXPECT_EQ(d, std::numeric_limits<float>::max());
  }

  for (auto i : ind) {
    EXPECT_EQ(i, -1);
  }
}

TEST(TestGpuIndexFlat, CopyFrom) {
  faiss::gpu::newTestSeed();

  int numVecs = faiss::gpu::randVal(100, 200);
  int dim = faiss::gpu::randVal(1, 1000);

  faiss::IndexFlatL2 cpuIndex(dim);

  std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
  cpuIndex.add(numVecs, vecs.data());

  faiss::gpu::StandardGpuResources res;
  res.noTempMemory();

  // Fill with garbage values
  int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);

  faiss::gpu::GpuIndexFlatConfig config;
  config.device = 0;
  config.useFloat16 = false;
  config.storeTransposed = false;

  faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, 2000, config);
  gpuIndex.copyFrom(&cpuIndex);

  EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
  EXPECT_EQ(gpuIndex.ntotal, numVecs);

  EXPECT_EQ(cpuIndex.d, gpuIndex.d);
  EXPECT_EQ(cpuIndex.d, dim);

  int idx = faiss::gpu::randVal(0, numVecs - 1);

  std::vector<float> gpuVals(dim);
  gpuIndex.reconstruct(idx, gpuVals.data());

  std::vector<float> cpuVals(dim);
  cpuIndex.reconstruct(idx, cpuVals.data());

  EXPECT_EQ(gpuVals, cpuVals);
}

TEST(TestGpuIndexFlat, CopyTo) {
  faiss::gpu::newTestSeed();

  faiss::gpu::StandardGpuResources res;
  res.noTempMemory();

  int numVecs = faiss::gpu::randVal(100, 200);
  int dim = faiss::gpu::randVal(1, 1000);

  int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);

  faiss::gpu::GpuIndexFlatConfig config;
  config.device = device;
  config.useFloat16 = false;
  config.storeTransposed = false;

  faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);

  std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
  gpuIndex.add(numVecs, vecs.data());

  // Fill with garbage values
  faiss::IndexFlatL2 cpuIndex(2000);
  gpuIndex.copyTo(&cpuIndex);

  EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
  EXPECT_EQ(gpuIndex.ntotal, numVecs);

  EXPECT_EQ(cpuIndex.d, gpuIndex.d);
  EXPECT_EQ(cpuIndex.d, dim);

  int idx = faiss::gpu::randVal(0, numVecs - 1);

  std::vector<float> gpuVals(dim);
  gpuIndex.reconstruct(idx, gpuVals.data());

  std::vector<float> cpuVals(dim);
  cpuIndex.reconstruct(idx, cpuVals.data());

  EXPECT_EQ(gpuVals, cpuVals);
}

TEST(TestGpuIndexFlat, UnifiedMemory) {
  // Construct on a random device to test multi-device, if we have
  // multiple devices
  int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);

  if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
    return;
  }

  int dim = 256;

  // FIXME: GpuIndexFlat doesn't support > 2^31 (vecs * dims) due to
  // kernel indexing, so we can't test unified memory for memory
  // oversubscription.
  size_t numVecs = 50000;
  int numQuery = 10;
  int k = 10;

  faiss::IndexFlatL2 cpuIndexL2(dim);

  faiss::gpu::StandardGpuResources res;
  res.noTempMemory();

  faiss::gpu::GpuIndexFlatConfig config;
  config.device = device;
  config.memorySpace = faiss::gpu::MemorySpace::Unified;

  faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);

  std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
  cpuIndexL2.add(numVecs, vecs.data());
  gpuIndexL2.add(numVecs, vecs.data());

  // To some extent, we depend upon the relative error for the test
  // for float16
  faiss::gpu::compareIndices(cpuIndexL2, gpuIndexL2,
                             numQuery, dim, k, "Unified Memory",
                             kF32MaxRelErr,
                             0.1f,
                             0.015f);
}