Commit fe446e41 authored by matthijs's avatar matthijs

bugfixes

parent 250a3d3f
...@@ -431,7 +431,8 @@ void ParameterSpace::set_index_parameter ( ...@@ -431,7 +431,8 @@ void ParameterSpace::set_index_parameter (
// and fall through to also enable it on sub-indexes // and fall through to also enable it on sub-indexes
} }
if (DC (IndexPreTransform)) { if (DC (IndexPreTransform)) {
index = ix->index; set_index_parameter (ix->index, name, val);
return;
} }
if (DC (IndexShards)) { if (DC (IndexShards)) {
// call on all sub-indexes // call on all sub-indexes
...@@ -440,30 +441,28 @@ void ParameterSpace::set_index_parameter ( ...@@ -440,30 +441,28 @@ void ParameterSpace::set_index_parameter (
} }
return; return;
} }
if (name == "verbose") {
index->verbose = int(val);
// in case it was an IndexPreTransform
}
if (DC (IndexRefineFlat)) { if (DC (IndexRefineFlat)) {
if (name == "k_factor_rf") { if (name == "k_factor_rf") {
ix->k_factor = int(val); ix->k_factor = int(val);
return; return;
} }
index = ix->base_index; // otherwise it is for the sub-index
} set_index_parameter (&ix->refine_index, name, val);
if (DC (IndexPreTransform)) { return;
index = ix->index;
} }
if (name == "verbose") { if (name == "verbose") {
index->verbose = int(val); index->verbose = int(val);
return; // last verbose that we could find return; // last verbose that we could find
} }
if (name == "nprobe") { if (name == "nprobe") {
if ( DC(IndexIVF)) { if ( DC(IndexIVF)) {
ix->nprobe = int(val); ix->nprobe = int(val);
return; return;
} }
} }
if (name == "ht") { if (name == "ht") {
if (DC (IndexPQ)) { if (DC (IndexPQ)) {
if (val >= ix->pq.code_size * 8) { if (val >= ix->pq.code_size * 8) {
...@@ -685,7 +684,6 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -685,7 +684,6 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
tok; tok;
tok = strtok_r (nullptr, " ,", &ptr)) { tok = strtok_r (nullptr, " ,", &ptr)) {
int d_out, opq_M, nbit, M, M2; int d_out, opq_M, nbit, M, M2;
char option[100];
std::string stok(tok); std::string stok(tok);
// to avoid mem leaks with exceptions: // to avoid mem leaks with exceptions:
...@@ -776,10 +774,9 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -776,10 +774,9 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
del_coarse_quantizer.release (); del_coarse_quantizer.release ();
index_ivf->own_fields = true; index_ivf->own_fields = true;
index_1 = index_ivf; index_1 = index_ivf;
} else if (!index && sscanf (tok, "PQ%d%10s", &M, option) == 2) { } else if (!index && (sscanf (tok, "PQ%d", &M) == 1 ||
std::string soption = option; sscanf (tok, "PQ%dnp", &M) == 1)) {
// np to disable polysemous trainign bool do_polysemous_training = stok.find("np") == std::string::npos;
FAISS_THROW_IF_NOT(soption == "" || soption == "np");
if (coarse_quantizer) { if (coarse_quantizer) {
IndexIVFPQ *index_ivf = new IndexIVFPQ ( IndexIVFPQ *index_ivf = new IndexIVFPQ (
coarse_quantizer, d, ncentroids, M, 8); coarse_quantizer, d, ncentroids, M, 8);
...@@ -789,11 +786,11 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -789,11 +786,11 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT; index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
del_coarse_quantizer.release (); del_coarse_quantizer.release ();
index_ivf->own_fields = true; index_ivf->own_fields = true;
index_ivf->do_polysemous_training = soption != "np"; index_ivf->do_polysemous_training = do_polysemous_training;
index_1 = index_ivf; index_1 = index_ivf;
} else { } else {
IndexPQ *index_pq = new IndexPQ (d, M, 8, metric); IndexPQ *index_pq = new IndexPQ (d, M, 8, metric);
index_pq->do_polysemous_training = soption != "np"; index_pq->do_polysemous_training = do_polysemous_training;
index_1 = index_pq; index_1 = index_pq;
} }
......
...@@ -260,7 +260,6 @@ static size_t polysemous_inner_loop ( ...@@ -260,7 +260,6 @@ static size_t polysemous_inner_loop (
void IndexPQ::search_core_polysemous (idx_t n, const float *x, idx_t k, void IndexPQ::search_core_polysemous (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels) const float *distances, idx_t *labels) const
{ {
FAISS_THROW_IF_NOT (pq.code_size % 8 == 0);
FAISS_THROW_IF_NOT (pq.byte_per_idx == 1); FAISS_THROW_IF_NOT (pq.byte_per_idx == 1);
// PQ distance tables // PQ distance tables
...@@ -319,12 +318,17 @@ void IndexPQ::search_core_polysemous (idx_t n, const float *x, idx_t k, ...@@ -319,12 +318,17 @@ void IndexPQ::search_core_polysemous (idx_t n, const float *x, idx_t k,
(*this, dis_table_qi, q_code, k, heap_dis, heap_ids); (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
break; break;
default: default:
if (pq.code_size % 8 == 0) if (pq.code_size % 8 == 0) {
n_pass += polysemous_inner_loop<HammingComputerM8> n_pass += polysemous_inner_loop<HammingComputerM8>
(*this, dis_table_qi, q_code, k, heap_dis, heap_ids); (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
else } else if (pq.code_size % 4 == 0) {
n_pass += polysemous_inner_loop<HammingComputerM4> n_pass += polysemous_inner_loop<HammingComputerM4>
(*this, dis_table_qi, q_code, k, heap_dis, heap_ids); (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
} else {
FAISS_THROW_FMT(
"code size %zd not supported for polysemous",
pq.code_size);
}
break; break;
} }
} else { } else {
...@@ -342,8 +346,14 @@ void IndexPQ::search_core_polysemous (idx_t n, const float *x, idx_t k, ...@@ -342,8 +346,14 @@ void IndexPQ::search_core_polysemous (idx_t n, const float *x, idx_t k,
(*this, dis_table_qi, q_code, k, heap_dis, heap_ids); (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
break; break;
default: default:
n_pass += polysemous_inner_loop<GenHammingComputerM8> if (pq.code_size % 8 == 0) {
(*this, dis_table_qi, q_code, k, heap_dis, heap_ids); n_pass += polysemous_inner_loop<GenHammingComputerM8>
(*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
} else {
FAISS_THROW_FMT(
"code size %zd not supported for polysemous",
pq.code_size);
}
break; break;
} }
} }
......
...@@ -381,20 +381,20 @@ void GpuParameterSpace::set_index_parameter ( ...@@ -381,20 +381,20 @@ void GpuParameterSpace::set_index_parameter (
set_index_parameter (ix->at(i), name, val); set_index_parameter (ix->at(i), name, val);
return; return;
} }
if (DC (GpuIndexIVF)) { if (name == "nprobe") {
if (name == "nprobe") { if (DC (GpuIndexIVF)) {
ix->setNumProbes (int (val)); ix->setNumProbes (int (val));
return; return;
} }
} }
if(DC (GpuIndexIVFPQ)) { if (name == "use_precomputed_table") {
if (name == "use_precomputed_table") { if (DC (GpuIndexIVFPQ)) {
ix->setPrecomputedCodes(bool (val)); ix->setPrecomputedCodes(bool (val));
return; return;
} }
} }
// maybe norma lindex parameters apply? // maybe normal index parameters apply?
ParameterSpace::set_index_parameter (index, name, val); ParameterSpace::set_index_parameter (index, name, val);
} }
......
...@@ -29,8 +29,9 @@ GpuIndexIVF::GpuIndexIVF(GpuResources* resources, ...@@ -29,8 +29,9 @@ GpuIndexIVF::GpuIndexIVF(GpuResources* resources,
nprobe_(1), nprobe_(1),
quantizer_(nullptr) { quantizer_(nullptr) {
#ifndef FAISS_USE_FLOAT16 #ifndef FAISS_USE_FLOAT16
FAISS_THROW_IF_NOT_MSG(!ivfConfig_.flatConfig.useFloat16CoarseQuantizer, FAISS_THROW_IF_NOT_MSG(!ivfConfig_.flatConfig.useFloat16 &&
"float16 unsupported; need CUDA SDK >= 7.5"); !ivfConfig_.flatConfig.useFloat16Accumulator,
"float16 unsupported; need CUDA SDK >= 7.5");
#endif #endif
init_(); init_();
......
...@@ -60,7 +60,7 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(GpuResources* resources, ...@@ -60,7 +60,7 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(GpuResources* resources,
reserveMemoryVecs_(0), reserveMemoryVecs_(0),
index_(nullptr) { index_(nullptr) {
#ifndef FAISS_USE_FLOAT16 #ifndef FAISS_USE_FLOAT16
FAISS_ASSERT(!useFloat16LookupTables_); FAISS_ASSERT(!config.useFloat16LookupTables);
#endif #endif
verifySettings_(); verifySettings_();
......
...@@ -45,10 +45,6 @@ IVFFlat::IVFFlat(GpuResources* resources, ...@@ -45,10 +45,6 @@ IVFFlat::IVFFlat(GpuResources* resources,
space), space),
l2Distance_(l2Distance), l2Distance_(l2Distance),
useFloat16_(useFloat16) { useFloat16_(useFloat16) {
#ifndef FAISS_USE_FLOAT16
FAISS_ASSERT_MSG(!useFloat16, "float16 unsupported");
useFloat16_ = false;
#endif
} }
IVFFlat::~IVFFlat() { IVFFlat::~IVFFlat() {
...@@ -95,6 +91,9 @@ IVFFlat::addCodeVectorsFromCpu(int listId, ...@@ -95,6 +91,9 @@ IVFFlat::addCodeVectorsFromCpu(int listId,
lengthInBytes, lengthInBytes,
stream, stream,
true /* exact reserved size */); true /* exact reserved size */);
#else
// we are not compiling with float16 support
FAISS_ASSERT(false);
#endif #endif
} else { } else {
listData->append((unsigned char*) vecs, listData->append((unsigned char*) vecs,
......
...@@ -506,7 +506,6 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries, ...@@ -506,7 +506,6 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
} }
#else #else
FAISS_ASSERT(!useFloat16Lookup); FAISS_ASSERT(!useFloat16Lookup);
int codeSize = sizeof(float);
#endif #endif
int totalCodeDistancesSize = int totalCodeDistancesSize =
......
...@@ -7,16 +7,12 @@ ...@@ -7,16 +7,12 @@
#! /usr/bin/env python2 #! /usr/bin/env python2
import time import time
import libfb.py.mkl # noqa import unittest
import numpy as np import numpy as np
from libfb import testutil
import faiss import faiss
class EvalIVFPQAccuracy(testutil.BaseFacebookTestCase): class EvalIVFPQAccuracy(unittest.TestCase):
def get_dataset(self, small_one=False): def get_dataset(self, small_one=False):
if not small_one: if not small_one:
...@@ -110,3 +106,9 @@ class EvalIVFPQAccuracy(testutil.BaseFacebookTestCase): ...@@ -110,3 +106,9 @@ class EvalIVFPQAccuracy(testutil.BaseFacebookTestCase):
def test_cpu_to_gpu_IVFFlat(self): def test_cpu_to_gpu_IVFFlat(self):
self.do_cpu_to_gpu('IVF128,Flat') self.do_cpu_to_gpu('IVF128,Flat')
def test_set_gpu_param(self):
index = faiss.index_factory(12, "PCAR8,IVF10,PQ4")
res = faiss.StandardGpuResources()
gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
faiss.GpuParameterSpace().set_index_parameter(index, "nprobe", 3)
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
#ifdef __CUDA_ARCH__ #ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ <= 620 #if __CUDA_ARCH__ <= 700
constexpr int kWarpSize = 32; constexpr int kWarpSize = 32;
#else #else
#error Unknown __CUDA_ARCH__; please define parameters for compute capability #error Unknown __CUDA_ARCH__; please define parameters for compute capability
...@@ -25,37 +25,15 @@ constexpr int kWarpSize = 32; ...@@ -25,37 +25,15 @@ constexpr int kWarpSize = 32;
constexpr int kWarpSize = 32; constexpr int kWarpSize = 32;
#endif // !__CUDA_ARCH__ #endif // !__CUDA_ARCH__
// This is a memory barrier for intra-warp writes to shared memory.
__forceinline__ __device__ void warpFence() { __forceinline__ __device__ void warpFence() {
// Technically, memory barriers are required via the CUDA
// programming model, since warp synchronous programming no longer
// is guaranteed.
//
// There are two components to it:
// -a barrier known to the compiler such that the compiler will not
// schedule loads and stores across the barrier;
// -a HW-level barrier that guarantees that writes are seen in the
// proper order
//
// However, __threadfence_block() is a stronger constraint than what
// we really want out of the hardware: a warp-wide barrier.
//
// In current hardware, it appears that warp synchronous programming
// is a reality; by all tests it appears safe and race-free.
//
// However, understandably it may not be in the future (based on
// what Nvidia says in the Kepler guide, it may change depending
// upon compiler/toolchain issues or future hardware).
//
// Removing the fence results in 10%+ faster performance.
// However, we are judicious as to where we insert the fence, so if
// this reality ever changes, uncommenting this will result in CUDA
// programming model-safe ordering again.
//
// FIXME: we should probably qualify as volatile as well, since the
// compiler could technically preserve values across loops? This
// seems very impractical for the compiler to do, however.
#if __CUDA_ARCH__ >= 700
__syncwarp();
#else
// For the time being, assume synchronicity.
// __threadfence_block(); // __threadfence_block();
#endif
} }
} } // namespace } } // namespace
...@@ -48,10 +48,16 @@ void runConvertToFloat32(float* out, ...@@ -48,10 +48,16 @@ void runConvertToFloat32(float* out,
in, in + num, out, HalfToFloat()); in, in + num, out, HalfToFloat());
} }
half hostFloat2Half(float a) { __half hostFloat2Half(float a) {
half h; #if CUDA_VERSION >= 9000
__half_raw raw;
raw.x = cpu_float2half_rn(a).x;
return __half(raw);
#else
__half h;
h.x = cpu_float2half_rn(a).x; h.x = cpu_float2half_rn(a).x;
return h; return h;
#endif
} }
} } // namespace } } // namespace
......
...@@ -149,7 +149,7 @@ DeviceTensor<float, Dim, true> fromHalf(GpuResources* resources, ...@@ -149,7 +149,7 @@ DeviceTensor<float, Dim, true> fromHalf(GpuResources* resources,
return out; return out;
} }
half hostFloat2Half(float v); __half hostFloat2Half(float v);
#endif // FAISS_USE_FLOAT16 #endif // FAISS_USE_FLOAT16
......
...@@ -39,9 +39,15 @@ struct Limits<float> { ...@@ -39,9 +39,15 @@ struct Limits<float> {
#ifdef FAISS_USE_FLOAT16 #ifdef FAISS_USE_FLOAT16
inline __device__ __host__ half kGetHalf(unsigned short v) { inline __device__ __host__ half kGetHalf(unsigned short v) {
#if CUDA_VERSION >= 9000
__half_raw h;
h.x = v;
return __half(h);
#else
half h; half h;
h.x = v; h.x = v;
return h; return h;
#endif
} }
template <> template <>
......
...@@ -12,6 +12,12 @@ ...@@ -12,6 +12,12 @@
#include "Float16.cuh" #include "Float16.cuh"
#ifndef __HALF2_TO_UI
// cuda_fp16.hpp doesn't export this
#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
#endif
// //
// Templated wrappers to express load/store for different scalar and vector // Templated wrappers to express load/store for different scalar and vector
// types, so kernels can have the same written form but can operate // types, so kernels can have the same written form but can operate
...@@ -37,13 +43,23 @@ template <> ...@@ -37,13 +43,23 @@ template <>
struct LoadStore<Half4> { struct LoadStore<Half4> {
static inline __device__ Half4 load(void* p) { static inline __device__ Half4 load(void* p) {
Half4 out; Half4 out;
#if CUDA_VERSION >= 9000
asm("ld.global.v2.u32 {%0, %1}, [%2];" :
"=r"(__HALF2_TO_UI(out.a)), "=r"(__HALF2_TO_UI(out.b)) : "l"(p));
#else
asm("ld.global.v2.u32 {%0, %1}, [%2];" : asm("ld.global.v2.u32 {%0, %1}, [%2];" :
"=r"(out.a.x), "=r"(out.b.x) : "l"(p)); "=r"(out.a.x), "=r"(out.b.x) : "l"(p));
#endif
return out; return out;
} }
static inline __device__ void store(void* p, const Half4& v) { static inline __device__ void store(void* p, Half4& v) {
#if CUDA_VERSION >= 9000
asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p),
"r"(__HALF2_TO_UI(v.a)), "r"(__HALF2_TO_UI(v.b)));
#else
asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p), "r"(v.a.x), "r"(v.b.x)); asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p), "r"(v.a.x), "r"(v.b.x));
#endif
} }
}; };
...@@ -51,15 +67,27 @@ template <> ...@@ -51,15 +67,27 @@ template <>
struct LoadStore<Half8> { struct LoadStore<Half8> {
static inline __device__ Half8 load(void* p) { static inline __device__ Half8 load(void* p) {
Half8 out; Half8 out;
#if CUDA_VERSION >= 9000
asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" :
"=r"(__HALF2_TO_UI(out.a.a)), "=r"(__HALF2_TO_UI(out.a.b)),
"=r"(__HALF2_TO_UI(out.b.a)), "=r"(__HALF2_TO_UI(out.b.b)) : "l"(p));
#else
asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" : asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" :
"=r"(out.a.a.x), "=r"(out.a.b.x), "=r"(out.a.a.x), "=r"(out.a.b.x),
"=r"(out.b.a.x), "=r"(out.b.b.x) : "l"(p)); "=r"(out.b.a.x), "=r"(out.b.b.x) : "l"(p));
#endif
return out; return out;
} }
static inline __device__ void store(void* p, const Half8& v) { static inline __device__ void store(void* p, Half8& v) {
#if CUDA_VERSION >= 9000
asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
: : "l"(p), "r"(__HALF2_TO_UI(v.a.a)), "r"(__HALF2_TO_UI(v.a.b)),
"r"(__HALF2_TO_UI(v.b.a)), "r"(__HALF2_TO_UI(v.b.b)));
#else
asm("st.v4.u32 [%0], {%1, %2, %3, %4};" asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
: : "l"(p), "r"(v.a.a.x), "r"(v.a.b.x), "r"(v.b.a.x), "r"(v.b.b.x)); : : "l"(p), "r"(v.a.a.x), "r"(v.a.b.x), "r"(v.b.a.x), "r"(v.b.b.x));
#endif
} }
}; };
......
...@@ -285,9 +285,13 @@ struct Math<half> { ...@@ -285,9 +285,13 @@ struct Math<half> {
} }
static inline __device__ half zero() { static inline __device__ half zero() {
#if CUDA_VERSION >= 9000
return 0;
#else
half h; half h;
h.x = 0; h.x = 0;
return h; return h;
#endif
} }
}; };
......
...@@ -142,7 +142,14 @@ struct BlockSelect { ...@@ -142,7 +142,14 @@ struct BlockSelect {
__device__ inline void checkThreadQ() { __device__ inline void checkThreadQ() {
bool needSort = (numVals == NumThreadQ); bool needSort = (numVals == NumThreadQ);
if (!__any(needSort)) { #if CUDA_VERSION >= 9000
needSort = __any_sync(0xffffffff, needSort);
#else
needSort = __any(needSort);
#endif
if (!needSort) {
// no lanes have triggered a sort
return; return;
} }
...@@ -408,7 +415,14 @@ struct WarpSelect { ...@@ -408,7 +415,14 @@ struct WarpSelect {
__device__ inline void checkThreadQ() { __device__ inline void checkThreadQ() {
bool needSort = (numVals == NumThreadQ); bool needSort = (numVals == NumThreadQ);
if (!__any(needSort)) { #if CUDA_VERSION >= 9000
needSort = __any_sync(0xffffffff, needSort);
#else
needSort = __any(needSort);
#endif
if (!needSort) {
// no lanes have triggered a sort
return; return;
} }
......
...@@ -27,6 +27,37 @@ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor() ...@@ -27,6 +27,37 @@ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor()
} }
} }
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) {
this->operator=(t);
}
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
this->operator=(std::move(t));
}
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) {
data_ = t.data_;
for (int i = 0; i < Dim; ++i) {
size_[i] = t.size_[i];
stride_[i] = t.stride_[i];
}
return *this;
}
template <typename T, int Dim, bool InnerContig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ __host__ __device__
......
...@@ -87,16 +87,14 @@ class Tensor { ...@@ -87,16 +87,14 @@ class Tensor {
__host__ __device__ Tensor(); __host__ __device__ Tensor();
/// Copy constructor /// Copy constructor
__host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) __host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t);
= default;
/// Move constructor /// Move constructor
__host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) __host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
= default;
/// Assignment /// Assignment
__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) = default; operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t);
/// Move assignment /// Move assignment
__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
......
...@@ -19,7 +19,11 @@ namespace faiss { namespace gpu { ...@@ -19,7 +19,11 @@ namespace faiss { namespace gpu {
template <typename T> template <typename T>
inline __device__ T shfl(const T val, inline __device__ T shfl(const T val,
int srcLane, int width = kWarpSize) { int srcLane, int width = kWarpSize) {
#if CUDA_VERSION >= 9000
return __shfl_sync(0xffffffff, val, srcLane, width);
#else
return __shfl(val, srcLane, width); return __shfl(val, srcLane, width);
#endif
} }
// CUDA SDK does not provide specializations for T* // CUDA SDK does not provide specializations for T*
...@@ -28,13 +32,18 @@ inline __device__ T* shfl(T* const val, ...@@ -28,13 +32,18 @@ inline __device__ T* shfl(T* const val,
int srcLane, int width = kWarpSize) { int srcLane, int width = kWarpSize) {
static_assert(sizeof(T*) == sizeof(long long), "pointer size"); static_assert(sizeof(T*) == sizeof(long long), "pointer size");
long long v = (long long) val; long long v = (long long) val;
return (T*) __shfl(v, srcLane, width);
return (T*) shfl(v, srcLane, width);
} }
template <typename T> template <typename T>
inline __device__ T shfl_up(const T val, inline __device__ T shfl_up(const T val,
unsigned int delta, int width = kWarpSize) { unsigned int delta, int width = kWarpSize) {
#if CUDA_VERSION >= 9000
return __shfl_up_sync(0xffffffff, val, delta, width);
#else
return __shfl_up(val, delta, width); return __shfl_up(val, delta, width);
#endif
} }
// CUDA SDK does not provide specializations for T* // CUDA SDK does not provide specializations for T*
...@@ -43,13 +52,18 @@ inline __device__ T* shfl_up(T* const val, ...@@ -43,13 +52,18 @@ inline __device__ T* shfl_up(T* const val,
unsigned int delta, int width = kWarpSize) { unsigned int delta, int width = kWarpSize) {
static_assert(sizeof(T*) == sizeof(long long), "pointer size"); static_assert(sizeof(T*) == sizeof(long long), "pointer size");
long long v = (long long) val; long long v = (long long) val;
return (T*) __shfl_up(v, delta, width);
return (T*) shfl_up(v, delta, width);
} }
template <typename T> template <typename T>
inline __device__ T shfl_down(const T val, inline __device__ T shfl_down(const T val,
unsigned int delta, int width = kWarpSize) { unsigned int delta, int width = kWarpSize) {
#if CUDA_VERSION >= 9000
return __shfl_down_sync(0xffffffff, val, delta, width);
#else
return __shfl_down(val, delta, width); return __shfl_down(val, delta, width);
#endif
} }
// CUDA SDK does not provide specializations for T* // CUDA SDK does not provide specializations for T*
...@@ -58,13 +72,17 @@ inline __device__ T* shfl_down(T* const val, ...@@ -58,13 +72,17 @@ inline __device__ T* shfl_down(T* const val,
unsigned int delta, int width = kWarpSize) { unsigned int delta, int width = kWarpSize) {
static_assert(sizeof(T*) == sizeof(long long), "pointer size"); static_assert(sizeof(T*) == sizeof(long long), "pointer size");
long long v = (long long) val; long long v = (long long) val;
return (T*) __shfl_down(v, delta, width); return (T*) shfl_down(v, delta, width);
} }
template <typename T> template <typename T>
inline __device__ T shfl_xor(const T val, inline __device__ T shfl_xor(const T val,
int laneMask, int width = kWarpSize) { int laneMask, int width = kWarpSize) {
#if CUDA_VERSION >= 9000
return __shfl_xor_sync(0xffffffff, val, laneMask, width);
#else
return __shfl_xor(val, laneMask, width); return __shfl_xor(val, laneMask, width);
#endif
} }
// CUDA SDK does not provide specializations for T* // CUDA SDK does not provide specializations for T*
...@@ -73,10 +91,12 @@ inline __device__ T* shfl_xor(T* const val, ...@@ -73,10 +91,12 @@ inline __device__ T* shfl_xor(T* const val,
int laneMask, int width = kWarpSize) { int laneMask, int width = kWarpSize) {
static_assert(sizeof(T*) == sizeof(long long), "pointer size"); static_assert(sizeof(T*) == sizeof(long long), "pointer size");
long long v = (long long) val; long long v = (long long) val;
return (T*) __shfl_xor(v, laneMask, width); return (T*) shfl_xor(v, laneMask, width);
} }
#ifdef FAISS_USE_FLOAT16 #ifdef FAISS_USE_FLOAT16
// CUDA 9.0 has half shuffle
#if CUDA_VERSION < 9000
inline __device__ half shfl(half v, inline __device__ half shfl(half v,
int srcLane, int width = kWarpSize) { int srcLane, int width = kWarpSize) {
unsigned int vu = v.x; unsigned int vu = v.x;
...@@ -96,6 +116,7 @@ inline __device__ half shfl_xor(half v, ...@@ -96,6 +116,7 @@ inline __device__ half shfl_xor(half v,
h.x = (unsigned short) vu; h.x = (unsigned short) vu;
return h; return h;
} }
#endif #endif // CUDA_VERSION
#endif // FAISS_USE_FLOAT16
} } // namespace } } // namespace
# Copyright (c) 2015-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD+Patents license found in the
# LICENSE file in the root directory of this source tree.
#! /usr/bin/env python2
import numpy as np
import faiss
def fvecs_read(filename):
fv = np.fromfile(filename, dtype = 'float32')
if fv.size == 0:
return np.zeros((0, 0), dtype = 'float32')
dim = fv.view('int32')[0]
assert dim > 0
fv = fv.reshape(-1, 1 + dim)
if not all(fv.view('int32')[:,0]==dim):
raise IOError("non-uniform vector sizes in " + filename)
fv = fv[:, 1:]
return fv.copy() # to make contiguous
rootdir = '/mnt/vol/gfsai-east/ai-group/datasets/simsearch/sift1M'
print "loading database"
xb = fvecs_read(rootdir + '/sift_base.fvecs')
xt = fvecs_read(rootdir + '/sift_learn.fvecs')
xq = fvecs_read(rootdir + '/sift_query.fvecs')
d = xt.shape[1]
gt_index = faiss.IndexFlatL2(d)
gt_index.add(xb)
D, gt_nns = gt_index.search(xq, 1)
coarse_quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(coarse_quantizer, d, 25, 16, 8)
print "train"
index.train(xt)
print "add"
index.add(xb)
print "search"
index.nprobe = 5
D, nns = index.search(xq, 10)
n_ok = (nns == gt_nns).sum()
nq = xq.shape[0]
print "n_ok=%d/%d" % (n_ok, nq)
...@@ -11,9 +11,13 @@ import time ...@@ -11,9 +11,13 @@ import time
import numpy as np import numpy as np
import pdb import pdb
import matplotlib try:
matplotlib.use('Agg') import matplotlib
from matplotlib import pyplot matplotlib.use('Agg')
from matplotlib import pyplot
graphical_output = True
except ImportError:
graphical_output = False
import faiss import faiss
...@@ -93,15 +97,15 @@ keys_mem_32 = [ ...@@ -93,15 +97,15 @@ keys_mem_32 = [
# indexes that can run on the GPU # indexes that can run on the GPU
keys_gpu = [ keys_gpu = [
"PCA64,IVF4096,Flat",
"PCA64,Flat", "Flat", "IVF4096,Flat", "IVF16384,Flat", "PCA64,Flat", "Flat", "IVF4096,Flat", "IVF16384,Flat",
"PCA64,IVF4096,Flat", "IVF4096,PQ32"] "IVF4096,PQ32"]
keys_to_test = unlimited_mem_keys keys_to_test = unlimited_mem_keys
use_gpu = False use_gpu = False
if use_gpu: if use_gpu:
# if this fails, it means that the GPU version was not comp # if this fails, it means that the GPU version was not comp
assert faiss.StandardGpuResources, \ assert faiss.StandardGpuResources, \
...@@ -129,6 +133,8 @@ for index_key in keys_to_test: ...@@ -129,6 +133,8 @@ for index_key in keys_to_test:
# transfer to GPU (may be partial) # transfer to GPU (may be partial)
index = faiss.index_cpu_to_gpu(res, dev_no, index) index = faiss.index_cpu_to_gpu(res, dev_no, index)
params = faiss.GpuParameterSpace() params = faiss.GpuParameterSpace()
print "GGGG"
raw_input()
else: else:
params = faiss.ParameterSpace() params = faiss.ParameterSpace()
...@@ -152,7 +158,7 @@ for index_key in keys_to_test: ...@@ -152,7 +158,7 @@ for index_key in keys_to_test:
op_per_key.append((index_key, opi)) op_per_key.append((index_key, opi))
if True: if graphical_output:
# graphical output (to tmp/ subdirectory) # graphical output (to tmp/ subdirectory)
fig = pyplot.figure(figsize=(12, 9)) fig = pyplot.figure(figsize=(12, 9))
......
# Copyright (c) 2015-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD+Patents license found in the
# LICENSE file in the root directory of this source tree.
#! /usr/bin/env python2
import numpy as np
import unittest
import faiss
class TestFactory(unittest.TestCase):
def test_factory_1(self):
index = faiss.index_factory(12, "IVF10,PQ4")
assert index.do_polysemous_training
index = faiss.index_factory(12, "IVF10,PQ4np")
assert not index.do_polysemous_training
index = faiss.index_factory(12, "PQ4")
assert index.do_polysemous_training
index = faiss.index_factory(12, "PQ4np")
assert not index.do_polysemous_training
try:
index = faiss.index_factory(10, "PQ4")
except RuntimeError:
pass
else:
assert False, "should do a runtime error"
def test_factory_2(self):
index = faiss.index_factory(12, "SQ8")
assert index.code_size == 12
def test_factory_3(self):
index = faiss.index_factory(12, "IVF10,PQ4")
faiss.ParameterSpace().set_index_parameter(index, "nprobe", 3)
assert index.nprobe == 3
index = faiss.index_factory(12, "PCAR8,IVF10,PQ4")
faiss.ParameterSpace().set_index_parameter(index, "nprobe", 3)
assert faiss.downcast_index(index.index).nprobe == 3
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment