bugfixes

fe446e41 · matthijs · 250a3d3f · fe446e41 · fe446e41 · fe446e41
Commit fe446e41 authored Nov 23, 2017 by matthijs
21 changed files
--- a/AutoTune.cpp
+++ b/AutoTune.cpp
@@ -431,7 +431,8 @@ void ParameterSpace::set_index_parameter (
        // and fall through to also enable it on sub-indexes
    }
    if (DC (IndexPreTransform)) {
-        index = ix->index;
+        set_index_parameter (ix->index, name, val);
+        return;
    }
    if (DC (IndexShards)) {
        // call on all sub-indexes
@@ -440,30 +441,28 @@ void ParameterSpace::set_index_parameter (
        }
        return;
    }
-    if (name == "verbose") {
-        index->verbose = int(val);
-        // in case it was an IndexPreTransform
-    }
    if (DC (IndexRefineFlat)) {
        if (name == "k_factor_rf") {
            ix->k_factor = int(val);
            return;
        }
-        index = ix->base_index;
+        // otherwise it is for the sub-index
-    }
+        set_index_parameter (&ix->refine_index, name, val);
-    if (DC (IndexPreTransform)) {
+        return;
-        index = ix->index;
    }
    if (name == "verbose") {
        index->verbose = int(val);
        return; // last verbose that we could find
    }
    if (name == "nprobe") {
        if ( DC(IndexIVF)) {
            ix->nprobe = int(val);
            return;
        }
    }
    if (name == "ht") {
        if (DC (IndexPQ)) {
            if (val >= ix->pq.code_size * 8) {
@@ -685,7 +684,6 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
         tok;
         tok = strtok_r (nullptr, " ,", &ptr)) {
        int d_out, opq_M, nbit, M, M2;
-        char option[100];
        std::string stok(tok);
        // to avoid mem leaks with exceptions:
@@ -776,10 +774,9 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
            del_coarse_quantizer.release ();
            index_ivf->own_fields = true;
            index_1 = index_ivf;
-        } else if (!index && sscanf (tok, "PQ%d%10s", &M, option) == 2) {
+        } else if (!index && (sscanf (tok, "PQ%d", &M) == 1 ||
-            std::string soption = option;
+                              sscanf (tok, "PQ%dnp", &M) == 1)) {
-            // np to disable polysemous trainign
+            bool do_polysemous_training = stok.find("np") == std::string::npos;
-            FAISS_THROW_IF_NOT(soption == "" || soption == "np");
            if (coarse_quantizer) {
                IndexIVFPQ *index_ivf = new IndexIVFPQ (
                    coarse_quantizer, d, ncentroids, M, 8);
@@ -789,11 +786,11 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
                index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
                del_coarse_quantizer.release ();
                index_ivf->own_fields = true;
-                index_ivf->do_polysemous_training = soption != "np";
+                index_ivf->do_polysemous_training = do_polysemous_training;
                index_1 = index_ivf;
            } else {
                IndexPQ *index_pq = new IndexPQ (d, M, 8, metric);
-                index_pq->do_polysemous_training = soption != "np";
+                index_pq->do_polysemous_training = do_polysemous_training;
                index_1 = index_pq;
            }

--- a/IndexPQ.cpp
+++ b/IndexPQ.cpp
@@ -260,7 +260,6 @@ static size_t polysemous_inner_loop (
 void IndexPQ::search_core_polysemous (idx_t n, const float *x, idx_t k,
                                          float *distances, idx_t *labels) const
 {
-    FAISS_THROW_IF_NOT (pq.code_size % 8 == 0);
    FAISS_THROW_IF_NOT (pq.byte_per_idx == 1);
    // PQ distance tables
@@ -319,12 +318,17 @@ void IndexPQ::search_core_polysemous (idx_t n, const float *x, idx_t k,
                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
                break;
            default:
-                if (pq.code_size % 8 == 0)
+                if (pq.code_size % 8 == 0) {
                    n_pass += polysemous_inner_loop<HammingComputerM8>
                        (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
-                else
+                } else if (pq.code_size % 4 == 0) {
                    n_pass += polysemous_inner_loop<HammingComputerM4>
                        (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                } else {
+                    FAISS_THROW_FMT(
+                         "code size %zd not supported for polysemous",
+                         pq.code_size);
+                }
                break;
            }
        } else {
@@ -342,8 +346,14 @@ void IndexPQ::search_core_polysemous (idx_t n, const float *x, idx_t k,
                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
                break;
            default:
-                n_pass += polysemous_inner_loop<GenHammingComputerM8>
+                if (pq.code_size % 8 == 0) {
-                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                    n_pass += polysemous_inner_loop<GenHammingComputerM8>
+                        (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                } else {
+                    FAISS_THROW_FMT(
+                         "code size %zd not supported for polysemous",
+                         pq.code_size);
+                }
                break;
            }
        }

--- a/gpu/GpuAutoTune.cpp
+++ b/gpu/GpuAutoTune.cpp
@@ -381,20 +381,20 @@ void GpuParameterSpace::set_index_parameter (
            set_index_parameter (ix->at(i), name, val);
        return;
    }
-    if (DC (GpuIndexIVF)) {
+    if (name == "nprobe") {
-        if (name == "nprobe") {
+        if (DC (GpuIndexIVF)) {
            ix->setNumProbes (int (val));
            return;
        }
    }
-    if(DC (GpuIndexIVFPQ)) {
+    if (name == "use_precomputed_table") {
-        if (name == "use_precomputed_table") {
+        if (DC (GpuIndexIVFPQ)) {
            ix->setPrecomputedCodes(bool (val));
            return;
        }
    }
-    // maybe norma lindex parameters apply?
+    // maybe normal index parameters apply?
    ParameterSpace::set_index_parameter (index, name, val);
 }

--- a/gpu/GpuIndexIVF.cu
+++ b/gpu/GpuIndexIVF.cu
@@ -29,8 +29,9 @@ GpuIndexIVF::GpuIndexIVF(GpuResources* resources,
    nprobe_(1),
    quantizer_(nullptr) {
 #ifndef FAISS_USE_FLOAT16
-  FAISS_THROW_IF_NOT_MSG(!ivfConfig_.flatConfig.useFloat16CoarseQuantizer,
+  FAISS_THROW_IF_NOT_MSG(!ivfConfig_.flatConfig.useFloat16 &&
-                     "float16 unsupported; need CUDA SDK >= 7.5");
+                         !ivfConfig_.flatConfig.useFloat16Accumulator,
+                         "float16 unsupported; need CUDA SDK >= 7.5");
 #endif
  init_();

--- a/gpu/GpuIndexIVFPQ.cu
+++ b/gpu/GpuIndexIVFPQ.cu
@@ -60,7 +60,7 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(GpuResources* resources,
    reserveMemoryVecs_(0),
    index_(nullptr) {
 #ifndef FAISS_USE_FLOAT16
-  FAISS_ASSERT(!useFloat16LookupTables_);
+  FAISS_ASSERT(!config.useFloat16LookupTables);
 #endif
  verifySettings_();

--- a/gpu/impl/IVFFlat.cu
+++ b/gpu/impl/IVFFlat.cu
@@ -45,10 +45,6 @@ IVFFlat::IVFFlat(GpuResources* resources,
            space),
    l2Distance_(l2Distance),
    useFloat16_(useFloat16) {
-#ifndef FAISS_USE_FLOAT16
-  FAISS_ASSERT_MSG(!useFloat16, "float16 unsupported");
-  useFloat16_ = false;
-#endif
 }
 IVFFlat::~IVFFlat() {
@@ -95,6 +91,9 @@ IVFFlat::addCodeVectorsFromCpu(int listId,
                     lengthInBytes,
                     stream,
                     true /* exact reserved size */);
+#else
+    // we are not compiling with float16 support
+    FAISS_ASSERT(false);
 #endif
  } else {
    listData->append((unsigned char*) vecs,

--- a/gpu/impl/PQScanMultiPassNoPrecomputed.cu
+++ b/gpu/impl/PQScanMultiPassNoPrecomputed.cu
@@ -506,7 +506,6 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
  }
 #else
  FAISS_ASSERT(!useFloat16Lookup);
-  int codeSize = sizeof(float);
 #endif
  int totalCodeDistancesSize =

--- a/gpu/test/test_gpu_index.py
+++ b/gpu/test/test_gpu_index.py
@@ -7,16 +7,12 @@
 #! /usr/bin/env python2
 import time
-import libfb.py.mkl  # noqa
+import unittest
 import numpy as np
-from libfb import testutil
 import faiss
-class EvalIVFPQAccuracy(testutil.BaseFacebookTestCase):
+class EvalIVFPQAccuracy(unittest.TestCase):
    def get_dataset(self, small_one=False):
        if not small_one:
@@ -110,3 +106,9 @@ class EvalIVFPQAccuracy(testutil.BaseFacebookTestCase):
    def test_cpu_to_gpu_IVFFlat(self):
        self.do_cpu_to_gpu('IVF128,Flat')
+    def test_set_gpu_param(self):
+        index = faiss.index_factory(12, "PCAR8,IVF10,PQ4")
+        res = faiss.StandardGpuResources()
+        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
+        faiss.GpuParameterSpace().set_index_parameter(index, "nprobe", 3)
--- a/gpu/utils/DeviceDefs.cuh
+++ b/gpu/utils/DeviceDefs.cuh
@@ -13,7 +13,7 @@
 namespace faiss { namespace gpu {
 #ifdef __CUDA_ARCH__
-#if __CUDA_ARCH__ <= 620
+#if __CUDA_ARCH__ <= 700
 constexpr int kWarpSize = 32;
 #else
 #error Unknown __CUDA_ARCH__; please define parameters for compute capability
@@ -25,37 +25,15 @@ constexpr int kWarpSize = 32;
 constexpr int kWarpSize = 32;
 #endif // !__CUDA_ARCH__
+// This is a memory barrier for intra-warp writes to shared memory.
 __forceinline__ __device__ void warpFence() {
-  // Technically, memory barriers are required via the CUDA
-  // programming model, since warp synchronous programming no longer
-  // is guaranteed.
-  //
-  // There are two components to it:
-  // -a barrier known to the compiler such that the compiler will not
-  // schedule loads and stores across the barrier;
-  // -a HW-level barrier that guarantees that writes are seen in the
-  // proper order
-  //
-  // However, __threadfence_block() is a stronger constraint than what
-  // we really want out of the hardware: a warp-wide barrier.
-  //
-  // In current hardware, it appears that warp synchronous programming
-  // is a reality; by all tests it appears safe and race-free.
-  //
-  // However, understandably it may not be in the future (based on
-  // what Nvidia says in the Kepler guide, it may change depending
-  // upon compiler/toolchain issues or future hardware).
-  //
-  // Removing the fence results in 10%+ faster performance.
-  // However, we are judicious as to where we insert the fence, so if
-  // this reality ever changes, uncommenting this will result in CUDA
-  // programming model-safe ordering again.
-  //
-  // FIXME: we should probably qualify as volatile as well, since the
-  // compiler could technically preserve values across loops? This
-  // seems very impractical for the compiler to do, however.
+#if __CUDA_ARCH__ >= 700
+  __syncwarp();
+#else
+  // For the time being, assume synchronicity.
  //  __threadfence_block();
+#endif
 }
 } } // namespace
--- a/gpu/utils/Float16.cu
+++ b/gpu/utils/Float16.cu
@@ -48,10 +48,16 @@ void runConvertToFloat32(float* out,
                    in, in + num, out, HalfToFloat());
 }
-half hostFloat2Half(float a) {
+__half hostFloat2Half(float a) {
-  half h;
+#if CUDA_VERSION >= 9000
+  __half_raw raw;
+  raw.x = cpu_float2half_rn(a).x;
+  return __half(raw);
+#else
+  __half h;
  h.x = cpu_float2half_rn(a).x;
  return h;
+#endif
 }
 } } // namespace

--- a/gpu/utils/Float16.cuh
+++ b/gpu/utils/Float16.cuh
@@ -149,7 +149,7 @@ DeviceTensor<float, Dim, true> fromHalf(GpuResources* resources,
  return out;
 }
-half hostFloat2Half(float v);
+__half hostFloat2Half(float v);
 #endif // FAISS_USE_FLOAT16

--- a/gpu/utils/Limits.cuh
+++ b/gpu/utils/Limits.cuh
@@ -39,9 +39,15 @@ struct Limits<float> {
 #ifdef FAISS_USE_FLOAT16
 inline __device__ __host__ half kGetHalf(unsigned short v) {
+#if CUDA_VERSION >= 9000
+  __half_raw h;
+  h.x = v;
+  return __half(h);
+#else
  half h;
  h.x = v;
  return h;
+#endif
 }
 template <>

--- a/gpu/utils/LoadStoreOperators.cuh
+++ b/gpu/utils/LoadStoreOperators.cuh
@@ -12,6 +12,12 @@
 #include "Float16.cuh"
+#ifndef __HALF2_TO_UI
+// cuda_fp16.hpp doesn't export this
+#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
+#endif
 //
 // Templated wrappers to express load/store for different scalar and vector
 // types, so kernels can have the same written form but can operate
@@ -37,13 +43,23 @@ template <>
 struct LoadStore<Half4> {
  static inline __device__ Half4 load(void* p) {
    Half4 out;
+#if CUDA_VERSION >= 9000
+    asm("ld.global.v2.u32 {%0, %1}, [%2];" :
+        "=r"(__HALF2_TO_UI(out.a)), "=r"(__HALF2_TO_UI(out.b)) : "l"(p));
+#else
    asm("ld.global.v2.u32 {%0, %1}, [%2];" :
        "=r"(out.a.x), "=r"(out.b.x) : "l"(p));
+#endif
    return out;
  }
-  static inline __device__ void store(void* p, const Half4& v) {
+  static inline __device__ void store(void* p, Half4& v) {
+#if CUDA_VERSION >= 9000
+    asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p),
+        "r"(__HALF2_TO_UI(v.a)), "r"(__HALF2_TO_UI(v.b)));
+#else
    asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p), "r"(v.a.x), "r"(v.b.x));
+#endif
  }
 };
@@ -51,15 +67,27 @@ template <>
 struct LoadStore<Half8> {
  static inline __device__ Half8 load(void* p) {
    Half8 out;
+#if CUDA_VERSION >= 9000
+    asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" :
+        "=r"(__HALF2_TO_UI(out.a.a)), "=r"(__HALF2_TO_UI(out.a.b)),
+        "=r"(__HALF2_TO_UI(out.b.a)), "=r"(__HALF2_TO_UI(out.b.b)) : "l"(p));
+#else
    asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" :
        "=r"(out.a.a.x), "=r"(out.a.b.x),
        "=r"(out.b.a.x), "=r"(out.b.b.x) : "l"(p));
+#endif
    return out;
  }
-  static inline __device__ void store(void* p, const Half8& v) {
+  static inline __device__ void store(void* p, Half8& v) {
+#if CUDA_VERSION >= 9000
+    asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
+        : : "l"(p), "r"(__HALF2_TO_UI(v.a.a)), "r"(__HALF2_TO_UI(v.a.b)),
+          "r"(__HALF2_TO_UI(v.b.a)), "r"(__HALF2_TO_UI(v.b.b)));
+#else
    asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
        : : "l"(p), "r"(v.a.a.x), "r"(v.a.b.x), "r"(v.b.a.x), "r"(v.b.b.x));
+#endif
  }
 };

--- a/gpu/utils/MathOperators.cuh
+++ b/gpu/utils/MathOperators.cuh
@@ -285,9 +285,13 @@ struct Math<half> {
  }
  static inline __device__ half zero() {
+#if CUDA_VERSION >= 9000
+    return 0;
+#else
    half h;
    h.x = 0;
    return h;
+#endif
  }
 };

--- a/gpu/utils/Select.cuh
+++ b/gpu/utils/Select.cuh
@@ -142,7 +142,14 @@ struct BlockSelect {
  __device__ inline void checkThreadQ() {
    bool needSort = (numVals == NumThreadQ);
-    if (!__any(needSort)) {
+#if CUDA_VERSION >= 9000
+    needSort = __any_sync(0xffffffff, needSort);
+#else
+    needSort = __any(needSort);
+#endif
+    if (!needSort) {
+      // no lanes have triggered a sort
      return;
    }
@@ -408,7 +415,14 @@ struct WarpSelect {
  __device__ inline void checkThreadQ() {
    bool needSort = (numVals == NumThreadQ);
-    if (!__any(needSort)) {
+#if CUDA_VERSION >= 9000
+    needSort = __any_sync(0xffffffff, needSort);
+#else
+    needSort = __any(needSort);
+#endif
+    if (!needSort) {
+      // no lanes have triggered a sort
      return;
    }

--- a/gpu/utils/Tensor-inl.cuh
+++ b/gpu/utils/Tensor-inl.cuh
@@ -27,6 +27,37 @@ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor()
  }
 }
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) {
+  this->operator=(t);
+}
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
+  this->operator=(std::move(t));
+}
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) {
+  data_ = t.data_;
+  for (int i = 0; i < Dim; ++i) {
+    size_[i] = t.size_[i];
+    stride_[i] = t.stride_[i];
+  }
+  return *this;
+}
 template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ __device__

--- a/gpu/utils/Tensor.cuh
+++ b/gpu/utils/Tensor.cuh
@@ -87,16 +87,14 @@ class Tensor {
  __host__ __device__ Tensor();
  /// Copy constructor
-  __host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t)
+  __host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t);
-  = default;
  /// Move constructor
-  __host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t)
+  __host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
-  = default;
  /// Assignment
  __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
-  operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) = default;
+  operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t);
  /// Move assignment
  __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&

--- a/gpu/utils/WarpShuffles.cuh
+++ b/gpu/utils/WarpShuffles.cuh
@@ -19,7 +19,11 @@ namespace faiss { namespace gpu {
 template <typename T>
 inline __device__ T shfl(const T val,
                         int srcLane, int width = kWarpSize) {
+#if CUDA_VERSION >= 9000
+  return __shfl_sync(0xffffffff, val, srcLane, width);
+#else
  return __shfl(val, srcLane, width);
+#endif
 }
 // CUDA SDK does not provide specializations for T*
@@ -28,13 +32,18 @@ inline __device__ T* shfl(T* const val,
                         int srcLane, int width = kWarpSize) {
  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
  long long v = (long long) val;
-  return (T*) __shfl(v, srcLane, width);
+  return (T*) shfl(v, srcLane, width);
 }
 template <typename T>
 inline __device__ T shfl_up(const T val,
                            unsigned int delta, int width = kWarpSize) {
+#if CUDA_VERSION >= 9000
+  return __shfl_up_sync(0xffffffff, val, delta, width);
+#else
  return __shfl_up(val, delta, width);
+#endif
 }
 // CUDA SDK does not provide specializations for T*
@@ -43,13 +52,18 @@ inline __device__ T* shfl_up(T* const val,
                             unsigned int delta, int width = kWarpSize) {
  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
  long long v = (long long) val;
-  return (T*) __shfl_up(v, delta, width);
+  return (T*) shfl_up(v, delta, width);
 }
 template <typename T>
 inline __device__ T shfl_down(const T val,
                              unsigned int delta, int width = kWarpSize) {
+#if CUDA_VERSION >= 9000
+  return __shfl_down_sync(0xffffffff, val, delta, width);
+#else
  return __shfl_down(val, delta, width);
+#endif
 }
 // CUDA SDK does not provide specializations for T*
@@ -58,13 +72,17 @@ inline __device__ T* shfl_down(T* const val,
                              unsigned int delta, int width = kWarpSize) {
  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
  long long v = (long long) val;
-  return (T*) __shfl_down(v, delta, width);
+  return (T*) shfl_down(v, delta, width);
 }
 template <typename T>
 inline __device__ T shfl_xor(const T val,
                             int laneMask, int width = kWarpSize) {
+#if CUDA_VERSION >= 9000
+  return __shfl_xor_sync(0xffffffff, val, laneMask, width);
+#else
  return __shfl_xor(val, laneMask, width);
+#endif
 }
 // CUDA SDK does not provide specializations for T*
@@ -73,10 +91,12 @@ inline __device__ T* shfl_xor(T* const val,
                              int laneMask, int width = kWarpSize) {
  static_assert(sizeof(T*) == sizeof(long long), "pointer size");
  long long v = (long long) val;
-  return (T*) __shfl_xor(v, laneMask, width);
+  return (T*) shfl_xor(v, laneMask, width);
 }
 #ifdef FAISS_USE_FLOAT16
+// CUDA 9.0 has half shuffle
+#if CUDA_VERSION < 9000
 inline __device__ half shfl(half v,
                            int srcLane, int width = kWarpSize) {
  unsigned int vu = v.x;
@@ -96,6 +116,7 @@ inline __device__ half shfl_xor(half v,
  h.x = (unsigned short) vu;
  return h;
 }
-#endif
+#endif // CUDA_VERSION
+#endif // FAISS_USE_FLOAT16
 } } // namespace
--- a/python/demo_ivfpq.py
+++ b/python/demo_ivfpq.py
-# Copyright (c) 2015-present, Facebook, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the BSD+Patents license found in the
-# LICENSE file in the root directory of this source tree.
-#! /usr/bin/env python2
-import numpy as np
-import faiss
-def fvecs_read(filename):
-    fv = np.fromfile(filename, dtype = 'float32')
-    if fv.size == 0:
-        return np.zeros((0, 0), dtype = 'float32')
-    dim = fv.view('int32')[0]
-    assert dim > 0
-    fv = fv.reshape(-1, 1 + dim)
-    if not all(fv.view('int32')[:,0]==dim):
-        raise IOError("non-uniform vector sizes in " + filename)
-    fv = fv[:, 1:]
-    return fv.copy()   # to make contiguous
-rootdir = '/mnt/vol/gfsai-east/ai-group/datasets/simsearch/sift1M'
-print "loading database"
-xb = fvecs_read(rootdir + '/sift_base.fvecs')
-xt = fvecs_read(rootdir + '/sift_learn.fvecs')
-xq = fvecs_read(rootdir + '/sift_query.fvecs')
-d = xt.shape[1]
-gt_index = faiss.IndexFlatL2(d)
-gt_index.add(xb)
-D, gt_nns = gt_index.search(xq, 1)
-coarse_quantizer = faiss.IndexFlatL2(d)
-index = faiss.IndexIVFPQ(coarse_quantizer, d, 25, 16, 8)
-print "train"
-index.train(xt)
-print "add"
-index.add(xb)
-print "search"
-index.nprobe = 5
-D, nns = index.search(xq, 10)
-n_ok = (nns == gt_nns).sum()
-nq = xq.shape[0]
-print "n_ok=%d/%d" % (n_ok, nq)
--- a/python/demo_auto_tune.py
+++ b/python/demo_auto_tune.py
@@ -11,9 +11,13 @@ import time
 import numpy as np
 import pdb
-import matplotlib
+try:
-matplotlib.use('Agg')
+    import matplotlib
-from matplotlib import pyplot
+    matplotlib.use('Agg')
+    from matplotlib import pyplot
+    graphical_output = True
+except ImportError:
+    graphical_output = False
 import faiss
@@ -93,15 +97,15 @@ keys_mem_32 = [
 # indexes that can run on the GPU
 keys_gpu = [
+    "PCA64,IVF4096,Flat",
    "PCA64,Flat", "Flat", "IVF4096,Flat", "IVF16384,Flat",
-    "PCA64,IVF4096,Flat", "IVF4096,PQ32"]
+    "IVF4096,PQ32"]
 keys_to_test = unlimited_mem_keys
 use_gpu = False
 if use_gpu:
    # if this fails, it means that the GPU version was not comp
    assert faiss.StandardGpuResources, \
@@ -129,6 +133,8 @@ for index_key in keys_to_test:
        # transfer to GPU (may be partial)
        index = faiss.index_cpu_to_gpu(res, dev_no, index)
        params = faiss.GpuParameterSpace()
+        print "GGGG"
+        raw_input()
    else:
        params = faiss.ParameterSpace()
@@ -152,7 +158,7 @@ for index_key in keys_to_test:
    op_per_key.append((index_key, opi))
-    if True:
+    if graphical_output:
        # graphical output (to tmp/ subdirectory)
        fig = pyplot.figure(figsize=(12, 9))

--- a/tests/test_factory.py
+++ b/tests/test_factory.py
+# Copyright (c) 2015-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the BSD+Patents license found in the
+# LICENSE file in the root directory of this source tree.
+#! /usr/bin/env python2
+import numpy as np
+import unittest
+import faiss
+class TestFactory(unittest.TestCase):
+    def test_factory_1(self):
+        index = faiss.index_factory(12, "IVF10,PQ4")
+        assert index.do_polysemous_training
+        index = faiss.index_factory(12, "IVF10,PQ4np")
+        assert not index.do_polysemous_training
+        index = faiss.index_factory(12, "PQ4")
+        assert index.do_polysemous_training
+        index = faiss.index_factory(12, "PQ4np")
+        assert not index.do_polysemous_training
+        try:
+            index = faiss.index_factory(10, "PQ4")
+        except RuntimeError:
+            pass
+        else:
+            assert False, "should do a runtime error"
+    def test_factory_2(self):
+        index = faiss.index_factory(12, "SQ8")
+        assert index.code_size == 12
+    def test_factory_3(self):
+        index = faiss.index_factory(12, "IVF10,PQ4")
+        faiss.ParameterSpace().set_index_parameter(index, "nprobe", 3)
+        assert index.nprobe == 3
+        index = faiss.index_factory(12, "PCAR8,IVF10,PQ4")
+        faiss.ParameterSpace().set_index_parameter(index, "nprobe", 3)
+        assert faiss.downcast_index(index.index).nprobe == 3