sync with FB version 2017-11-22

various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops

sync with FB version 2017-11-22
various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops
250a3d3f · matthijs · 71335194 · 250a3d3f · 250a3d3f · 250a3d3f
Commit 250a3d3f authored Nov 22, 2017 by matthijs
64 changed files
--- a/AutoTune.cpp
+++ b/AutoTune.cpp
@@ -345,6 +345,7 @@ void ParameterSpace::initialize (const Index * index)
    }
    if (DC (IndexIVF)) {
+        {
            ParameterRange & pr = add_range("nprobe");
            for (int i = 0; i < 13; i++) {
                size_t nprobe = 1 << i;
@@ -352,6 +353,7 @@ void ParameterSpace::initialize (const Index * index)
                pr.values.push_back (nprobe);
            }
        }
+    }
    if (DC (IndexPQ)) {
        ParameterRange & pr = add_range("ht");
        init_pq_ParameterRange (ix->pq, pr);
@@ -371,7 +373,6 @@ void ParameterSpace::initialize (const Index * index)
        }
    }
    if (DC (IndexIVFPQR)) {
-        assert (ix);
        ParameterRange & pr = add_range("k_factor");
        for (int i = 0; i <= 6; i++) {
            pr.values.push_back (1 << i);
@@ -427,12 +428,21 @@ void ParameterSpace::set_index_parameter (
    if (name == "verbose") {
        index->verbose = int(val);
+        // and fall through to also enable it on sub-indexes
    }
    if (DC (IndexPreTransform)) {
        index = ix->index;
    }
+    if (DC (IndexShards)) {
+        // call on all sub-indexes
+        for (auto & shard_index : ix->shard_indexes) {
+            set_index_parameter (shard_index, name, val);
+        }
+        return;
+    }
    if (name == "verbose") {
        index->verbose = int(val);
+        // in case it was an IndexPreTransform
    }
    if (DC (IndexRefineFlat)) {
        if (name == "k_factor_rf") {
@@ -449,9 +459,12 @@ void ParameterSpace::set_index_parameter (
        return; // last verbose that we could find
    }
    if (name == "nprobe") {
-        DC(IndexIVF);
+        if ( DC(IndexIVF)) {
            ix->nprobe = int(val);
-    } else if (name == "ht") {
+            return;
+        }
+    }
+    if (name == "ht") {
        if (DC (IndexPQ)) {
            if (val >= ix->pq.code_size * 8) {
                ix->search_type = IndexPQ::ST_PQ;
@@ -459,25 +472,32 @@ void ParameterSpace::set_index_parameter (
                ix->search_type = IndexPQ::ST_polysemous;
                ix->polysemous_ht = int(val);
            }
+            return;
        } else if (DC (IndexIVFPQ)) {
            if (val >= ix->pq.code_size * 8) {
                ix->polysemous_ht = 0;
            } else {
                ix->polysemous_ht = int(val);
            }
+            return;
+        }
    }
-    } else if (name == "k_factor") {
-        DC (IndexIVFPQR);
+    if (name == "k_factor") {
+        if (DC (IndexIVFPQR)) {
            ix->k_factor = val;
-    } else if (name == "max_codes") {
+            return;
-        DC (IndexIVFPQ);
+        }
+    }
+    if (name == "max_codes") {
+        if (DC (IndexIVFPQ)) {
            ix->max_codes = finite(val) ? size_t(val) : 0;
-    } else {
+            return;
-        FAISS_THROW_FMT (
+        }
-                "ParameterSpace::set_index_parameter:"
+    }
+    FAISS_THROW_FMT ("ParameterSpace::set_index_parameter:"
                     "could not set parameter %s",
                     name.c_str());
-    }
 }
 void ParameterSpace::display () const
@@ -634,6 +654,15 @@ struct VTChain {
    }
 };
+/// what kind of training does this coarse quantizer require?
+char get_trains_alone(const Index *coarse_quantizer) {
+    return
+        dynamic_cast<const MultiIndexQuantizer*>(coarse_quantizer) ? 1 :
+        0;
+}
 }
 Index *index_factory (int d, const char *description_in, MetricType metric)
@@ -656,6 +685,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
         tok;
         tok = strtok_r (nullptr, " ,", &ptr)) {
        int d_out, opq_M, nbit, M, M2;
+        char option[100];
        std::string stok(tok);
        // to avoid mem leaks with exceptions:
@@ -686,7 +716,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
        } else if (stok == "L2norm") {
            vt_1 = new NormalizationTransform (d, 2.0);
-        // coarse quantizers
        } else if (!coarse_quantizer &&
                   sscanf (tok, "IVF%d", &ncentroids) == 1) {
            if (metric == METRIC_L2) {
@@ -709,8 +739,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
                IndexIVF *index_ivf = new IndexIVFFlat (
                    coarse_quantizer, d, ncentroids, metric);
                index_ivf->quantizer_trains_alone =
-                    dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
+                    get_trains_alone (coarse_quantizer);
-                    != nullptr;
                index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
                del_coarse_quantizer.release ();
                index_ivf->own_fields = true;
@@ -728,8 +757,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
                    new IndexIVFScalarQuantizer (
                      coarse_quantizer, d, ncentroids, qt, metric);
                index_ivf->quantizer_trains_alone =
-                    dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
+                    get_trains_alone (coarse_quantizer);
-                    != nullptr;
                del_coarse_quantizer.release ();
                index_ivf->own_fields = true;
                index_1 = index_ivf;
@@ -744,29 +772,31 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
            IndexIVFPQR *index_ivf = new IndexIVFPQR (
                  coarse_quantizer, d, ncentroids, M, 8, M2, 8);
            index_ivf->quantizer_trains_alone =
-                dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
+                    get_trains_alone (coarse_quantizer);
-                != nullptr;
            del_coarse_quantizer.release ();
            index_ivf->own_fields = true;
            index_1 = index_ivf;
-        } else if (!index && sscanf (tok, "PQ%d", &M) == 1) {
+        } else if (!index && sscanf (tok, "PQ%d%10s", &M, option) == 2) {
+            std::string soption = option;
+            // np to disable polysemous trainign
+            FAISS_THROW_IF_NOT(soption == "" || soption == "np");
            if (coarse_quantizer) {
                IndexIVFPQ *index_ivf = new IndexIVFPQ (
                    coarse_quantizer, d, ncentroids, M, 8);
                index_ivf->quantizer_trains_alone =
-                    dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
+                    get_trains_alone (coarse_quantizer);
-                    != nullptr;
                index_ivf->metric_type = metric;
                index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
                del_coarse_quantizer.release ();
                index_ivf->own_fields = true;
-                index_ivf->do_polysemous_training = true;
+                index_ivf->do_polysemous_training = soption != "np";
                index_1 = index_ivf;
            } else {
                IndexPQ *index_pq = new IndexPQ (d, M, 8, metric);
-                index_pq->do_polysemous_training = true;
+                index_pq->do_polysemous_training = soption != "np";
                index_1 = index_pq;
            }
        } else if (stok == "RFlat") {
            make_IndexRefineFlat = true;
        } else {

--- a/AuxIndexStructures.h
+++ b/AuxIndexStructures.h
@@ -25,7 +25,7 @@ namespace faiss {
 /** The objective is to have a simple result structure while
 *  minimizing the number of mem copies in the result. The method
 *  do_allocation can be overloaded to allocate the result tables in
- *  the matrix type of a srcipting language like Lua or Python. */
+ *  the matrix type of a scripting language like Lua or Python. */
 struct RangeSearchResult {
    size_t nq;      ///< nb of queries
    size_t *lims;   ///< size (nq + 1)

--- a/Clustering.cpp
+++ b/Clustering.cpp
@@ -29,6 +29,7 @@ ClusteringParameters::ClusteringParameters ():
    nredo(1),
    verbose(false), spherical(false),
    update_index(false),
+    frozen_centroids(false),
    min_points_per_centroid(39),
    max_points_per_centroid(256),
    seed(1234)
@@ -110,7 +111,24 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
    float * dis = new float[nx];
    ScopeDeleter<float> del2(dis);
+    // for redo
    float best_err = 1e50;
+    std::vector<float> best_obj;
+    std::vector<float> best_centroids;
+    // support input centroids
+    FAISS_THROW_IF_NOT_MSG (
+       centroids.size() % d == 0,
+       "size of provided input centroids not a multiple of dimension");
+    size_t n_input_centroids = centroids.size() / d;
+    if (verbose && n_input_centroids > 0) {
+        printf ("  Using %zd centroids provided as input (%sfrozen)\n",
+                n_input_centroids, frozen_centroids ? "" : "not ");
+    }
    double t_search_tot = 0;
    if (verbose) {
        printf("  Preprocessing in %.2f s\n",
@@ -120,39 +138,28 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
    for (int redo = 0; redo < nredo; redo++) {
-        std::vector<float> buf_centroids;
-        std::vector<float> &cur_centroids =
-            nredo == 1 ? centroids : buf_centroids;
        if (verbose && nredo > 1) {
            printf("Outer iteration %d / %d\n", redo, nredo);
        }
-        if (cur_centroids.size() == 0) {
-            // initialize centroids with random points from the dataset
+        // initialize remaining centroids with random points from the dataset
-            cur_centroids.resize (d * k);
+        centroids.resize (d * k);
        std::vector<int> perm (nx);
        rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L);
-#pragma omp parallel for
+        for (int i = n_input_centroids; i < k ; i++)
-            for (int i = 0; i < k ; i++)
+            memcpy (&centroids[i * d], x + perm[i] * d,
-                memcpy (&cur_centroids[i * d], x + perm[i] * d,
                    d * sizeof (float));
-        } else { // assume user provides some meaningful initialization
-            FAISS_THROW_IF_NOT (cur_centroids.size() == d * k);
-            FAISS_THROW_IF_NOT_MSG (nredo == 1,
-                              "will redo with same initialization");
-        }
        if (spherical)
-            fvec_renorm_L2 (d, k, cur_centroids.data());
+            fvec_renorm_L2 (d, k, centroids.data());
        if (!index.is_trained)
-            index.train (k, cur_centroids.data());
+            index.train (k, centroids.data());
        FAISS_THROW_IF_NOT (index.ntotal == 0);
-        index.add (k, cur_centroids.data());
+        index.add (k, centroids.data());
        float err = 0;
        for (int i = 0; i < niter; i++) {
            double t0s = getmillisecs();
@@ -164,8 +171,9 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
                err += dis[j];
            obj.push_back (err);
-            int nsplit = km_update_centroids (x, cur_centroids.data(),
+            int nsplit = km_update_centroids (
-                                              assign, d, k, nx);
+                  x, centroids.data(),
+                  assign, d, k, nx, frozen_centroids ? n_input_centroids : 0);
            if (verbose) {
                printf ("  Iteration %d (%.2f s, search %.2f s): "
@@ -178,26 +186,31 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
            }
            if (spherical)
-                fvec_renorm_L2 (d, k, cur_centroids.data());
+                fvec_renorm_L2 (d, k, centroids.data());
            index.reset ();
            if (update_index)
-                index.train (k, cur_centroids.data());
+                index.train (k, centroids.data());
            assert (index.ntotal == 0);
-            index.add (k, cur_centroids.data());
+            index.add (k, centroids.data());
        }
        if (verbose) printf("\n");
        if (nredo > 1) {
            if (err < best_err) {
                if (verbose)
                    printf ("Objective improved: keep new clusters\n");
-                centroids = buf_centroids;
+                best_centroids = centroids;
+                best_obj = obj;
                best_err = err;
            }
            index.reset ();
        }
    }
+    if (nredo > 1) {
+        centroids = best_centroids;
+        obj = best_obj;
+    }
 }

--- a/Clustering.h
+++ b/Clustering.h
@@ -28,6 +28,7 @@ struct ClusteringParameters {
    bool verbose;
    bool spherical;     ///< do we want normalized centroids?
    bool update_index;  ///< update index after each iteration?
+    bool frozen_centroids;  ///< use the centroids provided as input and do not change them during iterations
    int min_points_per_centroid; ///< otherwise you get a warning
    int max_points_per_centroid;  ///< to limit size of dataset

--- a/Index.cpp
+++ b/Index.cpp
@@ -41,8 +41,7 @@ long Index::remove_ids(const IDSelector& /*sel*/) {
 void Index::reconstruct (idx_t, float * ) const {
-  FAISS_THROW_MSG ("Can not compute reconstruct without "
+  FAISS_THROW_MSG ("reconstruct not implemented for this type of index");
-                    "knowing how to do so");
 }

--- a/IndexIVF.cpp
+++ b/IndexIVF.cpp
@@ -34,8 +34,9 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
    nlist (nlist),
    nprobe (1),
    quantizer (quantizer),
-    quantizer_trains_alone (false),
+    quantizer_trains_alone (0),
    own_fields (false),
+    clustering_index (nullptr),
    ids (nlist),
    maintain_direct_map (false)
 {
@@ -56,7 +57,8 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
 IndexIVF::IndexIVF ():
    nlist (0), nprobe (1), quantizer (nullptr),
-    quantizer_trains_alone (false), own_fields (false),
+    quantizer_trains_alone (0), own_fields (false),
+    clustering_index (nullptr),
    maintain_direct_map (false)
 {}
@@ -157,22 +159,44 @@ void IndexIVF::train (idx_t n, const float *x)
    if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
        if (verbose)
            printf ("IVF quantizer does not need training.\n");
-    } else if (quantizer_trains_alone) {
+    } else if (quantizer_trains_alone == 1) {
        if (verbose)
            printf ("IVF quantizer trains alone...\n");
        quantizer->train (n, x);
+        quantizer->verbose = verbose;
        FAISS_THROW_IF_NOT_MSG (quantizer->ntotal == nlist,
                          "nlist not consistent with quantizer size");
-    } else {
+    } else if (quantizer_trains_alone == 0) {
        if (verbose)
            printf ("Training IVF quantizer on %ld vectors in %dD\n",
                    n, d);
        Clustering clus (d, nlist, cp);
        quantizer->reset();
+        if (clustering_index) {
+            clus.train (n, x, *clustering_index);
+            quantizer->add (nlist, clus.centroids.data());
+        } else {
            clus.train (n, x, *quantizer);
+        }
        quantizer->is_trained = true;
+    } else if (quantizer_trains_alone == 2) {
+        if (verbose)
+            printf (
+                "Training L2 quantizer on %ld vectors in %dD%s\n",
+                n, d,
+                clustering_index ? "(user provided index)" : "");
+        FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
+        Clustering clus (d, nlist, cp);
+        if (!clustering_index) {
+            IndexFlatL2 assigner (d);
+            clus.train(n, x, assigner);
+        } else {
+            clus.train(n, x, *clustering_index);
+        }
+        if (verbose)
+            printf ("Adding centroids to quantizer\n");
+        quantizer->add (nlist, clus.centroids.data());
    }
    if (verbose)
        printf ("Training IVF residual\n");
@@ -250,8 +274,9 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
 {
    FAISS_THROW_IF_NOT (nlist == other.nlist);
    FAISS_THROW_IF_NOT (!other.maintain_direct_map);
-    FAISS_THROW_IF_NOT_MSG (subset_type == 0 || subset_type == 2,
+    FAISS_THROW_IF_NOT_FMT (
-                            "this subset type is not implemented");
+          subset_type == 0 || subset_type == 1 || subset_type == 2,
+          "subset type %d not implemented", subset_type);
    size_t accu_n = 0;
    size_t accu_a1 = 0;
@@ -275,15 +300,24 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
                    other.ntotal++;
                }
            }
+        } else if (subset_type == 1) {
+            for (long i = 0; i < n; i++) {
+                idx_t id = ids_in[i];
+                if (id % a1 == a2) {
+                    ids_out.push_back (id);
+                    codes_out.insert (codes_out.end(),
+                                      codes_in.begin() + i * code_size,
+                                  codes_in.begin() + (i + 1) * code_size);
+                    other.ntotal++;
+                }
+            }
        } else if (subset_type == 2) {
            // see what is allocated to a1 and to a2
            size_t next_accu_n = accu_n + n;
            size_t next_accu_a1 = next_accu_n * a1 / ntotal;
            size_t i1 = next_accu_a1 - accu_a1;
-            accu_a1 = next_accu_a1;
            size_t next_accu_a2 = next_accu_n * a2 / ntotal;
            size_t i2 = next_accu_a2 - accu_a2;
-            accu_a2 = next_accu_a2;
            ids_out.insert(ids_out.end(),
                           ids_in.begin() + i1,
                           ids_in.begin() + i2);
@@ -291,6 +325,8 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
                              codes_in.begin() + i1 * code_size,
                              codes_in.begin() + i2 * code_size);
            other.ntotal += i2 - i1;
+            accu_a1 = next_accu_a1;
+            accu_a2 = next_accu_a2;
        }
        accu_n += n;
    }

--- a/IndexIVF.h
+++ b/IndexIVF.h
@@ -47,10 +47,17 @@ struct IndexIVF: Index {
    size_t nprobe;            ///< number of probes at query time
    Index * quantizer;        ///< quantizer that maps vectors to inverted lists
-    bool quantizer_trains_alone;   ///< just pass over the trainset to quantizer
+    /**
+     * = 0: use the quantizer as index in a kmeans training
+     * = 1: just pass on the training set to the train() of the quantizer
+     * = 2: kmeans training on a flat index + add the centroids to the quantizer
+     */
+    char quantizer_trains_alone;
    bool own_fields;          ///< whether object owns the quantizer
    ClusteringParameters cp; ///< to override default clustering params
+    Index *clustering_index; ///< to override index used during clustering
    std::vector < std::vector<long> > ids;  ///< Inverted lists for indexes

--- a/IndexIVFPQ.cpp
+++ b/IndexIVFPQ.cpp
@@ -291,8 +291,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
                for (int j = 0; j < d; j++) {
                    r[j] += centroid[j];
                }
-            }
+            } else {
-            else {
                pq.decode (code_line + ofs * pq.code_size, r);
            }
        }
@@ -303,6 +302,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
 void IndexIVFPQ::reconstruct (idx_t key, float * recons) const
 {
    FAISS_THROW_IF_NOT (direct_map.size() == ntotal);
    int list_no = direct_map[key] >> 32;
    int ofs = direct_map[key] & 0xffffffff;
@@ -1029,6 +1029,51 @@ void IndexIVFPQ::search_preassigned (idx_t nx, const float *qx, idx_t k,
 }
+void IndexIVFPQ::search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                         float *distances, idx_t *labels,
+                                         float *reconstructed)
+{
+    long * idx = new long [n * nprobe];
+    ScopeDeleter<long> del (idx);
+    float * coarse_dis = new float [n * nprobe];
+    ScopeDeleter<float> del2 (coarse_dis);
+    quantizer->search (n, x, nprobe, coarse_dis, idx);
+    search_preassigned (n, x, k, idx, coarse_dis,
+                        distances, labels, true);
+    for (long i = 0; i < n; i++) {
+        for (long j = 0; j < k; j++) {
+            long ij = i * k + j;
+            idx_t res = labels[ij];
+            float *recons = reconstructed + d * (ij);
+            if (res < 0) {
+                // fill with NaNs
+                memset(recons, -1, sizeof(*recons) * d);
+            } else {
+                int list_no = res >> 32;
+                int ofs = res & 0xffffffff;
+                labels[ij] = ids[list_no][ofs];
+                quantizer->reconstruct (list_no, recons);
+                const uint8_t * code = &(codes[list_no][ofs * pq.code_size]);
+                for (size_t m = 0; m < pq.M; m++) {
+                    float * out = recons + m * pq.dsub;
+                    const float * cent = pq.get_centroids (m, code[m]);
+                    for (size_t l = 0; l < pq.dsub; l++) {
+                        out[l] += cent[l];
+                    }
+                }
+            }
+        }
+    }
+}
 IndexIVFPQ::IndexIVFPQ ()

--- a/IndexIVFPQ.h
+++ b/IndexIVFPQ.h
@@ -114,6 +114,15 @@ struct IndexIVFPQ: IndexIVF {
                             float *distances, idx_t *labels,
                             bool store_pairs) const override;
+    /** Same as the search function, but also reconstruct approximate
+     * vectors for the search results
+     *
+     * @param reconstructed    size (n, k, d)
+     **/
+    void search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                 float *distances, idx_t *labels,
+                                 float *reconstructed);
    /// build precomputed table
    void precompute_table ();

--- a/IndexScalarQuantizer.cpp
+++ b/IndexScalarQuantizer.cpp
@@ -124,8 +124,8 @@ struct Codec4bit {
 struct SimilarityL2 {
    const float *y, *yi;
-    explicit SimilarityL2 (const float * y): y(y) {}
+    explicit SimilarityL2 (const float * y): y(y) {}
    /******* scalar accumulator *******/
@@ -676,19 +676,19 @@ void ScalarQuantizer::compute_codes (const float * x,
                                     size_t n) const
 {
    Quantizer *squant = select_quantizer (*this);
+    ScopeDeleter1<Quantizer> del(squant);
 #pragma omp parallel for
    for (size_t i = 0; i < n; i++)
        squant->encode_vector (x + i * d, codes + i * code_size);
-    delete squant;
 }
 void ScalarQuantizer::decode (const uint8_t *codes, float *x, size_t n) const
 {
    Quantizer *squant = select_quantizer (*this);
+    ScopeDeleter1<Quantizer> del(squant);
 #pragma omp parallel for
    for (size_t i = 0; i < n; i++)
        squant->decode_vector (codes + i * code_size, x + i * d);
-    delete squant;
 }
 /*******************************************************************
@@ -754,6 +754,7 @@ void IndexScalarQuantizer::search(
                }
                ci += code_size;
            }
+            minheap_reorder (k, simi, idxi);
        }
    } else {
 #pragma omp parallel for
@@ -774,7 +775,7 @@ void IndexScalarQuantizer::search(
                }
                ci += code_size;
            }
+            maxheap_reorder (k, simi, idxi);
        }
    }
@@ -855,6 +856,7 @@ void IndexIVFScalarQuantizer::add_with_ids
        int nt = omp_get_num_threads();
        int rank = omp_get_thread_num();
+        // each thread takes care of a subset of lists
        for (size_t i = 0; i < n; i++) {
            long list_no = idx [i];
@@ -879,6 +881,7 @@ void IndexIVFScalarQuantizer::add_with_ids
    ntotal += nadd;
 }
+namespace {
 void search_with_probes_ip (const IndexIVFScalarQuantizer & index,
                            const float *x,
@@ -958,6 +961,8 @@ void search_with_probes_L2 (const IndexIVFScalarQuantizer & index,
    maxheap_reorder (k, simi, idxi);
 }
+} // anonymous namespace
 void IndexIVFScalarQuantizer::search_preassigned (
                             idx_t n, const float *x, idx_t k,
                             const idx_t *idx,

--- a/Makefile
+++ b/Makefile
@@ -87,54 +87,59 @@ _swigfaiss.so: python/_swigfaiss.so
 	cp python/_swigfaiss.so python/swigfaiss.py .
 #############################
-# Dependencies
+# Dependencies.
+# make dep > x
+# then copy/paste from x by hand below
-# for i in *.cpp ; do g++ -std=c++11 -I.. -MM $i -msse4; done
+dep:
+	for i in $(patsubst %.o,%.cpp,$(LIBOBJ)) ; do \
+	   cpp -MM -std=gnu++0x $$i ; \
+	done
-AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
- FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
- IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
- IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
-AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
-Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
- FaissAssert.h FaissException.h IndexFlat.h
-FaissException.o: FaissException.cpp FaissException.h
 hamming.o: hamming.cpp hamming.h Heap.h FaissAssert.h FaissException.h
-Heap.o: Heap.cpp Heap.h
+utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \
-Index.o: Index.cpp IndexFlat.h Index.h FaissAssert.h FaissException.h
+ FaissAssert.h FaissException.h
 IndexFlat.o: IndexFlat.cpp IndexFlat.h Index.h utils.h Heap.h \
 FaissAssert.h FaissException.h AuxIndexStructures.h
-index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
- IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
- ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
- IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
 IndexIVF.o: IndexIVF.cpp IndexIVF.h Index.h Clustering.h Heap.h utils.h \
 hamming.h FaissAssert.h FaissException.h IndexFlat.h \
 AuxIndexStructures.h
-IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
- Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
- IndexFlat.h hamming.h FaissAssert.h FaissException.h \
- AuxIndexStructures.h
 IndexLSH.o: IndexLSH.cpp IndexLSH.h Index.h VectorTransform.h utils.h \
 Heap.h hamming.h FaissAssert.h FaissException.h
 IndexPQ.o: IndexPQ.cpp IndexPQ.h Index.h ProductQuantizer.h Clustering.h \
 Heap.h PolysemousTraining.h FaissAssert.h FaissException.h hamming.h
-IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \
+IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
- IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \
+ Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
- FaissException.h
+ IndexFlat.h hamming.h FaissAssert.h FaissException.h \
-MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \
+ AuxIndexStructures.h
- FaissException.h Heap.h AuxIndexStructures.h
+Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
+ FaissAssert.h FaissException.h IndexFlat.h
+Heap.o: Heap.cpp Heap.h
+VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \
+ Heap.h FaissAssert.h FaissException.h IndexPQ.h ProductQuantizer.h \
+ Clustering.h PolysemousTraining.h
+index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
+ IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
+ ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
+ IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
 PolysemousTraining.o: PolysemousTraining.cpp PolysemousTraining.h \
 ProductQuantizer.h Clustering.h Index.h Heap.h utils.h hamming.h \
 FaissAssert.h FaissException.h
+MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \
+ FaissException.h Heap.h AuxIndexStructures.h
+Index.o: Index.cpp IndexFlat.h Index.h FaissAssert.h FaissException.h
 ProductQuantizer.o: ProductQuantizer.cpp ProductQuantizer.h Clustering.h \
 Index.h Heap.h FaissAssert.h FaissException.h VectorTransform.h \
 IndexFlat.h utils.h
-utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \
+AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
- FaissAssert.h FaissException.h
+ FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
-VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \
+ IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
- Heap.h FaissAssert.h FaissException.h IndexPQ.h ProductQuantizer.h \
+ IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
- Clustering.h PolysemousTraining.h
+AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
+IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \
+ IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \
+ FaissException.h
+FaissException.o: FaissException.cpp FaissException.h
 clean:

--- a/MetaIndexes.cpp
+++ b/MetaIndexes.cpp
@@ -76,6 +76,17 @@ void IndexIDMap::search (idx_t n, const float *x, idx_t k,
    }
 }
+void IndexIDMap::range_search (idx_t n, const float *x, float radius,
+                   RangeSearchResult *result) const
+{
+  index->range_search(n, x, radius, result);
+  for (idx_t i = 0; i < result->lims[result->nq]; i++) {
+      result->labels[i] = result->labels[i] < 0 ?
+        result->labels[i] : id_map[result->labels[i]];
+  }
+}
 namespace {
 struct IDTranslatedSelector: IDSelector {
@@ -109,6 +120,7 @@ long IndexIDMap::remove_ids (const IDSelector & sel)
    }
    FAISS_ASSERT (j == index->ntotal);
    ntotal = j;
+    id_map.resize(ntotal);
    return nremove;
 }

--- a/MetaIndexes.h
+++ b/MetaIndexes.h
@@ -51,6 +51,9 @@ struct IndexIDMap : Index {
    /// remove ids adapted to IndexFlat
    long remove_ids(const IDSelector& sel) override;
+    void range_search (idx_t n, const float *x, float radius,
+                       RangeSearchResult *result) const override;
    ~IndexIDMap() override;
    IndexIDMap () {own_fields=false; index=nullptr; }
 };

--- a/VectorTransform.cpp
+++ b/VectorTransform.cpp
@@ -804,18 +804,38 @@ void IndexPreTransform::train (idx_t n, const float *x)
    const float *prev_x = x;
    ScopeDeleter<float> del;
+    if (verbose) {
+        printf("IndexPreTransform::train: training chain 0 to %d\n",
+               last_untrained);
+    }
    for (int i = 0; i <= last_untrained; i++) {
        if (i < chain.size()) {
            VectorTransform *ltrans = chain [i];
-            if (!ltrans->is_trained)
+            if (!ltrans->is_trained) {
-                ltrans->train(n, prev_x);
+                if (verbose) {
+                    printf("   Training chain component %d/%zd\n",
+                           i, chain.size());
+                    if (OPQMatrix *opqm = dynamic_cast<OPQMatrix*>(ltrans)) {
+                        opqm->verbose = true;
+                    }
+                }
+                ltrans->train (n, prev_x);
+            }
        } else {
+            if (verbose) {
+                printf("   Training sub-index\n");
+            }
            index->train (n, prev_x);
        }
        if (i == last_untrained) break;
+        if (verbose) {
+            printf("   Applying transform %d/%zd\n",
+                   i, chain.size());
+        }
        float * xt = chain[i]->apply (n, prev_x);
-        if (prev_x != x) delete prev_x;
+        if (prev_x != x) delete [] prev_x;
        prev_x = xt;
        del.set(xt);
    }

--- a/benchs/bench_gpu_1bn.py
+++ b/benchs/bench_gpu_1bn.py
@@ -521,7 +521,7 @@ def compute_populated_index(preproc):
    co.verbose = True
    co.reserveVecs = max_add if max_add > 0 else xb.shape[0]
    co.shard = True
+    assert co.shard_type in (0, 1, 2)
    vres, vdev = make_vres_vdev()
    gpu_index = faiss.index_cpu_to_gpu_multiple(
        vres, vdev, indexall, co)

--- a/faiss.py
+++ b/faiss.py
@@ -121,6 +121,18 @@ def handle_Index(the_class):
                      swig_ptr(labels))
        return distances, labels
+    def replacement_search_and_reconstruct(self, x, k):
+        n, d = x.shape
+        assert d == self.d
+        distances = np.empty((n, k), dtype=np.float32)
+        labels = np.empty((n, k), dtype=np.int64)
+        recons = np.empty((n, k, d), dtype=np.float32)
+        self.search_and_reconstruct_c(n, swig_ptr(x),
+                                      k, swig_ptr(distances),
+                                      swig_ptr(labels),
+                                      swig_ptr(recons))
+        return distances, labels, recons
    def replacement_remove_ids(self, x):
        if isinstance(x, IDSelector):
            sel = x
@@ -167,6 +179,8 @@ def handle_Index(the_class):
    replace_method(the_class, 'range_search', replacement_range_search)
    replace_method(the_class, 'update_vectors', replacement_update_vectors,
                   ignore_missing=True)
+    replace_method(the_class, 'search_and_reconstruct',
+                   replacement_search_and_reconstruct, ignore_missing=True)
 def handle_VectorTransform(the_class):
@@ -258,12 +272,52 @@ def index_cpu_to_gpu_multiple_py(resources, index, co=None):
    return index_cpu_to_gpu_multiple(vres, vdev, index, co)
-def vector_float_to_array(v):
+def index_cpu_to_all_gpus(index, co=None, ngpu=-1):
-    a = np.empty(v.size(), dtype='float32')
+    if ngpu == -1:
-    memcpy(swig_ptr(a), v.data(), 4 * v.size())
+        ngpu = get_num_gpus()
+    res = [StandardGpuResources() for i in range(ngpu)]
+    index2 = index_cpu_to_gpu_multiple_py(res, index, co)
+    index2.dont_dealloc = res
+    return index2
+# mapping from vector names in swigfaiss.swig and the numpy dtype names
+vector_name_map = {
+    'Float': 'float32',
+    'Byte': 'uint8',
+    'Uint64': 'uint64',
+    'Long': 'int64',
+    'Int': 'int32',
+    'Double': 'float64'
+    }
+def vector_to_array(v):
+    """ convert a C++ vector to a numpy array """
+    classname = v.__class__.__name__
+    assert classname.endswith('Vector')
+    dtype = np.dtype(vector_name_map[classname[:-6]])
+    a = np.empty(v.size(), dtype=dtype)
+    memcpy(swig_ptr(a), v.data(), a.nbytes)
    return a
+def vector_float_to_array(v):
+    return vector_to_array(v)
+def copy_array_to_vector(a, v):
+    """ copy a numpy array to a vector """
+    n, = a.shape
+    classname = v.__class__.__name__
+    assert classname.endswith('Vector')
+    dtype = np.dtype(vector_name_map[classname[:-6]])
+    assert dtype == a.dtype, (
+        'cannot copy a %s array to a %s (should be %s)' % (
+            a.dtype, classname, dtype))
+    v.resize(n)
+    memcpy(v.data(), swig_ptr(a), a.nbytes)
 class Kmeans:
    def __init__(self, d, k, niter=25, verbose=False, spherical = False):
@@ -364,3 +418,18 @@ def eval_intersection(I1, I2):
 def normalize_L2(x):
    fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x))
+def replacement_map_add(self, keys, vals):
+    n, = keys.shape
+    assert (n,) == keys.shape
+    self.add_c(n, swig_ptr(keys), swig_ptr(vals))
+def replacement_map_search_multiple(self, keys):
+    n, = keys.shape
+    vals = np.empty(n, dtype='uint64')
+    self.search_multiple_c(n, swig_ptr(keys), swig_ptr(vals))
+    return vals
+replace_method(MapLong2Long, 'add', replacement_map_add)
+replace_method(MapLong2Long, 'search_multiple', replacement_map_search_multiple)
--- a/gpu/GpuAutoTune.cpp
+++ b/gpu/GpuAutoTune.cpp
@@ -8,6 +8,7 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 #include "GpuAutoTune.h"
+#include <typeinfo>
 #include "GpuIndex.h"
 #include "../FaissAssert.h"
@@ -97,17 +98,6 @@ faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index)
-GpuClonerOptions::GpuClonerOptions():
-    indicesOptions(INDICES_64_BIT),
-    useFloat16CoarseQuantizer(false),
-    useFloat16(false),
-    usePrecomputed(true),
-    reserveVecs(0),
-    storeTransposed(false),
-    verbose(0)
-{}
 struct ToGpuCloner: faiss::Cloner, GpuClonerOptions {
    GpuResources *resources;
    int device;
@@ -185,9 +175,6 @@ faiss::Index * index_cpu_to_gpu(
    return cl.clone_Index(index);
 }
-GpuMultipleClonerOptions::GpuMultipleClonerOptions(): shard(false)
-{}
 struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
    std::vector<ToGpuCloner> sub_cloners;
@@ -211,6 +198,28 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
    {}
+    void copy_ivf_shard (const IndexIVF *index_ivf, IndexIVF *idx2,
+                         long n, long i) {
+        if (shard_type == 2) {
+            long i0 = i * index_ivf->ntotal / n;
+            long i1 = (i + 1) * index_ivf->ntotal / n;
+            if(verbose)
+                printf("IndexShards shard %ld indices %ld:%ld\n",
+                       i, i0, i1);
+            index_ivf->copy_subset_to(*idx2, 2, i0, i1);
+            FAISS_ASSERT(idx2->ntotal == i1 - i0);
+        } else if (shard_type == 1) {
+            if(verbose)
+                printf("IndexShards shard %ld select modulo %ld = %ld\n",
+                       i, n, i);
+            index_ivf->copy_subset_to(*idx2, 1, n, i);
+        } else {
+            FAISS_THROW_FMT ("shard_type %d not implemented", shard_type);
+        }
+    }
    Index *clone_Index(const Index *index) override {
        long n = sub_cloners.size();
        if (n == 1)
@@ -231,19 +240,13 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
                    dynamic_cast<const faiss::IndexIVFPQ *>(index);
                auto index_ivfflat =
                    dynamic_cast<const faiss::IndexIVFFlat *>(index);
-                FAISS_ASSERT_MSG (index_ivfpq || index_ivfflat,
+                FAISS_THROW_IF_NOT_MSG (index_ivfpq || index_ivfflat,
                              "IndexShards implemented only for "
                              "IndexIVFFlat or IndexIVFPQ");
                std::vector<faiss::Index*> shards(n);
                for(long i = 0; i < n; i++) {
                    // make a shallow copy
-                    long i0 = i * index->ntotal / n;
-                    long i1 = (i + 1) * index->ntotal / n;
-                    if(verbose)
-                        printf("IndexShards shard %ld indices %ld:%ld\n",
-                               i, i0, i1);
                    if(reserveVecs)
                        sub_cloners[i].reserveVecs =
                            (reserveVecs + n - 1) / n;
@@ -258,18 +261,19 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
                        idx2.nprobe = index_ivfpq->nprobe;
                        idx2.use_precomputed_table = 0;
                        idx2.is_trained = index->is_trained;
-                        index_ivfpq->copy_subset_to(idx2, 2, i0, i1);
+                        copy_ivf_shard (index_ivfpq, &idx2, n, i);
-                        FAISS_ASSERT(idx2.ntotal == i1 - i0);
                        shards[i] = sub_cloners[i].clone_Index(&idx2);
                    } else if (index_ivfflat) {
                        faiss::IndexIVFFlat idx2(
                              index_ivfflat->quantizer, index->d,
                              index_ivfflat->nlist, index_ivfflat->metric_type);
                        idx2.nprobe = index_ivfflat->nprobe;
-                        index_ivfflat->copy_subset_to(idx2, 2, i0, i1);
                        idx2.nprobe = index_ivfflat->nprobe;
+                        copy_ivf_shard (index_ivfflat, &idx2, n, i);
                        shards[i] = sub_cloners[i].clone_Index(&idx2);
                    }
                }
                faiss::IndexShards *res =
                    new faiss::IndexShards(index->d, true, false);
@@ -372,33 +376,26 @@ void GpuParameterSpace::initialize (const Index * index)
 void GpuParameterSpace::set_index_parameter (
        Index * index, const std::string & name, double val) const
 {
-    if (DC (IndexPreTransform)) {
-        index = ix->index;
-    }
    if (DC (IndexProxy)) {
        for (int i = 0; i < ix->count(); i++)
            set_index_parameter (ix->at(i), name, val);
        return;
    }
-    if (DC (faiss::IndexShards)) {
+    if (DC (GpuIndexIVF)) {
-        for (auto sub_index : ix->shard_indexes)
-            set_index_parameter (sub_index, name, val);
-        return;
-    }
        if (name == "nprobe") {
-      DC (GpuIndexIVF);
-      FAISS_ASSERT(ix);
            ix->setNumProbes (int (val));
            return;
        }
+    }
+    if(DC (GpuIndexIVFPQ)) {
        if (name == "use_precomputed_table") {
-      DC (GpuIndexIVFPQ);
-      FAISS_ASSERT(ix);
            ix->setPrecomputedCodes(bool (val));
            return;
        }
+    }
-    FAISS_ASSERT_MSG (false, "unknown parameter");
+    // maybe norma lindex parameters apply?
+    ParameterSpace::set_index_parameter (index, name, val);
 }

--- a/gpu/GpuClonerOptions.cpp
+++ b/gpu/GpuClonerOptions.cpp
@@ -22,7 +22,9 @@ GpuClonerOptions::GpuClonerOptions()
 }
 GpuMultipleClonerOptions::GpuMultipleClonerOptions()
-    : shard(false) {
+    : shard(false),
+      shard_type(1)
+{
 }
 } } // namespace
--- a/gpu/GpuClonerOptions.h
+++ b/gpu/GpuClonerOptions.h
@@ -47,6 +47,9 @@ struct GpuMultipleClonerOptions : public GpuClonerOptions {
  /// Whether to shard the index across GPUs, versus replication
  /// across GPUs
  bool shard;
+  /// IndexIVF::copy_subset_to subset type
+  int shard_type;
 };
 } } // namespace
--- a/gpu/GpuIndex.h
+++ b/gpu/GpuIndex.h
@@ -26,7 +26,7 @@ struct GpuIndexConfig {
  /// GPU device on which the index is resident
  int device;
-  /// What memory space to use for primary storae.
+  /// What memory space to use for primary storage.
  /// On Pascal and above (CC 6+) architectures, allows GPUs to use
  /// more memory than is available on the GPU.
  MemorySpace memorySpace;

--- a/gpu/GpuIndexIVF.cu
+++ b/gpu/GpuIndexIVF.cu
@@ -184,7 +184,7 @@ GpuIndexIVF::copyTo(faiss::IndexIVF* index) const {
  }
  index->quantizer = q;
-  index->quantizer_trains_alone = false;
+  index->quantizer_trains_alone = 0;
  index->own_fields = true;
  index->cp = this->cp;
  index->ids.clear();

--- a/gpu/GpuIndexIVFPQ.cu
+++ b/gpu/GpuIndexIVFPQ.cu
@@ -96,7 +96,6 @@ GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
  FAISS_ASSERT(index->pq.byte_per_idx == 1);
  FAISS_ASSERT(index->by_residual);
  FAISS_ASSERT(index->polysemous_ht == 0);
-  ivfpqConfig_.usePrecomputedTables = (bool) index->use_precomputed_table;
  verifySettings_();

--- a/gpu/Makefile
+++ b/gpu/Makefile
@@ -23,6 +23,7 @@ CPPOBJ=     GpuResources.o \
            IndexProxy.o \
            StandardGpuResources.o \
 	    GpuAutoTune.o \
+	    GpuClonerOptions.o \
            impl/RemapIndices.o \
 	    utils/DeviceMemory.o \
            utils/StackDeviceMemory.o \
@@ -123,23 +124,24 @@ clean:
 dep:
 	for i in $(patsubst %.o,%.cpp,$(CPPOBJ)) \
                 $(patsubst %.o,%.cu,$(CUOBJ)); do \
-	     echo -n $${i%/*}/ ; \
+	     echo -n $$( dirname $$i )/ ; \
             cpp -MM -std=gnu++0x $$i; \
 	done
-GpuResources.cpp/GpuResources.o: GpuResources.cpp GpuResources.h utils/DeviceMemory.h \
+./GpuResources.o: GpuResources.cpp GpuResources.h utils/DeviceMemory.h \
 utils/DeviceUtils.h utils/../../FaissAssert.h \
 utils/../../FaissException.h
-IndexProxy.cpp/IndexProxy.o: IndexProxy.cpp IndexProxy.h ../Index.h utils/WorkerThread.h \
+./IndexProxy.o: IndexProxy.cpp IndexProxy.h ../Index.h utils/WorkerThread.h \
 ../FaissAssert.h ../FaissException.h ../Clustering.h ../Index.h \
 GpuIndexFlat.h GpuIndex.h utils/MemorySpace.h utils/../../FaissAssert.h \
 StandardGpuResources.h GpuResources.h utils/DeviceMemory.h \
 utils/StackDeviceMemory.h utils/DeviceUtils.h
-StandardGpuResources.cpp/StandardGpuResources.o: StandardGpuResources.cpp StandardGpuResources.h \
+./StandardGpuResources.o: StandardGpuResources.cpp StandardGpuResources.h \
 GpuResources.h utils/DeviceMemory.h utils/StackDeviceMemory.h \
 utils/DeviceUtils.h utils/../../FaissAssert.h \
 utils/../../FaissException.h ../FaissAssert.h
-GpuAutoTune.cpp/GpuAutoTune.o: GpuAutoTune.cpp GpuAutoTune.h ../Index.h ../AutoTune.h \
+./GpuAutoTune.o: GpuAutoTune.cpp GpuAutoTune.h ../Index.h ../AutoTune.h \
 ../Index.h GpuClonerOptions.h GpuIndicesOptions.h GpuIndex.h \
 utils/MemorySpace.h utils/../../FaissAssert.h \
 utils/../../FaissException.h ../FaissAssert.h ../index_io.h \
@@ -161,6 +163,8 @@ utils/DeviceUtils.o: utils/DeviceUtils.cpp utils/DeviceUtils.h \
 utils/../../FaissAssert.h utils/../../FaissException.h
 utils/Timer.o: utils/Timer.cpp utils/Timer.h utils/DeviceUtils.h \
 utils/../../FaissAssert.h utils/../../FaissException.h
+utils/MemorySpace.o: utils/MemorySpace.cpp utils/MemorySpace.h \
+ utils/../../FaissAssert.h utils/../../FaissException.h
 utils/WorkerThread.o: utils/WorkerThread.cpp utils/WorkerThread.h \
 utils/../../FaissAssert.h utils/../../FaissException.h
 impl/BroadcastSum.o: impl/BroadcastSum.cu impl/../../FaissAssert.h \
@@ -169,12 +173,14 @@ impl/BroadcastSum.o: impl/BroadcastSum.cu impl/../../FaissAssert.h \
 impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \
 impl/../utils/../utils/DeviceMemory.h impl/../utils/DeviceTensor.cuh \
 impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
+ impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
 impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \
 impl/../utils/StaticUtils.h
 impl/Distance.o: impl/Distance.cu impl/Distance.cuh \
 impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
- impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \
+ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
- impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
+ impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
+ impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
 impl/../utils/DeviceMemory.h impl/../utils/MemorySpace.h \
 impl/../utils/DeviceTensor-inl.cuh impl/../utils/Float16.cuh \
 impl/../utils/../GpuResources.h impl/BroadcastSum.cuh impl/L2Norm.cuh \
@@ -189,8 +195,9 @@ impl/Distance.o: impl/Distance.cu impl/Distance.cuh \
 impl/../utils/ReductionOperators.cuh
 impl/FlatIndex.o: impl/FlatIndex.cu impl/FlatIndex.cuh \
 impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
- impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \
+ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
- impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
+ impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
+ impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
 impl/../utils/DeviceMemory.h impl/../utils/MemorySpace.h \
 impl/../utils/DeviceTensor-inl.cuh impl/../utils/DeviceVector.cuh \
 impl/../utils/StaticUtils.h impl/../utils/Float16.cuh \
@@ -200,8 +207,9 @@ impl/FlatIndex.o: impl/FlatIndex.cu impl/FlatIndex.cuh \
 impl/InvertedListAppend.o: impl/InvertedListAppend.cu \
 impl/InvertedListAppend.cuh impl/../GpuIndicesOptions.h \
 impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
- impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
+ impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
- impl/../utils/DeviceUtils.h impl/../../FaissAssert.h \
+ impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
+ impl/../utils/../../FaissAssert.h impl/../../FaissAssert.h \
 impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \
 impl/../utils/../utils/DeviceMemory.h impl/../utils/DeviceTensor.cuh \
 impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \
@@ -211,7 +219,8 @@ impl/IVFBase.o: impl/IVFBase.cu impl/IVFBase.cuh impl/../GpuIndicesOptions.h \
 impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
 impl/../utils/MemorySpace.h impl/../utils/StaticUtils.h \
 impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
- impl/../utils/Tensor-inl.cuh impl/../utils/DeviceMemory.h \
+ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
+ impl/../utils/../../FaissAssert.h impl/../utils/DeviceMemory.h \
 impl/../utils/DeviceTensor-inl.cuh impl/../GpuResources.h \
 impl/FlatIndex.cuh impl/../utils/Float16.cuh impl/InvertedListAppend.cuh \
 impl/RemapIndices.h impl/../utils/DeviceDefs.cuh \
@@ -222,6 +231,7 @@ impl/IVFFlat.o: impl/IVFFlat.cu impl/IVFFlat.cuh impl/IVFBase.cuh \
 impl/../utils/DeviceUtils.h impl/../utils/MemorySpace.h \
 impl/../utils/StaticUtils.h impl/../utils/DeviceTensor.cuh \
 impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
+ impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
 impl/../utils/DeviceMemory.h impl/../utils/DeviceTensor-inl.cuh \
 impl/../GpuResources.h impl/FlatIndex.cuh impl/../utils/Float16.cuh \
 impl/InvertedListAppend.cuh impl/IVFFlatScan.cuh impl/RemapIndices.h \
@@ -230,8 +240,9 @@ impl/IVFFlat.o: impl/IVFFlat.cu impl/IVFFlat.cuh impl/IVFBase.cuh \
 impl/../utils/Transpose.cuh
 impl/IVFFlatScan.o: impl/IVFFlatScan.cu impl/IVFFlatScan.cuh \
 impl/../GpuIndicesOptions.h impl/../utils/Tensor.cuh \
- impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \
+ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
- impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
+ impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
+ impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
 impl/../GpuResources.h impl/../utils/DeviceMemory.h impl/IVFUtils.cuh \
 impl/../utils/ConversionOperators.cuh impl/../utils/Float16.cuh \
 impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \
@@ -247,6 +258,7 @@ impl/IVFPQ.o: impl/IVFPQ.cu impl/IVFPQ.cuh impl/IVFBase.cuh \
 impl/../utils/DeviceUtils.h impl/../utils/MemorySpace.h \
 impl/../utils/StaticUtils.h impl/../utils/DeviceTensor.cuh \
 impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
+ impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
 impl/../utils/DeviceMemory.h impl/../utils/DeviceTensor-inl.cuh \
 impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \
 impl/BroadcastSum.cuh impl/Distance.cuh impl/FlatIndex.cuh \
@@ -258,42 +270,46 @@ impl/IVFPQ.o: impl/IVFPQ.cu impl/IVFPQ.cuh impl/IVFBase.cuh \
 impl/../utils/MatrixMult.cuh impl/../utils/Transpose.cuh
 impl/IVFUtils.o: impl/IVFUtils.cu impl/IVFUtils.cuh \
 impl/../GpuIndicesOptions.h impl/../utils/Tensor.cuh \
- impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \
+ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
- impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
+ impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
+ impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
 impl/../utils/StaticUtils.h impl/../utils/ThrustAllocator.cuh
 impl/IVFUtilsSelect1.o: impl/IVFUtilsSelect1.cu impl/IVFUtils.cuh \
 impl/../GpuIndicesOptions.h impl/../utils/Tensor.cuh \
- impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \
+ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
- impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
+ impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
- impl/../utils/Select.cuh impl/../utils/Comparators.cuh \
+ impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
- impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \
+ impl/../utils/Limits.cuh impl/../utils/Float16.cuh \
- impl/../utils/../utils/DeviceMemory.h impl/../utils/DeviceTensor.cuh \
+ impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
- impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \
+ impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \
- impl/../utils/DeviceDefs.cuh impl/../utils/MergeNetworkBlock.cuh \
+ impl/../utils/DeviceTensor-inl.cuh impl/../utils/Pair.cuh \
+ impl/../utils/MathOperators.cuh impl/../utils/WarpShuffles.cuh \
+ impl/../utils/DeviceDefs.cuh impl/../utils/Select.cuh \
+ impl/../utils/Comparators.cuh impl/../utils/MergeNetworkBlock.cuh \
 impl/../utils/MergeNetworkUtils.cuh impl/../utils/PtxUtils.cuh \
- impl/../utils/StaticUtils.h impl/../utils/WarpShuffles.cuh \
+ impl/../utils/StaticUtils.h impl/../utils/MergeNetworkWarp.cuh \
- impl/../utils/MergeNetworkWarp.cuh impl/../utils/Reductions.cuh \
+ impl/../utils/Reductions.cuh impl/../utils/ReductionOperators.cuh
- impl/../utils/ReductionOperators.cuh impl/../utils/Limits.cuh \
- impl/../utils/Pair.cuh impl/../utils/MathOperators.cuh
 impl/IVFUtilsSelect2.o: impl/IVFUtilsSelect2.cu impl/IVFUtils.cuh \
 impl/../GpuIndicesOptions.h impl/../utils/Tensor.cuh \
- impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \
+ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
- impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
+ impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
- impl/../utils/Select.cuh impl/../utils/Comparators.cuh \
+ impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
- impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \
+ impl/../utils/Limits.cuh impl/../utils/Float16.cuh \
- impl/../utils/../utils/DeviceMemory.h impl/../utils/DeviceTensor.cuh \
+ impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
- impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \
+ impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \
- impl/../utils/DeviceDefs.cuh impl/../utils/MergeNetworkBlock.cuh \
+ impl/../utils/DeviceTensor-inl.cuh impl/../utils/Pair.cuh \
+ impl/../utils/MathOperators.cuh impl/../utils/WarpShuffles.cuh \
+ impl/../utils/DeviceDefs.cuh impl/../utils/Select.cuh \
+ impl/../utils/Comparators.cuh impl/../utils/MergeNetworkBlock.cuh \
 impl/../utils/MergeNetworkUtils.cuh impl/../utils/PtxUtils.cuh \
- impl/../utils/StaticUtils.h impl/../utils/WarpShuffles.cuh \
+ impl/../utils/StaticUtils.h impl/../utils/MergeNetworkWarp.cuh \
- impl/../utils/MergeNetworkWarp.cuh impl/../utils/Reductions.cuh \
+ impl/../utils/Reductions.cuh impl/../utils/ReductionOperators.cuh
- impl/../utils/ReductionOperators.cuh impl/../utils/Limits.cuh \
- impl/../utils/Pair.cuh impl/../utils/MathOperators.cuh
 impl/L2Norm.o: impl/L2Norm.cu impl/L2Norm.cuh impl/../utils/Float16.cuh \
 impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
 impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
- impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \
+ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
- impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
+ impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
+ impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
 impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \
 impl/../../FaissAssert.h impl/../utils/ConversionOperators.cuh \
 impl/../utils/DeviceDefs.cuh impl/../utils/MathOperators.cuh \
@@ -304,8 +320,9 @@ impl/L2Norm.o: impl/L2Norm.cu impl/L2Norm.cuh impl/../utils/Float16.cuh \
 impl/L2Select.o: impl/L2Select.cu impl/L2Select.cuh impl/../utils/Float16.cuh \
 impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
 impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
- impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \
+ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
- impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
+ impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
+ impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
 impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \
 impl/../../FaissAssert.h impl/../utils/MathOperators.cuh \
 impl/../utils/Pair.cuh impl/../utils/WarpShuffles.cuh \
@@ -317,8 +334,9 @@ impl/L2Select.o: impl/L2Select.cu impl/L2Select.cuh impl/../utils/Float16.cuh \
 impl/../utils/MergeNetworkWarp.cuh
 impl/PQCodeDistances.o: impl/PQCodeDistances.cu impl/PQCodeDistances.cuh \
 impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
- impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
+ impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
- impl/../utils/DeviceUtils.h impl/../utils/NoTypeTensor.cuh \
+ impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
+ impl/../utils/../../FaissAssert.h impl/../utils/NoTypeTensor.cuh \
 impl/BroadcastSum.cuh impl/../utils/Float16.cuh \
 impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
 impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \
@@ -329,8 +347,9 @@ impl/PQCodeDistances.o: impl/PQCodeDistances.cu impl/PQCodeDistances.cuh \
 impl/PQScanMultiPassNoPrecomputed.o: impl/PQScanMultiPassNoPrecomputed.cu \
 impl/PQScanMultiPassNoPrecomputed.cuh impl/../GpuIndicesOptions.h \
 impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
- impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
+ impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
- impl/../utils/DeviceUtils.h impl/../GpuResources.h \
+ impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
+ impl/../utils/../../FaissAssert.h impl/../GpuResources.h \
 impl/../utils/DeviceMemory.h impl/PQCodeDistances.cuh \
 impl/../utils/NoTypeTensor.cuh impl/PQCodeLoad.cuh \
 impl/../utils/PtxUtils.cuh impl/IVFUtils.cuh \
@@ -342,8 +361,9 @@ impl/PQScanMultiPassNoPrecomputed.o: impl/PQScanMultiPassNoPrecomputed.cu \
 impl/PQScanMultiPassPrecomputed.o: impl/PQScanMultiPassPrecomputed.cu \
 impl/PQScanMultiPassPrecomputed.cuh impl/../GpuIndicesOptions.h \
 impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
- impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
+ impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
- impl/../utils/DeviceUtils.h impl/../utils/NoTypeTensor.cuh \
+ impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
+ impl/../utils/../../FaissAssert.h impl/../utils/NoTypeTensor.cuh \
 impl/../GpuResources.h impl/../utils/DeviceMemory.h impl/PQCodeLoad.cuh \
 impl/../utils/PtxUtils.cuh impl/IVFUtils.cuh \
 impl/../utils/ConversionOperators.cuh impl/../utils/Float16.cuh \
@@ -352,33 +372,36 @@ impl/PQScanMultiPassPrecomputed.o: impl/PQScanMultiPassPrecomputed.cu \
 impl/../utils/MathOperators.cuh impl/../utils/StaticUtils.h
 impl/VectorResidual.o: impl/VectorResidual.cu impl/VectorResidual.cuh \
 impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
- impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
+ impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
- impl/../utils/DeviceUtils.h impl/../utils/Float16.cuh \
+ impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
+ impl/../utils/../../FaissAssert.h impl/../utils/Float16.cuh \
 impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
 impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \
 impl/../utils/DeviceTensor-inl.cuh impl/../../FaissAssert.h \
 impl/../utils/ConversionOperators.cuh impl/../utils/StaticUtils.h
-GpuIndex.cu/GpuIndex.o: GpuIndex.cu GpuIndex.h ../Index.h utils/MemorySpace.h \
+./GpuIndex.o: GpuIndex.cu GpuIndex.h ../Index.h utils/MemorySpace.h \
 utils/../../FaissAssert.h utils/../../FaissException.h ../FaissAssert.h \
 GpuResources.h utils/DeviceMemory.h utils/DeviceUtils.h
-GpuIndexFlat.cu/GpuIndexFlat.o: GpuIndexFlat.cu GpuIndexFlat.h GpuIndex.h ../Index.h \
+./GpuIndexFlat.o: GpuIndexFlat.cu GpuIndexFlat.h GpuIndex.h ../Index.h \
 utils/MemorySpace.h utils/../../FaissAssert.h \
 utils/../../FaissException.h ../IndexFlat.h ../Index.h GpuResources.h \
 utils/DeviceMemory.h impl/FlatIndex.cuh impl/../utils/DeviceTensor.cuh \
 impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
- impl/../utils/../../FaissAssert.h impl/../utils/DeviceUtils.h \
+ impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
+ impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
 impl/../utils/DeviceTensor-inl.cuh impl/../utils/DeviceVector.cuh \
 impl/../utils/StaticUtils.h impl/../utils/Float16.cuh \
 utils/CopyUtils.cuh utils/HostTensor.cuh utils/HostTensor-inl.cuh
-GpuIndexIVF.cu/GpuIndexIVF.o: GpuIndexIVF.cu GpuIndexIVF.h GpuIndex.h ../Index.h \
+./GpuIndexIVF.o: GpuIndexIVF.cu GpuIndexIVF.h GpuIndex.h ../Index.h \
 utils/MemorySpace.h utils/../../FaissAssert.h \
 utils/../../FaissException.h GpuIndexFlat.h GpuIndicesOptions.h \
 ../Clustering.h ../Index.h ../FaissAssert.h ../IndexFlat.h ../IndexIVF.h \
 ../Clustering.h ../Heap.h utils/DeviceUtils.h utils/Float16.cuh \
 utils/../GpuResources.h utils/../utils/DeviceMemory.h \
 utils/DeviceTensor.cuh utils/Tensor.cuh utils/Tensor-inl.cuh \
+ utils/../GpuFaissAssert.h utils/../../FaissAssert.h \
 utils/DeviceTensor-inl.cuh
-GpuIndexIVFFlat.cu/GpuIndexIVFFlat.o: GpuIndexIVFFlat.cu GpuIndexIVFFlat.h GpuIndexIVF.h \
+./GpuIndexIVFFlat.o: GpuIndexIVFFlat.cu GpuIndexIVFFlat.h GpuIndexIVF.h \
 GpuIndex.h ../Index.h utils/MemorySpace.h utils/../../FaissAssert.h \
 utils/../../FaissException.h GpuIndexFlat.h GpuIndicesOptions.h \
 ../Clustering.h ../Index.h ../IndexFlat.h ../IndexIVF.h ../Clustering.h \
@@ -387,9 +410,10 @@ GpuIndexIVFFlat.cu/GpuIndexIVFFlat.o: GpuIndexIVFFlat.cu GpuIndexIVFFlat.h GpuIn
 impl/../utils/../../FaissAssert.h impl/../utils/DeviceUtils.h \
 impl/../utils/StaticUtils.h impl/../utils/DeviceTensor.cuh \
 impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
+ impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
 impl/../utils/DeviceTensor-inl.cuh utils/CopyUtils.cuh \
 utils/HostTensor.cuh utils/HostTensor-inl.cuh utils/Float16.cuh
-GpuIndexIVFPQ.cu/GpuIndexIVFPQ.o: GpuIndexIVFPQ.cu GpuIndexIVFPQ.h GpuIndexIVF.h \
+./GpuIndexIVFPQ.o: GpuIndexIVFPQ.cu GpuIndexIVFPQ.h GpuIndexIVF.h \
 GpuIndex.h ../Index.h utils/MemorySpace.h utils/../../FaissAssert.h \
 utils/../../FaissException.h GpuIndexFlat.h GpuIndicesOptions.h \
 ../Clustering.h ../Index.h ../IndexFlat.h ../IndexIVFPQ.h ../IndexIVF.h \
@@ -399,19 +423,22 @@ GpuIndexIVFPQ.cu/GpuIndexIVFPQ.o: GpuIndexIVFPQ.cu GpuIndexIVFPQ.h GpuIndexIVF.h
 impl/../utils/DeviceVector.cuh impl/../utils/../../FaissAssert.h \
 impl/../utils/DeviceUtils.h impl/../utils/StaticUtils.h \
 impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
- impl/../utils/Tensor-inl.cuh impl/../utils/DeviceTensor-inl.cuh \
+ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
+ impl/../utils/../../FaissAssert.h impl/../utils/DeviceTensor-inl.cuh \
 impl/../utils/Float16.cuh utils/CopyUtils.cuh utils/HostTensor.cuh \
 utils/HostTensor-inl.cuh
 utils/Float16.o: utils/Float16.cu utils/Float16.cuh utils/../GpuResources.h \
 utils/../utils/DeviceMemory.h utils/DeviceTensor.cuh utils/Tensor.cuh \
- utils/Tensor-inl.cuh utils/../../FaissAssert.h \
+ utils/Tensor-inl.cuh utils/../GpuFaissAssert.h utils/../../FaissAssert.h \
- utils/../../FaissException.h utils/DeviceUtils.h utils/MemorySpace.h \
+ utils/../../FaissException.h utils/DeviceUtils.h \
- utils/DeviceTensor-inl.cuh utils/nvidia/fp16_emu.cuh
+ utils/../../FaissAssert.h utils/MemorySpace.h utils/DeviceTensor-inl.cuh \
+ utils/nvidia/fp16_emu.cuh
 utils/MatrixMult.o: utils/MatrixMult.cu utils/MatrixMult.cuh utils/Float16.cuh \
 utils/../GpuResources.h utils/../utils/DeviceMemory.h \
 utils/DeviceTensor.cuh utils/Tensor.cuh utils/Tensor-inl.cuh \
- utils/../../FaissAssert.h utils/../../FaissException.h \
+ utils/../GpuFaissAssert.h utils/../../FaissAssert.h \
- utils/DeviceUtils.h utils/MemorySpace.h utils/DeviceTensor-inl.cuh \
+ utils/../../FaissException.h utils/DeviceUtils.h \
+ utils/../../FaissAssert.h utils/MemorySpace.h utils/DeviceTensor-inl.cuh \
 utils/HostTensor.cuh utils/HostTensor-inl.cuh
 utils/BlockSelectFloat.o: utils/BlockSelectFloat.cu \
 utils/blockselect/BlockSelectImpl.cuh \
@@ -420,9 +447,12 @@ utils/BlockSelectFloat.o: utils/BlockSelectFloat.cu \
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -442,9 +472,12 @@ utils/BlockSelectHalf.o: utils/BlockSelectHalf.cu \
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -464,9 +497,12 @@ utils/WarpSelectFloat.o: utils/WarpSelectFloat.cu \
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -485,9 +521,12 @@ utils/WarpSelectHalf.o: utils/WarpSelectHalf.cu \
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -507,9 +546,12 @@ utils/blockselect/BlockSelectHalf1.o: utils/blockselect/BlockSelectHalf1.cu \
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -529,9 +571,12 @@ utils/blockselect/BlockSelectFloat1.o: utils/blockselect/BlockSelectFloat1.cu \
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -551,9 +596,12 @@ utils/warpselect/WarpSelectHalf1.o: utils/warpselect/WarpSelectHalf1.cu \
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -572,9 +620,12 @@ utils/warpselect/WarpSelectFloat1.o: utils/warpselect/WarpSelectFloat1.cu \
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -593,9 +644,12 @@ utils/blockselect/BlockSelectHalf32.o: utils/blockselect/BlockSelectHalf32.cu \
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -615,9 +669,12 @@ utils/blockselect/BlockSelectFloat32.o: utils/blockselect/BlockSelectFloat32.cu
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -637,9 +694,12 @@ utils/warpselect/WarpSelectHalf32.o: utils/warpselect/WarpSelectHalf32.cu \
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -658,9 +718,12 @@ utils/warpselect/WarpSelectFloat32.o: utils/warpselect/WarpSelectFloat32.cu \
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -679,9 +742,12 @@ utils/blockselect/BlockSelectHalf64.o: utils/blockselect/BlockSelectHalf64.cu \
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -701,9 +767,12 @@ utils/blockselect/BlockSelectFloat64.o: utils/blockselect/BlockSelectFloat64.cu
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -723,9 +792,12 @@ utils/warpselect/WarpSelectHalf64.o: utils/warpselect/WarpSelectHalf64.cu \
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -744,9 +816,12 @@ utils/warpselect/WarpSelectFloat64.o: utils/warpselect/WarpSelectFloat64.cu \
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -765,9 +840,12 @@ utils/blockselect/BlockSelectHalf128.o: utils/blockselect/BlockSelectHalf128.cu
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -787,9 +865,12 @@ utils/blockselect/BlockSelectFloat128.o: utils/blockselect/BlockSelectFloat128.c
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -809,9 +890,12 @@ utils/warpselect/WarpSelectHalf128.o: utils/warpselect/WarpSelectHalf128.cu \
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -830,9 +914,12 @@ utils/warpselect/WarpSelectFloat128.o: utils/warpselect/WarpSelectFloat128.cu \
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -851,9 +938,12 @@ utils/blockselect/BlockSelectHalf256.o: utils/blockselect/BlockSelectHalf256.cu
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -873,9 +963,12 @@ utils/blockselect/BlockSelectFloat256.o: utils/blockselect/BlockSelectFloat256.c
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -895,9 +988,12 @@ utils/warpselect/WarpSelectHalf256.o: utils/warpselect/WarpSelectHalf256.cu \
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -916,9 +1012,12 @@ utils/warpselect/WarpSelectFloat256.o: utils/warpselect/WarpSelectFloat256.cu \
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -937,9 +1036,12 @@ utils/blockselect/BlockSelectHalfF512.o: utils/blockselect/BlockSelectHalfF512.c
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -959,9 +1061,12 @@ utils/blockselect/BlockSelectFloatF512.o: utils/blockselect/BlockSelectFloatF512
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -981,9 +1086,12 @@ utils/warpselect/WarpSelectHalfF512.o: utils/warpselect/WarpSelectHalfF512.cu \
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -1002,9 +1110,12 @@ utils/warpselect/WarpSelectFloatF512.o: utils/warpselect/WarpSelectFloatF512.cu
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -1023,9 +1134,12 @@ utils/blockselect/BlockSelectHalfT512.o: utils/blockselect/BlockSelectHalfT512.c
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -1045,9 +1159,12 @@ utils/blockselect/BlockSelectFloatT512.o: utils/blockselect/BlockSelectFloatT512
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -1067,9 +1184,12 @@ utils/warpselect/WarpSelectHalfT512.o: utils/warpselect/WarpSelectHalfT512.cu \
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -1088,9 +1208,12 @@ utils/warpselect/WarpSelectFloatT512.o: utils/warpselect/WarpSelectFloatT512.cu
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -1109,9 +1232,12 @@ utils/blockselect/BlockSelectHalfF1024.o: utils/blockselect/BlockSelectHalfF1024
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -1131,9 +1257,12 @@ utils/blockselect/BlockSelectFloatF1024.o: utils/blockselect/BlockSelectFloatF10
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -1153,9 +1282,12 @@ utils/warpselect/WarpSelectHalfF1024.o: utils/warpselect/WarpSelectHalfF1024.cu
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -1174,9 +1306,12 @@ utils/warpselect/WarpSelectFloatF1024.o: utils/warpselect/WarpSelectFloatF1024.c
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -1195,9 +1330,12 @@ utils/blockselect/BlockSelectHalfT1024.o: utils/blockselect/BlockSelectHalfT1024
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -1217,9 +1355,12 @@ utils/blockselect/BlockSelectFloatT1024.o: utils/blockselect/BlockSelectFloatT10
 utils/blockselect/../../utils/DeviceMemory.h \
 utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
 utils/blockselect/../Tensor-inl.cuh \
+ utils/blockselect/../../GpuFaissAssert.h \
 utils/blockselect/../../../FaissAssert.h \
 utils/blockselect/../../../FaissException.h \
- utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
+ utils/blockselect/../DeviceUtils.h \
+ utils/blockselect/../../../FaissAssert.h \
+ utils/blockselect/../MemorySpace.h \
 utils/blockselect/../DeviceTensor-inl.cuh \
 utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
 utils/blockselect/../DeviceDefs.cuh \
@@ -1239,9 +1380,12 @@ utils/warpselect/WarpSelectHalfT1024.o: utils/warpselect/WarpSelectHalfT1024.cu
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \
@@ -1260,9 +1404,12 @@ utils/warpselect/WarpSelectFloatT1024.o: utils/warpselect/WarpSelectFloatT1024.c
 utils/warpselect/../../utils/DeviceMemory.h \
 utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
 utils/warpselect/../Tensor-inl.cuh \
+ utils/warpselect/../../GpuFaissAssert.h \
 utils/warpselect/../../../FaissAssert.h \
 utils/warpselect/../../../FaissException.h \
- utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
+ utils/warpselect/../DeviceUtils.h \
+ utils/warpselect/../../../FaissAssert.h \
+ utils/warpselect/../MemorySpace.h \
 utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
 utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
 utils/warpselect/../MergeNetworkBlock.cuh \

--- a/gpu/impl/Distance.cu
+++ b/gpu/impl/Distance.cu
@@ -29,44 +29,107 @@ namespace faiss { namespace gpu {
 namespace {
-constexpr int kDefaultTileSize = 256;
+template <typename T>
+Tensor<T, 2, true> sliceCentroids(Tensor<T, 2, true>& centroids,
+                                  Tensor<T, 2, true>* centroidsTransposed,
+                                  int startCentroid,
+                                  int num) {
+  if (startCentroid == 0 && num == centroids.getSize(0)) {
+    if (centroidsTransposed) {
+      return *centroidsTransposed;
+    } else {
+      return centroids;
+    }
+  }
+  if (centroidsTransposed) {
+    // (dim, num)
+    return centroidsTransposed->narrow(1, startCentroid, num);
+  } else {
+    return centroids.narrow(0, startCentroid, num);
+  }
+}
+// For each chunk of k indices, increment the index by chunk * increment
 template <typename T>
-int chooseTileSize(int tileSizeOverride,
+__global__ void incrementIndex(Tensor<T, 2, true> indices,
-                   size_t numCentroids,
+                               int k,
-                   size_t tempMemAvailable) {
+                               int increment) {
-  if (tileSizeOverride > 0) {
+  for (int i = threadIdx.x; i < k; i += blockDim.x) {
-    return tileSizeOverride;
+    indices[blockIdx.y][blockIdx.x * k + i] += blockIdx.x * increment;
  }
+}
-  size_t tileSize =
+// Used to update result indices in distance computation where the number of
-    sizeof(T) < 4 ? kDefaultTileSize * 2 : kDefaultTileSize;
+// centroids is high, and is tiled
+template <typename T>
+void runIncrementIndex(Tensor<T, 2, true>& indices,
+                       int k,
+                       int increment,
+                       cudaStream_t stream) {
+  dim3 grid(indices.getSize(1) / k, indices.getSize(0));
+  int block = std::min(k, 512);
-  while (tileSize > 64) {
+  // should be exact
-    size_t memRequirement = 2 * tileSize * numCentroids * sizeof(T);
+  FAISS_ASSERT(grid.x * k == indices.getSize(1));
-    if (memRequirement <= tempMemAvailable) {
+  incrementIndex<<<grid, block, 0, stream>>>(indices, k, increment);
-      // This fits entirely into our temporary memory
-      return tileSize;
-    }
-    // Otherwise, halve the tile size
+  cudaDeviceSynchronize();
-    tileSize /= 2;
+}
+// If the inner size (dim) of the vectors is small, we want a larger query tile
+// size, like 1024
+void chooseTileSize(int numQueries,
+                    int numCentroids,
+                    int dim,
+                    int elementSize,
+                    size_t tempMemAvailable,
+                    int& tileRows,
+                    int& tileCols) {
+  // The matrix multiplication should be large enough to be efficient, but if it
+  // is too large, we seem to lose efficiency as opposed to double-streaming.
+  // Each tile size here defines 1/2 of the memory use due to double streaming.
+  // We ignore available temporary memory, as that is adjusted independently by
+  // the user and can thus meet these requirements (or not).
+  // For <= 4 GB GPUs, prefer 512 MB of usage.
+  // For <= 8 GB GPUs, prefer 768 MB of usage.
+  // Otherwise, prefer 1 GB of usage.
+  auto totalMem = getCurrentDeviceProperties().totalGlobalMem;
+  int targetUsage = 0;
+  if (totalMem <= ((size_t) 4) * 1024 * 1024 * 1024) {
+    targetUsage = 512 * 1024 * 1024;
+  } else if (totalMem <= ((size_t) 8) * 1024 * 1024 * 1024) {
+    targetUsage = 768 * 1024 * 1024;
+  } else {
+    targetUsage = 1024 * 1024 * 1024;
  }
-  // We use 64 as the minimum acceptable tile size
+  targetUsage /= 2 * elementSize;
-  FAISS_ASSERT(tileSize >= 64);
-  // FIXME: if we're running with no available temp memory, do we try
+  // 512 seems to be a batch size sweetspot for float32.
-  // and go larger based on free memory available on the device?
+  // If we are on float16, increase to 512.
+  // If the k size (vec dim) of the matrix multiplication is small (<= 32),
+  // increase to 1024.
+  int preferredTileRows = 512;
+  if (dim <= 32) {
+    preferredTileRows = 1024;
+  }
-  return tileSize;
+  tileRows = std::min(preferredTileRows, numQueries);
+  // tileCols is the remainder size
+  tileCols = std::min(targetUsage / preferredTileRows, numCentroids);
 }
 }
 template <typename T>
-void runL2Distance(GpuResources* resources,
+void runDistance(bool computeL2,
+                 GpuResources* resources,
                 Tensor<T, 2, true>& centroids,
                 Tensor<T, 2, true>* centroidsTransposed,
                 Tensor<T, 1, true>* centroidNorms,
@@ -75,8 +138,7 @@ void runL2Distance(GpuResources* resources,
                 Tensor<T, 2, true>& outDistances,
                 Tensor<int, 2, true>& outIndices,
                 bool useHgemm,
-                   bool ignoreOutDistances = false,
+                 bool ignoreOutDistances) {
-                   int tileSizeOverride = -1) {
  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
  FAISS_ASSERT(outDistances.getSize(1) == k);
@@ -98,9 +160,9 @@ void runL2Distance(GpuResources* resources,
    return;
  }
-  // If ||c||^2 is not pre-computed, calculate it
+  // L2: If ||c||^2 is not pre-computed, calculate it
  DeviceTensor<T, 1, true> cNorms;
-  if (!centroidNorms) {
+  if (computeL2 && !centroidNorms) {
    cNorms = std::move(DeviceTensor<T, 1, true>(
                       mem,
                       {centroids.getSize(0)}, defaultStream));
@@ -115,68 +177,111 @@ void runL2Distance(GpuResources* resources,
  DeviceTensor<T, 1, true> queryNorms(mem, qNormSize, defaultStream);
  // ||q||^2
+  if (computeL2) {
    runL2Norm(queries, queryNorms, true, defaultStream);
+  }
-  //
+  // By default, aim to use up to 512 MB of memory for the processing, with both
-  // Handle the problem in row tiles, to avoid excessive temporary
+  // number of queries and number of centroids being at least 512.
-  // memory requests
+  int tileRows = 0;
-  //
+  int tileCols = 0;
+  chooseTileSize(queries.getSize(0),
+                 centroids.getSize(0),
+                 queries.getSize(1),
+                 sizeof(T),
+                 mem.getSizeAvailable(),
+                 tileRows,
+                 tileCols);
+  int numColTiles = utils::divUp(centroids.getSize(0), tileCols);
  FAISS_ASSERT(k <= centroids.getSize(0));
  FAISS_ASSERT(k <= 1024); // select limitation
-  int tileSize =
-    chooseTileSize<T>(
-      tileSizeOverride,
-      centroids.getSize(0),
-      resources->getMemoryManagerCurrentDevice().getSizeAvailable());
-  int maxQueriesPerIteration = std::min(tileSize, queries.getSize(0));
  // Temporary output memory space we'll use
  DeviceTensor<T, 2, true> distanceBuf1(
-    mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
+    mem, {tileRows, tileCols}, defaultStream);
  DeviceTensor<T, 2, true> distanceBuf2(
-    mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
+    mem, {tileRows, tileCols}, defaultStream);
  DeviceTensor<T, 2, true>* distanceBufs[2] =
    {&distanceBuf1, &distanceBuf2};
+  DeviceTensor<T, 2, true> outDistanceBuf1(
+    mem, {tileRows, numColTiles * k}, defaultStream);
+  DeviceTensor<T, 2, true> outDistanceBuf2(
+    mem, {tileRows, numColTiles * k}, defaultStream);
+  DeviceTensor<T, 2, true>* outDistanceBufs[2] =
+    {&outDistanceBuf1, &outDistanceBuf2};
+  DeviceTensor<int, 2, true> outIndexBuf1(
+    mem, {tileRows, numColTiles * k}, defaultStream);
+  DeviceTensor<int, 2, true> outIndexBuf2(
+    mem, {tileRows, numColTiles * k}, defaultStream);
+  DeviceTensor<int, 2, true>* outIndexBufs[2] =
+    {&outIndexBuf1, &outIndexBuf2};
  auto streams = resources->getAlternateStreamsCurrentDevice();
  streamWait(streams, {defaultStream});
  int curStream = 0;
-  for (int i = 0; i < queries.getSize(0); i += maxQueriesPerIteration) {
+  // Tile over the input queries
-    int numQueriesForIteration = std::min(maxQueriesPerIteration,
+  for (int i = 0; i < queries.getSize(0); i += tileRows) {
-                                          queries.getSize(0) - i);
+    int curQuerySize = std::min(tileRows, queries.getSize(0) - i);
-    auto distanceBufView =
-      distanceBufs[curStream]->narrowOutermost(0, numQueriesForIteration);
-    auto queryView =
-      queries.narrowOutermost(i, numQueriesForIteration);
    auto outDistanceView =
-      outDistances.narrowOutermost(i, numQueriesForIteration);
+      outDistances.narrow(0, i, curQuerySize);
    auto outIndexView =
-      outIndices.narrowOutermost(i, numQueriesForIteration);
+      outIndices.narrow(0, i, curQuerySize);
+    auto queryView =
+      queries.narrow(0, i, curQuerySize);
    auto queryNormNiew =
-      queryNorms.narrowOutermost(i, numQueriesForIteration);
+      queryNorms.narrow(0, i, curQuerySize);
+    auto outDistanceBufRowView =
+      outDistanceBufs[curStream]->narrow(0, 0, curQuerySize);
+    auto outIndexBufRowView =
+      outIndexBufs[curStream]->narrow(0, 0, curQuerySize);
+    // Tile over the centroids
+    for (int j = 0; j < centroids.getSize(0); j += tileCols) {
+      int curCentroidSize = std::min(tileCols, centroids.getSize(0) - j);
+      int curColTile = j / tileCols;
-    // L2 distance is ||c||^2 - 2qc + ||q||^2
+      auto centroidsView =
+        sliceCentroids(centroids, centroidsTransposed, j, curCentroidSize);
-    // -2qc
+      auto distanceBufView = distanceBufs[curStream]->
+        narrow(0, 0, curQuerySize).narrow(1, 0, curCentroidSize);
+      auto outDistanceBufColView =
+        outDistanceBufRowView.narrow(1, k * curColTile, k);
+      auto outIndexBufColView =
+        outIndexBufRowView.narrow(1, k * curColTile, k);
+      // L2: distance is ||c||^2 - 2qc + ||q||^2, we compute -2qc
+      // IP: just compute qc
      // (query id x dim) x (centroid id, dim)' = (query id, centroid id)
      runMatrixMult(distanceBufView, false,
                    queryView, false,
-                  centroidsTransposed ? *centroidsTransposed : centroids,
+                    centroidsView,
                    centroidsTransposed ? false : true,
-                  -2.0f, 0.0f, useHgemm,
+                    computeL2 ? -2.0f : 1.0f, 0.0f, useHgemm,
                    resources->getBlasHandleCurrentDevice(),
                    streams[curStream]);
+      if (computeL2) {
        // For L2 distance, we use this fused kernel that performs both
        // adding ||c||^2 to -2qc and k-selection, so we only need two
        // passes (one write by the gemm, one read here) over the huge
        // region of output memory
+        //
+        // If we aren't tiling along the number of centroids, we can perform the
+        // output work directly
+        if (tileCols == centroids.getSize(0)) {
+          // Write into the final output
          runL2SelectMin(distanceBufView,
                         *centroidNorms,
                         outDistanceView,
@@ -189,6 +294,57 @@ void runL2Distance(GpuResources* resources,
            // top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)
            runSumAlongRows(queryNormNiew, outDistanceView, streams[curStream]);
          }
+        } else {
+          auto centroidNormsView =
+            centroidNorms->narrow(0, j, curCentroidSize);
+          // Write into our intermediate output
+          runL2SelectMin(distanceBufView,
+                         centroidNormsView,
+                         outDistanceBufColView,
+                         outIndexBufColView,
+                         k,
+                         streams[curStream]);
+          if (!ignoreOutDistances) {
+            // expand (query id) to (query id, k) by duplicating along rows
+            // top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)
+            runSumAlongRows(queryNormNiew,
+                            outDistanceBufColView,
+                            streams[curStream]);
+          }
+        }
+      } else {
+        // For IP, just k-select the output for this tile
+        if (tileCols == centroids.getSize(0)) {
+          // Write into the final output
+          runBlockSelect(distanceBufView,
+                         outDistanceView,
+                         outIndexView,
+                         true, k, streams[curStream]);
+        } else {
+          // Write into the intermediate output
+          runBlockSelect(distanceBufView,
+                         outDistanceBufColView,
+                         outIndexBufColView,
+                         true, k, streams[curStream]);
+        }
+      }
+    }
+    // As we're finished with processing a full set of centroids, perform the
+    // final k-selection
+    if (tileCols != centroids.getSize(0)) {
+      // The indices are tile-relative; for each tile of k, we need to add
+      // tileCols to the index
+      runIncrementIndex(outIndexBufRowView, k, tileCols, streams[curStream]);
+      runBlockSelectPair(outDistanceBufRowView,
+                         outIndexBufRowView,
+                         outDistanceView,
+                         outIndexView,
+                         computeL2 ? false : true, k, streams[curStream]);
+    }
    curStream = (curStream + 1) % 2;
  }
@@ -198,98 +354,49 @@ void runL2Distance(GpuResources* resources,
 }
 template <typename T>
-void runIPDistance(GpuResources* resources,
+void runL2Distance(GpuResources* resources,
                   Tensor<T, 2, true>& centroids,
                   Tensor<T, 2, true>* centroidsTransposed,
+                   Tensor<T, 1, true>* centroidNorms,
                   Tensor<T, 2, true>& queries,
                   int k,
                   Tensor<T, 2, true>& outDistances,
                   Tensor<int, 2, true>& outIndices,
                   bool useHgemm,
-                   int tileSizeOverride = -1) {
+                   bool ignoreOutDistances = false) {
-  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
+  runDistance<T>(true, // L2
-  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
+                 resources,
-  FAISS_ASSERT(outDistances.getSize(1) == k);
+                 centroids,
-  FAISS_ASSERT(outIndices.getSize(1) == k);
+                 centroidsTransposed,
+                 centroidNorms,
-  auto& mem = resources->getMemoryManagerCurrentDevice();
+                 queries,
-  auto defaultStream = resources->getDefaultStreamCurrentDevice();
+                 k,
+                 outDistances,
-  // If we're quering against a 0 sized set, just return empty results
+                 outIndices,
-  if (centroids.numElements() == 0) {
+                 useHgemm,
-    thrust::fill(thrust::cuda::par.on(defaultStream),
+                 ignoreOutDistances);
-                 outDistances.data(), outDistances.end(),
+}
-                 Limits<T>::getMax());
-    thrust::fill(thrust::cuda::par.on(defaultStream),
-                 outIndices.data(), outIndices.end(),
-                 -1);
-    return;
-  }
-  //
-  // Handle the problem in row tiles, to avoid excessive temporary
-  // memory requests
-  //
-  FAISS_ASSERT(k <= centroids.getSize(0));
-  FAISS_ASSERT(k <= 1024); // select limitation
-  int tileSize =
-    chooseTileSize<T>(
-      tileSizeOverride,
-      centroids.getSize(0),
-      resources->getMemoryManagerCurrentDevice().getSizeAvailable());
-  int maxQueriesPerIteration = std::min(tileSize, queries.getSize(0));
-  // Temporary output memory space we'll use
-  DeviceTensor<T, 2, true> distanceBuf1(
-    mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
-  DeviceTensor<T, 2, true> distanceBuf2(
-    mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
-  DeviceTensor<T, 2, true>* distanceBufs[2] =
-    {&distanceBuf1, &distanceBuf2};
-  auto streams = resources->getAlternateStreamsCurrentDevice();
-  streamWait(streams, {defaultStream});
-  int curStream = 0;
-  for (int i = 0; i < queries.getSize(0); i += maxQueriesPerIteration) {
-    int numQueriesForIteration = std::min(maxQueriesPerIteration,
-                                          queries.getSize(0) - i);
-    auto distanceBufView =
-      distanceBufs[curStream]->narrowOutermost(0, numQueriesForIteration);
-    auto queryView =
-      queries.narrowOutermost(i, numQueriesForIteration);
-    auto outDistanceView =
-      outDistances.narrowOutermost(i, numQueriesForIteration);
-    auto outIndexView =
-      outIndices.narrowOutermost(i, numQueriesForIteration);
-    // (query id x dim) x (centroid id, dim)' = (query id, centroid id)
-    runMatrixMult(distanceBufView, false,
-                  queryView, false,
-                  centroidsTransposed ? *centroidsTransposed : centroids,
-                  centroidsTransposed ? false : true,
-                  1.0f, 0.0f, useHgemm,
-                  resources->getBlasHandleCurrentDevice(),
-                  streams[curStream]);
-    // top-k of dot products
-    // (query id, top k centroids)
-    runBlockSelect(distanceBufView,
-                 outDistanceView,
-                 outIndexView,
-                 true, k, streams[curStream]);
-    curStream = (curStream + 1) % 2;
-  }
-  streamWait({defaultStream}, streams);
+template <typename T>
+void runIPDistance(GpuResources* resources,
+                   Tensor<T, 2, true>& centroids,
+                   Tensor<T, 2, true>* centroidsTransposed,
+                   Tensor<T, 2, true>& queries,
+                   int k,
+                   Tensor<T, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices,
+                   bool useHgemm) {
+  runDistance<T>(false, // IP
+                 resources,
+                 centroids,
+                 centroidsTransposed,
+                 nullptr,
+                 queries,
+                 k,
+                 outDistances,
+                 outIndices,
+                 useHgemm,
+                 false);
 }
 //
@@ -303,8 +410,7 @@ runIPDistance(GpuResources* resources,
              Tensor<float, 2, true>& queries,
              int k,
              Tensor<float, 2, true>& outDistances,
-              Tensor<int, 2, true>& outIndices,
+              Tensor<int, 2, true>& outIndices) {
-              int tileSizeOverride) {
  runIPDistance<float>(resources,
                       vectors,
                       vectorsTransposed,
@@ -312,8 +418,7 @@ runIPDistance(GpuResources* resources,
                       k,
                       outDistances,
                       outIndices,
-                       false,
+                       false);
-                       tileSizeOverride);
 }
 #ifdef FAISS_USE_FLOAT16
@@ -325,8 +430,7 @@ runIPDistance(GpuResources* resources,
              int k,
              Tensor<half, 2, true>& outDistances,
              Tensor<int, 2, true>& outIndices,
-              bool useHgemm,
+              bool useHgemm) {
-              int tileSizeOverride) {
  runIPDistance<half>(resources,
                      vectors,
                      vectorsTransposed,
@@ -334,8 +438,7 @@ runIPDistance(GpuResources* resources,
                      k,
                      outDistances,
                      outIndices,
-                      useHgemm,
+                      useHgemm);
-                      tileSizeOverride);
 }
 #endif
@@ -348,8 +451,7 @@ runL2Distance(GpuResources* resources,
              int k,
              Tensor<float, 2, true>& outDistances,
              Tensor<int, 2, true>& outIndices,
-              bool ignoreOutDistances,
+              bool ignoreOutDistances) {
-              int tileSizeOverride) {
  runL2Distance<float>(resources,
                       vectors,
                       vectorsTransposed,
@@ -359,8 +461,7 @@ runL2Distance(GpuResources* resources,
                       outDistances,
                       outIndices,
                       false,
-                       ignoreOutDistances,
+                       ignoreOutDistances);
-                       tileSizeOverride);
 }
 #ifdef FAISS_USE_FLOAT16
@@ -374,8 +475,7 @@ runL2Distance(GpuResources* resources,
              Tensor<half, 2, true>& outDistances,
              Tensor<int, 2, true>& outIndices,
              bool useHgemm,
-              bool ignoreOutDistances,
+              bool ignoreOutDistances) {
-              int tileSizeOverride) {
  runL2Distance<half>(resources,
                      vectors,
                      vectorsTransposed,
@@ -385,8 +485,7 @@ runL2Distance(GpuResources* resources,
                      outDistances,
                      outIndices,
                      useHgemm,
-                      ignoreOutDistances,
+                      ignoreOutDistances);
-                      tileSizeOverride);
 }
 #endif

--- a/gpu/impl/Distance.cuh
+++ b/gpu/impl/Distance.cuh
@@ -31,11 +31,7 @@ void runL2Distance(GpuResources* resources,
                   Tensor<int, 2, true>& outIndices,
                   // Do we care about `outDistances`? If not, we can
                   // take shortcuts.
-                   bool ignoreOutDistances = false,
+                   bool ignoreOutDistances = false);
-                   // Hint to use a different sized tile for
-                   // multi-streaming the queries. If <= 0, we use the
-                   // default
-                   int tileSizeOverride = -1);
 /// Calculates brute-force inner product distance between `vectors`
 /// and `queries`, returning the k closest results seen
@@ -45,11 +41,7 @@ void runIPDistance(GpuResources* resources,
                   Tensor<float, 2, true>& queries,
                   int k,
                   Tensor<float, 2, true>& outDistances,
-                   Tensor<int, 2, true>& outIndices,
+                   Tensor<int, 2, true>& outIndices);
-                   // Hint to use a different sized tile for
-                   // multi-streaming the queries. If <= 0, we use the
-                   // default
-                   int tileSizeOverride = -1);
 #ifdef FAISS_USE_FLOAT16
 void runIPDistance(GpuResources* resources,
@@ -59,8 +51,7 @@ void runIPDistance(GpuResources* resources,
                   int k,
                   Tensor<half, 2, true>& outDistances,
                   Tensor<int, 2, true>& outIndices,
-                   bool useHgemm,
+                   bool useHgemm);
-                   int tileSizeOverride = -1);
 void runL2Distance(GpuResources* resources,
                   Tensor<half, 2, true>& vectors,
@@ -71,8 +62,7 @@ void runL2Distance(GpuResources* resources,
                   Tensor<half, 2, true>& outDistances,
                   Tensor<int, 2, true>& outIndices,
                   bool useHgemm,
-                   bool ignoreOutDistances = false,
+                   bool ignoreOutDistances = false);
-                   int tileSizeOverride = -1);
 #endif
 } } // namespace
--- a/gpu/impl/FlatIndex.cu
+++ b/gpu/impl/FlatIndex.cu
@@ -114,8 +114,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
                 int k,
                 Tensor<float, 2, true>& outDistances,
                 Tensor<int, 2, true>& outIndices,
-                 bool exactDistance,
+                 bool exactDistance) {
-                 int tileSize) {
  auto stream = resources_->getDefaultStreamCurrentDevice();
  auto& mem = resources_->getMemoryManagerCurrentDevice();
@@ -127,7 +126,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
    DeviceTensor<half, 2, true> outDistancesHalf(
      mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream);
-    query(inputHalf, k, outDistancesHalf, outIndices, exactDistance, tileSize);
+    query(inputHalf, k, outDistancesHalf, outIndices, exactDistance);
    if (exactDistance) {
      // Convert outDistances back
@@ -145,8 +144,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
                    outDistances,
                    outIndices,
                    // FIXME
-                    !exactDistance,
+                    !exactDistance);
-                    tileSize);
    } else {
      runIPDistance(resources_,
                    vectors_,
@@ -154,8 +152,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
                    input,
                    k,
                    outDistances,
-                    outIndices,
+                    outIndices);
-                    tileSize);
    }
  }
 }
@@ -166,8 +163,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
                 int k,
                 Tensor<half, 2, true>& outDistances,
                 Tensor<int, 2, true>& outIndices,
-                 bool exactDistance,
+                 bool exactDistance) {
-                 int tileSize) {
  FAISS_ASSERT(useFloat16_);
  if (l2Distance_) {
@@ -181,8 +177,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
                  outIndices,
                  useFloat16Accumulator_,
                  // FIXME
-                  !exactDistance,
+                  !exactDistance);
-                  tileSize);
  } else {
    runIPDistance(resources_,
                  vectorsHalf_,
@@ -191,8 +186,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
                  k,
                  outDistances,
                  outIndices,
-                  useFloat16Accumulator_,
+                  useFloat16Accumulator_);
-                  tileSize);
  }
 }
 #endif
@@ -217,12 +211,14 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
    rawData_.append((char*) devDataHalf.data(),
                    devDataHalf.getSizeInBytes(),
-                    stream);
+                    stream,
+                    true /* reserve exactly */);
 #endif
  } else {
    rawData_.append((char*) data,
                    (size_t) dim_ * numVecs * sizeof(float),
-                    stream);
+                    stream,
+                    true /* reserve exactly */);
  }
  num_ += numVecs;

--- a/gpu/impl/FlatIndex.cuh
+++ b/gpu/impl/FlatIndex.cuh
@@ -61,16 +61,14 @@ class FlatIndex {
             int k,
             Tensor<float, 2, true>& outDistances,
             Tensor<int, 2, true>& outIndices,
-             bool exactDistance,
+             bool exactDistance);
-             int tileSize = -1);
 #ifdef FAISS_USE_FLOAT16
  void query(Tensor<half, 2, true>& vecs,
             int k,
             Tensor<half, 2, true>& outDistances,
             Tensor<int, 2, true>& outIndices,
-             bool exactDistance,
+             bool exactDistance);
-             int tileSize = -1);
 #endif
  /// Add vectors to ourselves; the pointer passed can be on the host

--- a/gpu/impl/IVFPQ.cu
+++ b/gpu/impl/IVFPQ.cu
@@ -195,10 +195,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
                  closestSubQDistanceView,
                  closestSubQIndexView,
                  // We don't care about distances
-                  true,
+                  true);
-                  // Much larger tile size, since these vectors are a
-                  // lot smaller than query vectors
-                  1024);
  }
  // Now, we have the nearest sub-q centroid for each slice of the

--- a/gpu/impl/IVFUtilsSelect1.cu
+++ b/gpu/impl/IVFUtilsSelect1.cu
@@ -10,10 +10,10 @@
 #include "IVFUtils.cuh"
 #include "../utils/DeviceUtils.h"
+#include "../utils/Limits.cuh"
 #include "../utils/Select.cuh"
 #include "../utils/StaticUtils.h"
 #include "../utils/Tensor.cuh"
-#include <limits>
 //
 // This kernel is split into a separate compilation unit to cut down
@@ -22,9 +22,6 @@
 namespace faiss { namespace gpu {
-constexpr auto kMax = std::numeric_limits<float>::max();
-constexpr auto kMin = std::numeric_limits<float>::min();
 template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir>
 __global__ void
 pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
@@ -38,7 +35,7 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
  __shared__ float smemK[kNumWarps * NumWarpQ];
  __shared__ int smemV[kNumWarps * NumWarpQ];
-  constexpr auto kInit = Dir ? kMin : kMax;
+  constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
  BlockSelect<float, int, Dir, Comparator<float>,
              NumWarpQ, NumThreadQ, ThreadsPerBlock>
    heap(kInit, -1, smemK, smemV, k);

--- a/gpu/impl/IVFUtilsSelect2.cu
+++ b/gpu/impl/IVFUtilsSelect2.cu
@@ -10,10 +10,10 @@
 #include "IVFUtils.cuh"
 #include "../utils/DeviceUtils.h"
+#include "../utils/Limits.cuh"
 #include "../utils/Select.cuh"
 #include "../utils/StaticUtils.h"
 #include "../utils/Tensor.cuh"
-#include <limits>
 //
 // This kernel is split into a separate compilation unit to cut down
@@ -22,9 +22,6 @@
 namespace faiss { namespace gpu {
-constexpr auto kMax = std::numeric_limits<float>::max();
-constexpr auto kMin = std::numeric_limits<float>::min();
 // This is warp divergence central, but this is really a final step
 // and happening a small number of times
 inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
@@ -71,7 +68,7 @@ pass2SelectLists(Tensor<float, 2, true> heapDistances,
  __shared__ float smemK[kNumWarps * NumWarpQ];
  __shared__ int smemV[kNumWarps * NumWarpQ];
-  constexpr auto kInit = Dir ? kMin : kMax;
+  constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
  BlockSelect<float, int, Dir, Comparator<float>,
            NumWarpQ, NumThreadQ, ThreadsPerBlock>
    heap(kInit, -1, smemK, smemV, k);

--- a/gpu/impl/L2Norm.cu
+++ b/gpu/impl/L2Norm.cu
@@ -31,28 +31,29 @@ namespace faiss { namespace gpu {
 // T: the type we are doing the math in (e.g., float, half)
 // TVec: the potentially vectorized type we are loading in (e.g.,
 // float4, half2)
-template <typename T, typename TVec,
+template <typename T, typename TVec, typename TIndex,
          int RowTileSize, bool NormLoop, bool NormSquared>
-__global__ void l2Norm(Tensor<TVec, 2, true> input,
+__global__ void l2Norm(Tensor<TVec, 2, true, TIndex> input,
-                       Tensor<T, 1, true> output) {
+                       Tensor<T, 1, true, TIndex> output) {
  extern __shared__ char smemByte[]; // #warps * RowTileSize elements
  T* smem = (T*) smemByte;
-  int numWarps = utils::divUp(blockDim.x, kWarpSize);
+  TIndex numWarps = utils::divUp(blockDim.x, kWarpSize);
-  int laneId = getLaneId();
+  TIndex laneId = getLaneId();
-  int warpId = threadIdx.x / kWarpSize;
+  TIndex warpId = threadIdx.x / kWarpSize;
  bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
-  int rowStart = RowTileSize * blockIdx.x;
+  TIndex rowStart = RowTileSize * blockIdx.x;
  T rowNorm[RowTileSize];
  if (lastRowTile) {
    // We are handling the very end of the input matrix rows
-    for (int row = 0; row < input.getSize(0) - rowStart; ++row) {
+    for (TIndex row = 0; row < input.getSize(0) - rowStart; ++row) {
      if (NormLoop) {
        rowNorm[0] = Math<T>::zero();
-        for (int col = threadIdx.x; col < input.getSize(1); col += blockDim.x) {
+        for (TIndex col = threadIdx.x;
+             col < input.getSize(1); col += blockDim.x) {
          TVec val = input[rowStart + row][col];
          val = Math<TVec>::mul(val, val);
          rowNorm[0] = Math<T>::add(rowNorm[0], Math<TVec>::reduceAdd(val));
@@ -82,7 +83,8 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input,
        rowNorm[row] = Math<T>::zero();
      }
-      for (int col = threadIdx.x; col < input.getSize(1); col += blockDim.x) {
+      for (TIndex col = threadIdx.x;
+           col < input.getSize(1); col += blockDim.x) {
 #pragma unroll
        for (int row = 0; row < RowTileSize; ++row) {
          tmp[row] = input[rowStart + row][col];
@@ -172,32 +174,32 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input,
  }
 }
-template <typename T, typename TVec>
+template <typename T, typename TVec, typename TIndex>
-void runL2Norm(Tensor<T, 2, true>& input,
+void runL2Norm(Tensor<T, 2, true, TIndex>& input,
-               Tensor<T, 1, true>& output,
+               Tensor<T, 1, true, TIndex>& output,
               bool normSquared,
               cudaStream_t stream) {
  FAISS_ASSERT(input.getSize(0) == output.getSize(0));
-  int maxThreads = getMaxThreadsCurrentDevice();
+  TIndex maxThreads = (TIndex) getMaxThreadsCurrentDevice();
  constexpr int rowTileSize = 8;
 #define RUN_L2(TYPE_T, TYPE_TVEC, INPUT)                                \
  do {                                                                  \
    if (normLoop) {                                                     \
      if (normSquared) {                                                \
-        l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, true, true>       \
+        l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, true, true>      \
          <<<grid, block, smem, stream>>>(INPUT, output);               \
      } else {                                                          \
-        l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, true, false>      \
+        l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, true, false>     \
          <<<grid, block, smem, stream>>>(INPUT, output);               \
      }                                                                 \
    } else {                                                            \
      if (normSquared) {                                                \
-        l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, false, true>      \
+        l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, false, true>     \
          <<<grid, block, smem, stream>>>(INPUT, output);               \
      } else {                                                          \
-        l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, false, false>     \
+        l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, false, false>    \
          <<<grid, block, smem, stream>>>(INPUT, output);               \
      }                                                                 \
    }                                                                   \
@@ -207,9 +209,9 @@ void runL2Norm(Tensor<T, 2, true>& input,
    // Can load using the vectorized type
    auto inputV = input.template castResize<TVec>();
-    int dim = inputV.getSize(1);
+    auto dim = inputV.getSize(1);
    bool normLoop = dim > maxThreads;
-    int numThreads = min(dim, maxThreads);
+    auto numThreads = min(dim, maxThreads);
    auto grid = dim3(utils::divUp(inputV.getSize(0), rowTileSize));
    auto block = dim3(numThreads);
@@ -220,9 +222,9 @@ void runL2Norm(Tensor<T, 2, true>& input,
  } else {
    // Can't load using the vectorized type
-    int dim = input.getSize(1);
+    auto dim = input.getSize(1);
    bool normLoop = dim > maxThreads;
-    int numThreads = min(dim, maxThreads);
+    auto numThreads = min(dim, maxThreads);
    auto grid = dim3(utils::divUp(input.getSize(0), rowTileSize));
    auto block = dim3(numThreads);
@@ -241,7 +243,13 @@ void runL2Norm(Tensor<float, 2, true>& input,
               Tensor<float, 1, true>& output,
               bool normSquared,
               cudaStream_t stream) {
-  runL2Norm<float, float4>(input, output, normSquared, stream);
+  if (input.canUseIndexType<int>()) {
+    runL2Norm<float, float4, int>(input, output, normSquared, stream);
+  } else {
+    auto inputCast = input.castIndexType<long>();
+    auto outputCast = output.castIndexType<long>();
+    runL2Norm<float, float4, long>(inputCast, outputCast, normSquared, stream);
+  }
 }
 #ifdef FAISS_USE_FLOAT16
@@ -249,7 +257,13 @@ void runL2Norm(Tensor<half, 2, true>& input,
               Tensor<half, 1, true>& output,
               bool normSquared,
               cudaStream_t stream) {
-  runL2Norm<half, half2>(input, output, normSquared, stream);
+  if (input.canUseIndexType<int>()) {
+    runL2Norm<half, half2, int>(input, output, normSquared, stream);
+  } else {
+    auto inputCast = input.castIndexType<long>();
+    auto outputCast = output.castIndexType<long>();
+    runL2Norm<half, half2, long>(inputCast, outputCast, normSquared, stream);
+  }
 }
 #endif

--- a/gpu/perf/CompareFlat.cu
+++ b/gpu/perf/CompareFlat.cu
@@ -29,11 +29,14 @@ DEFINE_int32(num, 128, "# of vecs");
 DEFINE_int32(dim, 128, "# of dimensions");
 DEFINE_int32(num_queries, 3, "number of query vectors");
 DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
-DEFINE_bool(use_float16, false, "use encodings in float16 instead of float32");
+DEFINE_bool(use_float16, false, "use encodings in float16");
+DEFINE_bool(use_float16_math, false, "perform math in float16");
 DEFINE_bool(transposed, false, "store vectors transposed");
 DEFINE_int64(seed, -1, "specify random seed");
 DEFINE_int32(num_gpus, 1, "number of gpus to use");
 DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use");
+DEFINE_bool(cpu, true, "run the CPU code for timing and comparison");
+DEFINE_bool(use_unified_mem, false, "use Pascal unified memory for the index");
 using namespace faiss::gpu;
@@ -72,7 +75,10 @@ int main(int argc, char** argv) {
    GpuIndexFlatConfig config;
    config.device = dev;
    config.useFloat16 = FLAGS_use_float16;
+    config.useFloat16Accumulator = FLAGS_use_float16_math;
    config.storeTransposed = FLAGS_transposed;
+    config.memorySpace = FLAGS_use_unified_mem ?
+    MemorySpace::Unified : MemorySpace::Device;
    auto p = std::unique_ptr<faiss::gpu::GpuIndexFlatL2>(
      new faiss::gpu::GpuIndexFlatL2(res, index.get(), config));
@@ -90,9 +96,9 @@ int main(int argc, char** argv) {
  HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
  HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
+  if (FLAGS_cpu) {
    float cpuTime = 0.0f;
-  {
    CpuTimer timer;
    index->search(numQueries,
                  cpuQuery.data(),
@@ -101,9 +107,8 @@ int main(int argc, char** argv) {
                  cpuIndices.data());
    cpuTime = timer.elapsedMilliseconds();
-  }
    printf("CPU time %.3f ms\n", cpuTime);
+  }
  HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
  HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
@@ -131,14 +136,14 @@ int main(int argc, char** argv) {
  CUDA_VERIFY(cudaProfilerStop());
  printf("GPU time %.3f ms\n", gpuTime);
+  if (FLAGS_cpu) {
    compareLists(cpuDistances.data(), cpuIndices.data(),
                 gpuDistances.data(), gpuIndices.data(),
                 numQueries, FLAGS_k,
                 "", true, FLAGS_diff, false);
+  }
  CUDA_VERIFY(cudaDeviceSynchronize());
-  // printf("\ncudaMalloc usage %zd\n",
-  //        resources.getMemoryManager().getHighWaterCudaMalloc());
  return 0;
 }
--- a/gpu/test/TestGpuIndexFlat.cpp
+++ b/gpu/test/TestGpuIndexFlat.cpp
@@ -21,29 +21,47 @@
 constexpr float kF16MaxRelErr = 0.07f;
 constexpr float kF32MaxRelErr = 6e-3f;
-void testFlat(bool useL2,
+struct TestFlatOptions {
-              bool useFloat16,
+  TestFlatOptions()
-              bool useTransposed,
+      : useL2(true),
-              int kOverride = -1) {
+        useFloat16(false),
-  int numVecs = faiss::gpu::randVal(1000, 20000);
+        useTransposed(false),
+        numVecsOverride(-1),
+        numQueriesOverride(-1),
+        kOverride(-1) {
+  }
+  bool useL2;
+  bool useFloat16;
+  bool useTransposed;
+  int numVecsOverride;
+  int numQueriesOverride;
+  int kOverride;
+};
+void testFlat(const TestFlatOptions& opt) {
+  int numVecs = opt.numVecsOverride > 0 ?
+    opt.numVecsOverride : faiss::gpu::randVal(1000, 20000);
  int dim = faiss::gpu::randVal(50, 800);
-  int numQuery = faiss::gpu::randVal(1, 512);
+  int numQuery = opt.numQueriesOverride > 0 ?
+    opt.numQueriesOverride : faiss::gpu::randVal(1, 512);
  // Due to loss of precision in a float16 accumulator, for large k,
  // the number of differences is pretty huge. Restrict ourselves to a
  // fairly small `k` for float16
-  int k = useFloat16 ?
+  int k = opt.useFloat16 ?
    std::min(faiss::gpu::randVal(1, 50), numVecs) :
    std::min(faiss::gpu::randVal(1, 1024), numVecs);
-  if (kOverride > 0) {
+  if (opt.kOverride > 0) {
-    k = kOverride;
+    k = opt.kOverride;
  }
  faiss::IndexFlatIP cpuIndexIP(dim);
  faiss::IndexFlatL2 cpuIndexL2(dim);
  faiss::IndexFlat* cpuIndex =
-    useL2 ? (faiss::IndexFlat*) &cpuIndexL2 : (faiss::IndexFlat*) &cpuIndexIP;
+    opt.useL2 ? (faiss::IndexFlat*) &cpuIndexL2 :
+    (faiss::IndexFlat*) &cpuIndexIP;
  // Construct on a random device to test multi-device, if we have
  // multiple devices
@@ -55,14 +73,14 @@ void testFlat(bool useL2,
  faiss::gpu::GpuIndexFlatConfig config;
  config.device = device;
-  config.useFloat16 = useFloat16;
+  config.useFloat16 = opt.useFloat16;
-  config.storeTransposed = useTransposed;
+  config.storeTransposed = opt.useTransposed;
  faiss::gpu::GpuIndexFlatIP gpuIndexIP(&res, dim, config);
  faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
  faiss::gpu::GpuIndexFlat* gpuIndex =
-    useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
+    opt.useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
    (faiss::gpu::GpuIndexFlat*) &gpuIndexIP;
  std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
@@ -70,37 +88,53 @@ void testFlat(bool useL2,
  gpuIndex->add(numVecs, vecs.data());
  std::stringstream str;
-  str << (useL2 ? "L2" : "IP") << " numVecs " << numVecs
+  str << (opt.useL2 ? "L2" : "IP") << " numVecs " << numVecs
      << " dim " << dim
-      << " useFloat16 " << useFloat16
+      << " useFloat16 " << opt.useFloat16
-      << " transposed " << useTransposed
+      << " transposed " << opt.useTransposed
      << " numQuery " << numQuery
      << " k " << k;
  // To some extent, we depend upon the relative error for the test
  // for float16
  faiss::gpu::compareIndices(*cpuIndex, *gpuIndex, numQuery, dim, k, str.str(),
-                             useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                             opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
                             // FIXME: the fp16 bounds are
                             // useless when math (the accumulator) is
                             // in fp16. Figure out another way to test
-                             useFloat16 ? 0.99f : 0.1f,
+                             opt.useFloat16 ? 0.99f : 0.1f,
-                             useFloat16 ? 0.65f : 0.015f);
+                             opt.useFloat16 ? 0.65f : 0.015f);
 }
 TEST(TestGpuIndexFlat, IP_Float32) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(false, false, false);
-    testFlat(false, false, true);
+    TestFlatOptions opt;
+    opt.useL2 = false;
+    opt.useFloat16 = false;
+    opt.useTransposed = false;
+    testFlat(opt);
+    opt.useTransposed = true;
+    testFlat(opt);
  }
 }
 TEST(TestGpuIndexFlat, L2_Float32) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(true, false, false);
-    testFlat(true, false, true);
+    TestFlatOptions opt;
+    opt.useL2 = true;
+    opt.useFloat16 = false;
+    opt.useTransposed = false;
+    testFlat(opt);
+    opt.useTransposed = true;
+    testFlat(opt);
  }
 }
@@ -108,24 +142,46 @@ TEST(TestGpuIndexFlat, L2_Float32) {
 TEST(TestGpuIndexFlat, L2_Float32_K1) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(true, false, false, 1);
-    testFlat(true, false, true, 1);
+    TestFlatOptions opt;
+    opt.useL2 = true;
+    opt.useFloat16 = false;
+    opt.useTransposed = false;
+    opt.kOverride = 1;
+    testFlat(opt);
  }
 }
 TEST(TestGpuIndexFlat, IP_Float16) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(false, true, false);
-    testFlat(false, true, false);
+    TestFlatOptions opt;
+    opt.useL2 = false;
+    opt.useFloat16 = true;
+    opt.useTransposed = false;
+    testFlat(opt);
+    opt.useTransposed = true;
+    testFlat(opt);
  }
 }
 TEST(TestGpuIndexFlat, L2_Float16) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(true, true, false);
-    testFlat(true, true, true);
+    TestFlatOptions opt;
+    opt.useL2 = true;
+    opt.useFloat16 = true;
+    opt.useTransposed = false;
+    testFlat(opt);
+    opt.useTransposed = true;
+    testFlat(opt);
  }
 }
@@ -133,8 +189,33 @@ TEST(TestGpuIndexFlat, L2_Float16) {
 TEST(TestGpuIndexFlat, L2_Float16_K1) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(true, true, false, 1);
-    testFlat(true, true, true, 1);
+    TestFlatOptions opt;
+    opt.useL2 = true;
+    opt.useFloat16 = true;
+    opt.useTransposed = false;
+    opt.kOverride = 1;
+    testFlat(opt);
+  }
+}
+// test tiling along a huge vector set
+TEST(TestGpuIndexFlat, L2_Tiling) {
+  for (int tries = 0; tries < 3; ++tries) {
+    faiss::gpu::newTestSeed();
+    TestFlatOptions opt;
+    opt.useL2 = true;
+    opt.useFloat16 = false;
+    opt.useTransposed = false;
+    opt.numVecsOverride = 1000000;
+    opt.numQueriesOverride = 8;
+    testFlat(opt);
+    opt.useTransposed = true;
+    testFlat(opt);
  }
 }

--- a/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -14,6 +14,7 @@
 #include "../StandardGpuResources.h"
 #include "../utils/DeviceUtils.h"
 #include "../test/TestUtils.h"
+#include <cmath>
 #include <gtest/gtest.h>
 #include <glog/logging.h>
 #include <sstream>
@@ -390,6 +391,68 @@ TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) {
  copyToTest(false, false);
 }
+TEST(TestGpuIndexIVFFlat, Float32_negative) {
+  faiss::gpu::newTestSeed();
+  Options opt;
+  auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+  auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+  // Put all vecs on negative side
+  for (auto& f : trainVecs) {
+    f = std::abs(f) * -1.0f;
+  }
+  for (auto& f : addVecs) {
+    f *= std::abs(f) * -1.0f;
+  }
+  faiss::IndexFlatIP quantizerIP(opt.dim);
+  faiss::Index* quantizer = (faiss::Index*) &quantizerIP;
+  faiss::IndexIVFFlat cpuIndex(quantizer,
+                               opt.dim, opt.numCentroids,
+                               faiss::METRIC_INNER_PRODUCT);
+  cpuIndex.train(opt.numTrain, trainVecs.data());
+  cpuIndex.add(opt.numAdd, addVecs.data());
+  cpuIndex.nprobe = opt.nprobe;
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+  faiss::gpu::GpuIndexIVFFlatConfig config;
+  config.device = opt.device;
+  config.indicesOptions = opt.indicesOpt;
+  faiss::gpu::GpuIndexIVFFlat gpuIndex(&res,
+                                       cpuIndex.d,
+                                       cpuIndex.nlist,
+                                       cpuIndex.metric_type,
+                                       config);
+  gpuIndex.copyFrom(&cpuIndex);
+  gpuIndex.setNumProbes(opt.nprobe);
+  // Construct a positive test set
+  auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+  // Put all vecs on positive size
+  for (auto& f : queryVecs) {
+    f = std::abs(f);
+  }
+  bool compFloat16 = false;
+  faiss::gpu::compareIndices(queryVecs,
+                             cpuIndex, gpuIndex,
+                             opt.numQuery, opt.dim, opt.k, opt.toString(),
+                             compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                             // FIXME: the fp16 bounds are
+                             // useless when math (the accumulator) is
+                             // in fp16. Figure out another way to test
+                             compFloat16 ? 0.99f : 0.1f,
+                             compFloat16 ? 0.65f : 0.015f);
+}
 //
 // NaN tests
 //

--- a/gpu/test/TestUtils.cpp
+++ b/gpu/test/TestUtils.cpp
@@ -64,24 +64,23 @@ std::vector<float> randVecs(size_t num, size_t dim) {
  return v;
 }
-void compareIndices(faiss::Index& refIndex,
+void compareIndices(const std::vector<float>& queryVecs,
+                    faiss::Index& refIndex,
                    faiss::Index& testIndex,
                    int numQuery, int dim, int k,
                    const std::string& configMsg,
                    float maxRelativeError,
                    float pctMaxDiff1,
                    float pctMaxDiffN) {
-  auto queries = faiss::gpu::randVecs(numQuery, dim);
  // Compare
  std::vector<float> refDistance(numQuery * k, 0);
  std::vector<faiss::Index::idx_t> refIndices(numQuery * k, -1);
-  refIndex.search(numQuery, queries.data(),
+  refIndex.search(numQuery, queryVecs.data(),
                  k, refDistance.data(), refIndices.data());
  std::vector<float> testDistance(numQuery * k, 0);
  std::vector<faiss::Index::idx_t> testIndices(numQuery * k, -1);
-  testIndex.search(numQuery, queries.data(),
+  testIndex.search(numQuery, queryVecs.data(),
                   k, testDistance.data(), testIndices.data());
  faiss::gpu::compareLists(refDistance.data(),
@@ -94,6 +93,25 @@ void compareIndices(faiss::Index& refIndex,
                           maxRelativeError, pctMaxDiff1, pctMaxDiffN);
 }
+void compareIndices(faiss::Index& refIndex,
+                    faiss::Index& testIndex,
+                    int numQuery, int dim, int k,
+                    const std::string& configMsg,
+                    float maxRelativeError,
+                    float pctMaxDiff1,
+                    float pctMaxDiffN) {
+  auto queryVecs = faiss::gpu::randVecs(numQuery, dim);
+  compareIndices(queryVecs,
+                 refIndex,
+                 testIndex,
+                 numQuery, dim, k,
+                 configMsg,
+                 maxRelativeError,
+                 pctMaxDiff1,
+                 pctMaxDiffN);
+}
 template <typename T>
 inline T lookup(const T* p, int i, int j, int /*dim1*/, int dim2) {
  return p[i * dim2 + j];

--- a/gpu/test/TestUtils.h
+++ b/gpu/test/TestUtils.h
@@ -56,7 +56,19 @@ T randSelect(std::initializer_list<T> vals) {
 /// Generates a collection of random vectors in the range [0, 1]
 std::vector<float> randVecs(size_t num, size_t dim);
-/// Compare two indices via query for similarity
+/// Compare two indices via query for similarity, with a user-specified set of
+/// query vectors
+void compareIndices(const std::vector<float>& queryVecs,
+                    faiss::Index& refIndex,
+                    faiss::Index& testIndex,
+                    int numQuery, int dim, int k,
+                    const std::string& configMsg,
+                    float maxRelativeError = 6e-5f,
+                    float pctMaxDiff1 = 0.1f,
+                    float pctMaxDiffN = 0.005f);
+/// Compare two indices via query for similarity, generating random query
+/// vectors
 void compareIndices(faiss::Index& refIndex,
                    faiss::Index& testIndex,
                    int numQuery, int dim, int k,

--- a/gpu/test/test_pytorch_faiss.py
+++ b/gpu/test/test_pytorch_faiss.py
@@ -38,14 +38,14 @@ def search_index_pytorch(index, x, k, D=None, I=None):
        assert I.__class__ in (torch.LongTensor, torch.cuda.LongTensor)
        assert I.size() == (n, k)
        assert I.is_contiguous()
+    torch.cuda.synchronize()
    xptr = x.storage().data_ptr()
    Iptr = I.storage().data_ptr()
    Dptr = D.storage().data_ptr()
    index.search_c(n, faiss.cast_integer_to_float_ptr(xptr),
                   k, faiss.cast_integer_to_float_ptr(Dptr),
                   faiss.cast_integer_to_long_ptr(Iptr))
+    torch.cuda.synchronize()
    return D, I

--- a/gpu/utils/BlockSelectFloat.cu
+++ b/gpu/utils/BlockSelectFloat.cu
@@ -77,4 +77,46 @@ void runBlockSelect(Tensor<float, 2, true>& in,
  }
 }
+void runBlockSelectPair(Tensor<float, 2, true>& inK,
+                        Tensor<int, 2, true>& inV,
+                        Tensor<float, 2, true>& outK,
+                        Tensor<int, 2, true>& outV,
+                        bool dir, int k, cudaStream_t stream) {
+  FAISS_ASSERT(k <= 1024);
+  if (dir) {
+    if (k == 1) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 1);
+    } else if (k <= 32) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 32);
+    } else if (k <= 64) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 64);
+    } else if (k <= 128) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 128);
+    } else if (k <= 256) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 256);
+    } else if (k <= 512) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 512);
+    } else if (k <= 1024) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 1024);
+    }
+  } else {
+    if (k == 1) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 1);
+    } else if (k <= 32) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 32);
+    } else if (k <= 64) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 64);
+    } else if (k <= 128) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 128);
+    } else if (k <= 256) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 256);
+    } else if (k <= 512) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 512);
+    } else if (k <= 1024) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 1024);
+    }
+  }
+}
 } } // namespace
--- a/gpu/utils/BlockSelectHalf.cu
+++ b/gpu/utils/BlockSelectHalf.cu
@@ -79,6 +79,48 @@ void runBlockSelect(Tensor<half, 2, true>& in,
  }
 }
+void runBlockSelectPair(Tensor<half, 2, true>& inK,
+                        Tensor<int, 2, true>& inV,
+                        Tensor<half, 2, true>& outK,
+                        Tensor<int, 2, true>& outV,
+                        bool dir, int k, cudaStream_t stream) {
+  FAISS_ASSERT(k <= 1024);
+  if (dir) {
+    if (k == 1) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 1);
+    } else if (k <= 32) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 32);
+    } else if (k <= 64) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 64);
+    } else if (k <= 128) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 128);
+    } else if (k <= 256) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 256);
+    } else if (k <= 512) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 512);
+    } else if (k <= 1024) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 1024);
+    }
+  } else {
+    if (k == 1) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 1);
+    } else if (k <= 32) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 32);
+    } else if (k <= 64) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 64);
+    } else if (k <= 128) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 128);
+    } else if (k <= 256) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 256);
+    } else if (k <= 512) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 512);
+    } else if (k <= 1024) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 1024);
+    }
+  }
+}
 #endif
 } } // namespace
--- a/gpu/utils/BlockSelectKernel.cuh
+++ b/gpu/utils/BlockSelectKernel.cuh
@@ -62,16 +62,79 @@ __global__ void blockSelect(Tensor<K, 2, true> in,
  }
 }
+template <typename K,
+          typename IndexType,
+          bool Dir,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
+__global__ void blockSelectPair(Tensor<K, 2, true> inK,
+                                Tensor<IndexType, 2, true> inV,
+                                Tensor<K, 2, true> outK,
+                                Tensor<IndexType, 2, true> outV,
+                                K initK,
+                                IndexType initV,
+                                int k) {
+  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+  __shared__ K smemK[kNumWarps * NumWarpQ];
+  __shared__ IndexType smemV[kNumWarps * NumWarpQ];
+  BlockSelect<K, IndexType, Dir, Comparator<K>,
+              NumWarpQ, NumThreadQ, ThreadsPerBlock>
+    heap(initK, initV, smemK, smemV, k);
+  // Grid is exactly sized to rows available
+  int row = blockIdx.x;
+  int i = threadIdx.x;
+  K* inKStart = inK[row][i].data();
+  IndexType* inVStart = inV[row][i].data();
+  // Whole warps must participate in the selection
+  int limit = utils::roundDown(inK.getSize(1), kWarpSize);
+  for (; i < limit; i += ThreadsPerBlock) {
+    heap.add(*inKStart, *inVStart);
+    inKStart += ThreadsPerBlock;
+    inVStart += ThreadsPerBlock;
+  }
+  // Handle last remainder fraction of a warp of elements
+  if (i < inK.getSize(1)) {
+    heap.addThreadQ(*inKStart, *inVStart);
+  }
+  heap.reduce();
+  for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
+    outK[row][i] = smemK[i];
+    outV[row][i] = smemV[i];
+  }
+}
 void runBlockSelect(Tensor<float, 2, true>& in,
                    Tensor<float, 2, true>& outKeys,
                    Tensor<int, 2, true>& outIndices,
                    bool dir, int k, cudaStream_t stream);
+void runBlockSelectPair(Tensor<float, 2, true>& inKeys,
+                        Tensor<int, 2, true>& inIndices,
+                        Tensor<float, 2, true>& outKeys,
+                        Tensor<int, 2, true>& outIndices,
+                        bool dir, int k, cudaStream_t stream);
 #ifdef FAISS_USE_FLOAT16
 void runBlockSelect(Tensor<half, 2, true>& in,
                    Tensor<half, 2, true>& outKeys,
                    Tensor<int, 2, true>& outIndices,
                    bool dir, int k, cudaStream_t stream);
+void runBlockSelectPair(Tensor<half, 2, true>& inKeys,
+                        Tensor<int, 2, true>& inIndices,
+                        Tensor<half, 2, true>& outKeys,
+                        Tensor<int, 2, true>& outIndices,
+                        bool dir, int k, cudaStream_t stream);
 #endif
 } } // namespace
--- a/gpu/utils/DeviceTensor-inl.cuh
+++ b/gpu/utils/DeviceTensor-inl.cuh
@@ -12,37 +12,37 @@
 namespace faiss { namespace gpu {
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor() :
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor() :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
    state_(AllocState::NotOwner),
    space_(MemorySpace::Device) {
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
-  DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t) :
+  DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
    state_(AllocState::NotOwner),
    space_(MemorySpace::Device) {
  this->operator=(std::move(t));
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
-  DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t) {
+  DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
  if (this->state_ == AllocState::Owner) {
    CUDA_VERIFY(cudaFree(this->data_));
  }
-  this->Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
+  this->Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
    std::move(t));
  this->state_ = t.state_; t.state_ = AllocState::NotOwner;
@@ -52,10 +52,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
  return *this;
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() {
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~DeviceTensor() {
  if (state_ == AllocState::Owner) {
    FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
    CUDA_VERIFY(cudaFree(this->data_));
@@ -66,13 +66,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() {
  // destructor will return the reservation
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
  const IndexT sizes[Dim],
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
    state_(AllocState::Owner),
    space_(space) {
@@ -80,13 +80,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
  FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
  std::initializer_list<IndexT> sizes,
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
    state_(AllocState::Owner),
    space_(space) {
@@ -95,15 +95,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
 }
 // memory reservation constructor
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
  DeviceMemory& m,
  const IndexT sizes[Dim],
  cudaStream_t stream,
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
    state_(AllocState::Reservation),
    space_(space) {
@@ -116,15 +116,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
 }
 // memory reservation constructor
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
  DeviceMemory& m,
  std::initializer_list<IndexT> sizes,
  cudaStream_t stream,
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
    state_(AllocState::Reservation),
    space_(space) {
@@ -136,51 +136,51 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
  reservation_ = std::move(memory);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
  DataPtrType data,
  const IndexT sizes[Dim],
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
    state_(AllocState::NotOwner),
    space_(space) {
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
  DataPtrType data,
  std::initializer_list<IndexT> sizes,
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
    state_(AllocState::NotOwner),
    space_(space) {
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
  DataPtrType data,
  const IndexT sizes[Dim],
  const IndexT strides[Dim],
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes, strides),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
    state_(AllocState::NotOwner),
    space_(space) {
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
-  Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
  cudaStream_t stream,
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
    state_(AllocState::Owner),
    space_(space) {
@@ -189,15 +189,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
  this->copyFrom(t, stream);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
  DeviceMemory& m,
-  Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
  cudaStream_t stream,
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
    state_(AllocState::Reservation),
    space_(space) {
@@ -211,10 +211,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
  this->copyFrom(t, stream);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
-__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
+__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::zero(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero(
  cudaStream_t stream) {
  if (this->data_) {
    // Region must be contiguous

--- a/gpu/utils/DeviceTensor.cuh
+++ b/gpu/utils/DeviceTensor.cuh
@@ -18,10 +18,10 @@ namespace faiss { namespace gpu {
 template <typename T,
          int Dim,
-          bool Contig = false,
+          bool InnerContig = false,
          typename IndexT = int,
          template <typename U> class PtrTraits = traits::DefaultPtrTraits>
-class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
+class DeviceTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
 public:
  typedef IndexT IndexType;
  typedef typename PtrTraits<T>::PtrType DataPtrType;
@@ -33,11 +33,11 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
  __host__ ~DeviceTensor();
  /// Move constructor
-  __host__ DeviceTensor(DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t);
+  __host__ DeviceTensor(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
  /// Move assignment
-  __host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
+  __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
-  operator=(DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t);
+  operator=(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
  /// Constructs a tensor of the given size, allocating memory for it
  /// locally
@@ -76,19 +76,19 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
                        MemorySpace space = MemorySpace::Device);
  /// Copies a tensor into ourselves, allocating memory for it locally
-  __host__ DeviceTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
+  __host__ DeviceTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
                        cudaStream_t stream,
                        MemorySpace space = MemorySpace::Device);
  /// Copies a tensor into ourselves, reserving a temporary
  /// memory reservation via a memory manager.
  __host__ DeviceTensor(DeviceMemory& m,
-                        Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
+                        Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
                        cudaStream_t stream,
                        MemorySpace space = MemorySpace::Device);
  /// Call to zero out memory
-  __host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
+  __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
  zero(cudaStream_t stream);
 private:

--- a/gpu/utils/DeviceUtils.cpp
+++ b/gpu/utils/DeviceUtils.cpp
@@ -43,7 +43,7 @@ void synchronizeAllDevices() {
  }
 }
-cudaDeviceProp& getDeviceProperties(int device) {
+const cudaDeviceProp& getDeviceProperties(int device) {
  static std::mutex mutex;
  static std::unordered_map<int, cudaDeviceProp> properties;
@@ -61,6 +61,10 @@ cudaDeviceProp& getDeviceProperties(int device) {
  return it->second;
 }
+const cudaDeviceProp& getCurrentDeviceProperties() {
+  return getDeviceProperties(getCurrentDevice());
+}
 int getMaxThreads(int device) {
  return getDeviceProperties(device).maxThreadsPerBlock;
 }

--- a/gpu/utils/DeviceUtils.h
+++ b/gpu/utils/DeviceUtils.h
@@ -31,7 +31,10 @@ int getNumDevices();
 void synchronizeAllDevices();
 /// Returns a cached cudaDeviceProp for the given device
-cudaDeviceProp& getDeviceProperties(int device);
+const cudaDeviceProp& getDeviceProperties(int device);
+/// Returns the cached cudaDeviceProp for the current device
+const cudaDeviceProp& getCurrentDeviceProperties();
 /// Returns the maximum number of threads available for the given GPU
 /// device

--- a/gpu/utils/HostTensor-inl.cuh
+++ b/gpu/utils/HostTensor-inl.cuh
@@ -10,18 +10,18 @@
 namespace faiss { namespace gpu {
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor() :
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor() :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
    state_(AllocState::NotOwner) {
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() {
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~HostTensor() {
  if (state_ == AllocState::Owner) {
    FAISS_ASSERT(this->data_ != nullptr);
    delete[] this->data_;
@@ -29,67 +29,67 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() {
  }
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
  const IndexT sizes[Dim]) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
    state_(AllocState::Owner) {
  this->data_ = new T[this->numElements()];
  FAISS_ASSERT(this->data_ != nullptr);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
  std::initializer_list<IndexT> sizes) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
    state_(AllocState::Owner) {
  this->data_ = new T[this->numElements()];
  FAISS_ASSERT(this->data_ != nullptr);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
  DataPtrType data,
  const IndexT sizes[Dim]) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
    state_(AllocState::NotOwner) {
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
  DataPtrType data,
  std::initializer_list<IndexT> sizes) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
    state_(AllocState::NotOwner) {
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
  DataPtrType data,
  const IndexT sizes[Dim],
  const IndexT strides[Dim]) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes, strides),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
    state_(AllocState::NotOwner) {
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
-  Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
  cudaStream_t stream) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
    state_(AllocState::Owner) {
  // Only contiguous arrays handled for now
  FAISS_ASSERT(t.isContiguous());
@@ -99,10 +99,10 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
 }
 /// Call to zero out memory
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
-__host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>&
+__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() {
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero() {
  // Region must be contiguous
  FAISS_ASSERT(this->isContiguous());
@@ -113,17 +113,17 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() {
  return *this;
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ T
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::maxDiff(
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::maxDiff(
-  const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const {
+  const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
  auto size = this->numElements();
  FAISS_ASSERT(size == t.numElements());
  FAISS_ASSERT(size > 0);
-  if (Contig) {
+  if (InnerContig) {
    auto a = this->data();
    auto b = t.data();

--- a/gpu/utils/HostTensor.cuh
+++ b/gpu/utils/HostTensor.cuh
@@ -16,10 +16,10 @@ namespace faiss { namespace gpu {
 template <typename T,
          int Dim,
-          bool Contig = false,
+          bool InnerContig = false,
          typename IndexT = int,
          template <typename U> class PtrTraits = traits::DefaultPtrTraits>
-class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
+class HostTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
 public:
  typedef IndexT IndexType;
  typedef typename PtrTraits<T>::PtrType DataPtrType;
@@ -51,19 +51,19 @@ class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
  /// Copies a tensor into ourselves, allocating memory for it
  /// locally. If the tensor is on the GPU, then we will copy it to
  /// ourselves wrt the given stream.
-  __host__ HostTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
+  __host__ HostTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
                      cudaStream_t stream);
  /// Call to zero out memory
-  __host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>& zero();
+  __host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& zero();
  /// Returns the maximum difference seen between two tensors
  __host__ T
-  maxDiff(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const;
+  maxDiff(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const;
  /// Are the two tensors exactly equal?
  __host__ bool
-  equal(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const {
+  equal(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
    return (maxDiff(t) == (T) 0);
  }

--- a/gpu/utils/Limits.cuh
+++ b/gpu/utils/Limits.cuh
@@ -24,11 +24,12 @@ struct Limits {
 // constexpr constructor for half
 // FIXME: faiss CPU uses +/-FLT_MAX instead of +/-infinity
 constexpr float kFloatMax = std::numeric_limits<float>::max();
+constexpr float kFloatMin = std::numeric_limits<float>::lowest();
 template <>
 struct Limits<float> {
  static __device__ __host__ inline float getMin() {
-    return -kFloatMax;
+    return kFloatMin;
  }
  static __device__ __host__ inline float getMax() {
    return kFloatMax;
@@ -55,8 +56,8 @@ struct Limits<half> {
 #endif // FAISS_USE_FLOAT16
-constexpr int kIntMin = std::numeric_limits<int>::min();
 constexpr int kIntMax = std::numeric_limits<int>::max();
+constexpr int kIntMin = std::numeric_limits<int>::lowest();
 template <>
 struct Limits<int> {

--- a/gpu/utils/MatrixMult.cu
+++ b/gpu/utils/MatrixMult.cu
@@ -112,6 +112,10 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC,
  FAISS_ASSERT(aK == bK);
  FAISS_ASSERT(bN == cN);
+  FAISS_ASSERT(a.getStride(1) == 1);
+  FAISS_ASSERT(b.getStride(1) == 1);
+  FAISS_ASSERT(c.getStride(1) == 1);
  // Now, we have to represent the matrix multiplication in
  // column-major layout
  T* pA = transC ? a.data() : b.data();
@@ -122,9 +126,9 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC,
  int n = c.getSize(0); // other size
  int k = transA ? a.getSize(0) : a.getSize(1);
-  int lda = transC ? a.getSize(1) : b.getSize(1);
+  int lda = transC ? a.getStride(0) : b.getStride(0);
-  int ldb = transC ? b.getSize(1) : a.getSize(1);
+  int ldb = transC ? b.getStride(0) : a.getStride(0);
-  int ldc = c.getSize(1);
+  int ldc = c.getStride(0);
  auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
  auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
@@ -238,9 +242,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
  int n = c.getSize(1); // other size
  int k = transA ? a.getSize(1) : a.getSize(2);
-  int lda = transC ? a.getSize(2) : b.getSize(2);
+  int lda = transC ? a.getStride(1) : b.getStride(1);
-  int ldb = transC ? b.getSize(2) : a.getSize(2);
+  int ldb = transC ? b.getStride(1) : a.getStride(1);
-  int ldc = c.getSize(2);
+  int ldc = c.getStride(1);
  auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
  auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
@@ -254,9 +258,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
  HostTensor<float*, 1, true> hostB({b.getSize(0)});
  HostTensor<float*, 1, true> hostC({c.getSize(0)});
-  size_t aOffset = a.getSize(1) * a.getSize(2);
+  size_t aOffset = a.getStride(0);
-  size_t bOffset = b.getSize(1) * b.getSize(2);
+  size_t bOffset = b.getStride(0);
-  size_t cOffset = c.getSize(1) * c.getSize(2);
+  size_t cOffset = c.getStride(0);
  for (int i = 0; i < a.getSize(0); ++i) {
    hostA[i] = transC ? a.data() + i * aOffset : b.data() + i * bOffset;

--- a/gpu/utils/NoTypeTensor.cuh
+++ b/gpu/utils/NoTypeTensor.cuh
@@ -16,7 +16,7 @@
 namespace faiss { namespace gpu {
-template <int Dim, bool Contig = false, typename IndexT = int>
+template <int Dim, bool InnerContig = false, typename IndexT = int>
 class NoTypeTensor {
 public:
  NoTypeTensor()
@@ -25,7 +25,7 @@ class NoTypeTensor {
  }
  template <typename T>
-  NoTypeTensor(Tensor<T, Dim, Contig, IndexT>& t)
+  NoTypeTensor(Tensor<T, Dim, InnerContig, IndexT>& t)
      : mem_(t.data()),
        typeSize_(sizeof(T)) {
    for (int i = 0; i < Dim; ++i) {
@@ -87,13 +87,14 @@ class NoTypeTensor {
  }
  template <typename T>
-  Tensor<T, Dim, Contig, IndexT> toTensor() {
+  Tensor<T, Dim, InnerContig, IndexT> toTensor() {
    FAISS_ASSERT(sizeof(T) == typeSize_);
-    return Tensor<T, Dim, Contig, IndexT>((T*) mem_, size_, stride_);
+    return Tensor<T, Dim, InnerContig, IndexT>((T*) mem_, size_, stride_);
  }
-  NoTypeTensor<Dim, Contig, IndexT> narrowOutermost(IndexT start, IndexT size) {
+  NoTypeTensor<Dim, InnerContig, IndexT> narrowOutermost(IndexT start,
+                                                         IndexT size) {
    char* newPtr = (char*) mem_;
    if (start > 0) {
@@ -110,7 +111,7 @@ class NoTypeTensor {
      }
    }
-    return NoTypeTensor<Dim, Contig, IndexT>(
+    return NoTypeTensor<Dim, InnerContig, IndexT>(
      newPtr, typeSize_, newSize, stride_);
  }

--- a/gpu/utils/Tensor-inl.cuh
+++ b/gpu/utils/Tensor-inl.cuh
@@ -8,16 +8,16 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
-#include "../../FaissAssert.h"
+#include "../GpuFaissAssert.h"
 #include "DeviceUtils.h"
 #include <limits>
 namespace faiss { namespace gpu {
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ __device__
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::Tensor()
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor()
    : data_(nullptr) {
  static_assert(Dim > 0, "must have > 0 dimensions");
@@ -27,12 +27,12 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::Tensor()
  }
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ __device__
-Tensor<T, Dim, Contig, IndexT, PtrTraits>&
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
-  Tensor<T, Dim, Contig, IndexT, PtrTraits>&& t) {
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
  data_ = t.data_; t.data_ = nullptr;
  for (int i = 0; i < Dim; ++i) {
    stride_[i] = t.stride_[i]; t.stride_[i] = 0;
@@ -42,10 +42,10 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
  return *this;
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ __device__
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
 Tensor(DataPtrType data, const IndexT sizes[Dim])
    : data_(data) {
  static_assert(Dim > 0, "must have > 0 dimensions");
@@ -60,13 +60,13 @@ Tensor(DataPtrType data, const IndexT sizes[Dim])
  }
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ __device__
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
 Tensor(DataPtrType data, std::initializer_list<IndexT> sizes)
    : data_(data) {
-  assert(sizes.size() == Dim);
+  GPU_FAISS_ASSERT(sizes.size() == Dim);
  static_assert(Dim > 0, "must have > 0 dimensions");
  int i = 0;
@@ -81,10 +81,10 @@ Tensor(DataPtrType data, std::initializer_list<IndexT> sizes)
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ __device__
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::Tensor(
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
  DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim])
    : data_(data) {
  static_assert(Dim > 0, "must have > 0 dimensions");
@@ -95,22 +95,23 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::Tensor(
  }
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ void
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::copyFrom(
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::copyFrom(
-  Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
  cudaStream_t stream) {
-  static_assert(Contig, "only contiguous tensors handled");
+  // The tensor must be fully contiguous
+  GPU_FAISS_ASSERT(this->isContiguous());
  // Size must be the same (since dimensions are checked and
  // continuity is assumed, we need only check total number of
  // elements
-  FAISS_ASSERT(this->numElements() == t.numElements());
+  GPU_FAISS_ASSERT(this->numElements() == t.numElements());
  if (t.numElements() > 0) {
-    FAISS_ASSERT(this->data_);
+    GPU_FAISS_ASSERT(this->data_);
-    FAISS_ASSERT(t.data());
+    GPU_FAISS_ASSERT(t.data());
    int ourDev = getDeviceForAddress(this->data_);
    int tDev = getDeviceForAddress(t.data());
@@ -133,22 +134,23 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::copyFrom(
  }
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ void
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::copyTo(
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::copyTo(
-  Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
  cudaStream_t stream) {
-  static_assert(Contig, "only contiguous tensors handled");
+  // The tensor must be fully contiguous
+  GPU_FAISS_ASSERT(this->isContiguous());
  // Size must be the same (since dimensions are checked and
  // continuity is assumed, we need only check total number of
  // elements
-  FAISS_ASSERT(this->numElements() == t.numElements());
+  GPU_FAISS_ASSERT(this->numElements() == t.numElements());
  if (t.numElements() > 0) {
-    FAISS_ASSERT(this->data_);
+    GPU_FAISS_ASSERT(this->data_);
-    FAISS_ASSERT(t.data());
+    GPU_FAISS_ASSERT(t.data());
    int ourDev = getDeviceForAddress(this->data_);
    int tDev = getDeviceForAddress(t.data());
@@ -171,62 +173,79 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::copyTo(
  }
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
-template <int OtherDim>
+template <typename OtherT, int OtherDim>
 __host__ __device__ bool
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::isSame(
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isSame(
-  const Tensor<T, OtherDim, Contig, IndexT, PtrTraits>& rhs) const {
+  const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs) const {
  if (Dim != OtherDim) {
    return false;
  }
  for (int i = 0; i < Dim; ++i) {
-    if (size_[i] != rhs.size_[i]) {
+    if (this->getSize(i) != rhs.getSize(i)) {
      return false;
    }
-    if (!Contig) {
+    if (this->getStride(i) != rhs.getStride(i)) {
-      if (stride_[i] != rhs.stride_[i]) {
      return false;
    }
  }
+  return true;
+}
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename OtherT, int OtherDim>
+__host__ __device__ bool
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isSameSize(
+  const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs) const {
+  if (Dim != OtherDim) {
+    return false;
+  }
+  for (int i = 0; i < Dim; ++i) {
+    if (this->getSize(i) != rhs.getSize(i)) {
+      return false;
+    }
  }
  return true;
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 template <typename U>
-__host__ __device__ Tensor<U, Dim, Contig, IndexT, PtrTraits>
+__host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::cast() {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::cast() {
  static_assert(sizeof(U) == sizeof(T), "cast must be to same size object");
-  return Tensor<U, Dim, Contig, IndexT, PtrTraits>(
+  return Tensor<U, Dim, InnerContig, IndexT, PtrTraits>(
    reinterpret_cast<U*>(data_), size_, stride_);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 template <typename U>
-__host__ __device__ const Tensor<U, Dim, Contig, IndexT, PtrTraits>
+__host__ __device__ const Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::cast() const {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::cast() const {
  static_assert(sizeof(U) == sizeof(T), "cast must be to same size object");
-  return Tensor<U, Dim, Contig, IndexT, PtrTraits>(
+  return Tensor<U, Dim, InnerContig, IndexT, PtrTraits>(
    reinterpret_cast<U*>(data_), size_, stride_);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 template <typename U>
-__host__ __device__ Tensor<U, Dim, Contig, IndexT, PtrTraits>
+__host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::castResize() {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::castResize() {
  static_assert(sizeof(U) >= sizeof(T), "only handles greater sizes");
  constexpr int kMultiple = sizeof(U) / sizeof(T);
-  assert(canCastResize<U>());
+  GPU_FAISS_ASSERT(canCastResize<U>());
  IndexT newSize[Dim];
  IndexT newStride[Dim];
@@ -239,24 +258,24 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::castResize() {
  newStride[Dim - 1] = 1; // this is the same as the old stride
  newSize[Dim - 1] = size_[Dim - 1] / kMultiple;
-  return Tensor<U, Dim, Contig, IndexT, PtrTraits>(
+  return Tensor<U, Dim, InnerContig, IndexT, PtrTraits>(
    reinterpret_cast<U*>(data_), newSize, newStride);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 template <typename U>
-__host__ __device__ const Tensor<U, Dim, Contig, IndexT, PtrTraits>
+__host__ __device__ const Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::castResize() const {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::castResize() const {
-  return const_cast<Tensor<T, Dim, Contig, IndexT, PtrTraits>*>(this)->
+  return const_cast<Tensor<T, Dim, InnerContig, IndexT, PtrTraits>*>(this)->
    castResize<U>();
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 template <typename U>
 __host__ __device__ bool
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastResize() const {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::canCastResize() const {
  static_assert(sizeof(U) >= sizeof(T), "only handles greater sizes");
  constexpr int kMultiple = sizeof(U) / sizeof(T);
@@ -279,13 +298,13 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastResize() const {
  return true;
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 template <typename NewIndexT>
-__host__ Tensor<T, Dim, Contig, NewIndexT, PtrTraits>
+__host__ Tensor<T, Dim, InnerContig, NewIndexT, PtrTraits>
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::castIndexType() const {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::castIndexType() const {
  if (sizeof(NewIndexT) < sizeof(IndexT)) {
-    assert(this->canCastIndexType<NewIndexT>());
+    GPU_FAISS_ASSERT(this->canUseIndexType<NewIndexT>());
  }
  NewIndexT newSize[Dim];
@@ -295,15 +314,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::castIndexType() const {
    newStride[i] = (NewIndexT) stride_[i];
  }
-  return Tensor<T, Dim, Contig, NewIndexT, PtrTraits>(
+  return Tensor<T, Dim, InnerContig, NewIndexT, PtrTraits>(
    data_, newSize, newStride);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 template <typename NewIndexT>
 __host__ bool
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastIndexType() const {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::canUseIndexType() const {
  static_assert(sizeof(size_t) >= sizeof(IndexT),
                "index size too large");
  static_assert(sizeof(size_t) >= sizeof(NewIndexT),
@@ -313,16 +332,12 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastIndexType() const {
  // FIXME: maybe also consider offset in bytes? multiply by sizeof(T)?
  size_t maxOffset = 0;
-  if (Contig) {
-    maxOffset = (size_t) size_[0] * (size_t) stride_[0];
-  } else {
  for (int i = 0; i < Dim; ++i) {
    size_t curMaxOffset = (size_t) size_[i] * (size_t) stride_[i];
    if (curMaxOffset > maxOffset) {
      maxOffset = curMaxOffset;
    }
  }
-  }
  if (maxOffset > (size_t) std::numeric_limits<NewIndexT>::max()) {
    return false;
@@ -331,23 +346,23 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastIndexType() const {
  return true;
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ IndexT
+__host__ __device__ size_t
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::numElements() const {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::numElements() const {
-  long size = getSize(0);
+  size_t size = (size_t) getSize(0);
  for (int i = 1; i < Dim; ++i) {
-    size *= getSize(i);
+    size *= (size_t) getSize(i);
  }
  return size;
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ __device__ bool
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::isContiguous() const {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isContiguous() const {
  long prevSize = 1;
  for (int i = Dim - 1; i >= 0; --i) {
@@ -363,10 +378,10 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::isContiguous() const {
  return true;
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ __device__ bool
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::isConsistentlySized(int i) const {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isConsistentlySized(int i) const {
  if (i == 0 && getStride(i) > 0 && getSize(i) > 0) {
    return true;
  } else if ((i > 0) && (i < Dim) && (getStride(i) > 0) &&
@@ -377,10 +392,10 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::isConsistentlySized(int i) const {
  return false;
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ __device__ bool
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::isConsistentlySized() const {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isConsistentlySized() const {
  for (int i = 0; i < Dim; ++i) {
    if (!isConsistentlySized(i)) {
      return false;
@@ -390,23 +405,28 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::isConsistentlySized() const {
  return true;
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ __device__ bool
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::isContiguousDim(int i) const {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isContiguousDim(int i) const {
  return (i == Dim - 1) || // just in case
    ((i < Dim - 1) &&
     ((getStride(i) / getStride(i + 1)) == getSize(i + 1)));
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits>
+__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::transpose(int dim1,
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::transpose(int dim1,
                                                     int dim2) const {
-  assert(dim1 >= 0 && dim1 < Dim);
+  GPU_FAISS_ASSERT(dim1 >= 0 && dim1 < Dim);
-  assert(dim1 >= 0 && dim2 < Dim);
+  GPU_FAISS_ASSERT(dim1 >= 0 && dim2 < Dim);
-  static_assert(!Contig, "cannot transpose contiguous arrays");
+  // If a tensor is innermost contiguous, one cannot transpose the innermost
+  // dimension
+  if (InnerContig) {
+    GPU_FAISS_ASSERT(dim1 != Dim - 1 && dim2 != Dim - 1);
+  }
  IndexT newSize[Dim];
  IndexT newStride[Dim];
@@ -424,14 +444,14 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::transpose(int dim1,
  newStride[dim1] = newStride[dim2];
  newStride[dim2] = tmp;
-  return Tensor<T, Dim, Contig, IndexT, PtrTraits>(data_, newSize, newStride);
+  return Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data_, newSize, newStride);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 template <int NewDim>
-__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits>
+__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::upcastOuter() {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::upcastOuter() {
  // Can only create tensors of greater dimension
  static_assert(NewDim > Dim, "Can only upcast to greater dim");
@@ -452,15 +472,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::upcastOuter() {
    }
  }
-  return Tensor<T, NewDim, Contig, IndexT, PtrTraits>(
+  return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
    data_, newSize, newStride);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 template <int NewDim>
-__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits>
+__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::upcastInner() {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::upcastInner() {
  // Can only create tensors of greater dimension
  static_assert(NewDim > Dim, "Can only upcast to greater dim");
@@ -479,15 +499,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::upcastInner() {
    }
  }
-  return Tensor<T, NewDim, Contig, IndexT, PtrTraits>(
+  return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
    data_, newSize, newStride);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 template <int NewDim>
-__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits>
+__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastOuter() {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::downcastOuter() {
  // Can only create tensors of lesser dimension
  static_assert(NewDim < Dim, "Can only downcast to lesser dim");
@@ -497,7 +517,7 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastOuter() {
  // them).
  for (int i = 0; i < Dim - NewDim; ++i) {
    bool cont = isContiguousDim(i);
-    assert(cont);
+    GPU_FAISS_ASSERT(cont);
  }
  IndexT newSize[NewDim];
@@ -524,15 +544,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastOuter() {
    }
  }
-  return Tensor<T, NewDim, Contig, IndexT, PtrTraits>(
+  return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
    data_, newSize, newStride);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 template <int NewDim>
-__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits>
+__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastInner() {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::downcastInner() {
  // Can only create tensors of lesser dimension
  static_assert(NewDim < Dim, "Can only downcast to lesser dim");
@@ -541,7 +561,7 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastInner() {
  // in all of the dimensions we are collapsing (no padding in
  // them).
  for (int i = NewDim; i < Dim; ++i) {
-    assert(isContiguousDim(i));
+    GPU_FAISS_ASSERT(isContiguousDim(i));
  }
  IndexT newSize[NewDim];
@@ -567,15 +587,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastInner() {
    }
  }
-  return Tensor<T, NewDim, Contig, IndexT, PtrTraits>(
+  return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
    data_, newSize, newStride);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 template <int SubDim>
-__host__ __device__ Tensor<T, SubDim, Contig, IndexT, PtrTraits>
+__host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::view(DataPtrType at) {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::view(DataPtrType at) {
  static_assert(SubDim >= 1 && SubDim < Dim,
                "can only create view of lesser dim");
@@ -587,89 +607,76 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::view(DataPtrType at) {
    viewStrides[i] = stride_[Dim - SubDim + i];
  }
-  return Tensor<T, SubDim, Contig, IndexT, PtrTraits>(
+  return Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>(
    at, viewSizes, viewStrides);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 template <int SubDim>
-__host__ __device__ Tensor<T, SubDim, Contig, IndexT, PtrTraits>
+__host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::view() {
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::view() {
  return view<SubDim>(data_);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits>
+__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::narrowOutermost(IndexT start,
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::narrowOutermost(IndexT start,
                                                                IndexT size) {
-  DataPtrType newData = data_;
+  return this->narrow(0, start, size);
-  if (start > 0) {
-    newData += start * stride_[0];
-  }
-  IndexT newSize[Dim];
-  for (int i = 0; i < Dim; ++i) {
-    if (i == 0) {
-      assert(start + size <= size_[0]);
-      newSize[i] = size;
-    } else {
-      newSize[i] = size_[i];
-    }
-  }
-  return Tensor<T, Dim, Contig, IndexT, PtrTraits>(newData, newSize, stride_);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ Tensor<T, Dim, false, IndexT, PtrTraits>
+__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::narrow(int dim,
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::narrow(int dim,
                                                       IndexT start,
                                                       IndexT size) {
  DataPtrType newData = data_;
+  GPU_FAISS_ASSERT(start >= 0 &&
+                   start < size_[dim] &&
+                   (start + size) <= size_[dim]);
  if (start > 0) {
-    newData += start * stride_[dim];
+    newData += (size_t) start * stride_[dim];
  }
  IndexT newSize[Dim];
  for (int i = 0; i < Dim; ++i) {
    if (i == dim) {
-      assert(start + size <= size_[dim]);
+      GPU_FAISS_ASSERT(start + size <= size_[dim]);
      newSize[i] = size;
    } else {
      newSize[i] = size_[i];
    }
  }
-  // The narrowed tensor is not necessarily contiguous
+  // If we were innermost contiguous before, we are still innermost contiguous
-  return Tensor<T, Dim, false, IndexT, PtrTraits>(newData, newSize, stride_);
+  return Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(newData, newSize, stride_);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 template <int NewDim>
-__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits>
+__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
-Tensor<T, Dim, Contig, IndexT, PtrTraits>::view(
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::view(
  std::initializer_list<IndexT> sizes) {
-  static_assert(Contig, "on contiguous tensors only");
+  GPU_FAISS_ASSERT(this->isContiguous());
-  assert(sizes.size() == NewDim);
+  GPU_FAISS_ASSERT(sizes.size() == NewDim);
  // The total size of the new view must be the same as the total size
  // of the old view
  size_t curSize = numElements();
  size_t newSize = 1;
  for (auto s : sizes) {
    newSize *= s;
  }
-  assert(curSize == newSize);
+  GPU_FAISS_ASSERT(curSize == newSize);
  return Tensor<T, NewDim, true, IndexT, PtrTraits>(data(), sizes);
 }

--- a/gpu/utils/Tensor.cuh
+++ b/gpu/utils/Tensor.cuh
@@ -24,7 +24,7 @@ namespace faiss { namespace gpu {
 /// Our tensor type
 template <typename T,
          int Dim,
-          bool Contig,
+          bool InnerContig,
          typename IndexT,
          template <typename U> class PtrTraits>
 class Tensor;
@@ -58,7 +58,7 @@ struct DefaultPtrTraits {
   - `T` is the contained type (e.g., `float`)
   - `Dim` is the tensor rank
-   - If `Contig` is true, then the tensor is assumed to be
+   - If `InnerContig` is true, then the tensor is assumed to be innermost
   - contiguous, and only operations that make sense on contiguous
   - arrays are allowed (e.g., no transpose). Strides are still
   - calculated, but innermost stride is assumed to be 1.
@@ -71,7 +71,7 @@ struct DefaultPtrTraits {
 */
 template <typename T,
          int Dim,
-          bool Contig = false,
+          bool InnerContig = false,
          typename IndexT = int,
          template <typename U> class PtrTraits = traits::DefaultPtrTraits>
 class Tensor {
@@ -79,28 +79,28 @@ class Tensor {
  enum { NumDim = Dim };
  typedef T DataType;
  typedef IndexT IndexType;
-  enum { IsContig = Contig };
+  enum { IsInnerContig = InnerContig };
  typedef typename PtrTraits<T>::PtrType DataPtrType;
-  typedef Tensor<T, Dim, Contig, IndexT, PtrTraits> TensorType;
+  typedef Tensor<T, Dim, InnerContig, IndexT, PtrTraits> TensorType;
  /// Default constructor
  __host__ __device__ Tensor();
  /// Copy constructor
-  __host__ __device__ Tensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t)
+  __host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t)
  = default;
  /// Move constructor
-  __host__ __device__ Tensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>&& t)
+  __host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t)
  = default;
  /// Assignment
-  __host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits>&
+  __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
-  operator=(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t) = default;
+  operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) = default;
  /// Move assignment
-  __host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits>&
+  __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
-  operator=(Tensor<T, Dim, Contig, IndexT, PtrTraits>&& t);
+  operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
  /// Constructor that calculates strides with no padding
  __host__ __device__ Tensor(DataPtrType data,
@@ -116,28 +116,33 @@ class Tensor {
                             const IndexT strides[Dim]);
  /// Copies a tensor into ourselves; sizes must match
-  __host__ void copyFrom(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
+  __host__ void copyFrom(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
                         cudaStream_t stream);
  /// Copies ourselves into a tensor; sizes must match
-  __host__ void copyTo(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
+  __host__ void copyTo(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
                       cudaStream_t stream);
  /// Returns true if the two tensors are of the same dimensionality,
  /// size and stride.
-  template <int OtherDim>
+  template <typename OtherT, int OtherDim>
  __host__ __device__ bool
-  isSame(const Tensor<T, OtherDim, Contig, IndexT, PtrTraits>& rhs) const;
+  isSame(const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs) const;
+  /// Returns true if the two tensors are of the same dimensionality and size
+  template <typename OtherT, int OtherDim>
+  __host__ __device__ bool
+  isSameSize(const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs) const;
  /// Cast to a tensor of a different type of the same size and
  /// stride. U and our type T must be of the same size
  template <typename U>
-  __host__ __device__ Tensor<U, Dim, Contig, IndexT, PtrTraits> cast();
+  __host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits> cast();
  /// Const version of `cast`
  template <typename U>
  __host__ __device__
-  const Tensor<U, Dim, Contig, IndexT, PtrTraits> cast() const;
+  const Tensor<U, Dim, InnerContig, IndexT, PtrTraits> cast() const;
  /// Cast to a tensor of a different type which is potentially a
  /// different size than our type T. Tensor must be aligned and the
@@ -146,11 +151,11 @@ class Tensor {
  /// must be contiguous. The stride of all outer dimensions must be a
  /// multiple of sizeof(U) / sizeof(T) as well.
  template <typename U>
-  __host__ __device__ Tensor<U, Dim, Contig, IndexT, PtrTraits> castResize();
+  __host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits> castResize();
  /// Const version of `castResize`
  template <typename U>
-  __host__ __device__ const Tensor<U, Dim, Contig, IndexT, PtrTraits>
+  __host__ __device__ const Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
  castResize() const;
  /// Returns true if we can castResize() this tensor to the new type
@@ -161,13 +166,13 @@ class Tensor {
  /// Fails if size or stride entries are not representable in the new
  /// IndexT.
  template <typename NewIndexT>
-  __host__ Tensor<T, Dim, Contig, NewIndexT, PtrTraits>
+  __host__ Tensor<T, Dim, InnerContig, NewIndexT, PtrTraits>
  castIndexType() const;
-  /// Returns true if we can castIndexType() this tensor to the new
+  /// Returns true if we can use this indexing type to access all elements
  /// index type
  template <typename NewIndexT>
-  __host__ bool canCastIndexType() const;
+  __host__ bool canUseIndexType() const;
  /// Returns a raw pointer to the start of our data.
  __host__ __device__ inline DataPtrType data() {
@@ -230,12 +235,12 @@ class Tensor {
  /// Returns the total number of elements contained within our data
  /// (product of `getSize(i)`)
-  __host__ __device__ IndexT numElements() const;
+  __host__ __device__ size_t numElements() const;
  /// If we are contiguous, returns the total size in bytes of our
  /// data
  __host__ __device__ size_t getSizeInBytes() const {
-    return (size_t) numElements() * sizeof(T);
+    return numElements() * sizeof(T);
  }
  /// Returns the size array.
@@ -273,21 +278,21 @@ class Tensor {
  /// dimensions given. Does not actually move elements; transposition
  /// is made by permuting the size/stride arrays.
  /// If the dimensions are not valid, asserts.
-  __host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits>
+  __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
  transpose(int dim1, int dim2) const;
  /// Upcast a tensor of dimension `D` to some tensor of dimension
  /// D' > D by padding the leading dimensions by 1
  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
  template <int NewDim>
-  __host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits>
+  __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
  upcastOuter();
  /// Upcast a tensor of dimension `D` to some tensor of dimension
  /// D' > D by padding the lowest/most varying dimensions by 1
  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
  template <int NewDim>
-  __host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits>
+  __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
  upcastInner();
  /// Downcast a tensor of dimension `D` to some tensor of dimension
@@ -295,46 +300,45 @@ class Tensor {
  /// padding on the leading dimensions.
  template <int NewDim>
  __host__ __device__
-  Tensor<T, NewDim, Contig, IndexT, PtrTraits> downcastOuter();
+  Tensor<T, NewDim, InnerContig, IndexT, PtrTraits> downcastOuter();
  /// Downcast a tensor of dimension `D` to some tensor of dimension
  /// D' < D by collapsing the leading dimensions. asserts if there is
  /// padding on the leading dimensions.
  template <int NewDim>
  __host__ __device__
-  Tensor<T, NewDim, Contig, IndexT, PtrTraits> downcastInner();
+  Tensor<T, NewDim, InnerContig, IndexT, PtrTraits> downcastInner();
  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
  /// of this tensor, starting at `at`.
  template <int SubDim>
-  __host__ __device__ Tensor<T, SubDim, Contig, IndexT, PtrTraits>
+  __host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>
  view(DataPtrType at);
  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
  /// of this tensor, starting where our data begins
  template <int SubDim>
-  __host__ __device__ Tensor<T, SubDim, Contig, IndexT, PtrTraits>
+  __host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>
  view();
  /// Returns a tensor of the same dimension that is a view of the
  /// original tensor with the specified dimension restricted to the
  /// elements in the range [start, start + size)
-  __host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits>
+  __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
  narrowOutermost(IndexT start, IndexT size);
  /// Returns a tensor of the same dimension that is a view of the
  /// original tensor with the specified dimension restricted to the
  /// elements in the range [start, start + size).
-  /// Can occur in an arbitrary dimension, and is possibly
+  /// Can occur in an arbitrary dimension
-  /// non-contiguous
+  __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
-  __host__ __device__ Tensor<T, Dim, false, IndexT, PtrTraits>
  narrow(int dim, IndexT start, IndexT size);
  /// Returns a view of the given tensor expressed as a tensor of a
  /// different number of dimensions.
  /// Only works if we are contiguous.
  template <int NewDim>
-  __host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits>
+  __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
  view(std::initializer_list<IndexT> sizes);
  protected:
@@ -352,21 +356,21 @@ class Tensor {
 namespace detail {
 template <typename IndexType>
-bool canCastIndexType() {
+bool canUseIndexType() {
  return true;
 }
 template <typename IndexType, typename T, typename... U>
-bool canCastIndexType(const T& arg, const U&... args) {
+bool canUseIndexType(const T& arg, const U&... args) {
-  return arg.canCastIndexType<IndexType>() &&
+  return arg.canUseIndexType<IndexType>() &&
-    canCastIndexType(args...);
+    canUseIndexType(args...);
 }
 } // namespace detail
 template <typename IndexType, typename... T>
-bool canCastIndexType(const T&... args) {
+bool canUseIndexType(const T&... args) {
-  return detail::canCastIndexType(args...);
+  return detail::canUseIndexType(args...);
 }
 namespace detail {
@@ -464,7 +468,7 @@ class SubTensor<TensorType, 0, PtrTraits> {
  /// Our parent tensor can create us
  friend class Tensor<typename TensorType::DataType,
                      1,
-                      TensorType::IsContig,
+                      TensorType::IsInnerContig,
                      typename TensorType::IndexType,
                      PtrTraits>;
@@ -493,7 +497,7 @@ class SubTensor {
  __host__ __device__ inline
  SubTensor<TensorType, SubDim - 1, PtrTraits>
    operator[](typename TensorType::IndexType index) {
-    if (TensorType::IsContig && SubDim == 1) {
+    if (TensorType::IsInnerContig && SubDim == 1) {
      // Innermost dimension is stride 1 for contiguous arrays
      return SubTensor<TensorType, SubDim - 1, PtrTraits>(
        tensor_, data_ + index);
@@ -509,7 +513,7 @@ class SubTensor {
  __host__ __device__ inline
  const SubTensor<TensorType, SubDim - 1, PtrTraits>
    operator[](typename TensorType::IndexType index) const {
-    if (TensorType::IsContig && SubDim == 1) {
+    if (TensorType::IsInnerContig && SubDim == 1) {
      // Innermost dimension is stride 1 for contiguous arrays
      return SubTensor<TensorType, SubDim - 1, PtrTraits>(
        tensor_, data_ + index);
@@ -590,7 +594,7 @@ class SubTensor {
  /// of this tensor, starting where our data begins
  Tensor<typename TensorType::DataType,
         SubDim,
-         TensorType::IsContig,
+         TensorType::IsInnerContig,
         typename TensorType::IndexType,
         PtrTraits> view() {
    return tensor_.template view<SubDim>(data_);
@@ -604,7 +608,7 @@ class SubTensor {
  friend class
  Tensor<typename TensorType::DataType,
         TensorType::NumDim,
-         TensorType::IsContig,
+         TensorType::IsInnerContig,
         typename TensorType::IndexType,
         PtrTraits>;
@@ -624,23 +628,23 @@ class SubTensor {
 } // namespace detail
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ __device__ inline
-detail::SubTensor<Tensor<T, Dim, Contig, IndexT, PtrTraits>,
+detail::SubTensor<Tensor<T, Dim, InnerContig, IndexT, PtrTraits>,
                  Dim - 1, PtrTraits>
-  Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator[](IndexT index) {
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator[](IndexT index) {
  return detail::SubTensor<TensorType, Dim - 1, PtrTraits>(
    detail::SubTensor<TensorType, Dim, PtrTraits>(
      *this, data_)[index]);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ __device__ inline
-const detail::SubTensor<Tensor<T, Dim, Contig, IndexT, PtrTraits>,
+const detail::SubTensor<Tensor<T, Dim, InnerContig, IndexT, PtrTraits>,
                        Dim - 1, PtrTraits>
-  Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator[](IndexT index) const {
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator[](IndexT index) const {
  return detail::SubTensor<TensorType, Dim - 1, PtrTraits>(
    detail::SubTensor<TensorType, Dim, PtrTraits>(
      const_cast<TensorType&>(*this), data_)[index]);

--- a/gpu/utils/Transpose.cuh
+++ b/gpu/utils/Transpose.cuh
@@ -19,26 +19,26 @@
 namespace faiss { namespace gpu {
-template <typename T>
+template <typename T, typename IndexT>
 struct TensorInfo {
  static constexpr int kMaxDims = 8;
  T* data;
-  int sizes[kMaxDims];
+  IndexT sizes[kMaxDims];
-  int strides[kMaxDims];
+  IndexT strides[kMaxDims];
  int dims;
 };
-template <typename T, int Dim>
+template <typename T, typename IndexT, int Dim>
 struct TensorInfoOffset {
-  __device__ inline static unsigned int get(const TensorInfo<T>& info,
+  __device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
-                                            unsigned int linearId) {
+                                            IndexT linearId) {
-    unsigned int offset = 0;
+    IndexT offset = 0;
 #pragma unroll
    for (int i = Dim - 1; i >= 0; --i) {
-      unsigned int curDimIndex = linearId % info.sizes[i];
+      IndexT curDimIndex = linearId % info.sizes[i];
-      unsigned int curDimOffset = curDimIndex * info.strides[i];
+      IndexT curDimOffset = curDimIndex * info.strides[i];
      offset += curDimOffset;
@@ -51,21 +51,21 @@ struct TensorInfoOffset {
  }
 };
-template <typename T>
+template <typename T, typename IndexT>
-struct TensorInfoOffset<T, -1> {
+struct TensorInfoOffset<T, IndexT, -1> {
-  __device__ inline static unsigned int get(const TensorInfo<T>& info,
+  __device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
-                                            unsigned int linearId) {
+                                            IndexT linearId) {
    return linearId;
  }
 };
-template <typename T, int Dim>
+template <typename T, typename IndexT, int Dim>
-TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) {
+TensorInfo<T, IndexT> getTensorInfo(const Tensor<T, Dim, true>& t) {
-  TensorInfo<T> info;
+  TensorInfo<T, IndexT> info;
  for (int i = 0; i < Dim; ++i) {
-    info.sizes[i] = t.getSize(i);
+    info.sizes[i] = (IndexT) t.getSize(i);
-    info.strides[i] = t.getStride(i);
+    info.strides[i] = (IndexT) t.getStride(i);
  }
  info.data = t.data();
@@ -74,26 +74,22 @@ TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) {
  return info;
 }
-template <typename T, int DimInput, int DimOutput>
+template <typename T, typename IndexT, int DimInput, int DimOutput>
-__global__ void transposeAny(TensorInfo<T> input,
+__global__ void transposeAny(TensorInfo<T, IndexT> input,
-                             TensorInfo<T> output,
+                             TensorInfo<T, IndexT> output,
-                             unsigned int totalSize) {
+                             IndexT totalSize) {
-  auto linearThreadId = blockIdx.x * blockDim.x + threadIdx.x;
+  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x;
+       i < totalSize;
-  if (linearThreadId >= totalSize) {
+       i += gridDim.x + blockDim.x) {
-    return;
+    auto inputOffset = TensorInfoOffset<T, IndexT, DimInput>::get(input, i);
-  }
+    auto outputOffset = TensorInfoOffset<T, IndexT, DimOutput>::get(output, i);
-  auto inputOffset =
-    TensorInfoOffset<T, DimInput>::get(input, linearThreadId);
-  auto outputOffset =
-    TensorInfoOffset<T, DimOutput>::get(output, linearThreadId);
 #if __CUDA_ARCH__ >= 350
    output.data[outputOffset] = __ldg(&input.data[inputOffset]);
 #else
    output.data[outputOffset] = input.data[inputOffset];
 #endif
+  }
 }
 /// Performs an out-of-place transposition between any two dimensions.
@@ -110,7 +106,8 @@ void runTransposeAny(Tensor<T, Dim, true>& in,
                     int dim1, int dim2,
                     Tensor<T, Dim, true>& out,
                     cudaStream_t stream) {
-  static_assert(Dim <= TensorInfo<T>::kMaxDims, "too many dimensions");
+  static_assert(Dim <= TensorInfo<T, unsigned int>::kMaxDims,
+                "too many dimensions");
  FAISS_ASSERT(dim1 != dim2);
  FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
@@ -127,20 +124,33 @@ void runTransposeAny(Tensor<T, Dim, true>& in,
    FAISS_ASSERT(out.getSize(i) == outSize[i]);
  }
-  auto inInfo = getTensorInfo<T, Dim>(in);
+  size_t totalSize = in.numElements();
-  auto outInfo = getTensorInfo<T, Dim>(out);
+  size_t block = std::min((size_t) getMaxThreadsCurrentDevice(), totalSize);
+  if (totalSize <= (size_t) std::numeric_limits<int>::max()) {
+    // div/mod seems faster with unsigned types
+    auto inInfo = getTensorInfo<T, unsigned int, Dim>(in);
+    auto outInfo = getTensorInfo<T, unsigned int, Dim>(out);
    std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
    std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
-  int totalSize = in.numElements();
+    auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
+    transposeAny<T, unsigned int, Dim, -1>
+      <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
+  } else {
+    auto inInfo = getTensorInfo<T, unsigned long, Dim>(in);
+    auto outInfo = getTensorInfo<T, unsigned long, Dim>(out);
+    std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
+    std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
-  int numThreads = std::min(getMaxThreadsCurrentDevice(), totalSize);
+    auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
-  auto grid = dim3(utils::divUp(totalSize, numThreads));
-  auto block = dim3(numThreads);
-  transposeAny<T, Dim, -1><<<grid, block, 0, stream>>>(
+    transposeAny<T, unsigned long, Dim, -1>
-    inInfo, outInfo, totalSize);
+      <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
+  }
  CUDA_TEST_ERROR();
 }

--- a/gpu/utils/blockselect/BlockSelectImpl.cuh
+++ b/gpu/utils/blockselect/BlockSelectImpl.cuh
@@ -7,6 +7,8 @@
 */
 // Copyright 2004-present Facebook. All Rights Reserved.
+#pragma once
 #include "../BlockSelectKernel.cuh"
 #include "../Limits.cuh"
@@ -17,6 +19,15 @@
    Tensor<int, 2, true>& outV,                                         \
    bool dir,                                                           \
    int k,                                                              \
+    cudaStream_t stream);                                               \
+                                                                        \
+  extern void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
+    Tensor<TYPE, 2, true>& inK,                                         \
+    Tensor<int, 2, true>& inV,                                          \
+    Tensor<TYPE, 2, true>& outK,                                        \
+    Tensor<int, 2, true>& outV,                                         \
+    bool dir,                                                           \
+    int k,                                                              \
    cudaStream_t stream)
 #define BLOCK_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q)                  \
@@ -27,6 +38,11 @@
    bool dir,                                                           \
    int k,                                                              \
    cudaStream_t stream) {                                              \
+    FAISS_ASSERT(in.getSize(0) == outK.getSize(0));                     \
+    FAISS_ASSERT(in.getSize(0) == outV.getSize(0));                     \
+    FAISS_ASSERT(outK.getSize(1) == k);                                 \
+    FAISS_ASSERT(outV.getSize(1) == k);                                 \
+                                                                        \
    auto grid = dim3(in.getSize(0));                                    \
                                                                        \
    constexpr int kBlockSelectNumThreads = 128;                         \
@@ -41,8 +57,40 @@
    blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
      <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k);    \
    CUDA_TEST_ERROR();                                                  \
+  }                                                                     \
+                                                                        \
+  void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(     \
+    Tensor<TYPE, 2, true>& inK,                                         \
+    Tensor<int, 2, true>& inV,                                          \
+    Tensor<TYPE, 2, true>& outK,                                        \
+    Tensor<int, 2, true>& outV,                                         \
+    bool dir,                                                           \
+    int k,                                                              \
+    cudaStream_t stream) {                                              \
+    FAISS_ASSERT(inK.isSameSize(inV));                                  \
+    FAISS_ASSERT(outK.isSameSize(outV));                                \
+                                                                        \
+    auto grid = dim3(inK.getSize(0));                                   \
+                                                                        \
+    constexpr int kBlockSelectNumThreads = 128;                         \
+    auto block = dim3(kBlockSelectNumThreads);                          \
+                                                                        \
+    FAISS_ASSERT(k <= WARP_Q);                                          \
+    FAISS_ASSERT(dir == DIR);                                           \
+                                                                        \
+    auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
+    auto vInit = -1;                                                    \
+                                                                        \
+    blockSelectPair<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
+      <<<grid, block, 0, stream>>>(inK, inV, outK, outV, kInit, vInit, k); \
+    CUDA_TEST_ERROR();                                                  \
  }
 #define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q)                    \
  runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(      \
    in, outK, outV, dir, k, stream)
+#define BLOCK_SELECT_PAIR_CALL(TYPE, DIR, WARP_Q)               \
+  runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(  \
+    inK, inV, outK, outV, dir, k, stream)
--- a/index_io.cpp
+++ b/index_io.cpp
@@ -222,7 +222,6 @@ void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) {
 }
 static void write_ivf_header (const IndexIVF * ivf, FILE *f,
                              bool include_ids = true) {
    write_index_header (ivf, f);
@@ -445,6 +444,7 @@ static void read_ScalarQuantizer (ScalarQuantizer *ivsc, FILE *f) {
    READVECTOR (ivsc->trained);
 }
 ProductQuantizer * read_ProductQuantizer (const char*fname) {
    FILE *f = fopen (fname, "r");
    FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
@@ -676,8 +676,8 @@ Index *read_index (FILE * f, bool try_mmap) {
        }
        idx = idxmap;
    } else {
-        fprintf (stderr, "Index type 0x%08x not supported\n", h);
+        FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
-        abort ();
+        idx = nullptr;
    }
    return idx;
 }

--- a/python/swigfaiss.py
+++ b/python/swigfaiss.py
@@ -80,6 +80,7 @@ class FloatVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss.FloatVector_push_back(self, *args)
+    def clear(self): return _swigfaiss.FloatVector_clear(self)
    def data(self): return _swigfaiss.FloatVector_data(self)
    def size(self): return _swigfaiss.FloatVector_size(self)
    def at(self, *args): return _swigfaiss.FloatVector_at(self, *args)
@@ -89,6 +90,27 @@ class FloatVector(_object):
 FloatVector_swigregister = _swigfaiss.FloatVector_swigregister
 FloatVector_swigregister(FloatVector)
+class DoubleVector(_object):
+    __swig_setmethods__ = {}
+    __setattr__ = lambda self, name, value: _swig_setattr(self, DoubleVector, name, value)
+    __swig_getmethods__ = {}
+    __getattr__ = lambda self, name: _swig_getattr(self, DoubleVector, name)
+    __repr__ = _swig_repr
+    def __init__(self): 
+        this = _swigfaiss.new_DoubleVector()
+        try: self.this.append(this)
+        except: self.this = this
+    def push_back(self, *args): return _swigfaiss.DoubleVector_push_back(self, *args)
+    def clear(self): return _swigfaiss.DoubleVector_clear(self)
+    def data(self): return _swigfaiss.DoubleVector_data(self)
+    def size(self): return _swigfaiss.DoubleVector_size(self)
+    def at(self, *args): return _swigfaiss.DoubleVector_at(self, *args)
+    def resize(self, *args): return _swigfaiss.DoubleVector_resize(self, *args)
+    __swig_destroy__ = _swigfaiss.delete_DoubleVector
+    __del__ = lambda self : None;
+DoubleVector_swigregister = _swigfaiss.DoubleVector_swigregister
+DoubleVector_swigregister(DoubleVector)
 class ByteVector(_object):
    __swig_setmethods__ = {}
    __setattr__ = lambda self, name, value: _swig_setattr(self, ByteVector, name, value)
@@ -100,6 +122,7 @@ class ByteVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss.ByteVector_push_back(self, *args)
+    def clear(self): return _swigfaiss.ByteVector_clear(self)
    def data(self): return _swigfaiss.ByteVector_data(self)
    def size(self): return _swigfaiss.ByteVector_size(self)
    def at(self, *args): return _swigfaiss.ByteVector_at(self, *args)
@@ -120,6 +143,7 @@ class Uint64Vector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss.Uint64Vector_push_back(self, *args)
+    def clear(self): return _swigfaiss.Uint64Vector_clear(self)
    def data(self): return _swigfaiss.Uint64Vector_data(self)
    def size(self): return _swigfaiss.Uint64Vector_size(self)
    def at(self, *args): return _swigfaiss.Uint64Vector_at(self, *args)
@@ -140,6 +164,7 @@ class LongVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss.LongVector_push_back(self, *args)
+    def clear(self): return _swigfaiss.LongVector_clear(self)
    def data(self): return _swigfaiss.LongVector_data(self)
    def size(self): return _swigfaiss.LongVector_size(self)
    def at(self, *args): return _swigfaiss.LongVector_at(self, *args)
@@ -160,6 +185,7 @@ class IntVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss.IntVector_push_back(self, *args)
+    def clear(self): return _swigfaiss.IntVector_clear(self)
    def data(self): return _swigfaiss.IntVector_data(self)
    def size(self): return _swigfaiss.IntVector_size(self)
    def at(self, *args): return _swigfaiss.IntVector_at(self, *args)
@@ -180,6 +206,7 @@ class VectorTransformVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss.VectorTransformVector_push_back(self, *args)
+    def clear(self): return _swigfaiss.VectorTransformVector_clear(self)
    def data(self): return _swigfaiss.VectorTransformVector_data(self)
    def size(self): return _swigfaiss.VectorTransformVector_size(self)
    def at(self, *args): return _swigfaiss.VectorTransformVector_at(self, *args)
@@ -200,6 +227,7 @@ class OperatingPointVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss.OperatingPointVector_push_back(self, *args)
+    def clear(self): return _swigfaiss.OperatingPointVector_clear(self)
    def data(self): return _swigfaiss.OperatingPointVector_data(self)
    def size(self): return _swigfaiss.OperatingPointVector_size(self)
    def at(self, *args): return _swigfaiss.OperatingPointVector_at(self, *args)
@@ -220,6 +248,7 @@ class FloatVectorVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss.FloatVectorVector_push_back(self, *args)
+    def clear(self): return _swigfaiss.FloatVectorVector_clear(self)
    def data(self): return _swigfaiss.FloatVectorVector_data(self)
    def size(self): return _swigfaiss.FloatVectorVector_size(self)
    def at(self, *args): return _swigfaiss.FloatVectorVector_at(self, *args)
@@ -240,6 +269,7 @@ class ByteVectorVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss.ByteVectorVector_push_back(self, *args)
+    def clear(self): return _swigfaiss.ByteVectorVector_clear(self)
    def data(self): return _swigfaiss.ByteVectorVector_data(self)
    def size(self): return _swigfaiss.ByteVectorVector_size(self)
    def at(self, *args): return _swigfaiss.ByteVectorVector_at(self, *args)
@@ -260,6 +290,7 @@ class LongVectorVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss.LongVectorVector_push_back(self, *args)
+    def clear(self): return _swigfaiss.LongVectorVector_clear(self)
    def data(self): return _swigfaiss.LongVectorVector_data(self)
    def size(self): return _swigfaiss.LongVectorVector_size(self)
    def at(self, *args): return _swigfaiss.LongVectorVector_at(self, *args)
@@ -876,6 +907,9 @@ class ClusteringParameters(_object):
    __swig_setmethods__["update_index"] = _swigfaiss.ClusteringParameters_update_index_set
    __swig_getmethods__["update_index"] = _swigfaiss.ClusteringParameters_update_index_get
    if _newclass:update_index = _swig_property(_swigfaiss.ClusteringParameters_update_index_get, _swigfaiss.ClusteringParameters_update_index_set)
+    __swig_setmethods__["frozen_centroids"] = _swigfaiss.ClusteringParameters_frozen_centroids_set
+    __swig_getmethods__["frozen_centroids"] = _swigfaiss.ClusteringParameters_frozen_centroids_get
+    if _newclass:frozen_centroids = _swig_property(_swigfaiss.ClusteringParameters_frozen_centroids_get, _swigfaiss.ClusteringParameters_frozen_centroids_set)
    __swig_setmethods__["min_points_per_centroid"] = _swigfaiss.ClusteringParameters_min_points_per_centroid_set
    __swig_getmethods__["min_points_per_centroid"] = _swigfaiss.ClusteringParameters_min_points_per_centroid_get
    if _newclass:min_points_per_centroid = _swig_property(_swigfaiss.ClusteringParameters_min_points_per_centroid_get, _swigfaiss.ClusteringParameters_min_points_per_centroid_set)
@@ -1720,6 +1754,9 @@ class IndexIVF(Index):
    __swig_setmethods__["cp"] = _swigfaiss.IndexIVF_cp_set
    __swig_getmethods__["cp"] = _swigfaiss.IndexIVF_cp_get
    if _newclass:cp = _swig_property(_swigfaiss.IndexIVF_cp_get, _swigfaiss.IndexIVF_cp_set)
+    __swig_setmethods__["clustering_index"] = _swigfaiss.IndexIVF_clustering_index_set
+    __swig_getmethods__["clustering_index"] = _swigfaiss.IndexIVF_clustering_index_get
+    if _newclass:clustering_index = _swig_property(_swigfaiss.IndexIVF_clustering_index_get, _swigfaiss.IndexIVF_clustering_index_set)
    __swig_setmethods__["ids"] = _swigfaiss.IndexIVF_ids_set
    __swig_getmethods__["ids"] = _swigfaiss.IndexIVF_ids_get
    if _newclass:ids = _swig_property(_swigfaiss.IndexIVF_ids_get, _swigfaiss.IndexIVF_ids_set)
@@ -1949,6 +1986,7 @@ class IndexIVFPQ(IndexIVF):
    def encode_multiple(self, *args): return _swigfaiss.IndexIVFPQ_encode_multiple(self, *args)
    def decode_multiple(self, *args): return _swigfaiss.IndexIVFPQ_decode_multiple(self, *args)
    def search_preassigned(self, *args): return _swigfaiss.IndexIVFPQ_search_preassigned(self, *args)
+    def search_and_reconstruct(self, *args): return _swigfaiss.IndexIVFPQ_search_and_reconstruct(self, *args)
    def precompute_table(self): return _swigfaiss.IndexIVFPQ_precompute_table(self)
    def __init__(self, *args): 
        this = _swigfaiss.new_IndexIVFPQ(*args)
@@ -2107,6 +2145,7 @@ class IndexIDMap(Index):
    def train(self, *args): return _swigfaiss.IndexIDMap_train(self, *args)
    def reset(self): return _swigfaiss.IndexIDMap_reset(self)
    def remove_ids(self, *args): return _swigfaiss.IndexIDMap_remove_ids(self, *args)
+    def range_search(self, *args): return _swigfaiss.IndexIDMap_range_search(self, *args)
    __swig_destroy__ = _swigfaiss.delete_IndexIDMap
    __del__ = lambda self : None;
    def __init__(self, *args): 
@@ -2775,6 +2814,27 @@ RangeSearchPartialResult_swigregister(RangeSearchPartialResult)
 def ignore_SIGTTIN():
  return _swigfaiss.ignore_SIGTTIN()
 ignore_SIGTTIN = _swigfaiss.ignore_SIGTTIN
+class MapLong2Long(_object):
+    __swig_setmethods__ = {}
+    __setattr__ = lambda self, name, value: _swig_setattr(self, MapLong2Long, name, value)
+    __swig_getmethods__ = {}
+    __getattr__ = lambda self, name: _swig_getattr(self, MapLong2Long, name)
+    __repr__ = _swig_repr
+    __swig_setmethods__["map"] = _swigfaiss.MapLong2Long_map_set
+    __swig_getmethods__["map"] = _swigfaiss.MapLong2Long_map_get
+    if _newclass:map = _swig_property(_swigfaiss.MapLong2Long_map_get, _swigfaiss.MapLong2Long_map_set)
+    def add(self, *args): return _swigfaiss.MapLong2Long_add(self, *args)
+    def search(self, *args): return _swigfaiss.MapLong2Long_search(self, *args)
+    def search_multiple(self, *args): return _swigfaiss.MapLong2Long_search_multiple(self, *args)
+    def __init__(self): 
+        this = _swigfaiss.new_MapLong2Long()
+        try: self.this.append(this)
+        except: self.this = this
+    __swig_destroy__ = _swigfaiss.delete_MapLong2Long
+    __del__ = lambda self : None;
+MapLong2Long_swigregister = _swigfaiss.MapLong2Long_swigregister
+MapLong2Long_swigregister(MapLong2Long)
 # This file is compatible with both classic and new-style classes.
--- a/python/swigfaiss_gpu.py
+++ b/python/swigfaiss_gpu.py
@@ -80,6 +80,7 @@ class FloatVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss_gpu.FloatVector_push_back(self, *args)
+    def clear(self): return _swigfaiss_gpu.FloatVector_clear(self)
    def data(self): return _swigfaiss_gpu.FloatVector_data(self)
    def size(self): return _swigfaiss_gpu.FloatVector_size(self)
    def at(self, *args): return _swigfaiss_gpu.FloatVector_at(self, *args)
@@ -89,6 +90,27 @@ class FloatVector(_object):
 FloatVector_swigregister = _swigfaiss_gpu.FloatVector_swigregister
 FloatVector_swigregister(FloatVector)
+class DoubleVector(_object):
+    __swig_setmethods__ = {}
+    __setattr__ = lambda self, name, value: _swig_setattr(self, DoubleVector, name, value)
+    __swig_getmethods__ = {}
+    __getattr__ = lambda self, name: _swig_getattr(self, DoubleVector, name)
+    __repr__ = _swig_repr
+    def __init__(self): 
+        this = _swigfaiss_gpu.new_DoubleVector()
+        try: self.this.append(this)
+        except: self.this = this
+    def push_back(self, *args): return _swigfaiss_gpu.DoubleVector_push_back(self, *args)
+    def clear(self): return _swigfaiss_gpu.DoubleVector_clear(self)
+    def data(self): return _swigfaiss_gpu.DoubleVector_data(self)
+    def size(self): return _swigfaiss_gpu.DoubleVector_size(self)
+    def at(self, *args): return _swigfaiss_gpu.DoubleVector_at(self, *args)
+    def resize(self, *args): return _swigfaiss_gpu.DoubleVector_resize(self, *args)
+    __swig_destroy__ = _swigfaiss_gpu.delete_DoubleVector
+    __del__ = lambda self : None;
+DoubleVector_swigregister = _swigfaiss_gpu.DoubleVector_swigregister
+DoubleVector_swigregister(DoubleVector)
 class ByteVector(_object):
    __swig_setmethods__ = {}
    __setattr__ = lambda self, name, value: _swig_setattr(self, ByteVector, name, value)
@@ -100,6 +122,7 @@ class ByteVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss_gpu.ByteVector_push_back(self, *args)
+    def clear(self): return _swigfaiss_gpu.ByteVector_clear(self)
    def data(self): return _swigfaiss_gpu.ByteVector_data(self)
    def size(self): return _swigfaiss_gpu.ByteVector_size(self)
    def at(self, *args): return _swigfaiss_gpu.ByteVector_at(self, *args)
@@ -120,6 +143,7 @@ class Uint64Vector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss_gpu.Uint64Vector_push_back(self, *args)
+    def clear(self): return _swigfaiss_gpu.Uint64Vector_clear(self)
    def data(self): return _swigfaiss_gpu.Uint64Vector_data(self)
    def size(self): return _swigfaiss_gpu.Uint64Vector_size(self)
    def at(self, *args): return _swigfaiss_gpu.Uint64Vector_at(self, *args)
@@ -140,6 +164,7 @@ class LongVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss_gpu.LongVector_push_back(self, *args)
+    def clear(self): return _swigfaiss_gpu.LongVector_clear(self)
    def data(self): return _swigfaiss_gpu.LongVector_data(self)
    def size(self): return _swigfaiss_gpu.LongVector_size(self)
    def at(self, *args): return _swigfaiss_gpu.LongVector_at(self, *args)
@@ -160,6 +185,7 @@ class IntVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss_gpu.IntVector_push_back(self, *args)
+    def clear(self): return _swigfaiss_gpu.IntVector_clear(self)
    def data(self): return _swigfaiss_gpu.IntVector_data(self)
    def size(self): return _swigfaiss_gpu.IntVector_size(self)
    def at(self, *args): return _swigfaiss_gpu.IntVector_at(self, *args)
@@ -180,6 +206,7 @@ class VectorTransformVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss_gpu.VectorTransformVector_push_back(self, *args)
+    def clear(self): return _swigfaiss_gpu.VectorTransformVector_clear(self)
    def data(self): return _swigfaiss_gpu.VectorTransformVector_data(self)
    def size(self): return _swigfaiss_gpu.VectorTransformVector_size(self)
    def at(self, *args): return _swigfaiss_gpu.VectorTransformVector_at(self, *args)
@@ -200,6 +227,7 @@ class OperatingPointVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss_gpu.OperatingPointVector_push_back(self, *args)
+    def clear(self): return _swigfaiss_gpu.OperatingPointVector_clear(self)
    def data(self): return _swigfaiss_gpu.OperatingPointVector_data(self)
    def size(self): return _swigfaiss_gpu.OperatingPointVector_size(self)
    def at(self, *args): return _swigfaiss_gpu.OperatingPointVector_at(self, *args)
@@ -220,6 +248,7 @@ class FloatVectorVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss_gpu.FloatVectorVector_push_back(self, *args)
+    def clear(self): return _swigfaiss_gpu.FloatVectorVector_clear(self)
    def data(self): return _swigfaiss_gpu.FloatVectorVector_data(self)
    def size(self): return _swigfaiss_gpu.FloatVectorVector_size(self)
    def at(self, *args): return _swigfaiss_gpu.FloatVectorVector_at(self, *args)
@@ -240,6 +269,7 @@ class ByteVectorVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss_gpu.ByteVectorVector_push_back(self, *args)
+    def clear(self): return _swigfaiss_gpu.ByteVectorVector_clear(self)
    def data(self): return _swigfaiss_gpu.ByteVectorVector_data(self)
    def size(self): return _swigfaiss_gpu.ByteVectorVector_size(self)
    def at(self, *args): return _swigfaiss_gpu.ByteVectorVector_at(self, *args)
@@ -260,6 +290,7 @@ class LongVectorVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss_gpu.LongVectorVector_push_back(self, *args)
+    def clear(self): return _swigfaiss_gpu.LongVectorVector_clear(self)
    def data(self): return _swigfaiss_gpu.LongVectorVector_data(self)
    def size(self): return _swigfaiss_gpu.LongVectorVector_size(self)
    def at(self, *args): return _swigfaiss_gpu.LongVectorVector_at(self, *args)
@@ -280,6 +311,7 @@ class GpuResourcesVector(_object):
        try: self.this.append(this)
        except: self.this = this
    def push_back(self, *args): return _swigfaiss_gpu.GpuResourcesVector_push_back(self, *args)
+    def clear(self): return _swigfaiss_gpu.GpuResourcesVector_clear(self)
    def data(self): return _swigfaiss_gpu.GpuResourcesVector_data(self)
    def size(self): return _swigfaiss_gpu.GpuResourcesVector_size(self)
    def at(self, *args): return _swigfaiss_gpu.GpuResourcesVector_at(self, *args)
@@ -949,6 +981,9 @@ class ClusteringParameters(_object):
    __swig_setmethods__["update_index"] = _swigfaiss_gpu.ClusteringParameters_update_index_set
    __swig_getmethods__["update_index"] = _swigfaiss_gpu.ClusteringParameters_update_index_get
    if _newclass:update_index = _swig_property(_swigfaiss_gpu.ClusteringParameters_update_index_get, _swigfaiss_gpu.ClusteringParameters_update_index_set)
+    __swig_setmethods__["frozen_centroids"] = _swigfaiss_gpu.ClusteringParameters_frozen_centroids_set
+    __swig_getmethods__["frozen_centroids"] = _swigfaiss_gpu.ClusteringParameters_frozen_centroids_get
+    if _newclass:frozen_centroids = _swig_property(_swigfaiss_gpu.ClusteringParameters_frozen_centroids_get, _swigfaiss_gpu.ClusteringParameters_frozen_centroids_set)
    __swig_setmethods__["min_points_per_centroid"] = _swigfaiss_gpu.ClusteringParameters_min_points_per_centroid_set
    __swig_getmethods__["min_points_per_centroid"] = _swigfaiss_gpu.ClusteringParameters_min_points_per_centroid_get
    if _newclass:min_points_per_centroid = _swig_property(_swigfaiss_gpu.ClusteringParameters_min_points_per_centroid_get, _swigfaiss_gpu.ClusteringParameters_min_points_per_centroid_set)
@@ -1793,6 +1828,9 @@ class IndexIVF(Index):
    __swig_setmethods__["cp"] = _swigfaiss_gpu.IndexIVF_cp_set
    __swig_getmethods__["cp"] = _swigfaiss_gpu.IndexIVF_cp_get
    if _newclass:cp = _swig_property(_swigfaiss_gpu.IndexIVF_cp_get, _swigfaiss_gpu.IndexIVF_cp_set)
+    __swig_setmethods__["clustering_index"] = _swigfaiss_gpu.IndexIVF_clustering_index_set
+    __swig_getmethods__["clustering_index"] = _swigfaiss_gpu.IndexIVF_clustering_index_get
+    if _newclass:clustering_index = _swig_property(_swigfaiss_gpu.IndexIVF_clustering_index_get, _swigfaiss_gpu.IndexIVF_clustering_index_set)
    __swig_setmethods__["ids"] = _swigfaiss_gpu.IndexIVF_ids_set
    __swig_getmethods__["ids"] = _swigfaiss_gpu.IndexIVF_ids_get
    if _newclass:ids = _swig_property(_swigfaiss_gpu.IndexIVF_ids_get, _swigfaiss_gpu.IndexIVF_ids_set)
@@ -2022,6 +2060,7 @@ class IndexIVFPQ(IndexIVF):
    def encode_multiple(self, *args): return _swigfaiss_gpu.IndexIVFPQ_encode_multiple(self, *args)
    def decode_multiple(self, *args): return _swigfaiss_gpu.IndexIVFPQ_decode_multiple(self, *args)
    def search_preassigned(self, *args): return _swigfaiss_gpu.IndexIVFPQ_search_preassigned(self, *args)
+    def search_and_reconstruct(self, *args): return _swigfaiss_gpu.IndexIVFPQ_search_and_reconstruct(self, *args)
    def precompute_table(self): return _swigfaiss_gpu.IndexIVFPQ_precompute_table(self)
    def __init__(self, *args): 
        this = _swigfaiss_gpu.new_IndexIVFPQ(*args)
@@ -2180,6 +2219,7 @@ class IndexIDMap(Index):
    def train(self, *args): return _swigfaiss_gpu.IndexIDMap_train(self, *args)
    def reset(self): return _swigfaiss_gpu.IndexIDMap_reset(self)
    def remove_ids(self, *args): return _swigfaiss_gpu.IndexIDMap_remove_ids(self, *args)
+    def range_search(self, *args): return _swigfaiss_gpu.IndexIDMap_range_search(self, *args)
    __swig_destroy__ = _swigfaiss_gpu.delete_IndexIDMap
    __del__ = lambda self : None;
    def __init__(self, *args): 
@@ -2340,6 +2380,9 @@ class GpuMultipleClonerOptions(GpuClonerOptions):
    __swig_setmethods__["shard"] = _swigfaiss_gpu.GpuMultipleClonerOptions_shard_set
    __swig_getmethods__["shard"] = _swigfaiss_gpu.GpuMultipleClonerOptions_shard_get
    if _newclass:shard = _swig_property(_swigfaiss_gpu.GpuMultipleClonerOptions_shard_get, _swigfaiss_gpu.GpuMultipleClonerOptions_shard_set)
+    __swig_setmethods__["shard_type"] = _swigfaiss_gpu.GpuMultipleClonerOptions_shard_type_set
+    __swig_getmethods__["shard_type"] = _swigfaiss_gpu.GpuMultipleClonerOptions_shard_type_get
+    if _newclass:shard_type = _swig_property(_swigfaiss_gpu.GpuMultipleClonerOptions_shard_type_get, _swigfaiss_gpu.GpuMultipleClonerOptions_shard_type_set)
    __swig_destroy__ = _swigfaiss_gpu.delete_GpuMultipleClonerOptions
    __del__ = lambda self : None;
 GpuMultipleClonerOptions_swigregister = _swigfaiss_gpu.GpuMultipleClonerOptions_swigregister
@@ -3256,6 +3299,27 @@ RangeSearchPartialResult_swigregister(RangeSearchPartialResult)
 def ignore_SIGTTIN():
  return _swigfaiss_gpu.ignore_SIGTTIN()
 ignore_SIGTTIN = _swigfaiss_gpu.ignore_SIGTTIN
+class MapLong2Long(_object):
+    __swig_setmethods__ = {}
+    __setattr__ = lambda self, name, value: _swig_setattr(self, MapLong2Long, name, value)
+    __swig_getmethods__ = {}
+    __getattr__ = lambda self, name: _swig_getattr(self, MapLong2Long, name)
+    __repr__ = _swig_repr
+    __swig_setmethods__["map"] = _swigfaiss_gpu.MapLong2Long_map_set
+    __swig_getmethods__["map"] = _swigfaiss_gpu.MapLong2Long_map_get
+    if _newclass:map = _swig_property(_swigfaiss_gpu.MapLong2Long_map_get, _swigfaiss_gpu.MapLong2Long_map_set)
+    def add(self, *args): return _swigfaiss_gpu.MapLong2Long_add(self, *args)
+    def search(self, *args): return _swigfaiss_gpu.MapLong2Long_search(self, *args)
+    def search_multiple(self, *args): return _swigfaiss_gpu.MapLong2Long_search_multiple(self, *args)
+    def __init__(self): 
+        this = _swigfaiss_gpu.new_MapLong2Long()
+        try: self.this.append(this)
+        except: self.this = this
+    __swig_destroy__ = _swigfaiss_gpu.delete_MapLong2Long
+    __del__ = lambda self : None;
+MapLong2Long_swigregister = _swigfaiss_gpu.MapLong2Long_swigregister
+MapLong2Long_swigregister(MapLong2Long)
 # This file is compatible with both classic and new-style classes.
--- a/python/swigfaiss_gpu_wrap.cxx
+++ b/python/swigfaiss_gpu_wrap.cxx
--- a/python/swigfaiss_wrap.cxx
+++ b/python/swigfaiss_wrap.cxx
--- a/swigfaiss.swig
+++ b/swigfaiss.swig
@@ -23,6 +23,7 @@
 #pragma SWIG nowarn=321
 #pragma SWIG nowarn=403
+#pragma SWIG nowarn=325
 typedef unsigned long uint64_t;
 typedef uint64_t size_t;
@@ -108,6 +109,7 @@ namespace std {
    public:
        vector();
        void push_back(T);
+        void clear();
        T * data();
        size_t size();
        T at (size_t n) const;
@@ -117,6 +119,7 @@ namespace std {
 %template(FloatVector) std::vector<float>;
+%template(DoubleVector) std::vector<double>;
 %template(ByteVector) std::vector<uint8_t>;
 %template(Uint64Vector) std::vector<uint64_t>;
 %template(LongVector) std::vector<long>;
@@ -709,4 +712,36 @@ void ignore_SIGTTIN() {
 void ignore_SIGTTIN();
+%inline %{
+// numpy misses a hash table implementation, hence this class. It
+// represents not found values as -1 like in the Index implementation
+struct MapLong2Long {
+    std::unordered_map<long, long> map;
+    void add(size_t n, const long *keys, const long *vals) {
+        map.reserve(map.size() + n);
+        for (size_t i = 0; i < n; i++) {
+            map[keys[i]] = vals[i];
+        }
+    }
+    long search(long key) {
+        if (map.count(key) == 0) {
+            return -1;
+        } else {
+            return map[key];
+        }
+    }
+    void search_multiple(size_t n, const long *keys, long * vals) {
+        for (size_t i = 0; i < n; i++) {
+            vals[i] = search(keys[i]);
+        }
+    }
+};
+%}
 // End of file...
--- a/tests/test_build_blocks.py
+++ b/tests/test_build_blocks.py
@@ -17,8 +17,8 @@ class TestClustering(unittest.TestCase):
    def test_clustering(self):
        d = 64
        n = 1000
-        np.random.seed(123)
+        rs = np.random.RandomState(123)
-        x = np.random.random(size=(n, d)).astype('float32')
+        x = rs.uniform(size=(n, d)).astype('float32')
        km = faiss.Kmeans(d, 32, niter=10)
        err32 = km.train(x)
@@ -37,15 +37,35 @@ class TestClustering(unittest.TestCase):
    def test_nasty_clustering(self):
        d = 2
-        np.random.seed(123)
+        rs = np.random.RandomState(123)
        x = np.zeros((100, d), dtype='float32')
        for i in range(5):
-            x[i * 20:i * 20 + 20] = np.random.random(size=d)
+            x[i * 20:i * 20 + 20] = rs.uniform(size=d)
        # we have 5 distinct points but ask for 10 centroids...
        km = faiss.Kmeans(d, 10, niter=10, verbose=True)
        km.train(x)
+    def test_redo(self):
+        d = 64
+        n = 1000
+        rs = np.random.RandomState(123)
+        x = rs.uniform(size=(n, d)).astype('float32')
+        clus = faiss.Clustering(d, 20)
+        clus.nredo = 1
+        clus.train(x, faiss.IndexFlatL2(d))
+        obj1 = faiss.vector_to_array(clus.obj)
+        clus = faiss.Clustering(d, 20)
+        clus.nredo = 10
+        clus.train(x, faiss.IndexFlatL2(d))
+        obj10 = faiss.vector_to_array(clus.obj)
+        self.assertGreater(obj1[-1], obj10[-1])
 class TestPCA(unittest.TestCase):
@@ -87,7 +107,6 @@ class TestProductQuantizer(unittest.TestCase):
        self.assertGreater(2500, diff)
 class TestRevSwigPtr(unittest.TestCase):
    def test_rev_swig_ptr(self):
@@ -127,6 +146,19 @@ class TestException(unittest.TestCase):
        else:
            assert False, 'exception did not fire???'
+class TestMapLong2Long:
+    def test_do_it(self):
+        keys = np.array([13, 45, 67])
+        vals = np.array([3, 8, 2])
+        m = faiss.MapLong2Long()
+        m.add(keys, vals)
+        assert np.all(m.search_multiple(keys) == vals)
+        assert m.search(12343) == -1
 if __name__ == '__main__':
    unittest.main()
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -22,15 +22,32 @@ def get_dataset(d, nb, nt, nq):
    return (xt, xb, xq)
+def get_dataset_2(d, nb, nt, nq):
+    """A dataset that is not completely random but still challenging to
+    index
+    """
+    d1 = 10     # intrinsic dimension (more or less)
+    n = nb + nt + nq
+    rs = np.random.RandomState(1234)
+    x = rs.normal(size=(n, d1))
+    x = np.dot(x, rs.rand(d1, d))
+    # now we have a d1-dim ellipsoid in d-dimensional space
+    # higher factor (>4) -> higher frequency -> less linear
+    x = x * (rs.rand(d) * 4 + 0.1)
+    x = np.sin(x)
+    x = x.astype('float32')
+    return x[:nt], x[nt:-nq], x[-nq:]
 class EvalIVFPQAccuracy(unittest.TestCase):
    def test_IndexIVFPQ(self):
-        d = 64
+        d = 32
        nb = 1000
        nt = 1500
        nq = 200
-        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+        (xt, xb, xq) = get_dataset_2(d, nb, nt, nq)
        d = xt.shape[1]
        gt_index = faiss.IndexFlatL2(d)
@@ -38,15 +55,15 @@ class EvalIVFPQAccuracy(unittest.TestCase):
        D, gt_nns = gt_index.search(xq, 1)
        coarse_quantizer = faiss.IndexFlatL2(d)
-        index = faiss.IndexIVFPQ(coarse_quantizer, d, 25, 16, 8)
+        index = faiss.IndexIVFPQ(coarse_quantizer, d, 32, 8, 8)
        index.train(xt)
        index.add(xb)
-        index.nprobe = 5
+        index.nprobe = 4
        D, nns = index.search(xq, 10)
        n_ok = (nns == gt_nns).sum()
        nq = xq.shape[0]
-        self.assertGreater(n_ok, nq * 0.4)
+        self.assertGreater(n_ok, nq * 0.66)
 class TestMultiIndexQuantizer(unittest.TestCase):
@@ -78,16 +95,16 @@ class TestScalarQuantizer(unittest.TestCase):
    def test_4variants_ivf(self):
        d = 32
-        nt = 1500
+        nt = 2500
-        nq = 200
+        nq = 400
-        nb = 10000
+        nb = 5000
-        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+        (xt, xb, xq) = get_dataset_2(d, nb, nt, nq)
        # common quantizer
        quantizer = faiss.IndexFlatL2(d)
-        ncent = 128
+        ncent = 64
        index_gt = faiss.IndexFlatL2(d)
        index_gt.add(xb)
@@ -114,9 +131,12 @@ class TestScalarQuantizer(unittest.TestCase):
            D, I = index.search(xq, 10)
            nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
+        print(nok, nq)
-        print(nok)
+        self.assertGreaterEqual(nok['flat'], nq * 0.6)
+        # The tests below are a bit fragile, it happens that the
+        # ordering between uniform and non-uniform are reverted,
+        # probably because the dataset is small, which introduces
+        # jitter
        self.assertGreaterEqual(nok['flat'], nok['QT_8bit'])
        self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit'])
        self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
@@ -124,15 +144,15 @@ class TestScalarQuantizer(unittest.TestCase):
    def test_4variants(self):
        d = 32
-        nt = 1500
+        nt = 2500
-        nq = 200
+        nq = 400
-        nb = 10000
+        nb = 5000
        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
        index_gt = faiss.IndexFlatL2(d)
        index_gt.add(xb)
-        D, I_ref = index_gt.search(xq, 10)
+        D_ref, I_ref = index_gt.search(xq, 10)
        nok = {}
@@ -141,12 +161,12 @@ class TestScalarQuantizer(unittest.TestCase):
            index = faiss.IndexScalarQuantizer(d, qtype, faiss.METRIC_L2)
            index.train(xt)
            index.add(xb)
            D, I = index.search(xq, 10)
            nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
-        print(nok)
+        self.assertGreaterEqual(nok['QT_8bit'], nq * 0.9)
        self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit'])
        self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
        self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform'])

--- a/tests/test_index_composite.py
+++ b/tests/test_index_composite.py
@@ -42,6 +42,42 @@ class TestRemove(unittest.TestCase):
        else:
            assert False, 'should have raised an exception'
+    def test_remove_id_map_2(self):
+        # from https://github.com/facebookresearch/faiss/issues/255
+        rs = np.random.RandomState(1234)
+        X = rs.randn(10, 10).astype(np.float32)
+        idx = np.array([0, 10, 20, 30, 40, 5, 15, 25, 35, 45], np.int64)
+        remove_set = np.array([10, 30], dtype=np.int64)
+        index = faiss.index_factory(10, 'IDMap,Flat')
+        index.add_with_ids(X[:5, :], idx[:5])
+        index.remove_ids(remove_set)
+        index.add_with_ids(X[5:, :], idx[5:])
+        print (index.search(X, 1))
+        for i in range(10):
+            _, searchres = index.search(X[i:i + 1, :], 1)
+            if idx[i] in remove_set:
+                assert searchres[0] != idx[i]
+            else:
+                assert searchres[0] == idx[i]
+class TestRangeSearch(unittest.TestCase):
+    def test_range_search_id_map(self):
+        sub_index = faiss.IndexFlat(5, 1)  # L2 search instead of inner product
+        xb = np.zeros((10, 5), dtype='float32')
+        xb[:, 0] = np.arange(10) + 1000
+        index = faiss.IndexIDMap2(sub_index)
+        index.add_with_ids(xb, np.arange(10) + 100)
+        dist = float(np.linalg.norm(xb[3] - xb[0])) * 0.99
+        res_subindex = sub_index.range_search(xb[[0], :], dist)
+        res_index = index.range_search(xb[[0], :], dist)
+        assert len(res_subindex[2]) == 2
+        np.testing.assert_array_equal(res_subindex[2] + 100, res_index[2])
 class TestUpdate(unittest.TestCase):

--- a/utils.cpp
+++ b/utils.cpp
@@ -67,6 +67,10 @@ int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
 namespace faiss {
+#ifdef __AVX__
+#define USE_AVX
+#endif
 double getmillisecs () {
    struct timeval tv;
    gettimeofday (&tv, nullptr);
@@ -455,7 +459,7 @@ float fvec_norm_L2sqr_ref (const float * __restrict x,
 /*********************************************************
- * SSE implementations
+ * SSE and AVX implementations
 */
 // reads 0 <= d < 4 floats as __m128
@@ -475,7 +479,96 @@ static inline __m128 masked_read (int d, const float *x)
    // cannot use AVX2 _mm_mask_set1_epi32
 }
+#ifdef USE_AVX
+// reads 0 <= d < 8 floats as __m256
+static inline __m256 masked_read_8 (int d, const float *x)
+{
+    assert (0 <= d && d < 8);
+    if (d < 4) {
+        __m256 res = _mm256_setzero_ps ();
+        res = _mm256_insertf128_ps (res, masked_read (d, x), 0);
+        return res;
+    } else {
+        __m256 res = _mm256_setzero_ps ();
+        res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0);
+        res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1);
+        return res;
+    }
+}
+float fvec_inner_product (const float * x,
+                          const float * y,
+                          size_t d)
+{
+    __m256 msum1 = _mm256_setzero_ps();
+    while (d >= 8) {
+        __m256 mx = _mm256_loadu_ps (x); x += 8;
+        __m256 my = _mm256_loadu_ps (y); y += 8;
+        msum1 = _mm256_add_ps (msum1, _mm256_mul_ps (mx, my));
+        d -= 8;
+    }
+    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
+    msum2 +=       _mm256_extractf128_ps(msum1, 0);
+    if (d >= 4) {
+        __m128 mx = _mm_loadu_ps (x); x += 4;
+        __m128 my = _mm_loadu_ps (y); y += 4;
+        msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
+        d -= 4;
+    }
+    if (d > 0) {
+        __m128 mx = masked_read (d, x);
+        __m128 my = masked_read (d, y);
+        msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
+    }
+    msum2 = _mm_hadd_ps (msum2, msum2);
+    msum2 = _mm_hadd_ps (msum2, msum2);
+    return  _mm_cvtss_f32 (msum2);
+}
+float fvec_L2sqr (const float * x,
+                 const float * y,
+                 size_t d)
+{
+    __m256 msum1 = _mm256_setzero_ps();
+    while (d >= 8) {
+        __m256 mx = _mm256_loadu_ps (x); x += 8;
+        __m256 my = _mm256_loadu_ps (y); y += 8;
+        const __m256 a_m_b1 = mx - my;
+        msum1 += a_m_b1 * a_m_b1;
+        d -= 8;
+    }
+    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
+    msum2 +=       _mm256_extractf128_ps(msum1, 0);
+    if (d >= 4) {
+        __m128 mx = _mm_loadu_ps (x); x += 4;
+        __m128 my = _mm_loadu_ps (y); y += 4;
+        const __m128 a_m_b1 = mx - my;
+        msum2 += a_m_b1 * a_m_b1;
+        d -= 4;
+    }
+    if (d > 0) {
+        __m128 mx = masked_read (d, x);
+        __m128 my = masked_read (d, y);
+        __m128 a_m_b1 = mx - my;
+        msum2 += a_m_b1 * a_m_b1;
+    }
+    msum2 = _mm_hadd_ps (msum2, msum2);
+    msum2 = _mm_hadd_ps (msum2, msum2);
+    return  _mm_cvtss_f32 (msum2);
+}
+#else
 /* SSE-implementation of L2 distance */
 float fvec_L2sqr (const float * x,
@@ -534,6 +627,7 @@ float fvec_inner_product (const float * x,
+#endif
 float fvec_norm_L2sqr (const float *  x,
                      size_t d)
@@ -557,69 +651,6 @@ float fvec_norm_L2sqr (const float *  x,
-/*********************************************************
- * AVX implementations
- *
- * Disabled for now, it is not faster than SSE on current machines
- * see P57425519
- */
-#if 0
-// reads 0 <= d < 8 floats as __m256
-static inline __m256 masked_read_8 (int d, const float *x)
-{
-    assert (0 <= d && d < 8);
-    if (d < 4) {
-        __m256 res = _mm256_setzero_ps ();
-        res = _mm256_insertf128_ps (res, masked_read (d, x), 0);
-        return res;
-    } else {
-        __m256 res;
-        res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0);
-        res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1);
-        return res;
-    }
-}
-float fvec_L2sqr (const float * x,
-                 const float * y,
-                 size_t d)
-{
-    __m256 msum1 = _mm256_setzero_ps();
-    while (d >= 8) {
-        __m256 mx = _mm256_loadu_ps (x); x += 8;
-        __m256 my = _mm256_loadu_ps (y); y += 8;
-        const __m256 a_m_b1 = mx - my;
-        msum1 += a_m_b1 * a_m_b1;
-        d -= 8;
-    }
-    if (d > 0) {
-        // add the last 1, 2 or 3 values
-        __m256 mx = masked_read_8 (d, x);
-        __m256 my = masked_read_8 (d, y);
-        __m256 a_m_b1 = mx - my;
-        msum1 += a_m_b1 * a_m_b1;
-    }
-    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
-    msum2 +=       _mm256_extractf128_ps(msum1, 0);
-    msum2 = _mm_hadd_ps (msum2, msum2);
-    msum2 = _mm_hadd_ps (msum2, msum2);
-    return  _mm_cvtss_f32 (msum2);
-}
-#endif
 /***************************************************************************
 * Matrix/vector ops
@@ -1365,16 +1396,18 @@ void pairwise_L2sqr (long d,
 #define EPS (1 / 1024.)
 /* For k-means, compute centroids given assignment of vectors to centroids */
-/* NOTE: This could be multi-threaded (use histogram of indexes) */
 int km_update_centroids (const float * x,
                         float * centroids,
                         long * assign,
-                          size_t d, size_t k, size_t n)
+                         size_t d, size_t k, size_t n,
+                         size_t k_frozen)
 {
+    k -= k_frozen;
+    centroids += k_frozen * d;
    std::vector<size_t> hassign(k);
    memset (centroids, 0, sizeof(*centroids) * d * k);
 #pragma omp parallel
    {
        int nt = omp_get_num_threads();
@@ -1383,12 +1416,12 @@ int km_update_centroids (const float * x,
        size_t c0 = (k * rank) / nt;
        size_t c1 = (k * (rank + 1)) / nt;
        const float *xi = x;
-        // printf("thread %d/%d: centroids %ld:%ld\n", rank, nt, c0, c1);
        size_t nacc = 0;
        for (size_t i = 0; i < n; i++) {
            long ci = assign[i];
-            assert (ci >= 0 && ci < k);
+            assert (ci >= 0 && ci < k + k_frozen);
+            ci -= k_frozen;
            if (ci >= c0 && ci < c1)  {
                float * c = centroids + ci * d;
                hassign[ci]++;
@@ -1398,7 +1431,6 @@ int km_update_centroids (const float * x,
            }
            xi += d;
        }
-        // printf("thread %d/%d: nacc = %ld/%ld\n", rank, nt, nacc, n);
    }

--- a/utils.h
+++ b/utils.h
@@ -307,12 +307,20 @@ int fvec_madd_and_argmin (size_t n, const float *a,
 void reflection (const float * u, float * x, size_t n, size_t d, size_t nu);
-/** For k-means: update stage. Returns nb of split clusters. */
+/** For k-means: update stage.
+ *
+ * @param x          training vectors, size n * d
+ * @param centroids  centroid vectors, size k * d
+ * @param assign     nearest centroid for each training vector, size n
+ * @param k_frozen   do not update the k_frozen first centroids
+ * @return           nb of spliting operations to fight empty clusters
+ */
 int km_update_centroids (
        const float * x,
        float * centroids,
        long * assign,
-        size_t d, size_t k, size_t n);
+        size_t d, size_t k, size_t n,
+        size_t k_frozen);
 /** compute the Q of the QR decomposition for m > n
 * @param a   size n * m: input matrix and output Q