sync with FB version 2017-11-22

various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops

sync with FB version 2017-11-22
various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops
250a3d3f · matthijs · 71335194 · 250a3d3f · 250a3d3f · 250a3d3f
Commit 250a3d3f authored Nov 22, 2017 by matthijs
64 changed files
--- a/AutoTune.cpp
+++ b/AutoTune.cpp
@@ -77,10 +77,10 @@ IntersectionCriterion::IntersectionCriterion (idx_t nq, idx_t R):
 double IntersectionCriterion::evaluate(const float* /*D*/, const idx_t* I)
    const {
-  FAISS_THROW_IF_NOT_MSG(
+    FAISS_THROW_IF_NOT_MSG(
      (gt_I.size() == gt_nnn * nq && gt_nnn >= R && nnn >= R),
      "ground truth not initialized");
-  long n_ok = 0;
+    long n_ok = 0;
 #pragma omp parallel for reduction(+: n_ok)
    for (idx_t q = 0; q < nq; q++) {
        n_ok += ranklist_intersection_size (
@@ -345,11 +345,13 @@ void ParameterSpace::initialize (const Index * index)
    }
    if (DC (IndexIVF)) {
-        ParameterRange & pr = add_range("nprobe");
+        {
-        for (int i = 0; i < 13; i++) {
+            ParameterRange & pr = add_range("nprobe");
-            size_t nprobe = 1 << i;
+            for (int i = 0; i < 13; i++) {
-            if (nprobe >= ix->nlist) break;
+                size_t nprobe = 1 << i;
-            pr.values.push_back (nprobe);
+                if (nprobe >= ix->nlist) break;
+                pr.values.push_back (nprobe);
+            }
        }
    }
    if (DC (IndexPQ)) {
@@ -371,7 +373,6 @@ void ParameterSpace::initialize (const Index * index)
        }
    }
    if (DC (IndexIVFPQR)) {
-        assert (ix);
        ParameterRange & pr = add_range("k_factor");
        for (int i = 0; i <= 6; i++) {
            pr.values.push_back (1 << i);
@@ -427,12 +428,21 @@ void ParameterSpace::set_index_parameter (
    if (name == "verbose") {
        index->verbose = int(val);
+        // and fall through to also enable it on sub-indexes
    }
    if (DC (IndexPreTransform)) {
        index = ix->index;
    }
+    if (DC (IndexShards)) {
+        // call on all sub-indexes
+        for (auto & shard_index : ix->shard_indexes) {
+            set_index_parameter (shard_index, name, val);
+        }
+        return;
+    }
    if (name == "verbose") {
        index->verbose = int(val);
+        // in case it was an IndexPreTransform
    }
    if (DC (IndexRefineFlat)) {
        if (name == "k_factor_rf") {
@@ -449,9 +459,12 @@ void ParameterSpace::set_index_parameter (
        return; // last verbose that we could find
    }
    if (name == "nprobe") {
-        DC(IndexIVF);
+        if ( DC(IndexIVF)) {
-        ix->nprobe = int(val);
+            ix->nprobe = int(val);
-    } else if (name == "ht") {
+            return;
+        }
+    }
+    if (name == "ht") {
        if (DC (IndexPQ)) {
            if (val >= ix->pq.code_size * 8) {
                ix->search_type = IndexPQ::ST_PQ;
@@ -459,25 +472,32 @@ void ParameterSpace::set_index_parameter (
                ix->search_type = IndexPQ::ST_polysemous;
                ix->polysemous_ht = int(val);
            }
+            return;
        } else if (DC (IndexIVFPQ)) {
            if (val >= ix->pq.code_size * 8) {
                ix->polysemous_ht = 0;
            } else {
                ix->polysemous_ht = int(val);
            }
+            return;
        }
-    } else if (name == "k_factor") {
-        DC (IndexIVFPQR);
-        ix->k_factor = val;
-    } else if (name == "max_codes") {
-        DC (IndexIVFPQ);
-        ix->max_codes = finite(val) ? size_t(val) : 0;
-    } else {
-        FAISS_THROW_FMT (
-                "ParameterSpace::set_index_parameter:"
-                "could not set parameter %s",
-                name.c_str());
    }
+    if (name == "k_factor") {
+        if (DC (IndexIVFPQR)) {
+            ix->k_factor = val;
+            return;
+        }
+    }
+    if (name == "max_codes") {
+        if (DC (IndexIVFPQ)) {
+            ix->max_codes = finite(val) ? size_t(val) : 0;
+            return;
+        }
+    }
+    FAISS_THROW_FMT ("ParameterSpace::set_index_parameter:"
+                     "could not set parameter %s",
+                     name.c_str());
 }
 void ParameterSpace::display () const
@@ -634,6 +654,15 @@ struct VTChain {
    }
 };
+/// what kind of training does this coarse quantizer require?
+char get_trains_alone(const Index *coarse_quantizer) {
+    return
+        dynamic_cast<const MultiIndexQuantizer*>(coarse_quantizer) ? 1 :
+        0;
+}
 }
 Index *index_factory (int d, const char *description_in, MetricType metric)
@@ -656,6 +685,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
         tok;
         tok = strtok_r (nullptr, " ,", &ptr)) {
        int d_out, opq_M, nbit, M, M2;
+        char option[100];
        std::string stok(tok);
        // to avoid mem leaks with exceptions:
@@ -686,7 +716,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
        } else if (stok == "L2norm") {
            vt_1 = new NormalizationTransform (d, 2.0);
-        // coarse quantizers
        } else if (!coarse_quantizer &&
                   sscanf (tok, "IVF%d", &ncentroids) == 1) {
            if (metric == METRIC_L2) {
@@ -709,8 +739,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
                IndexIVF *index_ivf = new IndexIVFFlat (
                    coarse_quantizer, d, ncentroids, metric);
                index_ivf->quantizer_trains_alone =
-                    dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
+                    get_trains_alone (coarse_quantizer);
-                    != nullptr;
                index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
                del_coarse_quantizer.release ();
                index_ivf->own_fields = true;
@@ -728,8 +757,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
                    new IndexIVFScalarQuantizer (
                      coarse_quantizer, d, ncentroids, qt, metric);
                index_ivf->quantizer_trains_alone =
-                    dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
+                    get_trains_alone (coarse_quantizer);
-                    != nullptr;
                del_coarse_quantizer.release ();
                index_ivf->own_fields = true;
                index_1 = index_ivf;
@@ -744,29 +772,31 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
            IndexIVFPQR *index_ivf = new IndexIVFPQR (
                  coarse_quantizer, d, ncentroids, M, 8, M2, 8);
            index_ivf->quantizer_trains_alone =
-                dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
+                    get_trains_alone (coarse_quantizer);
-                != nullptr;
            del_coarse_quantizer.release ();
            index_ivf->own_fields = true;
            index_1 = index_ivf;
-        } else if (!index && sscanf (tok, "PQ%d", &M) == 1) {
+        } else if (!index && sscanf (tok, "PQ%d%10s", &M, option) == 2) {
+            std::string soption = option;
+            // np to disable polysemous trainign
+            FAISS_THROW_IF_NOT(soption == "" || soption == "np");
            if (coarse_quantizer) {
                IndexIVFPQ *index_ivf = new IndexIVFPQ (
                    coarse_quantizer, d, ncentroids, M, 8);
                index_ivf->quantizer_trains_alone =
-                    dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
+                    get_trains_alone (coarse_quantizer);
-                    != nullptr;
                index_ivf->metric_type = metric;
                index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
                del_coarse_quantizer.release ();
                index_ivf->own_fields = true;
-                index_ivf->do_polysemous_training = true;
+                index_ivf->do_polysemous_training = soption != "np";
                index_1 = index_ivf;
            } else {
                IndexPQ *index_pq = new IndexPQ (d, M, 8, metric);
-                index_pq->do_polysemous_training = true;
+                index_pq->do_polysemous_training = soption != "np";
                index_1 = index_pq;
            }
        } else if (stok == "RFlat") {
            make_IndexRefineFlat = true;
        } else {

--- a/AuxIndexStructures.h
+++ b/AuxIndexStructures.h
@@ -25,7 +25,7 @@ namespace faiss {
 /** The objective is to have a simple result structure while
 *  minimizing the number of mem copies in the result. The method
 *  do_allocation can be overloaded to allocate the result tables in
- *  the matrix type of a srcipting language like Lua or Python. */
+ *  the matrix type of a scripting language like Lua or Python. */
 struct RangeSearchResult {
    size_t nq;      ///< nb of queries
    size_t *lims;   ///< size (nq + 1)

--- a/Clustering.cpp
+++ b/Clustering.cpp
@@ -29,6 +29,7 @@ ClusteringParameters::ClusteringParameters ():
    nredo(1),
    verbose(false), spherical(false),
    update_index(false),
+    frozen_centroids(false),
    min_points_per_centroid(39),
    max_points_per_centroid(256),
    seed(1234)
@@ -110,7 +111,24 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
    float * dis = new float[nx];
    ScopeDeleter<float> del2(dis);
+    // for redo
    float best_err = 1e50;
+    std::vector<float> best_obj;
+    std::vector<float> best_centroids;
+    // support input centroids
+    FAISS_THROW_IF_NOT_MSG (
+       centroids.size() % d == 0,
+       "size of provided input centroids not a multiple of dimension");
+    size_t n_input_centroids = centroids.size() / d;
+    if (verbose && n_input_centroids > 0) {
+        printf ("  Using %zd centroids provided as input (%sfrozen)\n",
+                n_input_centroids, frozen_centroids ? "" : "not ");
+    }
    double t_search_tot = 0;
    if (verbose) {
        printf("  Preprocessing in %.2f s\n",
@@ -120,39 +138,28 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
    for (int redo = 0; redo < nredo; redo++) {
-        std::vector<float> buf_centroids;
-        std::vector<float> &cur_centroids =
-            nredo == 1 ? centroids : buf_centroids;
        if (verbose && nredo > 1) {
            printf("Outer iteration %d / %d\n", redo, nredo);
        }
-        if (cur_centroids.size() == 0) {
-            // initialize centroids with random points from the dataset
+        // initialize remaining centroids with random points from the dataset
-            cur_centroids.resize (d * k);
+        centroids.resize (d * k);
-            std::vector<int> perm (nx);
+        std::vector<int> perm (nx);
-            rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L);
+        rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L);
-#pragma omp parallel for
+        for (int i = n_input_centroids; i < k ; i++)
-            for (int i = 0; i < k ; i++)
+            memcpy (&centroids[i * d], x + perm[i] * d,
-                memcpy (&cur_centroids[i * d], x + perm[i] * d,
+                    d * sizeof (float));
-                        d * sizeof (float));
-        } else { // assume user provides some meaningful initialization
-            FAISS_THROW_IF_NOT (cur_centroids.size() == d * k);
-            FAISS_THROW_IF_NOT_MSG (nredo == 1,
-                              "will redo with same initialization");
-        }
        if (spherical)
-            fvec_renorm_L2 (d, k, cur_centroids.data());
+            fvec_renorm_L2 (d, k, centroids.data());
        if (!index.is_trained)
-            index.train (k, cur_centroids.data());
+            index.train (k, centroids.data());
        FAISS_THROW_IF_NOT (index.ntotal == 0);
-        index.add (k, cur_centroids.data());
+        index.add (k, centroids.data());
        float err = 0;
        for (int i = 0; i < niter; i++) {
            double t0s = getmillisecs();
@@ -164,8 +171,9 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
                err += dis[j];
            obj.push_back (err);
-            int nsplit = km_update_centroids (x, cur_centroids.data(),
+            int nsplit = km_update_centroids (
-                                              assign, d, k, nx);
+                  x, centroids.data(),
+                  assign, d, k, nx, frozen_centroids ? n_input_centroids : 0);
            if (verbose) {
                printf ("  Iteration %d (%.2f s, search %.2f s): "
@@ -178,26 +186,31 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
            }
            if (spherical)
-                fvec_renorm_L2 (d, k, cur_centroids.data());
+                fvec_renorm_L2 (d, k, centroids.data());
            index.reset ();
            if (update_index)
-                index.train (k, cur_centroids.data());
+                index.train (k, centroids.data());
            assert (index.ntotal == 0);
-            index.add (k, cur_centroids.data());
+            index.add (k, centroids.data());
        }
        if (verbose) printf("\n");
        if (nredo > 1) {
            if (err < best_err) {
                if (verbose)
                    printf ("Objective improved: keep new clusters\n");
-                centroids = buf_centroids;
+                best_centroids = centroids;
+                best_obj = obj;
                best_err = err;
            }
            index.reset ();
        }
    }
+    if (nredo > 1) {
+        centroids = best_centroids;
+        obj = best_obj;
+    }
 }

--- a/Clustering.h
+++ b/Clustering.h
@@ -28,6 +28,7 @@ struct ClusteringParameters {
    bool verbose;
    bool spherical;     ///< do we want normalized centroids?
    bool update_index;  ///< update index after each iteration?
+    bool frozen_centroids;  ///< use the centroids provided as input and do not change them during iterations
    int min_points_per_centroid; ///< otherwise you get a warning
    int max_points_per_centroid;  ///< to limit size of dataset

--- a/Index.cpp
+++ b/Index.cpp
@@ -41,8 +41,7 @@ long Index::remove_ids(const IDSelector& /*sel*/) {
 void Index::reconstruct (idx_t, float * ) const {
-  FAISS_THROW_MSG ("Can not compute reconstruct without "
+  FAISS_THROW_MSG ("reconstruct not implemented for this type of index");
-                    "knowing how to do so");
 }

--- a/IndexIVF.cpp
+++ b/IndexIVF.cpp
@@ -34,8 +34,9 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
    nlist (nlist),
    nprobe (1),
    quantizer (quantizer),
-    quantizer_trains_alone (false),
+    quantizer_trains_alone (0),
    own_fields (false),
+    clustering_index (nullptr),
    ids (nlist),
    maintain_direct_map (false)
 {
@@ -56,7 +57,8 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
 IndexIVF::IndexIVF ():
    nlist (0), nprobe (1), quantizer (nullptr),
-    quantizer_trains_alone (false), own_fields (false),
+    quantizer_trains_alone (0), own_fields (false),
+    clustering_index (nullptr),
    maintain_direct_map (false)
 {}
@@ -157,22 +159,44 @@ void IndexIVF::train (idx_t n, const float *x)
    if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
        if (verbose)
            printf ("IVF quantizer does not need training.\n");
-    } else if (quantizer_trains_alone) {
+    } else if (quantizer_trains_alone == 1) {
        if (verbose)
            printf ("IVF quantizer trains alone...\n");
        quantizer->train (n, x);
+        quantizer->verbose = verbose;
        FAISS_THROW_IF_NOT_MSG (quantizer->ntotal == nlist,
                          "nlist not consistent with quantizer size");
-    } else {
+    } else if (quantizer_trains_alone == 0) {
        if (verbose)
            printf ("Training IVF quantizer on %ld vectors in %dD\n",
                    n, d);
        Clustering clus (d, nlist, cp);
        quantizer->reset();
-        clus.train (n, x, *quantizer);
+        if (clustering_index) {
+            clus.train (n, x, *clustering_index);
+            quantizer->add (nlist, clus.centroids.data());
+        } else {
+            clus.train (n, x, *quantizer);
+        }
        quantizer->is_trained = true;
+    } else if (quantizer_trains_alone == 2) {
+        if (verbose)
+            printf (
+                "Training L2 quantizer on %ld vectors in %dD%s\n",
+                n, d,
+                clustering_index ? "(user provided index)" : "");
+        FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
+        Clustering clus (d, nlist, cp);
+        if (!clustering_index) {
+            IndexFlatL2 assigner (d);
+            clus.train(n, x, assigner);
+        } else {
+            clus.train(n, x, *clustering_index);
+        }
+        if (verbose)
+            printf ("Adding centroids to quantizer\n");
+        quantizer->add (nlist, clus.centroids.data());
    }
    if (verbose)
        printf ("Training IVF residual\n");
@@ -250,8 +274,9 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
 {
    FAISS_THROW_IF_NOT (nlist == other.nlist);
    FAISS_THROW_IF_NOT (!other.maintain_direct_map);
-    FAISS_THROW_IF_NOT_MSG (subset_type == 0 || subset_type == 2,
+    FAISS_THROW_IF_NOT_FMT (
-                            "this subset type is not implemented");
+          subset_type == 0 || subset_type == 1 || subset_type == 2,
+          "subset type %d not implemented", subset_type);
    size_t accu_n = 0;
    size_t accu_a1 = 0;
@@ -275,15 +300,24 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
                    other.ntotal++;
                }
            }
+        } else if (subset_type == 1) {
+            for (long i = 0; i < n; i++) {
+                idx_t id = ids_in[i];
+                if (id % a1 == a2) {
+                    ids_out.push_back (id);
+                    codes_out.insert (codes_out.end(),
+                                      codes_in.begin() + i * code_size,
+                                  codes_in.begin() + (i + 1) * code_size);
+                    other.ntotal++;
+                }
+            }
        } else if (subset_type == 2) {
            // see what is allocated to a1 and to a2
            size_t next_accu_n = accu_n + n;
            size_t next_accu_a1 = next_accu_n * a1 / ntotal;
            size_t i1 = next_accu_a1 - accu_a1;
-            accu_a1 = next_accu_a1;
            size_t next_accu_a2 = next_accu_n * a2 / ntotal;
            size_t i2 = next_accu_a2 - accu_a2;
-            accu_a2 = next_accu_a2;
            ids_out.insert(ids_out.end(),
                           ids_in.begin() + i1,
                           ids_in.begin() + i2);
@@ -291,6 +325,8 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
                              codes_in.begin() + i1 * code_size,
                              codes_in.begin() + i2 * code_size);
            other.ntotal += i2 - i1;
+            accu_a1 = next_accu_a1;
+            accu_a2 = next_accu_a2;
        }
        accu_n += n;
    }

--- a/IndexIVF.h
+++ b/IndexIVF.h
@@ -47,10 +47,17 @@ struct IndexIVF: Index {
    size_t nprobe;            ///< number of probes at query time
    Index * quantizer;        ///< quantizer that maps vectors to inverted lists
-    bool quantizer_trains_alone;   ///< just pass over the trainset to quantizer
+    /**
+     * = 0: use the quantizer as index in a kmeans training
+     * = 1: just pass on the training set to the train() of the quantizer
+     * = 2: kmeans training on a flat index + add the centroids to the quantizer
+     */
+    char quantizer_trains_alone;
    bool own_fields;          ///< whether object owns the quantizer
    ClusteringParameters cp; ///< to override default clustering params
+    Index *clustering_index; ///< to override index used during clustering
    std::vector < std::vector<long> > ids;  ///< Inverted lists for indexes

--- a/IndexIVFPQ.cpp
+++ b/IndexIVFPQ.cpp
@@ -291,8 +291,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
                for (int j = 0; j < d; j++) {
                    r[j] += centroid[j];
                }
-            }
+            } else {
-            else {
                pq.decode (code_line + ofs * pq.code_size, r);
            }
        }
@@ -303,6 +302,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
 void IndexIVFPQ::reconstruct (idx_t key, float * recons) const
 {
    FAISS_THROW_IF_NOT (direct_map.size() == ntotal);
    int list_no = direct_map[key] >> 32;
    int ofs = direct_map[key] & 0xffffffff;
@@ -1029,6 +1029,51 @@ void IndexIVFPQ::search_preassigned (idx_t nx, const float *qx, idx_t k,
 }
+void IndexIVFPQ::search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                         float *distances, idx_t *labels,
+                                         float *reconstructed)
+{
+    long * idx = new long [n * nprobe];
+    ScopeDeleter<long> del (idx);
+    float * coarse_dis = new float [n * nprobe];
+    ScopeDeleter<float> del2 (coarse_dis);
+    quantizer->search (n, x, nprobe, coarse_dis, idx);
+    search_preassigned (n, x, k, idx, coarse_dis,
+                        distances, labels, true);
+    for (long i = 0; i < n; i++) {
+        for (long j = 0; j < k; j++) {
+            long ij = i * k + j;
+            idx_t res = labels[ij];
+            float *recons = reconstructed + d * (ij);
+            if (res < 0) {
+                // fill with NaNs
+                memset(recons, -1, sizeof(*recons) * d);
+            } else {
+                int list_no = res >> 32;
+                int ofs = res & 0xffffffff;
+                labels[ij] = ids[list_no][ofs];
+                quantizer->reconstruct (list_no, recons);
+                const uint8_t * code = &(codes[list_no][ofs * pq.code_size]);
+                for (size_t m = 0; m < pq.M; m++) {
+                    float * out = recons + m * pq.dsub;
+                    const float * cent = pq.get_centroids (m, code[m]);
+                    for (size_t l = 0; l < pq.dsub; l++) {
+                        out[l] += cent[l];
+                    }
+                }
+            }
+        }
+    }
+}
 IndexIVFPQ::IndexIVFPQ ()

--- a/IndexIVFPQ.h
+++ b/IndexIVFPQ.h
@@ -114,6 +114,15 @@ struct IndexIVFPQ: IndexIVF {
                             float *distances, idx_t *labels,
                             bool store_pairs) const override;
+    /** Same as the search function, but also reconstruct approximate
+     * vectors for the search results
+     *
+     * @param reconstructed    size (n, k, d)
+     **/
+    void search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                 float *distances, idx_t *labels,
+                                 float *reconstructed);
    /// build precomputed table
    void precompute_table ();

--- a/IndexScalarQuantizer.cpp
+++ b/IndexScalarQuantizer.cpp
@@ -124,8 +124,8 @@ struct Codec4bit {
 struct SimilarityL2 {
    const float *y, *yi;
-    explicit SimilarityL2 (const float * y): y(y) {}
+    explicit SimilarityL2 (const float * y): y(y) {}
    /******* scalar accumulator *******/
@@ -676,19 +676,19 @@ void ScalarQuantizer::compute_codes (const float * x,
                                     size_t n) const
 {
    Quantizer *squant = select_quantizer (*this);
+    ScopeDeleter1<Quantizer> del(squant);
 #pragma omp parallel for
    for (size_t i = 0; i < n; i++)
        squant->encode_vector (x + i * d, codes + i * code_size);
-    delete squant;
 }
 void ScalarQuantizer::decode (const uint8_t *codes, float *x, size_t n) const
 {
    Quantizer *squant = select_quantizer (*this);
+    ScopeDeleter1<Quantizer> del(squant);
 #pragma omp parallel for
    for (size_t i = 0; i < n; i++)
        squant->decode_vector (codes + i * code_size, x + i * d);
-    delete squant;
 }
 /*******************************************************************
@@ -754,6 +754,7 @@ void IndexScalarQuantizer::search(
                }
                ci += code_size;
            }
+            minheap_reorder (k, simi, idxi);
        }
    } else {
 #pragma omp parallel for
@@ -774,7 +775,7 @@ void IndexScalarQuantizer::search(
                }
                ci += code_size;
            }
+            maxheap_reorder (k, simi, idxi);
        }
    }
@@ -855,6 +856,7 @@ void IndexIVFScalarQuantizer::add_with_ids
        int nt = omp_get_num_threads();
        int rank = omp_get_thread_num();
+        // each thread takes care of a subset of lists
        for (size_t i = 0; i < n; i++) {
            long list_no = idx [i];
@@ -879,6 +881,7 @@ void IndexIVFScalarQuantizer::add_with_ids
    ntotal += nadd;
 }
+namespace {
 void search_with_probes_ip (const IndexIVFScalarQuantizer & index,
                            const float *x,
@@ -958,6 +961,8 @@ void search_with_probes_L2 (const IndexIVFScalarQuantizer & index,
    maxheap_reorder (k, simi, idxi);
 }
+} // anonymous namespace
 void IndexIVFScalarQuantizer::search_preassigned (
                             idx_t n, const float *x, idx_t k,
                             const idx_t *idx,

--- a/Makefile
+++ b/Makefile
@@ -87,54 +87,59 @@ _swigfaiss.so: python/_swigfaiss.so
 	cp python/_swigfaiss.so python/swigfaiss.py .
 #############################
-# Dependencies
+# Dependencies.
+# make dep > x
+# then copy/paste from x by hand below
-# for i in *.cpp ; do g++ -std=c++11 -I.. -MM $i -msse4; done
+dep:
+	for i in $(patsubst %.o,%.cpp,$(LIBOBJ)) ; do \
+	   cpp -MM -std=gnu++0x $$i ; \
+	done
-AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
- FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
- IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
- IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
-AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
-Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
- FaissAssert.h FaissException.h IndexFlat.h
-FaissException.o: FaissException.cpp FaissException.h
 hamming.o: hamming.cpp hamming.h Heap.h FaissAssert.h FaissException.h
-Heap.o: Heap.cpp Heap.h
+utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \
-Index.o: Index.cpp IndexFlat.h Index.h FaissAssert.h FaissException.h
+ FaissAssert.h FaissException.h
 IndexFlat.o: IndexFlat.cpp IndexFlat.h Index.h utils.h Heap.h \
 FaissAssert.h FaissException.h AuxIndexStructures.h
-index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
- IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
- ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
- IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
 IndexIVF.o: IndexIVF.cpp IndexIVF.h Index.h Clustering.h Heap.h utils.h \
 hamming.h FaissAssert.h FaissException.h IndexFlat.h \
 AuxIndexStructures.h
-IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
- Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
- IndexFlat.h hamming.h FaissAssert.h FaissException.h \
- AuxIndexStructures.h
 IndexLSH.o: IndexLSH.cpp IndexLSH.h Index.h VectorTransform.h utils.h \
 Heap.h hamming.h FaissAssert.h FaissException.h
 IndexPQ.o: IndexPQ.cpp IndexPQ.h Index.h ProductQuantizer.h Clustering.h \
 Heap.h PolysemousTraining.h FaissAssert.h FaissException.h hamming.h
-IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \
+IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
- IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \
+ Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
- FaissException.h
+ IndexFlat.h hamming.h FaissAssert.h FaissException.h \
-MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \
+ AuxIndexStructures.h
- FaissException.h Heap.h AuxIndexStructures.h
+Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
+ FaissAssert.h FaissException.h IndexFlat.h
+Heap.o: Heap.cpp Heap.h
+VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \
+ Heap.h FaissAssert.h FaissException.h IndexPQ.h ProductQuantizer.h \
+ Clustering.h PolysemousTraining.h
+index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
+ IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
+ ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
+ IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
 PolysemousTraining.o: PolysemousTraining.cpp PolysemousTraining.h \
 ProductQuantizer.h Clustering.h Index.h Heap.h utils.h hamming.h \
 FaissAssert.h FaissException.h
+MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \
+ FaissException.h Heap.h AuxIndexStructures.h
+Index.o: Index.cpp IndexFlat.h Index.h FaissAssert.h FaissException.h
 ProductQuantizer.o: ProductQuantizer.cpp ProductQuantizer.h Clustering.h \
 Index.h Heap.h FaissAssert.h FaissException.h VectorTransform.h \
 IndexFlat.h utils.h
-utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \
+AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
- FaissAssert.h FaissException.h
+ FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
-VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \
+ IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
- Heap.h FaissAssert.h FaissException.h IndexPQ.h ProductQuantizer.h \
+ IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
- Clustering.h PolysemousTraining.h
+AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
+IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \
+ IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \
+ FaissException.h
+FaissException.o: FaissException.cpp FaissException.h
 clean:

--- a/MetaIndexes.cpp
+++ b/MetaIndexes.cpp
@@ -76,6 +76,17 @@ void IndexIDMap::search (idx_t n, const float *x, idx_t k,
    }
 }
+void IndexIDMap::range_search (idx_t n, const float *x, float radius,
+                   RangeSearchResult *result) const
+{
+  index->range_search(n, x, radius, result);
+  for (idx_t i = 0; i < result->lims[result->nq]; i++) {
+      result->labels[i] = result->labels[i] < 0 ?
+        result->labels[i] : id_map[result->labels[i]];
+  }
+}
 namespace {
 struct IDTranslatedSelector: IDSelector {
@@ -109,6 +120,7 @@ long IndexIDMap::remove_ids (const IDSelector & sel)
    }
    FAISS_ASSERT (j == index->ntotal);
    ntotal = j;
+    id_map.resize(ntotal);
    return nremove;
 }

--- a/MetaIndexes.h
+++ b/MetaIndexes.h
@@ -51,6 +51,9 @@ struct IndexIDMap : Index {
    /// remove ids adapted to IndexFlat
    long remove_ids(const IDSelector& sel) override;
+    void range_search (idx_t n, const float *x, float radius,
+                       RangeSearchResult *result) const override;
    ~IndexIDMap() override;
    IndexIDMap () {own_fields=false; index=nullptr; }
 };

--- a/VectorTransform.cpp
+++ b/VectorTransform.cpp
@@ -804,18 +804,38 @@ void IndexPreTransform::train (idx_t n, const float *x)
    const float *prev_x = x;
    ScopeDeleter<float> del;
+    if (verbose) {
+        printf("IndexPreTransform::train: training chain 0 to %d\n",
+               last_untrained);
+    }
    for (int i = 0; i <= last_untrained; i++) {
        if (i < chain.size()) {
            VectorTransform *ltrans = chain [i];
-            if (!ltrans->is_trained)
+            if (!ltrans->is_trained) {
-                ltrans->train(n, prev_x);
+                if (verbose) {
+                    printf("   Training chain component %d/%zd\n",
+                           i, chain.size());
+                    if (OPQMatrix *opqm = dynamic_cast<OPQMatrix*>(ltrans)) {
+                        opqm->verbose = true;
+                    }
+                }
+                ltrans->train (n, prev_x);
+            }
        } else {
+            if (verbose) {
+                printf("   Training sub-index\n");
+            }
            index->train (n, prev_x);
        }
        if (i == last_untrained) break;
+        if (verbose) {
+            printf("   Applying transform %d/%zd\n",
+                   i, chain.size());
+        }
        float * xt = chain[i]->apply (n, prev_x);
-        if (prev_x != x) delete prev_x;
+        if (prev_x != x) delete [] prev_x;
        prev_x = xt;
        del.set(xt);
    }

--- a/benchs/bench_gpu_1bn.py
+++ b/benchs/bench_gpu_1bn.py
@@ -521,7 +521,7 @@ def compute_populated_index(preproc):
    co.verbose = True
    co.reserveVecs = max_add if max_add > 0 else xb.shape[0]
    co.shard = True
+    assert co.shard_type in (0, 1, 2)
    vres, vdev = make_vres_vdev()
    gpu_index = faiss.index_cpu_to_gpu_multiple(
        vres, vdev, indexall, co)
@@ -630,7 +630,7 @@ def get_populated_index(preproc):
    co.usePrecomputed = use_precomputed_tables
    co.indicesOptions = 0
    co.verbose = True
-    co.shard = True # the replicas will be made "manually"
+    co.shard = True    # the replicas will be made "manually"
    t0 = time.time()
    print "CPU index contains %d vectors, move to GPU" % indexall.ntotal
    if replicas == 1:

--- a/faiss.py
+++ b/faiss.py
@@ -121,6 +121,18 @@ def handle_Index(the_class):
                      swig_ptr(labels))
        return distances, labels
+    def replacement_search_and_reconstruct(self, x, k):
+        n, d = x.shape
+        assert d == self.d
+        distances = np.empty((n, k), dtype=np.float32)
+        labels = np.empty((n, k), dtype=np.int64)
+        recons = np.empty((n, k, d), dtype=np.float32)
+        self.search_and_reconstruct_c(n, swig_ptr(x),
+                                      k, swig_ptr(distances),
+                                      swig_ptr(labels),
+                                      swig_ptr(recons))
+        return distances, labels, recons
    def replacement_remove_ids(self, x):
        if isinstance(x, IDSelector):
            sel = x
@@ -167,6 +179,8 @@ def handle_Index(the_class):
    replace_method(the_class, 'range_search', replacement_range_search)
    replace_method(the_class, 'update_vectors', replacement_update_vectors,
                   ignore_missing=True)
+    replace_method(the_class, 'search_and_reconstruct',
+                   replacement_search_and_reconstruct, ignore_missing=True)
 def handle_VectorTransform(the_class):
@@ -258,12 +272,52 @@ def index_cpu_to_gpu_multiple_py(resources, index, co=None):
    return index_cpu_to_gpu_multiple(vres, vdev, index, co)
-def vector_float_to_array(v):
+def index_cpu_to_all_gpus(index, co=None, ngpu=-1):
-    a = np.empty(v.size(), dtype='float32')
+    if ngpu == -1:
-    memcpy(swig_ptr(a), v.data(), 4 * v.size())
+        ngpu = get_num_gpus()
+    res = [StandardGpuResources() for i in range(ngpu)]
+    index2 = index_cpu_to_gpu_multiple_py(res, index, co)
+    index2.dont_dealloc = res
+    return index2
+# mapping from vector names in swigfaiss.swig and the numpy dtype names
+vector_name_map = {
+    'Float': 'float32',
+    'Byte': 'uint8',
+    'Uint64': 'uint64',
+    'Long': 'int64',
+    'Int': 'int32',
+    'Double': 'float64'
+    }
+def vector_to_array(v):
+    """ convert a C++ vector to a numpy array """
+    classname = v.__class__.__name__
+    assert classname.endswith('Vector')
+    dtype = np.dtype(vector_name_map[classname[:-6]])
+    a = np.empty(v.size(), dtype=dtype)
+    memcpy(swig_ptr(a), v.data(), a.nbytes)
    return a
+def vector_float_to_array(v):
+    return vector_to_array(v)
+def copy_array_to_vector(a, v):
+    """ copy a numpy array to a vector """
+    n, = a.shape
+    classname = v.__class__.__name__
+    assert classname.endswith('Vector')
+    dtype = np.dtype(vector_name_map[classname[:-6]])
+    assert dtype == a.dtype, (
+        'cannot copy a %s array to a %s (should be %s)' % (
+            a.dtype, classname, dtype))
+    v.resize(n)
+    memcpy(v.data(), swig_ptr(a), a.nbytes)
 class Kmeans:
    def __init__(self, d, k, niter=25, verbose=False, spherical = False):
@@ -364,3 +418,18 @@ def eval_intersection(I1, I2):
 def normalize_L2(x):
    fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x))
+def replacement_map_add(self, keys, vals):
+    n, = keys.shape
+    assert (n,) == keys.shape
+    self.add_c(n, swig_ptr(keys), swig_ptr(vals))
+def replacement_map_search_multiple(self, keys):
+    n, = keys.shape
+    vals = np.empty(n, dtype='uint64')
+    self.search_multiple_c(n, swig_ptr(keys), swig_ptr(vals))
+    return vals
+replace_method(MapLong2Long, 'add', replacement_map_add)
+replace_method(MapLong2Long, 'search_multiple', replacement_map_search_multiple)
--- a/gpu/GpuAutoTune.cpp
+++ b/gpu/GpuAutoTune.cpp
@@ -8,6 +8,7 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
 #include "GpuAutoTune.h"
+#include <typeinfo>
 #include "GpuIndex.h"
 #include "../FaissAssert.h"
@@ -97,17 +98,6 @@ faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index)
-GpuClonerOptions::GpuClonerOptions():
-    indicesOptions(INDICES_64_BIT),
-    useFloat16CoarseQuantizer(false),
-    useFloat16(false),
-    usePrecomputed(true),
-    reserveVecs(0),
-    storeTransposed(false),
-    verbose(0)
-{}
 struct ToGpuCloner: faiss::Cloner, GpuClonerOptions {
    GpuResources *resources;
    int device;
@@ -185,9 +175,6 @@ faiss::Index * index_cpu_to_gpu(
    return cl.clone_Index(index);
 }
-GpuMultipleClonerOptions::GpuMultipleClonerOptions(): shard(false)
-{}
 struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
    std::vector<ToGpuCloner> sub_cloners;
@@ -211,6 +198,28 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
    {}
+    void copy_ivf_shard (const IndexIVF *index_ivf, IndexIVF *idx2,
+                         long n, long i) {
+        if (shard_type == 2) {
+            long i0 = i * index_ivf->ntotal / n;
+            long i1 = (i + 1) * index_ivf->ntotal / n;
+            if(verbose)
+                printf("IndexShards shard %ld indices %ld:%ld\n",
+                       i, i0, i1);
+            index_ivf->copy_subset_to(*idx2, 2, i0, i1);
+            FAISS_ASSERT(idx2->ntotal == i1 - i0);
+        } else if (shard_type == 1) {
+            if(verbose)
+                printf("IndexShards shard %ld select modulo %ld = %ld\n",
+                       i, n, i);
+            index_ivf->copy_subset_to(*idx2, 1, n, i);
+        } else {
+            FAISS_THROW_FMT ("shard_type %d not implemented", shard_type);
+        }
+    }
    Index *clone_Index(const Index *index) override {
        long n = sub_cloners.size();
        if (n == 1)
@@ -231,19 +240,13 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
                    dynamic_cast<const faiss::IndexIVFPQ *>(index);
                auto index_ivfflat =
                    dynamic_cast<const faiss::IndexIVFFlat *>(index);
-                FAISS_ASSERT_MSG (index_ivfpq || index_ivfflat,
+                FAISS_THROW_IF_NOT_MSG (index_ivfpq || index_ivfflat,
                              "IndexShards implemented only for "
                              "IndexIVFFlat or IndexIVFPQ");
                std::vector<faiss::Index*> shards(n);
                for(long i = 0; i < n; i++) {
                    // make a shallow copy
-                    long i0 = i * index->ntotal / n;
-                    long i1 = (i + 1) * index->ntotal / n;
-                    if(verbose)
-                        printf("IndexShards shard %ld indices %ld:%ld\n",
-                               i, i0, i1);
                    if(reserveVecs)
                        sub_cloners[i].reserveVecs =
                            (reserveVecs + n - 1) / n;
@@ -258,18 +261,19 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
                        idx2.nprobe = index_ivfpq->nprobe;
                        idx2.use_precomputed_table = 0;
                        idx2.is_trained = index->is_trained;
-                        index_ivfpq->copy_subset_to(idx2, 2, i0, i1);
+                        copy_ivf_shard (index_ivfpq, &idx2, n, i);
-                        FAISS_ASSERT(idx2.ntotal == i1 - i0);
                        shards[i] = sub_cloners[i].clone_Index(&idx2);
                    } else if (index_ivfflat) {
                        faiss::IndexIVFFlat idx2(
                              index_ivfflat->quantizer, index->d,
                              index_ivfflat->nlist, index_ivfflat->metric_type);
                        idx2.nprobe = index_ivfflat->nprobe;
-                        index_ivfflat->copy_subset_to(idx2, 2, i0, i1);
                        idx2.nprobe = index_ivfflat->nprobe;
+                        copy_ivf_shard (index_ivfflat, &idx2, n, i);
                        shards[i] = sub_cloners[i].clone_Index(&idx2);
                    }
                }
                faiss::IndexShards *res =
                    new faiss::IndexShards(index->d, true, false);
@@ -372,33 +376,26 @@ void GpuParameterSpace::initialize (const Index * index)
 void GpuParameterSpace::set_index_parameter (
        Index * index, const std::string & name, double val) const
 {
-    if (DC (IndexPreTransform)) {
-        index = ix->index;
-    }
    if (DC (IndexProxy)) {
        for (int i = 0; i < ix->count(); i++)
            set_index_parameter (ix->at(i), name, val);
        return;
    }
-    if (DC (faiss::IndexShards)) {
+    if (DC (GpuIndexIVF)) {
-        for (auto sub_index : ix->shard_indexes)
+        if (name == "nprobe") {
-            set_index_parameter (sub_index, name, val);
+            ix->setNumProbes (int (val));
-        return;
+            return;
-    }
+        }
-    if (name == "nprobe") {
-      DC (GpuIndexIVF);
-      FAISS_ASSERT(ix);
-      ix->setNumProbes (int (val));
-      return;
    }
-    if (name == "use_precomputed_table") {
+    if(DC (GpuIndexIVFPQ)) {
-      DC (GpuIndexIVFPQ);
+        if (name == "use_precomputed_table") {
-      FAISS_ASSERT(ix);
+            ix->setPrecomputedCodes(bool (val));
-      ix->setPrecomputedCodes(bool (val));
+            return;
-      return;
+        }
    }
-    FAISS_ASSERT_MSG (false, "unknown parameter");
+    // maybe norma lindex parameters apply?
+    ParameterSpace::set_index_parameter (index, name, val);
 }

--- a/gpu/GpuClonerOptions.cpp
+++ b/gpu/GpuClonerOptions.cpp
@@ -22,7 +22,9 @@ GpuClonerOptions::GpuClonerOptions()
 }
 GpuMultipleClonerOptions::GpuMultipleClonerOptions()
-    : shard(false) {
+    : shard(false),
+      shard_type(1)
+{
 }
 } } // namespace
--- a/gpu/GpuClonerOptions.h
+++ b/gpu/GpuClonerOptions.h
@@ -47,6 +47,9 @@ struct GpuMultipleClonerOptions : public GpuClonerOptions {
  /// Whether to shard the index across GPUs, versus replication
  /// across GPUs
  bool shard;
+  /// IndexIVF::copy_subset_to subset type
+  int shard_type;
 };
 } } // namespace
--- a/gpu/GpuIndex.h
+++ b/gpu/GpuIndex.h
@@ -26,7 +26,7 @@ struct GpuIndexConfig {
  /// GPU device on which the index is resident
  int device;
-  /// What memory space to use for primary storae.
+  /// What memory space to use for primary storage.
  /// On Pascal and above (CC 6+) architectures, allows GPUs to use
  /// more memory than is available on the GPU.
  MemorySpace memorySpace;

--- a/gpu/GpuIndexIVF.cu
+++ b/gpu/GpuIndexIVF.cu
@@ -184,7 +184,7 @@ GpuIndexIVF::copyTo(faiss::IndexIVF* index) const {
  }
  index->quantizer = q;
-  index->quantizer_trains_alone = false;
+  index->quantizer_trains_alone = 0;
  index->own_fields = true;
  index->cp = this->cp;
  index->ids.clear();

--- a/gpu/GpuIndexIVFPQ.cu
+++ b/gpu/GpuIndexIVFPQ.cu
@@ -96,7 +96,6 @@ GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
  FAISS_ASSERT(index->pq.byte_per_idx == 1);
  FAISS_ASSERT(index->by_residual);
  FAISS_ASSERT(index->polysemous_ht == 0);
-  ivfpqConfig_.usePrecomputedTables = (bool) index->use_precomputed_table;
  verifySettings_();

--- a/gpu/Makefile
+++ b/gpu/Makefile
--- a/gpu/impl/Distance.cu
+++ b/gpu/impl/Distance.cu
--- a/gpu/impl/Distance.cuh
+++ b/gpu/impl/Distance.cuh
@@ -31,11 +31,7 @@ void runL2Distance(GpuResources* resources,
                   Tensor<int, 2, true>& outIndices,
                   // Do we care about `outDistances`? If not, we can
                   // take shortcuts.
-                   bool ignoreOutDistances = false,
+                   bool ignoreOutDistances = false);
-                   // Hint to use a different sized tile for
-                   // multi-streaming the queries. If <= 0, we use the
-                   // default
-                   int tileSizeOverride = -1);
 /// Calculates brute-force inner product distance between `vectors`
 /// and `queries`, returning the k closest results seen
@@ -45,11 +41,7 @@ void runIPDistance(GpuResources* resources,
                   Tensor<float, 2, true>& queries,
                   int k,
                   Tensor<float, 2, true>& outDistances,
-                   Tensor<int, 2, true>& outIndices,
+                   Tensor<int, 2, true>& outIndices);
-                   // Hint to use a different sized tile for
-                   // multi-streaming the queries. If <= 0, we use the
-                   // default
-                   int tileSizeOverride = -1);
 #ifdef FAISS_USE_FLOAT16
 void runIPDistance(GpuResources* resources,
@@ -59,8 +51,7 @@ void runIPDistance(GpuResources* resources,
                   int k,
                   Tensor<half, 2, true>& outDistances,
                   Tensor<int, 2, true>& outIndices,
-                   bool useHgemm,
+                   bool useHgemm);
-                   int tileSizeOverride = -1);
 void runL2Distance(GpuResources* resources,
                   Tensor<half, 2, true>& vectors,
@@ -71,8 +62,7 @@ void runL2Distance(GpuResources* resources,
                   Tensor<half, 2, true>& outDistances,
                   Tensor<int, 2, true>& outIndices,
                   bool useHgemm,
-                   bool ignoreOutDistances = false,
+                   bool ignoreOutDistances = false);
-                   int tileSizeOverride = -1);
 #endif
 } } // namespace
--- a/gpu/impl/FlatIndex.cu
+++ b/gpu/impl/FlatIndex.cu
@@ -114,8 +114,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
                 int k,
                 Tensor<float, 2, true>& outDistances,
                 Tensor<int, 2, true>& outIndices,
-                 bool exactDistance,
+                 bool exactDistance) {
-                 int tileSize) {
  auto stream = resources_->getDefaultStreamCurrentDevice();
  auto& mem = resources_->getMemoryManagerCurrentDevice();
@@ -127,7 +126,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
    DeviceTensor<half, 2, true> outDistancesHalf(
      mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream);
-    query(inputHalf, k, outDistancesHalf, outIndices, exactDistance, tileSize);
+    query(inputHalf, k, outDistancesHalf, outIndices, exactDistance);
    if (exactDistance) {
      // Convert outDistances back
@@ -145,8 +144,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
                    outDistances,
                    outIndices,
                    // FIXME
-                    !exactDistance,
+                    !exactDistance);
-                    tileSize);
    } else {
      runIPDistance(resources_,
                    vectors_,
@@ -154,8 +152,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
                    input,
                    k,
                    outDistances,
-                    outIndices,
+                    outIndices);
-                    tileSize);
    }
  }
 }
@@ -166,8 +163,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
                 int k,
                 Tensor<half, 2, true>& outDistances,
                 Tensor<int, 2, true>& outIndices,
-                 bool exactDistance,
+                 bool exactDistance) {
-                 int tileSize) {
  FAISS_ASSERT(useFloat16_);
  if (l2Distance_) {
@@ -181,8 +177,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
                  outIndices,
                  useFloat16Accumulator_,
                  // FIXME
-                  !exactDistance,
+                  !exactDistance);
-                  tileSize);
  } else {
    runIPDistance(resources_,
                  vectorsHalf_,
@@ -191,8 +186,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
                  k,
                  outDistances,
                  outIndices,
-                  useFloat16Accumulator_,
+                  useFloat16Accumulator_);
-                  tileSize);
  }
 }
 #endif
@@ -217,12 +211,14 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
    rawData_.append((char*) devDataHalf.data(),
                    devDataHalf.getSizeInBytes(),
-                    stream);
+                    stream,
+                    true /* reserve exactly */);
 #endif
  } else {
    rawData_.append((char*) data,
                    (size_t) dim_ * numVecs * sizeof(float),
-                    stream);
+                    stream,
+                    true /* reserve exactly */);
  }
  num_ += numVecs;

--- a/gpu/impl/FlatIndex.cuh
+++ b/gpu/impl/FlatIndex.cuh
@@ -61,16 +61,14 @@ class FlatIndex {
             int k,
             Tensor<float, 2, true>& outDistances,
             Tensor<int, 2, true>& outIndices,
-             bool exactDistance,
+             bool exactDistance);
-             int tileSize = -1);
 #ifdef FAISS_USE_FLOAT16
  void query(Tensor<half, 2, true>& vecs,
             int k,
             Tensor<half, 2, true>& outDistances,
             Tensor<int, 2, true>& outIndices,
-             bool exactDistance,
+             bool exactDistance);
-             int tileSize = -1);
 #endif
  /// Add vectors to ourselves; the pointer passed can be on the host

--- a/gpu/impl/IVFPQ.cu
+++ b/gpu/impl/IVFPQ.cu
@@ -195,10 +195,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
                  closestSubQDistanceView,
                  closestSubQIndexView,
                  // We don't care about distances
-                  true,
+                  true);
-                  // Much larger tile size, since these vectors are a
-                  // lot smaller than query vectors
-                  1024);
  }
  // Now, we have the nearest sub-q centroid for each slice of the

--- a/gpu/impl/IVFUtilsSelect1.cu
+++ b/gpu/impl/IVFUtilsSelect1.cu
@@ -10,10 +10,10 @@
 #include "IVFUtils.cuh"
 #include "../utils/DeviceUtils.h"
+#include "../utils/Limits.cuh"
 #include "../utils/Select.cuh"
 #include "../utils/StaticUtils.h"
 #include "../utils/Tensor.cuh"
-#include <limits>
 //
 // This kernel is split into a separate compilation unit to cut down
@@ -22,9 +22,6 @@
 namespace faiss { namespace gpu {
-constexpr auto kMax = std::numeric_limits<float>::max();
-constexpr auto kMin = std::numeric_limits<float>::min();
 template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir>
 __global__ void
 pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
@@ -38,9 +35,9 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
  __shared__ float smemK[kNumWarps * NumWarpQ];
  __shared__ int smemV[kNumWarps * NumWarpQ];
-  constexpr auto kInit = Dir ? kMin : kMax;
+  constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
  BlockSelect<float, int, Dir, Comparator<float>,
-            NumWarpQ, NumThreadQ, ThreadsPerBlock>
+              NumWarpQ, NumThreadQ, ThreadsPerBlock>
    heap(kInit, -1, smemK, smemV, k);
  auto queryId = blockIdx.y;

--- a/gpu/impl/IVFUtilsSelect2.cu
+++ b/gpu/impl/IVFUtilsSelect2.cu
@@ -10,10 +10,10 @@
 #include "IVFUtils.cuh"
 #include "../utils/DeviceUtils.h"
+#include "../utils/Limits.cuh"
 #include "../utils/Select.cuh"
 #include "../utils/StaticUtils.h"
 #include "../utils/Tensor.cuh"
-#include <limits>
 //
 // This kernel is split into a separate compilation unit to cut down
@@ -22,9 +22,6 @@
 namespace faiss { namespace gpu {
-constexpr auto kMax = std::numeric_limits<float>::max();
-constexpr auto kMin = std::numeric_limits<float>::min();
 // This is warp divergence central, but this is really a final step
 // and happening a small number of times
 inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
@@ -71,7 +68,7 @@ pass2SelectLists(Tensor<float, 2, true> heapDistances,
  __shared__ float smemK[kNumWarps * NumWarpQ];
  __shared__ int smemV[kNumWarps * NumWarpQ];
-  constexpr auto kInit = Dir ? kMin : kMax;
+  constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
  BlockSelect<float, int, Dir, Comparator<float>,
            NumWarpQ, NumThreadQ, ThreadsPerBlock>
    heap(kInit, -1, smemK, smemV, k);

--- a/gpu/impl/L2Norm.cu
+++ b/gpu/impl/L2Norm.cu
@@ -31,28 +31,29 @@ namespace faiss { namespace gpu {
 // T: the type we are doing the math in (e.g., float, half)
 // TVec: the potentially vectorized type we are loading in (e.g.,
 // float4, half2)
-template <typename T, typename TVec,
+template <typename T, typename TVec, typename TIndex,
          int RowTileSize, bool NormLoop, bool NormSquared>
-__global__ void l2Norm(Tensor<TVec, 2, true> input,
+__global__ void l2Norm(Tensor<TVec, 2, true, TIndex> input,
-                       Tensor<T, 1, true> output) {
+                       Tensor<T, 1, true, TIndex> output) {
  extern __shared__ char smemByte[]; // #warps * RowTileSize elements
  T* smem = (T*) smemByte;
-  int numWarps = utils::divUp(blockDim.x, kWarpSize);
+  TIndex numWarps = utils::divUp(blockDim.x, kWarpSize);
-  int laneId = getLaneId();
+  TIndex laneId = getLaneId();
-  int warpId = threadIdx.x / kWarpSize;
+  TIndex warpId = threadIdx.x / kWarpSize;
  bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
-  int rowStart = RowTileSize * blockIdx.x;
+  TIndex rowStart = RowTileSize * blockIdx.x;
  T rowNorm[RowTileSize];
  if (lastRowTile) {
    // We are handling the very end of the input matrix rows
-    for (int row = 0; row < input.getSize(0) - rowStart; ++row) {
+    for (TIndex row = 0; row < input.getSize(0) - rowStart; ++row) {
      if (NormLoop) {
        rowNorm[0] = Math<T>::zero();
-        for (int col = threadIdx.x; col < input.getSize(1); col += blockDim.x) {
+        for (TIndex col = threadIdx.x;
+             col < input.getSize(1); col += blockDim.x) {
          TVec val = input[rowStart + row][col];
          val = Math<TVec>::mul(val, val);
          rowNorm[0] = Math<T>::add(rowNorm[0], Math<TVec>::reduceAdd(val));
@@ -82,7 +83,8 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input,
        rowNorm[row] = Math<T>::zero();
      }
-      for (int col = threadIdx.x; col < input.getSize(1); col += blockDim.x) {
+      for (TIndex col = threadIdx.x;
+           col < input.getSize(1); col += blockDim.x) {
 #pragma unroll
        for (int row = 0; row < RowTileSize; ++row) {
          tmp[row] = input[rowStart + row][col];
@@ -172,44 +174,44 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input,
  }
 }
-template <typename T, typename TVec>
+template <typename T, typename TVec, typename TIndex>
-void runL2Norm(Tensor<T, 2, true>& input,
+void runL2Norm(Tensor<T, 2, true, TIndex>& input,
-               Tensor<T, 1, true>& output,
+               Tensor<T, 1, true, TIndex>& output,
               bool normSquared,
               cudaStream_t stream) {
  FAISS_ASSERT(input.getSize(0) == output.getSize(0));
-  int maxThreads = getMaxThreadsCurrentDevice();
+  TIndex maxThreads = (TIndex) getMaxThreadsCurrentDevice();
  constexpr int rowTileSize = 8;
-#define RUN_L2(TYPE_T, TYPE_TVEC, INPUT)                         \
+#define RUN_L2(TYPE_T, TYPE_TVEC, INPUT)                                \
-  do {                                                           \
+  do {                                                                  \
-    if (normLoop) {                                              \
+    if (normLoop) {                                                     \
-      if (normSquared) {                                         \
+      if (normSquared) {                                                \
-        l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, true, true>       \
+        l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, true, true>      \
-          <<<grid, block, smem, stream>>>(INPUT, output);        \
+          <<<grid, block, smem, stream>>>(INPUT, output);               \
-      } else {                                                   \
+      } else {                                                          \
-        l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, true, false>      \
+        l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, true, false>     \
-          <<<grid, block, smem, stream>>>(INPUT, output);        \
+          <<<grid, block, smem, stream>>>(INPUT, output);               \
-      }                                                          \
+      }                                                                 \
-    } else {                                                     \
+    } else {                                                            \
-      if (normSquared) {                                         \
+      if (normSquared) {                                                \
-        l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, false, true>      \
+        l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, false, true>     \
-          <<<grid, block, smem, stream>>>(INPUT, output);        \
+          <<<grid, block, smem, stream>>>(INPUT, output);               \
-      } else {                                                   \
+      } else {                                                          \
-        l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, false, false>     \
+        l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, false, false>    \
-          <<<grid, block, smem, stream>>>(INPUT, output);        \
+          <<<grid, block, smem, stream>>>(INPUT, output);               \
-      }                                                          \
+      }                                                                 \
-    }                                                            \
+    }                                                                   \
  } while (0)
  if (input.template canCastResize<TVec>()) {
    // Can load using the vectorized type
    auto inputV = input.template castResize<TVec>();
-    int dim = inputV.getSize(1);
+    auto dim = inputV.getSize(1);
    bool normLoop = dim > maxThreads;
-    int numThreads = min(dim, maxThreads);
+    auto numThreads = min(dim, maxThreads);
    auto grid = dim3(utils::divUp(inputV.getSize(0), rowTileSize));
    auto block = dim3(numThreads);
@@ -220,9 +222,9 @@ void runL2Norm(Tensor<T, 2, true>& input,
  } else {
    // Can't load using the vectorized type
-    int dim = input.getSize(1);
+    auto dim = input.getSize(1);
    bool normLoop = dim > maxThreads;
-    int numThreads = min(dim, maxThreads);
+    auto numThreads = min(dim, maxThreads);
    auto grid = dim3(utils::divUp(input.getSize(0), rowTileSize));
    auto block = dim3(numThreads);
@@ -241,7 +243,13 @@ void runL2Norm(Tensor<float, 2, true>& input,
               Tensor<float, 1, true>& output,
               bool normSquared,
               cudaStream_t stream) {
-  runL2Norm<float, float4>(input, output, normSquared, stream);
+  if (input.canUseIndexType<int>()) {
+    runL2Norm<float, float4, int>(input, output, normSquared, stream);
+  } else {
+    auto inputCast = input.castIndexType<long>();
+    auto outputCast = output.castIndexType<long>();
+    runL2Norm<float, float4, long>(inputCast, outputCast, normSquared, stream);
+  }
 }
 #ifdef FAISS_USE_FLOAT16
@@ -249,7 +257,13 @@ void runL2Norm(Tensor<half, 2, true>& input,
               Tensor<half, 1, true>& output,
               bool normSquared,
               cudaStream_t stream) {
-  runL2Norm<half, half2>(input, output, normSquared, stream);
+  if (input.canUseIndexType<int>()) {
+    runL2Norm<half, half2, int>(input, output, normSquared, stream);
+  } else {
+    auto inputCast = input.castIndexType<long>();
+    auto outputCast = output.castIndexType<long>();
+    runL2Norm<half, half2, long>(inputCast, outputCast, normSquared, stream);
+  }
 }
 #endif

--- a/gpu/perf/CompareFlat.cu
+++ b/gpu/perf/CompareFlat.cu
@@ -29,11 +29,14 @@ DEFINE_int32(num, 128, "# of vecs");
 DEFINE_int32(dim, 128, "# of dimensions");
 DEFINE_int32(num_queries, 3, "number of query vectors");
 DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
-DEFINE_bool(use_float16, false, "use encodings in float16 instead of float32");
+DEFINE_bool(use_float16, false, "use encodings in float16");
+DEFINE_bool(use_float16_math, false, "perform math in float16");
 DEFINE_bool(transposed, false, "store vectors transposed");
 DEFINE_int64(seed, -1, "specify random seed");
 DEFINE_int32(num_gpus, 1, "number of gpus to use");
 DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use");
+DEFINE_bool(cpu, true, "run the CPU code for timing and comparison");
+DEFINE_bool(use_unified_mem, false, "use Pascal unified memory for the index");
 using namespace faiss::gpu;
@@ -72,7 +75,10 @@ int main(int argc, char** argv) {
    GpuIndexFlatConfig config;
    config.device = dev;
    config.useFloat16 = FLAGS_use_float16;
+    config.useFloat16Accumulator = FLAGS_use_float16_math;
    config.storeTransposed = FLAGS_transposed;
+    config.memorySpace = FLAGS_use_unified_mem ?
+    MemorySpace::Unified : MemorySpace::Device;
    auto p = std::unique_ptr<faiss::gpu::GpuIndexFlatL2>(
      new faiss::gpu::GpuIndexFlatL2(res, index.get(), config));
@@ -90,9 +96,9 @@ int main(int argc, char** argv) {
  HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
  HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
-  float cpuTime = 0.0f;
+  if (FLAGS_cpu) {
+    float cpuTime = 0.0f;
-  {
    CpuTimer timer;
    index->search(numQueries,
                  cpuQuery.data(),
@@ -101,10 +107,9 @@ int main(int argc, char** argv) {
                  cpuIndices.data());
    cpuTime = timer.elapsedMilliseconds();
+    printf("CPU time %.3f ms\n", cpuTime);
  }
-  printf("CPU time %.3f ms\n", cpuTime);
  HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
  HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
@@ -131,14 +136,14 @@ int main(int argc, char** argv) {
  CUDA_VERIFY(cudaProfilerStop());
  printf("GPU time %.3f ms\n", gpuTime);
-  compareLists(cpuDistances.data(), cpuIndices.data(),
+  if (FLAGS_cpu) {
-               gpuDistances.data(), gpuIndices.data(),
+    compareLists(cpuDistances.data(), cpuIndices.data(),
-               numQueries, FLAGS_k,
+                 gpuDistances.data(), gpuIndices.data(),
-               "", true, FLAGS_diff, false);
+                 numQueries, FLAGS_k,
+                 "", true, FLAGS_diff, false);
+  }
  CUDA_VERIFY(cudaDeviceSynchronize());
-  // printf("\ncudaMalloc usage %zd\n",
-  //        resources.getMemoryManager().getHighWaterCudaMalloc());
  return 0;
 }
--- a/gpu/test/TestGpuIndexFlat.cpp
+++ b/gpu/test/TestGpuIndexFlat.cpp
@@ -21,29 +21,47 @@
 constexpr float kF16MaxRelErr = 0.07f;
 constexpr float kF32MaxRelErr = 6e-3f;
-void testFlat(bool useL2,
+struct TestFlatOptions {
-              bool useFloat16,
+  TestFlatOptions()
-              bool useTransposed,
+      : useL2(true),
-              int kOverride = -1) {
+        useFloat16(false),
-  int numVecs = faiss::gpu::randVal(1000, 20000);
+        useTransposed(false),
+        numVecsOverride(-1),
+        numQueriesOverride(-1),
+        kOverride(-1) {
+  }
+  bool useL2;
+  bool useFloat16;
+  bool useTransposed;
+  int numVecsOverride;
+  int numQueriesOverride;
+  int kOverride;
+};
+void testFlat(const TestFlatOptions& opt) {
+  int numVecs = opt.numVecsOverride > 0 ?
+    opt.numVecsOverride : faiss::gpu::randVal(1000, 20000);
  int dim = faiss::gpu::randVal(50, 800);
-  int numQuery = faiss::gpu::randVal(1, 512);
+  int numQuery = opt.numQueriesOverride > 0 ?
+    opt.numQueriesOverride : faiss::gpu::randVal(1, 512);
  // Due to loss of precision in a float16 accumulator, for large k,
  // the number of differences is pretty huge. Restrict ourselves to a
  // fairly small `k` for float16
-  int k = useFloat16 ?
+  int k = opt.useFloat16 ?
    std::min(faiss::gpu::randVal(1, 50), numVecs) :
    std::min(faiss::gpu::randVal(1, 1024), numVecs);
-  if (kOverride > 0) {
+  if (opt.kOverride > 0) {
-    k = kOverride;
+    k = opt.kOverride;
  }
  faiss::IndexFlatIP cpuIndexIP(dim);
  faiss::IndexFlatL2 cpuIndexL2(dim);
  faiss::IndexFlat* cpuIndex =
-    useL2 ? (faiss::IndexFlat*) &cpuIndexL2 : (faiss::IndexFlat*) &cpuIndexIP;
+    opt.useL2 ? (faiss::IndexFlat*) &cpuIndexL2 :
+    (faiss::IndexFlat*) &cpuIndexIP;
  // Construct on a random device to test multi-device, if we have
  // multiple devices
@@ -55,14 +73,14 @@ void testFlat(bool useL2,
  faiss::gpu::GpuIndexFlatConfig config;
  config.device = device;
-  config.useFloat16 = useFloat16;
+  config.useFloat16 = opt.useFloat16;
-  config.storeTransposed = useTransposed;
+  config.storeTransposed = opt.useTransposed;
  faiss::gpu::GpuIndexFlatIP gpuIndexIP(&res, dim, config);
  faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
  faiss::gpu::GpuIndexFlat* gpuIndex =
-    useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
+    opt.useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
    (faiss::gpu::GpuIndexFlat*) &gpuIndexIP;
  std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
@@ -70,37 +88,53 @@ void testFlat(bool useL2,
  gpuIndex->add(numVecs, vecs.data());
  std::stringstream str;
-  str << (useL2 ? "L2" : "IP") << " numVecs " << numVecs
+  str << (opt.useL2 ? "L2" : "IP") << " numVecs " << numVecs
      << " dim " << dim
-      << " useFloat16 " << useFloat16
+      << " useFloat16 " << opt.useFloat16
-      << " transposed " << useTransposed
+      << " transposed " << opt.useTransposed
      << " numQuery " << numQuery
      << " k " << k;
  // To some extent, we depend upon the relative error for the test
  // for float16
  faiss::gpu::compareIndices(*cpuIndex, *gpuIndex, numQuery, dim, k, str.str(),
-                             useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                             opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
                             // FIXME: the fp16 bounds are
                             // useless when math (the accumulator) is
                             // in fp16. Figure out another way to test
-                             useFloat16 ? 0.99f : 0.1f,
+                             opt.useFloat16 ? 0.99f : 0.1f,
-                             useFloat16 ? 0.65f : 0.015f);
+                             opt.useFloat16 ? 0.65f : 0.015f);
 }
 TEST(TestGpuIndexFlat, IP_Float32) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(false, false, false);
-    testFlat(false, false, true);
+    TestFlatOptions opt;
+    opt.useL2 = false;
+    opt.useFloat16 = false;
+    opt.useTransposed = false;
+    testFlat(opt);
+    opt.useTransposed = true;
+    testFlat(opt);
  }
 }
 TEST(TestGpuIndexFlat, L2_Float32) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(true, false, false);
-    testFlat(true, false, true);
+    TestFlatOptions opt;
+    opt.useL2 = true;
+    opt.useFloat16 = false;
+    opt.useTransposed = false;
+    testFlat(opt);
+    opt.useTransposed = true;
+    testFlat(opt);
  }
 }
@@ -108,24 +142,46 @@ TEST(TestGpuIndexFlat, L2_Float32) {
 TEST(TestGpuIndexFlat, L2_Float32_K1) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(true, false, false, 1);
-    testFlat(true, false, true, 1);
+    TestFlatOptions opt;
+    opt.useL2 = true;
+    opt.useFloat16 = false;
+    opt.useTransposed = false;
+    opt.kOverride = 1;
+    testFlat(opt);
  }
 }
 TEST(TestGpuIndexFlat, IP_Float16) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(false, true, false);
-    testFlat(false, true, false);
+    TestFlatOptions opt;
+    opt.useL2 = false;
+    opt.useFloat16 = true;
+    opt.useTransposed = false;
+    testFlat(opt);
+    opt.useTransposed = true;
+    testFlat(opt);
  }
 }
 TEST(TestGpuIndexFlat, L2_Float16) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(true, true, false);
-    testFlat(true, true, true);
+    TestFlatOptions opt;
+    opt.useL2 = true;
+    opt.useFloat16 = true;
+    opt.useTransposed = false;
+    testFlat(opt);
+    opt.useTransposed = true;
+    testFlat(opt);
  }
 }
@@ -133,8 +189,33 @@ TEST(TestGpuIndexFlat, L2_Float16) {
 TEST(TestGpuIndexFlat, L2_Float16_K1) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(true, true, false, 1);
-    testFlat(true, true, true, 1);
+    TestFlatOptions opt;
+    opt.useL2 = true;
+    opt.useFloat16 = true;
+    opt.useTransposed = false;
+    opt.kOverride = 1;
+    testFlat(opt);
+  }
+}
+// test tiling along a huge vector set
+TEST(TestGpuIndexFlat, L2_Tiling) {
+  for (int tries = 0; tries < 3; ++tries) {
+    faiss::gpu::newTestSeed();
+    TestFlatOptions opt;
+    opt.useL2 = true;
+    opt.useFloat16 = false;
+    opt.useTransposed = false;
+    opt.numVecsOverride = 1000000;
+    opt.numQueriesOverride = 8;
+    testFlat(opt);
+    opt.useTransposed = true;
+    testFlat(opt);
  }
 }

--- a/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -14,6 +14,7 @@
 #include "../StandardGpuResources.h"
 #include "../utils/DeviceUtils.h"
 #include "../test/TestUtils.h"
+#include <cmath>
 #include <gtest/gtest.h>
 #include <glog/logging.h>
 #include <sstream>
@@ -390,6 +391,68 @@ TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) {
  copyToTest(false, false);
 }
+TEST(TestGpuIndexIVFFlat, Float32_negative) {
+  faiss::gpu::newTestSeed();
+  Options opt;
+  auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+  auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+  // Put all vecs on negative side
+  for (auto& f : trainVecs) {
+    f = std::abs(f) * -1.0f;
+  }
+  for (auto& f : addVecs) {
+    f *= std::abs(f) * -1.0f;
+  }
+  faiss::IndexFlatIP quantizerIP(opt.dim);
+  faiss::Index* quantizer = (faiss::Index*) &quantizerIP;
+  faiss::IndexIVFFlat cpuIndex(quantizer,
+                               opt.dim, opt.numCentroids,
+                               faiss::METRIC_INNER_PRODUCT);
+  cpuIndex.train(opt.numTrain, trainVecs.data());
+  cpuIndex.add(opt.numAdd, addVecs.data());
+  cpuIndex.nprobe = opt.nprobe;
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+  faiss::gpu::GpuIndexIVFFlatConfig config;
+  config.device = opt.device;
+  config.indicesOptions = opt.indicesOpt;
+  faiss::gpu::GpuIndexIVFFlat gpuIndex(&res,
+                                       cpuIndex.d,
+                                       cpuIndex.nlist,
+                                       cpuIndex.metric_type,
+                                       config);
+  gpuIndex.copyFrom(&cpuIndex);
+  gpuIndex.setNumProbes(opt.nprobe);
+  // Construct a positive test set
+  auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+  // Put all vecs on positive size
+  for (auto& f : queryVecs) {
+    f = std::abs(f);
+  }
+  bool compFloat16 = false;
+  faiss::gpu::compareIndices(queryVecs,
+                             cpuIndex, gpuIndex,
+                             opt.numQuery, opt.dim, opt.k, opt.toString(),
+                             compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                             // FIXME: the fp16 bounds are
+                             // useless when math (the accumulator) is
+                             // in fp16. Figure out another way to test
+                             compFloat16 ? 0.99f : 0.1f,
+                             compFloat16 ? 0.65f : 0.015f);
+}
 //
 // NaN tests
 //

--- a/gpu/test/TestUtils.cpp
+++ b/gpu/test/TestUtils.cpp
@@ -64,24 +64,23 @@ std::vector<float> randVecs(size_t num, size_t dim) {
  return v;
 }
-void compareIndices(faiss::Index& refIndex,
+void compareIndices(const std::vector<float>& queryVecs,
+                    faiss::Index& refIndex,
                    faiss::Index& testIndex,
                    int numQuery, int dim, int k,
                    const std::string& configMsg,
                    float maxRelativeError,
                    float pctMaxDiff1,
                    float pctMaxDiffN) {
-  auto queries = faiss::gpu::randVecs(numQuery, dim);
  // Compare
  std::vector<float> refDistance(numQuery * k, 0);
  std::vector<faiss::Index::idx_t> refIndices(numQuery * k, -1);
-  refIndex.search(numQuery, queries.data(),
+  refIndex.search(numQuery, queryVecs.data(),
                  k, refDistance.data(), refIndices.data());
  std::vector<float> testDistance(numQuery * k, 0);
  std::vector<faiss::Index::idx_t> testIndices(numQuery * k, -1);
-  testIndex.search(numQuery, queries.data(),
+  testIndex.search(numQuery, queryVecs.data(),
                   k, testDistance.data(), testIndices.data());
  faiss::gpu::compareLists(refDistance.data(),
@@ -94,6 +93,25 @@ void compareIndices(faiss::Index& refIndex,
                           maxRelativeError, pctMaxDiff1, pctMaxDiffN);
 }
+void compareIndices(faiss::Index& refIndex,
+                    faiss::Index& testIndex,
+                    int numQuery, int dim, int k,
+                    const std::string& configMsg,
+                    float maxRelativeError,
+                    float pctMaxDiff1,
+                    float pctMaxDiffN) {
+  auto queryVecs = faiss::gpu::randVecs(numQuery, dim);
+  compareIndices(queryVecs,
+                 refIndex,
+                 testIndex,
+                 numQuery, dim, k,
+                 configMsg,
+                 maxRelativeError,
+                 pctMaxDiff1,
+                 pctMaxDiffN);
+}
 template <typename T>
 inline T lookup(const T* p, int i, int j, int /*dim1*/, int dim2) {
  return p[i * dim2 + j];

--- a/gpu/test/TestUtils.h
+++ b/gpu/test/TestUtils.h
@@ -56,7 +56,19 @@ T randSelect(std::initializer_list<T> vals) {
 /// Generates a collection of random vectors in the range [0, 1]
 std::vector<float> randVecs(size_t num, size_t dim);
-/// Compare two indices via query for similarity
+/// Compare two indices via query for similarity, with a user-specified set of
+/// query vectors
+void compareIndices(const std::vector<float>& queryVecs,
+                    faiss::Index& refIndex,
+                    faiss::Index& testIndex,
+                    int numQuery, int dim, int k,
+                    const std::string& configMsg,
+                    float maxRelativeError = 6e-5f,
+                    float pctMaxDiff1 = 0.1f,
+                    float pctMaxDiffN = 0.005f);
+/// Compare two indices via query for similarity, generating random query
+/// vectors
 void compareIndices(faiss::Index& refIndex,
                    faiss::Index& testIndex,
                    int numQuery, int dim, int k,

--- a/gpu/test/test_pytorch_faiss.py
+++ b/gpu/test/test_pytorch_faiss.py
@@ -38,14 +38,14 @@ def search_index_pytorch(index, x, k, D=None, I=None):
        assert I.__class__ in (torch.LongTensor, torch.cuda.LongTensor)
        assert I.size() == (n, k)
        assert I.is_contiguous()
+    torch.cuda.synchronize()
    xptr = x.storage().data_ptr()
    Iptr = I.storage().data_ptr()
    Dptr = D.storage().data_ptr()
    index.search_c(n, faiss.cast_integer_to_float_ptr(xptr),
                   k, faiss.cast_integer_to_float_ptr(Dptr),
                   faiss.cast_integer_to_long_ptr(Iptr))
+    torch.cuda.synchronize()
    return D, I

--- a/gpu/utils/BlockSelectFloat.cu
+++ b/gpu/utils/BlockSelectFloat.cu
@@ -37,9 +37,9 @@ BLOCK_SELECT_DECL(float, false, 512);
 BLOCK_SELECT_DECL(float, false, 1024);
 void runBlockSelect(Tensor<float, 2, true>& in,
-                  Tensor<float, 2, true>& outK,
+                    Tensor<float, 2, true>& outK,
-                  Tensor<int, 2, true>& outV,
+                    Tensor<int, 2, true>& outV,
-                  bool dir, int k, cudaStream_t stream) {
+                    bool dir, int k, cudaStream_t stream) {
  FAISS_ASSERT(k <= 1024);
  if (dir) {
@@ -77,4 +77,46 @@ void runBlockSelect(Tensor<float, 2, true>& in,
  }
 }
+void runBlockSelectPair(Tensor<float, 2, true>& inK,
+                        Tensor<int, 2, true>& inV,
+                        Tensor<float, 2, true>& outK,
+                        Tensor<int, 2, true>& outV,
+                        bool dir, int k, cudaStream_t stream) {
+  FAISS_ASSERT(k <= 1024);
+  if (dir) {
+    if (k == 1) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 1);
+    } else if (k <= 32) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 32);
+    } else if (k <= 64) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 64);
+    } else if (k <= 128) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 128);
+    } else if (k <= 256) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 256);
+    } else if (k <= 512) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 512);
+    } else if (k <= 1024) {
+      BLOCK_SELECT_PAIR_CALL(float, true, 1024);
+    }
+  } else {
+    if (k == 1) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 1);
+    } else if (k <= 32) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 32);
+    } else if (k <= 64) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 64);
+    } else if (k <= 128) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 128);
+    } else if (k <= 256) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 256);
+    } else if (k <= 512) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 512);
+    } else if (k <= 1024) {
+      BLOCK_SELECT_PAIR_CALL(float, false, 1024);
+    }
+  }
+}
 } } // namespace
--- a/gpu/utils/BlockSelectHalf.cu
+++ b/gpu/utils/BlockSelectHalf.cu
@@ -39,9 +39,9 @@ BLOCK_SELECT_DECL(half, false, 512);
 BLOCK_SELECT_DECL(half, false, 1024);
 void runBlockSelect(Tensor<half, 2, true>& in,
-                  Tensor<half, 2, true>& outK,
+                    Tensor<half, 2, true>& outK,
-                  Tensor<int, 2, true>& outV,
+                    Tensor<int, 2, true>& outV,
-                  bool dir, int k, cudaStream_t stream) {
+                    bool dir, int k, cudaStream_t stream) {
  FAISS_ASSERT(k <= 1024);
  if (dir) {
@@ -79,6 +79,48 @@ void runBlockSelect(Tensor<half, 2, true>& in,
  }
 }
+void runBlockSelectPair(Tensor<half, 2, true>& inK,
+                        Tensor<int, 2, true>& inV,
+                        Tensor<half, 2, true>& outK,
+                        Tensor<int, 2, true>& outV,
+                        bool dir, int k, cudaStream_t stream) {
+  FAISS_ASSERT(k <= 1024);
+  if (dir) {
+    if (k == 1) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 1);
+    } else if (k <= 32) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 32);
+    } else if (k <= 64) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 64);
+    } else if (k <= 128) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 128);
+    } else if (k <= 256) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 256);
+    } else if (k <= 512) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 512);
+    } else if (k <= 1024) {
+      BLOCK_SELECT_PAIR_CALL(half, true, 1024);
+    }
+  } else {
+    if (k == 1) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 1);
+    } else if (k <= 32) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 32);
+    } else if (k <= 64) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 64);
+    } else if (k <= 128) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 128);
+    } else if (k <= 256) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 256);
+    } else if (k <= 512) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 512);
+    } else if (k <= 1024) {
+      BLOCK_SELECT_PAIR_CALL(half, false, 1024);
+    }
+  }
+}
 #endif
 } } // namespace
--- a/gpu/utils/BlockSelectKernel.cuh
+++ b/gpu/utils/BlockSelectKernel.cuh
@@ -32,7 +32,7 @@ __global__ void blockSelect(Tensor<K, 2, true> in,
  __shared__ IndexType smemV[kNumWarps * NumWarpQ];
  BlockSelect<K, IndexType, Dir, Comparator<K>,
-            NumWarpQ, NumThreadQ, ThreadsPerBlock>
+              NumWarpQ, NumThreadQ, ThreadsPerBlock>
    heap(initK, initV, smemK, smemV, k);
  // Grid is exactly sized to rows available
@@ -62,16 +62,79 @@ __global__ void blockSelect(Tensor<K, 2, true> in,
  }
 }
+template <typename K,
+          typename IndexType,
+          bool Dir,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
+__global__ void blockSelectPair(Tensor<K, 2, true> inK,
+                                Tensor<IndexType, 2, true> inV,
+                                Tensor<K, 2, true> outK,
+                                Tensor<IndexType, 2, true> outV,
+                                K initK,
+                                IndexType initV,
+                                int k) {
+  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+  __shared__ K smemK[kNumWarps * NumWarpQ];
+  __shared__ IndexType smemV[kNumWarps * NumWarpQ];
+  BlockSelect<K, IndexType, Dir, Comparator<K>,
+              NumWarpQ, NumThreadQ, ThreadsPerBlock>
+    heap(initK, initV, smemK, smemV, k);
+  // Grid is exactly sized to rows available
+  int row = blockIdx.x;
+  int i = threadIdx.x;
+  K* inKStart = inK[row][i].data();
+  IndexType* inVStart = inV[row][i].data();
+  // Whole warps must participate in the selection
+  int limit = utils::roundDown(inK.getSize(1), kWarpSize);
+  for (; i < limit; i += ThreadsPerBlock) {
+    heap.add(*inKStart, *inVStart);
+    inKStart += ThreadsPerBlock;
+    inVStart += ThreadsPerBlock;
+  }
+  // Handle last remainder fraction of a warp of elements
+  if (i < inK.getSize(1)) {
+    heap.addThreadQ(*inKStart, *inVStart);
+  }
+  heap.reduce();
+  for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
+    outK[row][i] = smemK[i];
+    outV[row][i] = smemV[i];
+  }
+}
 void runBlockSelect(Tensor<float, 2, true>& in,
-                  Tensor<float, 2, true>& outKeys,
+                    Tensor<float, 2, true>& outKeys,
-                  Tensor<int, 2, true>& outIndices,
+                    Tensor<int, 2, true>& outIndices,
-                  bool dir, int k, cudaStream_t stream);
+                    bool dir, int k, cudaStream_t stream);
+void runBlockSelectPair(Tensor<float, 2, true>& inKeys,
+                        Tensor<int, 2, true>& inIndices,
+                        Tensor<float, 2, true>& outKeys,
+                        Tensor<int, 2, true>& outIndices,
+                        bool dir, int k, cudaStream_t stream);
 #ifdef FAISS_USE_FLOAT16
 void runBlockSelect(Tensor<half, 2, true>& in,
-                  Tensor<half, 2, true>& outKeys,
+                    Tensor<half, 2, true>& outKeys,
-                  Tensor<int, 2, true>& outIndices,
+                    Tensor<int, 2, true>& outIndices,
-                  bool dir, int k, cudaStream_t stream);
+                    bool dir, int k, cudaStream_t stream);
+void runBlockSelectPair(Tensor<half, 2, true>& inKeys,
+                        Tensor<int, 2, true>& inIndices,
+                        Tensor<half, 2, true>& outKeys,
+                        Tensor<int, 2, true>& outIndices,
+                        bool dir, int k, cudaStream_t stream);
 #endif
 } } // namespace
--- a/gpu/utils/DeviceTensor-inl.cuh
+++ b/gpu/utils/DeviceTensor-inl.cuh
@@ -12,37 +12,37 @@
 namespace faiss { namespace gpu {
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor() :
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor() :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
    state_(AllocState::NotOwner),
    space_(MemorySpace::Device) {
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
-  DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t) :
+  DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
    state_(AllocState::NotOwner),
    space_(MemorySpace::Device) {
  this->operator=(std::move(t));
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
-  DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t) {
+  DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
  if (this->state_ == AllocState::Owner) {
    CUDA_VERIFY(cudaFree(this->data_));
  }
-  this->Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
+  this->Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
    std::move(t));
  this->state_ = t.state_; t.state_ = AllocState::NotOwner;
@@ -52,10 +52,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
  return *this;
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() {
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~DeviceTensor() {
  if (state_ == AllocState::Owner) {
    FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
    CUDA_VERIFY(cudaFree(this->data_));
@@ -66,13 +66,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() {
  // destructor will return the reservation
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
  const IndexT sizes[Dim],
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
    state_(AllocState::Owner),
    space_(space) {
@@ -80,13 +80,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
  FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
  std::initializer_list<IndexT> sizes,
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
    state_(AllocState::Owner),
    space_(space) {
@@ -95,15 +95,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
 }
 // memory reservation constructor
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
  DeviceMemory& m,
  const IndexT sizes[Dim],
  cudaStream_t stream,
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
    state_(AllocState::Reservation),
    space_(space) {
@@ -116,15 +116,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
 }
 // memory reservation constructor
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
  DeviceMemory& m,
  std::initializer_list<IndexT> sizes,
  cudaStream_t stream,
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
    state_(AllocState::Reservation),
    space_(space) {
@@ -136,51 +136,51 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
  reservation_ = std::move(memory);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
  DataPtrType data,
  const IndexT sizes[Dim],
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
    state_(AllocState::NotOwner),
    space_(space) {
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
  DataPtrType data,
  std::initializer_list<IndexT> sizes,
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
    state_(AllocState::NotOwner),
    space_(space) {
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
  DataPtrType data,
  const IndexT sizes[Dim],
  const IndexT strides[Dim],
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes, strides),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
    state_(AllocState::NotOwner),
    space_(space) {
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
-  Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
  cudaStream_t stream,
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
    state_(AllocState::Owner),
    space_(space) {
@@ -189,15 +189,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
  this->copyFrom(t, stream);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
  DeviceMemory& m,
-  Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
  cudaStream_t stream,
  MemorySpace space) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
    state_(AllocState::Reservation),
    space_(space) {
@@ -211,10 +211,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
  this->copyFrom(t, stream);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
-__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
+__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
-DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::zero(
+DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero(
  cudaStream_t stream) {
  if (this->data_) {
    // Region must be contiguous

--- a/gpu/utils/DeviceTensor.cuh
+++ b/gpu/utils/DeviceTensor.cuh
@@ -18,10 +18,10 @@ namespace faiss { namespace gpu {
 template <typename T,
          int Dim,
-          bool Contig = false,
+          bool InnerContig = false,
          typename IndexT = int,
          template <typename U> class PtrTraits = traits::DefaultPtrTraits>
-class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
+class DeviceTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
 public:
  typedef IndexT IndexType;
  typedef typename PtrTraits<T>::PtrType DataPtrType;
@@ -33,11 +33,11 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
  __host__ ~DeviceTensor();
  /// Move constructor
-  __host__ DeviceTensor(DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t);
+  __host__ DeviceTensor(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
  /// Move assignment
-  __host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
+  __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
-  operator=(DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t);
+  operator=(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
  /// Constructs a tensor of the given size, allocating memory for it
  /// locally
@@ -76,19 +76,19 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
                        MemorySpace space = MemorySpace::Device);
  /// Copies a tensor into ourselves, allocating memory for it locally
-  __host__ DeviceTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
+  __host__ DeviceTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
                        cudaStream_t stream,
                        MemorySpace space = MemorySpace::Device);
  /// Copies a tensor into ourselves, reserving a temporary
  /// memory reservation via a memory manager.
  __host__ DeviceTensor(DeviceMemory& m,
-                        Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
+                        Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
                        cudaStream_t stream,
                        MemorySpace space = MemorySpace::Device);
  /// Call to zero out memory
-  __host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
+  __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
  zero(cudaStream_t stream);
 private:

--- a/gpu/utils/DeviceUtils.cpp
+++ b/gpu/utils/DeviceUtils.cpp
@@ -43,7 +43,7 @@ void synchronizeAllDevices() {
  }
 }
-cudaDeviceProp& getDeviceProperties(int device) {
+const cudaDeviceProp& getDeviceProperties(int device) {
  static std::mutex mutex;
  static std::unordered_map<int, cudaDeviceProp> properties;
@@ -61,6 +61,10 @@ cudaDeviceProp& getDeviceProperties(int device) {
  return it->second;
 }
+const cudaDeviceProp& getCurrentDeviceProperties() {
+  return getDeviceProperties(getCurrentDevice());
+}
 int getMaxThreads(int device) {
  return getDeviceProperties(device).maxThreadsPerBlock;
 }

--- a/gpu/utils/DeviceUtils.h
+++ b/gpu/utils/DeviceUtils.h
@@ -31,7 +31,10 @@ int getNumDevices();
 void synchronizeAllDevices();
 /// Returns a cached cudaDeviceProp for the given device
-cudaDeviceProp& getDeviceProperties(int device);
+const cudaDeviceProp& getDeviceProperties(int device);
+/// Returns the cached cudaDeviceProp for the current device
+const cudaDeviceProp& getCurrentDeviceProperties();
 /// Returns the maximum number of threads available for the given GPU
 /// device

--- a/gpu/utils/HostTensor-inl.cuh
+++ b/gpu/utils/HostTensor-inl.cuh
@@ -10,18 +10,18 @@
 namespace faiss { namespace gpu {
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor() :
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor() :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
    state_(AllocState::NotOwner) {
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() {
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~HostTensor() {
  if (state_ == AllocState::Owner) {
    FAISS_ASSERT(this->data_ != nullptr);
    delete[] this->data_;
@@ -29,67 +29,67 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() {
  }
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
  const IndexT sizes[Dim]) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
    state_(AllocState::Owner) {
  this->data_ = new T[this->numElements()];
  FAISS_ASSERT(this->data_ != nullptr);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
  std::initializer_list<IndexT> sizes) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
    state_(AllocState::Owner) {
  this->data_ = new T[this->numElements()];
  FAISS_ASSERT(this->data_ != nullptr);
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
  DataPtrType data,
  const IndexT sizes[Dim]) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
    state_(AllocState::NotOwner) {
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
  DataPtrType data,
  std::initializer_list<IndexT> sizes) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
    state_(AllocState::NotOwner) {
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
  DataPtrType data,
  const IndexT sizes[Dim],
  const IndexT strides[Dim]) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes, strides),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
    state_(AllocState::NotOwner) {
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
-  Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
+  Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
  cudaStream_t stream) :
-    Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
+    Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
    state_(AllocState::Owner) {
  // Only contiguous arrays handled for now
  FAISS_ASSERT(t.isContiguous());
@@ -99,10 +99,10 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
 }
 /// Call to zero out memory
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
-__host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>&
+__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() {
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero() {
  // Region must be contiguous
  FAISS_ASSERT(this->isContiguous());
@@ -113,17 +113,17 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() {
  return *this;
 }
-template <typename T, int Dim, bool Contig,
+template <typename T, int Dim, bool InnerContig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ T
-HostTensor<T, Dim, Contig, IndexT, PtrTraits>::maxDiff(
+HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::maxDiff(
-  const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const {
+  const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
  auto size = this->numElements();
  FAISS_ASSERT(size == t.numElements());
  FAISS_ASSERT(size > 0);
-  if (Contig) {
+  if (InnerContig) {
    auto a = this->data();
    auto b = t.data();

--- a/gpu/utils/HostTensor.cuh
+++ b/gpu/utils/HostTensor.cuh
@@ -16,10 +16,10 @@ namespace faiss { namespace gpu {
 template <typename T,
          int Dim,
-          bool Contig = false,
+          bool InnerContig = false,
          typename IndexT = int,
          template <typename U> class PtrTraits = traits::DefaultPtrTraits>
-class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
+class HostTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
 public:
  typedef IndexT IndexType;
  typedef typename PtrTraits<T>::PtrType DataPtrType;
@@ -51,19 +51,19 @@ class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
  /// Copies a tensor into ourselves, allocating memory for it
  /// locally. If the tensor is on the GPU, then we will copy it to
  /// ourselves wrt the given stream.
-  __host__ HostTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
+  __host__ HostTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
                      cudaStream_t stream);
  /// Call to zero out memory
-  __host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>& zero();
+  __host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& zero();
  /// Returns the maximum difference seen between two tensors
  __host__ T
-  maxDiff(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const;
+  maxDiff(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const;
  /// Are the two tensors exactly equal?
  __host__ bool
-  equal(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const {
+  equal(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
    return (maxDiff(t) == (T) 0);
  }

--- a/gpu/utils/Limits.cuh
+++ b/gpu/utils/Limits.cuh
@@ -24,11 +24,12 @@ struct Limits {
 // constexpr constructor for half
 // FIXME: faiss CPU uses +/-FLT_MAX instead of +/-infinity
 constexpr float kFloatMax = std::numeric_limits<float>::max();
+constexpr float kFloatMin = std::numeric_limits<float>::lowest();
 template <>
 struct Limits<float> {
  static __device__ __host__ inline float getMin() {
-    return -kFloatMax;
+    return kFloatMin;
  }
  static __device__ __host__ inline float getMax() {
    return kFloatMax;
@@ -55,8 +56,8 @@ struct Limits<half> {
 #endif // FAISS_USE_FLOAT16
-constexpr int kIntMin = std::numeric_limits<int>::min();
 constexpr int kIntMax = std::numeric_limits<int>::max();
+constexpr int kIntMin = std::numeric_limits<int>::lowest();
 template <>
 struct Limits<int> {

--- a/gpu/utils/MatrixMult.cu
+++ b/gpu/utils/MatrixMult.cu
@@ -112,6 +112,10 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC,
  FAISS_ASSERT(aK == bK);
  FAISS_ASSERT(bN == cN);
+  FAISS_ASSERT(a.getStride(1) == 1);
+  FAISS_ASSERT(b.getStride(1) == 1);
+  FAISS_ASSERT(c.getStride(1) == 1);
  // Now, we have to represent the matrix multiplication in
  // column-major layout
  T* pA = transC ? a.data() : b.data();
@@ -122,9 +126,9 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC,
  int n = c.getSize(0); // other size
  int k = transA ? a.getSize(0) : a.getSize(1);
-  int lda = transC ? a.getSize(1) : b.getSize(1);
+  int lda = transC ? a.getStride(0) : b.getStride(0);
-  int ldb = transC ? b.getSize(1) : a.getSize(1);
+  int ldb = transC ? b.getStride(0) : a.getStride(0);
-  int ldc = c.getSize(1);
+  int ldc = c.getStride(0);
  auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
  auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
@@ -238,9 +242,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
  int n = c.getSize(1); // other size
  int k = transA ? a.getSize(1) : a.getSize(2);
-  int lda = transC ? a.getSize(2) : b.getSize(2);
+  int lda = transC ? a.getStride(1) : b.getStride(1);
-  int ldb = transC ? b.getSize(2) : a.getSize(2);
+  int ldb = transC ? b.getStride(1) : a.getStride(1);
-  int ldc = c.getSize(2);
+  int ldc = c.getStride(1);
  auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
  auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
@@ -254,9 +258,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
  HostTensor<float*, 1, true> hostB({b.getSize(0)});
  HostTensor<float*, 1, true> hostC({c.getSize(0)});
-  size_t aOffset = a.getSize(1) * a.getSize(2);
+  size_t aOffset = a.getStride(0);
-  size_t bOffset = b.getSize(1) * b.getSize(2);
+  size_t bOffset = b.getStride(0);
-  size_t cOffset = c.getSize(1) * c.getSize(2);
+  size_t cOffset = c.getStride(0);
  for (int i = 0; i < a.getSize(0); ++i) {
    hostA[i] = transC ? a.data() + i * aOffset : b.data() + i * bOffset;

--- a/gpu/utils/NoTypeTensor.cuh
+++ b/gpu/utils/NoTypeTensor.cuh
@@ -16,7 +16,7 @@
 namespace faiss { namespace gpu {
-template <int Dim, bool Contig = false, typename IndexT = int>
+template <int Dim, bool InnerContig = false, typename IndexT = int>
 class NoTypeTensor {
 public:
  NoTypeTensor()
@@ -25,7 +25,7 @@ class NoTypeTensor {
  }
  template <typename T>
-  NoTypeTensor(Tensor<T, Dim, Contig, IndexT>& t)
+  NoTypeTensor(Tensor<T, Dim, InnerContig, IndexT>& t)
      : mem_(t.data()),
        typeSize_(sizeof(T)) {
    for (int i = 0; i < Dim; ++i) {
@@ -87,13 +87,14 @@ class NoTypeTensor {
  }
  template <typename T>
-  Tensor<T, Dim, Contig, IndexT> toTensor() {
+  Tensor<T, Dim, InnerContig, IndexT> toTensor() {
    FAISS_ASSERT(sizeof(T) == typeSize_);
-    return Tensor<T, Dim, Contig, IndexT>((T*) mem_, size_, stride_);
+    return Tensor<T, Dim, InnerContig, IndexT>((T*) mem_, size_, stride_);
  }
-  NoTypeTensor<Dim, Contig, IndexT> narrowOutermost(IndexT start, IndexT size) {
+  NoTypeTensor<Dim, InnerContig, IndexT> narrowOutermost(IndexT start,
+                                                         IndexT size) {
    char* newPtr = (char*) mem_;
    if (start > 0) {
@@ -110,7 +111,7 @@ class NoTypeTensor {
      }
    }
-    return NoTypeTensor<Dim, Contig, IndexT>(
+    return NoTypeTensor<Dim, InnerContig, IndexT>(
      newPtr, typeSize_, newSize, stride_);
  }

--- a/gpu/utils/Tensor-inl.cuh
+++ b/gpu/utils/Tensor-inl.cuh
--- a/gpu/utils/Tensor.cuh
+++ b/gpu/utils/Tensor.cuh
--- a/gpu/utils/Transpose.cuh
+++ b/gpu/utils/Transpose.cuh
@@ -19,26 +19,26 @@
 namespace faiss { namespace gpu {
-template <typename T>
+template <typename T, typename IndexT>
 struct TensorInfo {
  static constexpr int kMaxDims = 8;
  T* data;
-  int sizes[kMaxDims];
+  IndexT sizes[kMaxDims];
-  int strides[kMaxDims];
+  IndexT strides[kMaxDims];
  int dims;
 };
-template <typename T, int Dim>
+template <typename T, typename IndexT, int Dim>
 struct TensorInfoOffset {
-  __device__ inline static unsigned int get(const TensorInfo<T>& info,
+  __device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
-                                            unsigned int linearId) {
+                                            IndexT linearId) {
-    unsigned int offset = 0;
+    IndexT offset = 0;
 #pragma unroll
    for (int i = Dim - 1; i >= 0; --i) {
-      unsigned int curDimIndex = linearId % info.sizes[i];
+      IndexT curDimIndex = linearId % info.sizes[i];
-      unsigned int curDimOffset = curDimIndex * info.strides[i];
+      IndexT curDimOffset = curDimIndex * info.strides[i];
      offset += curDimOffset;
@@ -51,21 +51,21 @@ struct TensorInfoOffset {
  }
 };
-template <typename T>
+template <typename T, typename IndexT>
-struct TensorInfoOffset<T, -1> {
+struct TensorInfoOffset<T, IndexT, -1> {
-  __device__ inline static unsigned int get(const TensorInfo<T>& info,
+  __device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
-                                            unsigned int linearId) {
+                                            IndexT linearId) {
    return linearId;
  }
 };
-template <typename T, int Dim>
+template <typename T, typename IndexT, int Dim>
-TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) {
+TensorInfo<T, IndexT> getTensorInfo(const Tensor<T, Dim, true>& t) {
-  TensorInfo<T> info;
+  TensorInfo<T, IndexT> info;
  for (int i = 0; i < Dim; ++i) {
-    info.sizes[i] = t.getSize(i);
+    info.sizes[i] = (IndexT) t.getSize(i);
-    info.strides[i] = t.getStride(i);
+    info.strides[i] = (IndexT) t.getStride(i);
  }
  info.data = t.data();
@@ -74,26 +74,22 @@ TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) {
  return info;
 }
-template <typename T, int DimInput, int DimOutput>
+template <typename T, typename IndexT, int DimInput, int DimOutput>
-__global__ void transposeAny(TensorInfo<T> input,
+__global__ void transposeAny(TensorInfo<T, IndexT> input,
-                             TensorInfo<T> output,
+                             TensorInfo<T, IndexT> output,
-                             unsigned int totalSize) {
+                             IndexT totalSize) {
-  auto linearThreadId = blockIdx.x * blockDim.x + threadIdx.x;
+  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x;
+       i < totalSize;
-  if (linearThreadId >= totalSize) {
+       i += gridDim.x + blockDim.x) {
-    return;
+    auto inputOffset = TensorInfoOffset<T, IndexT, DimInput>::get(input, i);
-  }
+    auto outputOffset = TensorInfoOffset<T, IndexT, DimOutput>::get(output, i);
-  auto inputOffset =
-    TensorInfoOffset<T, DimInput>::get(input, linearThreadId);
-  auto outputOffset =
-    TensorInfoOffset<T, DimOutput>::get(output, linearThreadId);
 #if __CUDA_ARCH__ >= 350
-  output.data[outputOffset] = __ldg(&input.data[inputOffset]);
+    output.data[outputOffset] = __ldg(&input.data[inputOffset]);
 #else
-  output.data[outputOffset] = input.data[inputOffset];
+    output.data[outputOffset] = input.data[inputOffset];
 #endif
+  }
 }
 /// Performs an out-of-place transposition between any two dimensions.
@@ -110,7 +106,8 @@ void runTransposeAny(Tensor<T, Dim, true>& in,
                     int dim1, int dim2,
                     Tensor<T, Dim, true>& out,
                     cudaStream_t stream) {
-  static_assert(Dim <= TensorInfo<T>::kMaxDims, "too many dimensions");
+  static_assert(Dim <= TensorInfo<T, unsigned int>::kMaxDims,
+                "too many dimensions");
  FAISS_ASSERT(dim1 != dim2);
  FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
@@ -127,20 +124,33 @@ void runTransposeAny(Tensor<T, Dim, true>& in,
    FAISS_ASSERT(out.getSize(i) == outSize[i]);
  }
-  auto inInfo = getTensorInfo<T, Dim>(in);
+  size_t totalSize = in.numElements();
-  auto outInfo = getTensorInfo<T, Dim>(out);
+  size_t block = std::min((size_t) getMaxThreadsCurrentDevice(), totalSize);
+  if (totalSize <= (size_t) std::numeric_limits<int>::max()) {
+    // div/mod seems faster with unsigned types
+    auto inInfo = getTensorInfo<T, unsigned int, Dim>(in);
+    auto outInfo = getTensorInfo<T, unsigned int, Dim>(out);
+    std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
+    std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
-  std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
+    auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
-  std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
-  int totalSize = in.numElements();
+    transposeAny<T, unsigned int, Dim, -1>
+      <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
+  } else {
+    auto inInfo = getTensorInfo<T, unsigned long, Dim>(in);
+    auto outInfo = getTensorInfo<T, unsigned long, Dim>(out);
-  int numThreads = std::min(getMaxThreadsCurrentDevice(), totalSize);
+    std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
-  auto grid = dim3(utils::divUp(totalSize, numThreads));
+    std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
-  auto block = dim3(numThreads);
-  transposeAny<T, Dim, -1><<<grid, block, 0, stream>>>(
+    auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
-    inInfo, outInfo, totalSize);
+    transposeAny<T, unsigned long, Dim, -1>
+      <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
+  }
  CUDA_TEST_ERROR();
 }

--- a/gpu/utils/blockselect/BlockSelectImpl.cuh
+++ b/gpu/utils/blockselect/BlockSelectImpl.cuh
--- a/index_io.cpp
+++ b/index_io.cpp
@@ -222,7 +222,6 @@ void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) {
 }
 static void write_ivf_header (const IndexIVF * ivf, FILE *f,
                              bool include_ids = true) {
    write_index_header (ivf, f);
@@ -445,6 +444,7 @@ static void read_ScalarQuantizer (ScalarQuantizer *ivsc, FILE *f) {
    READVECTOR (ivsc->trained);
 }
 ProductQuantizer * read_ProductQuantizer (const char*fname) {
    FILE *f = fopen (fname, "r");
    FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
@@ -676,8 +676,8 @@ Index *read_index (FILE * f, bool try_mmap) {
        }
        idx = idxmap;
    } else {
-        fprintf (stderr, "Index type 0x%08x not supported\n", h);
+        FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
-        abort ();
+        idx = nullptr;
    }
    return idx;
 }

--- a/python/swigfaiss.py
+++ b/python/swigfaiss.py
--- a/python/swigfaiss_gpu.py
+++ b/python/swigfaiss_gpu.py
--- a/python/swigfaiss_gpu_wrap.cxx
+++ b/python/swigfaiss_gpu_wrap.cxx
--- a/python/swigfaiss_wrap.cxx
+++ b/python/swigfaiss_wrap.cxx
--- a/swigfaiss.swig
+++ b/swigfaiss.swig
--- a/tests/test_build_blocks.py
+++ b/tests/test_build_blocks.py
--- a/tests/test_index.py
+++ b/tests/test_index.py
--- a/tests/test_index_composite.py
+++ b/tests/test_index_composite.py
@@ -42,6 +42,42 @@ class TestRemove(unittest.TestCase):
        else:
            assert False, 'should have raised an exception'
+    def test_remove_id_map_2(self):
+        # from https://github.com/facebookresearch/faiss/issues/255
+        rs = np.random.RandomState(1234)
+        X = rs.randn(10, 10).astype(np.float32)
+        idx = np.array([0, 10, 20, 30, 40, 5, 15, 25, 35, 45], np.int64)
+        remove_set = np.array([10, 30], dtype=np.int64)
+        index = faiss.index_factory(10, 'IDMap,Flat')
+        index.add_with_ids(X[:5, :], idx[:5])
+        index.remove_ids(remove_set)
+        index.add_with_ids(X[5:, :], idx[5:])
+        print (index.search(X, 1))
+        for i in range(10):
+            _, searchres = index.search(X[i:i + 1, :], 1)
+            if idx[i] in remove_set:
+                assert searchres[0] != idx[i]
+            else:
+                assert searchres[0] == idx[i]
+class TestRangeSearch(unittest.TestCase):
+    def test_range_search_id_map(self):
+        sub_index = faiss.IndexFlat(5, 1)  # L2 search instead of inner product
+        xb = np.zeros((10, 5), dtype='float32')
+        xb[:, 0] = np.arange(10) + 1000
+        index = faiss.IndexIDMap2(sub_index)
+        index.add_with_ids(xb, np.arange(10) + 100)
+        dist = float(np.linalg.norm(xb[3] - xb[0])) * 0.99
+        res_subindex = sub_index.range_search(xb[[0], :], dist)
+        res_index = index.range_search(xb[[0], :], dist)
+        assert len(res_subindex[2]) == 2
+        np.testing.assert_array_equal(res_subindex[2] + 100, res_index[2])
 class TestUpdate(unittest.TestCase):

--- a/utils.cpp
+++ b/utils.cpp
--- a/utils.h
+++ b/utils.h