Facebook sync (Mar 2019) (#756)

Facebook sync (Mar 2019) - MatrixStats object - option to round coordinates during k-means optimization - alternative option for search in HNSW - moved stats and imbalance_factor of IndexIVF to InvertedLists object - range search for IVFScalarQuantizer - direct unit8 codec in ScalarQuantizer - renamed IndexProxy to IndexReplicas and moved to main Faiss - better support for PQ code assignment with external index - support for IMI2x16 (4B virtual centroids!) - support for k = 2048 search on GPU (instead of 1024) - most CUDA mem alloc failures throw exceptions instead of terminating on an assertion - support for renaming an ondisk invertedlists - interrupt computations with ctrl-C in python

Facebook sync (Mar 2019) (#756)
Facebook sync (Mar 2019) - MatrixStats object - option to round coordinates during k-means optimization - alternative option for search in HNSW - moved stats and imbalance_factor of IndexIVF to InvertedLists object - range search for IVFScalarQuantizer - direct unit8 codec in ScalarQuantizer - renamed IndexProxy to IndexReplicas and moved to main Faiss - better support for PQ code assignment with external index - support for IMI2x16 (4B virtual centroids!) - support for k = 2048 search on GPU (instead of 1024) - most CUDA mem alloc failures throw exceptions instead of terminating on an assertion - support for renaming an ondisk invertedlists - interrupt computations with ctrl-C in python
afe0fdc1 · Lucas Hosseini · GitHub · a9959bf6 · afe0fdc1 · afe0fdc1
Unverified Commit afe0fdc1 authored Mar 29, 2019 by Lucas Hosseini Committed by GitHub Mar 29, 2019
120 changed files
--- a/AutoTune.cpp
+++ b/AutoTune.cpp
@@ -15,6 +15,8 @@
 #include "AutoTune.h"

 #include <cmath>
+#include <stdarg.h>     /* va_list, va_start, va_arg, va_end */
+

 #include "FaissAssert.h"
 #include "utils.h"
@@ -992,5 +994,235 @@ IndexBinary *index_binary_factory(int d, const char *description)
    return index;
 }

+/*********************************************************************
+ * MatrixStats
+ *********************************************************************/
+
+MatrixStats::PerDimStats::PerDimStats():
+    n(0), n_nan(0), n_inf(0), n0(0),
+    min(HUGE_VALF), max(-HUGE_VALF),
+    sum(0), sum2(0),
+    mean(NAN), stddev(NAN)
+{}
+
+
+void MatrixStats::PerDimStats::add (float x)
+{
+    n++;
+    if (std::isnan(x)) {
+        n_nan++;
+        return;
+    }
+    if (!std::isfinite(x)) {
+        n_inf++;
+        return;
+    }
+    if (x == 0) n0++;
+    if (x < min) min = x;
+    if (x > max) max = x;
+    sum += x;
+    sum2 += (double)x * (double)x;
+}
+
+void MatrixStats::PerDimStats::compute_mean_std ()
+{
+    n_valid = n - n_nan - n_inf;
+    mean = sum / n_valid;
+    double var = sum2 / n_valid - mean * mean;
+    if (var < 0) var = 0;
+    stddev = sqrt(var);
+}
+
+
+void MatrixStats::do_comment (const char *fmt, ...)
+{
+    va_list ap;
+
+    /* Determine required size */
+    va_start(ap, fmt);
+    size_t size = vsnprintf(buf, nbuf, fmt, ap);
+    va_end(ap);
+
+    nbuf -= size;
+    buf += size;
+}
+
+
+
+MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
+    n(n), d(d),
+    n_collision(0), n_valid(0), n0(0),
+    min_norm2(HUGE_VAL), max_norm2(0)
+{
+    std::vector<char> comment_buf (10000);
+    buf = comment_buf.data ();
+    nbuf = comment_buf.size();
+
+    do_comment ("analyzing %ld vectors of size %ld\n", n, d);
+
+    if (d > 1024) {
+        do_comment (
+           "indexing this many dimensions is hard, "
+           "please consider dimensionality reducution (with PCAMatrix)\n");
+    }
+
+    size_t nbytes = sizeof (x[0]) * d;
+    per_dim_stats.resize (d);
+
+    for (size_t i = 0; i < n; i++) {
+        const float *xi = x + d * i;
+        double sum2 = 0;
+        for (size_t j = 0; j < d; j++) {
+            per_dim_stats[j].add (xi[j]);
+            sum2 += xi[j] * (double)xi[j];
+        }
+
+        if (std::isfinite (sum2)) {
+            n_valid++;
+            if (sum2 == 0) {
+                n0 ++;
+            } else {
+                if (sum2 < min_norm2) min_norm2 = sum2;
+                if (sum2 > max_norm2) max_norm2 = sum2;
+            }
+        }
+
+        { // check hash
+            uint64_t hash = hash_bytes((const uint8_t*)xi, nbytes);
+            auto elt = occurrences.find (hash);
+            if (elt == occurrences.end()) {
+                Occurrence occ = {i, 1};
+                occurrences[hash] = occ;
+            } else {
+                if (!memcmp (xi, x + elt->second.first * d, nbytes)) {
+                    elt->second.count ++;
+                } else {
+                    n_collision ++;
+                    // we should use a list of collisions but overkill
+                }
+            }
+        }
+    }
+
+    // invalid vecor stats
+    if (n_valid == n) {
+        do_comment ("no NaN or Infs in data\n");
+    } else {
+        do_comment ("%ld vectors contain NaN or Inf "
+                 "(or have too large components), "
+                 "expect bad results with indexing!\n", n - n_valid);
+    }
+
+    // copies in dataset
+    if (occurrences.size() == n) {
+        do_comment ("all vectors are distinct\n");
+    } else {
+        do_comment ("%ld vectors are distinct (%.2f%%)\n",
+                 occurrences.size(),
+                 occurrences.size() * 100.0 / n);
+
+        if (n_collision > 0) {
+            do_comment ("%ld collisions in hash table, "
+                     "counts may be invalid\n", n_collision);
+        }
+
+        Occurrence max = {0, 0};
+        for (auto it = occurrences.begin();
+             it != occurrences.end(); ++it) {
+            if (it->second.count > max.count) {
+                max = it->second;
+            }
+        }
+        do_comment ("vector %ld has %ld copies\n", max.first, max.count);
+    }
+
+    { // norm stats
+        min_norm2 = sqrt (min_norm2);
+        max_norm2 = sqrt (max_norm2);
+        do_comment ("range of L2 norms=[%g, %g] (%ld null vectors)\n",
+                 min_norm2, max_norm2, n0);
+
+        if (max_norm2 < min_norm2 * 1.0001) {
+            do_comment ("vectors are normalized, inner product and "
+                     "L2  search are equivalent\n");
+        }
+
+        if (max_norm2 > min_norm2 * 100) {
+            do_comment ("vectors have very large differences in norms, "
+                     "is this normal?\n");
+        }
+    }
+
+    { // per dimension stats
+
+        double max_std = 0, min_std = HUGE_VAL;
+
+        size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
+
+        for (size_t j = 0; j < d; j++) {
+            PerDimStats &st = per_dim_stats[j];
+            st.compute_mean_std ();
+            n0 += st.n0;
+
+            if (st.max == st.min) {
+                n_0_range ++;
+            } else if (st.max < 1.001 * st.min) {
+                n_dangerous_range ++;
+            }
+
+            if (st.stddev > max_std) max_std = st.stddev;
+            if (st.stddev < min_std) min_std = st.stddev;
+        }
+
+
+
+        if (n0 == 0) {
+            do_comment ("matrix contains no 0s\n");
+        } else {
+            do_comment ("matrix contains %.2f %% 0 entries\n",
+                     n0 * 100.0 / (n * d));
+        }
+
+        if (n_0_range == 0) {
+            do_comment ("no constant dimensions\n");
+        } else {
+            do_comment ("%ld dimensions are constant: they can be removed\n",
+                     n_0_range);
+        }
+
+        if (n_dangerous_range == 0) {
+            do_comment ("no dimension has a too large mean\n");
+        } else {
+            do_comment ("%ld dimensions are too large "
+                     "wrt. their variance, may loose precision "
+                     "in IndexFlatL2 (use CenteringTransform)\n",
+                     n_dangerous_range);
+        }
+
+        do_comment ("stddevs per dimension are in [%g %g]\n", min_std, max_std);
+
+        size_t n_small_var = 0;
+
+        for (size_t j = 0; j < d; j++) {
+            const PerDimStats &st = per_dim_stats[j];
+            if (st.stddev < max_std * 1e-4) {
+                n_small_var++;
+            }
+        }
+
+        if (n_small_var > 0) {
+            do_comment ("%ld dimensions have negligible stddev wrt. "
+                     "the largest dimension, they could be ignored",
+                     n_small_var);
+        }
+
+    }
+    comments = comment_buf.data ();
+    buf = nullptr;
+    nbuf = 0;
+}
+
+
+

 } // namespace faiss
--- a/AutoTune.h
+++ b/AutoTune.h
@@ -12,6 +12,7 @@
 #define FAISS_AUTO_TUNE_H

 #include <vector>
+#include <unordered_map>

 #include "Index.h"
 #include "IndexBinary.h"
@@ -209,6 +210,50 @@ Index *index_factory (int d, const char *description,

 IndexBinary *index_binary_factory (int d, const char *description);

+
+/** Reports some statistics on a dataset and comments on them.
+ *
+ * It is a class rather than a function so that all stats can also be
+ * accessed from code */
+
+struct MatrixStats {
+    MatrixStats (size_t n, size_t d, const float *x);
+    std::string comments;
+
+    // raw statistics
+    size_t n, d;
+    size_t n_collision, n_valid, n0;
+    double min_norm2, max_norm2;
+
+    struct PerDimStats {
+        size_t n, n_nan, n_inf, n0;
+
+        float min, max;
+        double sum, sum2;
+
+        size_t n_valid;
+        double mean, stddev;
+
+        PerDimStats();
+        void add (float x);
+        void compute_mean_std ();
+    };
+
+    std::vector<PerDimStats> per_dim_stats;
+    struct Occurrence {
+        size_t first;
+        size_t count;
+    };
+    std::unordered_map<uint64_t, Occurrence> occurrences;
+
+    char *buf;
+    size_t nbuf;
+    void do_comment (const char *fmt, ...);
+
+};
+
+
+
 } // namespace faiss



--- a/AuxIndexStructures.cpp
+++ b/AuxIndexStructures.cpp
@@ -8,11 +8,12 @@

 // -*- c++ -*-

+#include <cstring>
+
 #include "AuxIndexStructures.h"

 #include "FaissAssert.h"

-#include <cstring>

 namespace faiss {

@@ -72,13 +73,22 @@ BufferList::~BufferList ()
    }
 }

+void BufferList::add (idx_t id, float dis) {
+    if (wp == buffer_size) { // need new buffer
+        append_buffer();
+    }
+    Buffer & buf = buffers.back();
+    buf.ids [wp] = id;
+    buf.dis [wp] = dis;
+    wp++;
+}


 void BufferList::append_buffer ()
 {
-        Buffer buf = {new idx_t [buffer_size], new float [buffer_size]};
-        buffers.push_back (buf);
-        wp = 0;
+    Buffer buf = {new idx_t [buffer_size], new float [buffer_size]};
+    buffers.push_back (buf);
+    wp = 0;
 }

 /// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
@@ -97,7 +107,7 @@ void BufferList::copy_range (size_t ofs, size_t n,
        dest_dis += ncopy;
        ofs = 0;
        bno ++;
-            n -= ncopy;
+        n -= ncopy;
    }
 }

@@ -106,6 +116,12 @@ void BufferList::copy_range (size_t ofs, size_t n,
 * RangeSearchPartialResult
 ***********************************************************************/

+void RangeQueryResult::add (float dis, idx_t id) {
+    nres++;
+    pres->add (id, dis);
+}
+
+

 RangeSearchPartialResult::RangeSearchPartialResult (RangeSearchResult * res_in):
    BufferList(res_in->buffer_size),
@@ -114,10 +130,10 @@ RangeSearchPartialResult::RangeSearchPartialResult (RangeSearchResult * res_in):


 /// begin a new result
-RangeSearchPartialResult::QueryResult &
+RangeQueryResult &
    RangeSearchPartialResult::new_result (idx_t qno)
 {
-    QueryResult qres = {qno, 0, this};
+    RangeQueryResult qres = {qno, 0, this};
    queries.push_back (qres);
    return queries.back();
 }
@@ -140,7 +156,7 @@ void RangeSearchPartialResult::finalize ()
 void RangeSearchPartialResult::set_lims ()
 {
    for (int i = 0; i < queries.size(); i++) {
-        QueryResult & qres = queries[i];
+        RangeQueryResult & qres = queries[i];
        res->lims[qres.qno] = qres.nres;
    }
 }
@@ -150,7 +166,7 @@ void RangeSearchPartialResult::set_result (bool incremental)
 {
    size_t ofs = 0;
    for (int i = 0; i < queries.size(); i++) {
-        QueryResult & qres = queries[i];
+        RangeQueryResult & qres = queries[i];

        copy_range (ofs, qres.nres,
                    res->labels + res->lims[qres.qno],
@@ -246,6 +262,38 @@ size_t VectorIOReader::operator()(
 }


+/***********************************************************
+ * Interrupt callback
+ ***********************************************************/
+
+
+std::unique_ptr<InterruptCallback> InterruptCallback::instance;
+
+void InterruptCallback::check () {
+    if (!instance.get()) {
+        return;
+    }
+    if (instance->want_interrupt ()) {
+        FAISS_THROW_MSG ("computation interrupted");
+    }
+}
+
+bool InterruptCallback::is_interrupted () {
+    if (!instance.get()) {
+        return false;
+    }
+    return instance->want_interrupt();
+}
+
+
+size_t InterruptCallback::get_period_hint (size_t flops) {
+    if (!instance.get()) {
+        return 1L << 30; // never check
+    }
+    // for 10M flops, it is reasonable to check once every 10 iterations
+    return std::max((size_t)10 * 10 * 1000 * 1000 / (flops + 1), (size_t)1);
+}
+




--- a/AuxIndexStructures.h
+++ b/AuxIndexStructures.h
@@ -18,6 +18,7 @@

 #include <vector>
 #include <unordered_set>
+#include <memory>


 #include "Index.h"
@@ -117,16 +118,7 @@ struct BufferList {
    // create a new buffer
    void append_buffer ();

-    inline void add (idx_t id, float dis)
-    {
-        if (wp == buffer_size) { // need new buffer
-            append_buffer();
-        }
-        Buffer & buf = buffers.back();
-        buf.ids [wp] = id;
-        buf.dis [wp] = dis;
-        wp++;
-    }
+    void add (idx_t id, float dis);

    /// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
    /// tables dest_ids, dest_dis
@@ -135,7 +127,17 @@ struct BufferList {

 };

+struct RangeSearchPartialResult;

+/// result structure for a single query
+struct RangeQueryResult {
+    using idx_t = Index::idx_t;
+    idx_t qno;
+    size_t nres;
+    RangeSearchPartialResult * pres;
+
+    void add (float dis, idx_t id);
+};

 /// the entries in the buffers are split per query
 struct RangeSearchPartialResult: BufferList {
@@ -143,21 +145,10 @@ struct RangeSearchPartialResult: BufferList {

    explicit RangeSearchPartialResult (RangeSearchResult * res_in);

-    /// result structure for a single query
-    struct QueryResult {
-        idx_t qno;
-        size_t nres;
-        RangeSearchPartialResult * pres;
-        inline void add (float dis, idx_t id) {
-            nres++;
-            pres->add (id, dis);
-        }
-    };
-
-    std::vector<QueryResult> queries;
+    std::vector<RangeQueryResult> queries;

    /// begin a new result
-    QueryResult & new_result (idx_t qno);
+    RangeQueryResult & new_result (idx_t qno);

    void finalize ();

@@ -173,7 +164,6 @@ struct RangeSearchPartialResult: BufferList {
 * Abstract I/O objects
 ***********************************************************/

-
 struct IOReader {
    // name that can be used in error messages
    std::string name;
@@ -214,6 +204,57 @@ struct VectorIOWriter:IOWriter {
    size_t operator()(const void *ptr, size_t size, size_t nitems) override;
 };

+/***********************************************************
+ * The distance computer maintains a current query and computes
+ * distances to elements in an index that supports random access.
+ *
+ * The DistanceComputer is not intended to be thread-safe (eg. because
+ * it maintains counters) so the distance functions are not const,
+ * instanciate one from each thread if needed.
+ ***********************************************************/
+ struct DistanceComputer {
+     using idx_t = Index::idx_t;
+
+     /// called before computing distances
+     virtual void set_query(const float *x) = 0;
+
+     /// compute distance of vector i to current query
+     virtual float operator () (idx_t i) = 0;
+
+     /// compute distance between two stored vectors
+     virtual float symmetric_dis (idx_t i, idx_t j) = 0;
+
+     virtual ~DistanceComputer() {}
+ };
+
+/***********************************************************
+ * Interrupt callback
+ ***********************************************************/
+
+struct InterruptCallback {
+    virtual bool want_interrupt () = 0;
+    virtual ~InterruptCallback() {}
+
+    static std::unique_ptr<InterruptCallback> instance;
+
+    /** check if:
+     * - an interrupt callback is set
+     * - the callback retuns true
+     * if this is the case, then throw an exception
+     */
+    static void check ();
+
+    /// same as check() but return true if is interrupted instead of
+    /// throwing
+    static bool is_interrupted ();
+
+    /** assuming each iteration takes a certain number of flops, what
+     * is a reasonable interval to check for interrupts?
+     */
+    static size_t get_period_hint (size_t flops);
+
+};
+


 }; // namespace faiss

--- a/Clustering.cpp
+++ b/Clustering.cpp
@@ -9,6 +9,7 @@
 // -*- c++ -*-

 #include "Clustering.h"
+#include "AuxIndexStructures.h"


 #include <cmath>
@@ -24,7 +25,9 @@ namespace faiss {
 ClusteringParameters::ClusteringParameters ():
    niter(25),
    nredo(1),
-    verbose(false), spherical(false),
+    verbose(false),
+    spherical(false),
+    int_centroids(false),
    update_index(false),
    frozen_centroids(false),
    min_points_per_centroid(39),
@@ -58,7 +61,18 @@ static double imbalance_factor (int n, int k, long *assign) {
    return uf;
 }

+void Clustering::post_process_centroids ()
+{
+
+    if (spherical) {
+        fvec_renorm_L2 (d, k, centroids.data());
+    }

+    if (int_centroids) {
+        for (size_t i = 0; i < centroids.size(); i++)
+            centroids[i] = roundf (centroids[i]);
+    }
+}


 void Clustering::train (idx_t nx, const float *x_in, Index & index) {
@@ -117,9 +131,6 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
               "redo %d times, %d iterations\n",
               int(nx), d, k, nredo, niter);

-
-
-
    idx_t * assign = new idx_t[nx];
    ScopeDeleter<idx_t> del (assign);
    float * dis = new float[nx];
@@ -146,7 +157,7 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
    double t_search_tot = 0;
    if (verbose) {
        printf("  Preprocessing in %.2f s\n",
-               (getmillisecs() - t0)/1000.);
+               (getmillisecs() - t0) / 1000.);
    }
    t0 = getmillisecs();

@@ -156,7 +167,6 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
            printf("Outer iteration %d / %d\n", redo, nredo);
        }

-
        // initialize remaining centroids with random points from the dataset
        centroids.resize (d * k);
        std::vector<int> perm (nx);
@@ -166,9 +176,7 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
            memcpy (&centroids[i * d], x + perm[i] * d,
                    d * sizeof (float));

-        if (spherical) {
-            fvec_renorm_L2 (d, k, centroids.data());
-        }
+        post_process_centroids ();

        if (index.ntotal != 0) {
            index.reset();
@@ -183,6 +191,7 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
        for (int i = 0; i < niter; i++) {
            double t0s = getmillisecs();
            index.search (nx, x, 1, dis, assign);
+            InterruptCallback::check();
            t_search_tot += getmillisecs() - t0s;

            err = 0;
@@ -204,8 +213,7 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
                fflush (stdout);
            }

-            if (spherical)
-                fvec_renorm_L2 (d, k, centroids.data());
+            post_process_centroids ();

            index.reset ();
            if (update_index)
@@ -213,6 +221,7 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {

            assert (index.ntotal == 0);
            index.add (k, centroids.data());
+            InterruptCallback::check ();
        }
        if (verbose) printf("\n");
        if (nredo > 1) {

--- a/Clustering.h
+++ b/Clustering.h
@@ -26,6 +26,7 @@ struct ClusteringParameters {

    bool verbose;
    bool spherical;     ///< do we want normalized centroids?
+    bool int_centroids; ///< round centroids coordinates to integer
    bool update_index;  ///< update index after each iteration?
    bool frozen_centroids;  ///< use the centroids provided as input and do not change them during iterations

@@ -72,6 +73,10 @@ struct Clustering: ClusteringParameters {
    /// Index is used during the assignment stage
    virtual void train (idx_t n, const float * x, faiss::Index & index);

+    /// Post-process the centroids after each centroid update.
+    /// includes optional L2 normalization and nearest integer rounding
+    void post_process_centroids ();
+
    virtual ~Clustering() {}
 };


--- a/HNSW.cpp
+++ b/HNSW.cpp
@@ -9,12 +9,11 @@
 // -*- c++ -*-

 #include "HNSW.h"
-
+#include "AuxIndexStructures.h"

 namespace faiss {

 using idx_t = Index::idx_t;
-using DistanceComputer = HNSW::DistanceComputer;

 /**************************************************************
 * HNSW structure implementation
@@ -544,12 +543,24 @@ int HNSW::search_from_candidates(
    vt.set(v1);
  }

+  bool do_dis_check = check_relative_distance;
  int nstep = 0;

  while (candidates.size() > 0) {
    float d0 = 0;
    int v0 = candidates.pop_min(&d0);

+    if (do_dis_check) {
+      // tricky stopping condition: there are more that ef
+      // distances that are processed already that are smaller
+      // than d0
+
+      int n_dis_below = candidates.count_below(d0);
+      if(n_dis_below >= efSearch) {
+        break;
+      }
+    }
+
    size_t begin, end;
    neighbor_range(v0, level, &begin, &end);

@@ -572,7 +583,7 @@ int HNSW::search_from_candidates(
    }

    nstep++;
-    if (nstep > efSearch) {
+    if (!do_dis_check && nstep > efSearch) {
      break;
    }
  }
@@ -596,38 +607,31 @@ int HNSW::search_from_candidates(
 * Searching
 **************************************************************/

-template<typename T>
-using MaxHeap = std::priority_queue<T, std::vector<T>, std::less<T>>;
-template<typename T>
-using MinHeap = std::priority_queue<T, std::vector<T>, std::greater<T>>;
-
-
-MaxHeap<HNSW::Node> HNSW::search_from(
+std::priority_queue<HNSW::Node> HNSW::search_from_candidate_unbounded(
  const Node& node,
  DistanceComputer& qdis,
  int ef,
  VisitedTable *vt) const
 {
-  MaxHeap<Node> top_candidates;
-  MinHeap<Node> candidate_set;
+  int ndis = 0;
+  std::priority_queue<Node> top_candidates;
+  std::priority_queue<Node, std::vector<Node>, std::greater<Node>> candidates;

  top_candidates.push(node);
-  candidate_set.push(node);
+  candidates.push(node);

  vt->set(node.second);

-  float lower_bound = node.first;
-
-  while (!candidate_set.empty()) {
+  while (!candidates.empty()) {
    float d0;
    storage_idx_t v0;
-    std::tie(d0, v0) = candidate_set.top();
+    std::tie(d0, v0) = candidates.top();

-    if (d0 > lower_bound) {
+    if (d0 > top_candidates.top().first) {
      break;
    }

-    candidate_set.pop();
+    candidates.pop();

    size_t begin, end;
    neighbor_range(v0, 0, &begin, &end);
@@ -645,20 +649,28 @@ MaxHeap<HNSW::Node> HNSW::search_from(
      vt->set(v1);

      float d1 = qdis(v1);
+      ++ndis;

      if (top_candidates.top().first > d1 || top_candidates.size() < ef) {
-        candidate_set.emplace(d1, v1);
+        candidates.emplace(d1, v1);
        top_candidates.emplace(d1, v1);

        if (top_candidates.size() > ef) {
          top_candidates.pop();
        }
-
-        lower_bound = top_candidates.top().first;
      }
    }
  }

+#pragma omp critical
+  {
+    ++hnsw_stats.n1;
+    if (candidates.size() == 0) {
+      ++hnsw_stats.n2;
+    }
+    hnsw_stats.n3 += ndis;
+  }
+
  return top_candidates;
 }

@@ -677,32 +689,34 @@ void HNSW::search(DistanceComputer& qdis, int k,
    }

    int ef = std::max(efSearch, k);
-    MaxHeap<Node> top_candidates = search_from(Node(d_nearest, nearest), qdis, ef, &vt);
-    while (top_candidates.size() > k) {
-      top_candidates.pop();
-    }
+    if (search_bounded_queue) {
+      MinimaxHeap candidates(ef);

-    int nres = 0;
-    while (!top_candidates.empty()) {
-      float d;
-      storage_idx_t label;
-      std::tie(d, label) = top_candidates.top();
-      faiss::maxheap_push(++nres, D, I, d, label);
-      top_candidates.pop();
-    }
+      candidates.push(nearest, d_nearest);

-    // MinimaxHeap candidates(candidates_size);
+      search_from_candidates(qdis, k, I, D, candidates, vt, 0);
+    } else {
+      std::priority_queue<Node> top_candidates =
+        search_from_candidate_unbounded(Node(d_nearest, nearest),
+                                        qdis, ef, &vt);

-//    top_candidates.emplace(d_nearest, nearest);
+      while (top_candidates.size() > k) {
+        top_candidates.pop();
+      }

-    // search_from_candidates(qdis, k, I, D, candidates, vt, 0);
+      int nres = 0;
+      while (!top_candidates.empty()) {
+        float d;
+        storage_idx_t label;
+        std::tie(d, label) = top_candidates.top();
+        faiss::maxheap_push(++nres, D, I, d, label);
+        top_candidates.pop();
+      }
+    }

-    // NOTE(hoss): Init at the beginning?
    vt.advance();

  } else {
-    assert(false);
-
    int candidates_size = upper_beam;
    MinimaxHeap candidates(candidates_size);

@@ -742,44 +756,47 @@ void HNSW::MinimaxHeap::push(storage_idx_t i, float v) {
  if (k == n) {
    if (v >= dis[0]) return;
    faiss::heap_pop<HC> (k--, dis.data(), ids.data());
+    --nvalid;
  }
  faiss::heap_push<HC> (++k, dis.data(), ids.data(), v, i);
+  ++nvalid;
 }

 float HNSW::MinimaxHeap::max() const {
-  assert(k > 0);
-
  return dis[0];
 }

 int HNSW::MinimaxHeap::size() const {
-  return k;
+  return nvalid;
 }

 void HNSW::MinimaxHeap::clear() {
-  k = 0;
+  nvalid = k = 0;
 }

 int HNSW::MinimaxHeap::pop_min(float *vmin_out) {
  assert(k > 0);
  // returns min. This is an O(n) operation
  int i = k - 1;
+  while (i >= 0) {
+    if (ids[i] != -1) break;
+    i--;
+  }
+  if (i == -1) return -1;
  int imin = i;
  float vmin = dis[i];
  i--;
  while(i >= 0) {
-    if (dis[i] < vmin) {
+    if (ids[i] != -1 && dis[i] < vmin) {
      vmin = dis[i];
      imin = i;
    }
    i--;
  }
-  assert(2 * i > k);
  if (vmin_out) *vmin_out = vmin;
  int ret = ids[imin];
-
-  --k;
-  faiss::heap_push<HC>(++imin, dis.data(), ids.data(), ids[k], dis[k]);
+  ids[imin] = -1;
+  --nvalid;

  return ret;
 }

--- a/HNSW.h
+++ b/HNSW.h
@@ -37,12 +37,12 @@ namespace faiss {
 * (https://github.com/searchivarius/nmslib)
 *
 * The HNSW object stores only the neighbor link structure, see
- * IndexHNSW below for the full index object.
+ * IndexHNSW.h for the full index object.
 */


 struct VisitedTable;
-
+struct DistanceComputer; // from AuxIndexStructures

 struct HNSW {
  /// internal storage of vectors (32 bits: this is expensive)
@@ -53,37 +53,18 @@ struct HNSW {

  typedef std::pair<float, storage_idx_t> Node;

-  /** The HNSW structure does not store vectors, it only accesses
-   * them through this class.
-   *
-   * Functions are guaranteed to be be accessed only from 1 thread. */
-  struct DistanceComputer {
-    idx_t d;
-
-    /// called before computing distances
-    virtual void set_query(const float *x) = 0;
-
-    /// compute distance of vector i to current query
-    virtual float operator () (storage_idx_t i) = 0;
-
-    /// compute distance between two stored vectors
-    virtual float symmetric_dis(storage_idx_t i, storage_idx_t j) = 0;
-
-    virtual ~DistanceComputer() {}
-  };
-
-
  /** Heap structure that allows fast
   */
  struct MinimaxHeap {
    int n;
    int k;
+    int nvalid;

    std::vector<storage_idx_t> ids;
    std::vector<float> dis;
    typedef faiss::CMax<float, storage_idx_t> HC;

-    explicit MinimaxHeap(int n): n(n), k(0), ids(n), dis(n) {}
+    explicit MinimaxHeap(int n): n(n), k(0), nvalid(0), ids(n), dis(n) {}

    void push(storage_idx_t i, float v);

@@ -147,9 +128,15 @@ struct HNSW {
  /// expansion factor at search time
  int efSearch;

+  /// during search: do we check whether the next best distance is good enough?
+  bool check_relative_distance = true;
+
  /// number of entry points in levels > 0.
  int upper_beam;

+  /// use bounded queue during exploration
+  bool search_bounded_queue = true;
+
  // methods that initialize the tree sizes

  /// initialize the assign_probas and cum_nneighbor_per_level to
@@ -201,10 +188,12 @@ struct HNSW {
                             VisitedTable &vt,
                             int level, int nres_in = 0) const;

-  std::priority_queue<Node> search_from(const Node& node,
-                                        DistanceComputer& qdis,
-                                        int ef,
-                                        VisitedTable *vt) const;
+  std::priority_queue<Node> search_from_candidate_unbounded(
+    const Node& node,
+    DistanceComputer& qdis,
+    int ef,
+    VisitedTable *vt
+  ) const;

  /// search interface
  void search(DistanceComputer& qdis, int k,

--- a/IVFlib.cpp
+++ b/IVFlib.cpp
@@ -234,7 +234,7 @@ void SlidingIndexWindow::step(const Index *sub_index, bool remove_oldest) {
            for (int j = 0; j + 1 < n_slice; j++) {
                sizes[i][j] = sizes[i][j + 1] - amount_to_remove;
            }
-            sizes[i].resize(sizes[i].size() - 1);
+            sizes[i].pop_back ();
        }
        n_slice--;
    } else {

--- a/Index.h
+++ b/Index.h
@@ -60,8 +60,9 @@ struct RangeSearchResult;
 * database-to-database queries are not implemented.
 */
 struct Index {
-
-    typedef long idx_t;    ///< all indices are this type
+    using idx_t = long;    ///< all indices are this type
+    using component_t = float;
+    using distance_t = float;

    int d;                 ///< vector dimension
    idx_t ntotal;          ///< total nb of indexed vectors

--- a/IndexBinary.h
+++ b/IndexBinary.h
@@ -35,7 +35,9 @@ struct RangeSearchResult;
 * vectors.
 */
 struct IndexBinary {
-  typedef long idx_t;    ///< all indices are this type
+  using idx_t = Index::idx_t;    ///< all indices are this type
+  using component_t = uint8_t;
+  using distance_t = int32_t;

  int d;                 ///< vector dimension
  int code_size;   ///< number of bytes per vector ( = d / 8 )

--- a/IndexBinaryHNSW.cpp
+++ b/IndexBinaryHNSW.cpp
@@ -32,7 +32,7 @@
 #include "FaissAssert.h"
 #include "IndexBinaryFlat.h"
 #include "hamming.h"
-
+#include "AuxIndexStructures.h"

 namespace faiss {

@@ -121,7 +121,7 @@ void hnsw_add_vertices(IndexBinaryHNSW& index_hnsw,
      {
        VisitedTable vt (ntotal);

-        std::unique_ptr<HNSW::DistanceComputer> dis(
+        std::unique_ptr<DistanceComputer> dis(
          index_hnsw.get_distance_computer()
        );
        int prev_display = verbose && omp_get_thread_num() == 0 ? 0 : -1;
@@ -202,7 +202,7 @@ void IndexBinaryHNSW::search(idx_t n, const uint8_t *x, idx_t k,
 #pragma omp parallel
  {
    VisitedTable vt(ntotal);
-    std::unique_ptr<HNSW::DistanceComputer> dis(get_distance_computer());
+    std::unique_ptr<DistanceComputer> dis(get_distance_computer());

 #pragma omp for
    for(idx_t i = 0; i < n; i++) {
@@ -252,18 +252,18 @@ namespace {


 template<class HammingComputer>
-struct FlatHammingDis : HNSW::DistanceComputer {
+struct FlatHammingDis : DistanceComputer {
  const int code_size;
  const uint8_t *b;
  size_t ndis;
  HammingComputer hc;

-  float operator () (HNSW::storage_idx_t i) override {
+  float operator () (idx_t i) override {
    ndis++;
    return hc.hamming(b + i * code_size);
  }

-  float symmetric_dis(HNSW::storage_idx_t i, HNSW::storage_idx_t j) override {
+  float symmetric_dis(idx_t i, idx_t j) override {
    return HammingComputerDefault(b + j * code_size, code_size)
      .hamming(b + i * code_size);
  }
@@ -281,7 +281,7 @@ struct FlatHammingDis : HNSW::DistanceComputer {
    hc.set((uint8_t *)x, code_size);
  }

-  virtual ~FlatHammingDis() {
+  ~FlatHammingDis() override {
 #pragma omp critical
    {
      hnsw_stats.ndis += ndis;
@@ -293,7 +293,7 @@ struct FlatHammingDis : HNSW::DistanceComputer {
 }  // namespace


-HNSW::DistanceComputer *IndexBinaryHNSW::get_distance_computer() const {
+DistanceComputer *IndexBinaryHNSW::get_distance_computer() const {
  IndexBinaryFlat *flat_storage = dynamic_cast<IndexBinaryFlat *>(storage);

  FAISS_ASSERT(flat_storage != nullptr);

--- a/IndexBinaryHNSW.h
+++ b/IndexBinaryHNSW.h
@@ -37,7 +37,7 @@ struct IndexBinaryHNSW : IndexBinary {

  ~IndexBinaryHNSW() override;

-  HNSW::DistanceComputer *get_distance_computer() const;
+  DistanceComputer *get_distance_computer() const;

  void add(idx_t n, const uint8_t *x) override;


--- a/IndexBinaryIVF.cpp
+++ b/IndexBinaryIVF.cpp
@@ -252,39 +252,42 @@ long IndexBinaryIVF::remove_ids(const IDSelector& sel) {
 }

 void IndexBinaryIVF::train(idx_t n, const uint8_t *x) {
-  if (verbose)
-    printf("Training level-1 quantizer\n");
-
-  train_q1(n, x, verbose);
+  if (verbose) {
+    printf("Training quantizer\n");
+  }

-  is_trained = true;
-}
+  if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
+    if (verbose) {
+      printf("IVF quantizer does not need training.\n");
+    }
+  } else {
+    if (verbose) {
+      printf("Training quantizer on %ld vectors in %dD\n", n, d);
+    }

-double IndexBinaryIVF::imbalance_factor () const {
-  std::vector<int> hist(nlist);
+    Clustering clus(d, nlist, cp);
+    quantizer->reset();

-  for (int i = 0; i < nlist; i++) {
-    hist[i] = invlists->list_size(i);
-  }
+    std::unique_ptr<float[]> x_f(new float[n * d]);
+    binary_to_real(n * d, x, x_f.get());

-  return faiss::imbalance_factor(nlist, hist.data());
-}
+    IndexFlatL2 index_tmp(d);

-void IndexBinaryIVF::print_stats() const {
-  std::vector<int> sizes(40);
-  for (int i = 0; i < nlist; i++) {
-    for (int j = 0; j < sizes.size(); j++) {
-      if ((invlists->list_size(i) >> j) == 0) {
-        sizes[j]++;
-        break;
-      }
-    }
-  }
-  for (int i = 0; i < sizes.size(); i++) {
-    if (sizes[i]) {
-      printf("list size in < %d: %d instances\n", 1 << i, sizes[i]);
+    if (clustering_index && verbose) {
+      printf("using clustering_index of dimension %d to do the clustering\n",
+             clustering_index->d);
    }
+
+    clus.train(n, x_f.get(), clustering_index ? *clustering_index : index_tmp);
+
+    std::unique_ptr<uint8_t[]> x_b(new uint8_t[clus.k * code_size]);
+    real_to_binary(d * clus.k, clus.centroids.data(), x_b.get());
+
+    quantizer->add(clus.k, x_b.get());
+    quantizer->is_trained = true;
  }
+
+  is_trained = true;
 }

 void IndexBinaryIVF::merge_from(IndexBinaryIVF &other, idx_t add_id) {
@@ -315,38 +318,6 @@ void IndexBinaryIVF::replace_invlists(InvertedLists *il, bool own) {
 }


-void IndexBinaryIVF::train_q1(size_t n, const uint8_t *x, bool verbose) {
-  if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
-    if (verbose)
-      printf("IVF quantizer does not need training.\n");
-  } else {
-    if (verbose)
-      printf("Training level-1 quantizer on %ld vectors in %dD\n", n, d);
-
-    Clustering clus(d, nlist, cp);
-    quantizer->reset();
-
-    std::unique_ptr<float[]> x_f(new float[n * d]);
-    binary_to_real(n * d, x, x_f.get());
-
-    IndexFlatL2 index_tmp(d);
-
-    if (clustering_index && verbose) {
-        printf("using clustering_index of dimension %d to do the clustering\n",
-               clustering_index->d);
-    }
-
-    clus.train(n, x_f.get(), clustering_index ? *clustering_index : index_tmp);
-
-    std::unique_ptr<uint8_t[]> x_b(new uint8_t[clus.k * code_size]);
-    real_to_binary(d * clus.k, clus.centroids.data(), x_b.get());
-
-    quantizer->add(clus.k, x_b.get());
-    quantizer->is_trained = true;
-  }
-}
-
-
 namespace {

 using idx_t = Index::idx_t;

--- a/IndexBinaryIVF.h
+++ b/IndexBinaryIVF.h
@@ -58,9 +58,6 @@ struct IndexBinaryIVF : IndexBinary {
    ClusteringParameters cp; ///< to override default clustering params
    Index *clustering_index; ///< to override index used during clustering

-    /// Trains the quantizer and calls train_residual to train sub-quantizers
-    void train_q1(size_t n, const uint8_t *x, bool verbose);
-
    /** The Inverted file takes a quantizer (an IndexBinary) on input,
     * which implements the function mapping a vector to a list
     * identifier. The pointer is borrowed: the quantizer should not
@@ -74,10 +71,9 @@ struct IndexBinaryIVF : IndexBinary {

    void reset() override;

-    /// Trains the quantizer and calls train_residual to train sub-quantizers
+    /// Trains the quantizer
    void train(idx_t n, const uint8_t *x) override;

-    /// Quantizes x and calls add_with_key
    void add(idx_t n, const uint8_t *x) override;

    void add_with_ids(idx_t n, const uint8_t *x, const long *xids) override;
@@ -174,12 +170,6 @@ struct IndexBinaryIVF : IndexBinary {
     */
    void make_direct_map(bool new_maintain_direct_map=true);

-    /// 1= perfectly balanced, >1: imbalanced
-    double imbalance_factor() const;
-
-    /// display some stats about the inverted lists
-    void print_stats() const;
-
    void replace_invlists(InvertedLists *il, bool own=false);
 };


--- a/IndexHNSW.cpp
+++ b/IndexHNSW.cpp
--- a/IndexHNSW.h
+++ b/IndexHNSW.h
@@ -86,7 +86,7 @@ struct IndexHNSW : Index {
    ~IndexHNSW() override;

    // get a DistanceComputer object for this kind of storage
-    virtual HNSW::DistanceComputer *get_distance_computer() const = 0;
+    virtual DistanceComputer *get_distance_computer() const = 0;

    void add(idx_t n, const float *x) override;

@@ -138,7 +138,7 @@ struct IndexHNSW : Index {
 struct IndexHNSWFlat : IndexHNSW {
    IndexHNSWFlat();
    IndexHNSWFlat(int d, int M);
-    HNSW::DistanceComputer *
+    DistanceComputer *
      get_distance_computer() const override;
 };

@@ -149,7 +149,7 @@ struct IndexHNSWPQ : IndexHNSW {
    IndexHNSWPQ();
    IndexHNSWPQ(int d, int pq_m, int M);
    void train(idx_t n, const float* x) override;
-    HNSW::DistanceComputer *
+    DistanceComputer *
      get_distance_computer() const override;
 };

@@ -159,7 +159,7 @@ struct IndexHNSWPQ : IndexHNSW {
 struct IndexHNSWSQ : IndexHNSW {
    IndexHNSWSQ();
    IndexHNSWSQ(int d, ScalarQuantizer::QuantizerType qtype, int M);
-    HNSW::DistanceComputer *
+    DistanceComputer *
      get_distance_computer() const override;
 };

@@ -168,7 +168,7 @@ struct IndexHNSWSQ : IndexHNSW {
 struct IndexHNSW2Level : IndexHNSW {
    IndexHNSW2Level();
    IndexHNSW2Level(Index *quantizer, size_t nlist, int m_pq, int M);
-    HNSW::DistanceComputer *
+    DistanceComputer *
      get_distance_computer() const override;
    void flip_to_ivf();


--- a/IndexIVF.cpp
+++ b/IndexIVF.cpp
--- a/IndexIVF.h
+++ b/IndexIVF.h
@@ -160,14 +160,15 @@ struct IndexIVF: Index, Level1Quantizer {
                                     ) const;

    /** assign the vectors, then call search_preassign */
-    virtual void search (idx_t n, const float *x, idx_t k,
-                         float *distances, idx_t *labels) const override;
+    void search (idx_t n, const float *x, idx_t k,
+                 float *distances, idx_t *labels) const override;
+
+    void range_search (idx_t n, const float* x, float radius,
+                       RangeSearchResult* result) const override;

    /// get a scanner for this index (store_pairs means ignore labels)
    virtual InvertedListScanner *get_InvertedListScanner (
-                 bool store_pairs=false) const {
-        return nullptr;
-    }
+        bool store_pairs=false) const;

    void reconstruct (idx_t key, float* recons) const override;

@@ -242,18 +243,14 @@ struct IndexIVF: Index, Level1Quantizer {
     */
    void make_direct_map (bool new_maintain_direct_map=true);

-    /// 1= perfectly balanced, >1: imbalanced
-    double imbalance_factor () const;
-
-    /// display some stats about the inverted lists
-    void print_stats () const;
-
    /// replace the inverted lists, old one is deallocated if own_invlists
    void replace_invlists (InvertedLists *il, bool own=false);

    IndexIVF ();
 };

+class RangeQueryResult;
+
 /** Object that handles a query. The inverted lists to scan are
 * provided externally. The object has a lot of state, but
 * distance_to_code and scan_codes can be called in multiple
@@ -271,8 +268,8 @@ struct InvertedListScanner {
    /// compute a single query-to-code distance
    virtual float distance_to_code (const uint8_t *code) const = 0;

-    /** compute the distances to codes. (distances, labels) should be
-     * organized ad a min- or max-heap
+    /** scan a set of codes, compute distances to current query and
+     * update heap of results if necessary.
     *
     * @param n      number of codes to scan
     * @param codes  codes to scan (n * code_size)
@@ -280,6 +277,7 @@ struct InvertedListScanner {
     * @param distances  heap distances (size k)
     * @param labels     heap labels (size k)
     * @param k          heap size
+     * @return number of heap updates performed
     */
    virtual size_t scan_codes (size_t n,
                               const uint8_t *codes,
@@ -287,6 +285,16 @@ struct InvertedListScanner {
                               float *distances, idx_t *labels,
                               size_t k) const = 0;

+    /** scan a set of codes, compute distances to current query and
+     * update results if distances are below radius
+     *
+     * (default implementation fails) */
+    virtual void scan_codes_range (size_t n,
+                                   const uint8_t *codes,
+                                   const idx_t *ids,
+                                   float radius,
+                                   RangeQueryResult &result) const;
+
    virtual ~InvertedListScanner () {}

 };

--- a/IndexIVFFlat.cpp
+++ b/IndexIVFFlat.cpp
@@ -137,6 +137,25 @@ struct IVFFlatScanner: InvertedListScanner {
        return nup;
    }

+    void scan_codes_range (size_t list_size,
+                           const uint8_t *codes,
+                           const idx_t *ids,
+                           float radius,
+                           RangeQueryResult & res) const override
+    {
+        const float *list_vecs = (const float*)codes;
+        for (size_t j = 0; j < list_size; j++) {
+            const float * yj = list_vecs + d * j;
+            float dis = metric == METRIC_INNER_PRODUCT ?
+                fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
+            if (C::cmp (radius, dis)) {
+                long id = store_pairs ? (list_no << 32 | j) : ids[j];
+                res.add (dis, id);
+            }
+        }
+    }
+
+
 };


@@ -168,57 +187,6 @@ InvertedListScanner* IndexIVFFlat::get_InvertedListScanner
 }


-void IndexIVFFlat::range_search (idx_t nx, const float *x, float radius,
-                                 RangeSearchResult *result) const
-{
-    idx_t * keys = new idx_t [nx * nprobe];
-    ScopeDeleter<idx_t> del (keys);
-    quantizer->assign (nx, x, keys, nprobe);
-
-#pragma omp parallel
-    {
-        RangeSearchPartialResult pres(result);
-
-        for (size_t i = 0; i < nx; i++) {
-            const float * xi = x + i * d;
-            const long * keysi = keys + i * nprobe;
-
-            RangeSearchPartialResult::QueryResult & qres =
-                pres.new_result (i);
-
-            for (size_t ik = 0; ik < nprobe; ik++) {
-                long key = keysi[ik];  /* select the list  */
-                if (key < 0 || key >= (long) nlist) {
-                    fprintf (stderr, "Invalid key=%ld  at ik=%ld nlist=%ld\n",
-                             key, ik, nlist);
-                    throw;
-                }
-
-                const size_t list_size = invlists->list_size(key);
-                InvertedLists::ScopedCodes scodes (invlists, key);
-                const float * list_vecs = (const float*)scodes.get();
-                InvertedLists::ScopedIds ids (invlists, key);
-
-                for (size_t j = 0; j < list_size; j++) {
-                    const float * yj = list_vecs + d * j;
-                    if (metric_type == METRIC_L2) {
-                        float disij = fvec_L2sqr (xi, yj, d);
-                        if (disij < radius) {
-                            qres.add (disij, ids[j]);
-                        }
-                    } else if (metric_type == METRIC_INNER_PRODUCT) {
-                        float disij = fvec_inner_product(xi, yj, d);
-                        if (disij > radius) {
-                            qres.add (disij, ids[j]);
-                        }
-                    }
-                }
-            }
-        }
-
-        pres.finalize ();
-    }
-}

 void IndexIVFFlat::update_vectors (int n, idx_t *new_ids, const float *x)
 {
@@ -272,18 +240,6 @@ IndexIVFFlatDedup::IndexIVFFlatDedup (
    IndexIVFFlat (quantizer, d, nlist_, metric_type)
 {}

-// from Python's stringobject.c
-static uint64_t hash_bytes (const uint8_t *bytes, long n) {
-    const uint8_t *p = bytes;
-    uint64_t x = (uint64_t)(*p) << 7;
-    long len = n;
-    while (--len >= 0) {
-        x = (1000003*x) ^ *p++;
-    }
-    x ^= n;
-    return x;
-}
-

 void IndexIVFFlatDedup::train(idx_t n, const float* x)
 {

--- a/IndexIVFFlat.h
+++ b/IndexIVFFlat.h
@@ -39,24 +39,10 @@ struct IndexIVFFlat: IndexIVF {
                        const idx_t *list_nos,
                        uint8_t * codes) const override;

-    /*
-    void search_preassigned (idx_t n, const float *x, idx_t k,
-                             const idx_t *assign,
-                             const float *centroid_dis,
-                             float *distances, idx_t *labels,
-                             bool store_pairs,
-                             const IVFSearchParameters *params=nullptr
-                             ) const override;
-    */
+
    InvertedListScanner *get_InvertedListScanner (bool store_pairs)
        const override;

-    void range_search(
-        idx_t n,
-        const float* x,
-        float radius,
-        RangeSearchResult* result) const override;
-
    /** Update a subset of vectors.
     *
     * The index must have a direct_map

--- a/IndexPQ.cpp
+++ b/IndexPQ.cpp
@@ -796,7 +796,7 @@ struct MinSumK {
            // enqueue followers
            long ii = ti;
            for (int m = 0; m < M; m++) {
-                long n = ii & ((1 << nbit) - 1);
+                long n = ii & ((1L << nbit) - 1);
                ii >>= nbit;
                if (n + 1 >= N) continue;

@@ -819,8 +819,8 @@ struct MinSumK {
            }
            long ti = 0;
            for (int m = 0; m < M; m++) {
-                long n = ii & ((1 << nbit) - 1);
-                ti += ssx[m].get_ord(n) << (nbit * m);
+                long n = ii & ((1L << nbit) - 1);
+                ti += long(ssx[m].get_ord(n)) << (nbit * m);
                ii >>= nbit;
            }
            terms[k] = ti;

--- a/IndexReplicas.cpp
+++ b/IndexReplicas.cpp
+/**
+ * Copyright (c) 2015-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD+Patents license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include "IndexReplicas.h"
+#include "FaissAssert.h"
+
+namespace faiss {
+
+template<class IndexClass>
+IndexReplicasTemplate<IndexClass>::IndexReplicasTemplate()
+    : own_fields(false) {
+}
+
+template<class IndexClass>
+IndexReplicasTemplate<IndexClass>::~IndexReplicasTemplate() {
+  if (own_fields) {
+    for (auto& index : this->indices_)
+      delete index.first;
+  }
+}
+
+template<class IndexClass>
+void IndexReplicasTemplate<IndexClass>::addIndex(IndexClass* index) {
+  // Make sure that the parameters are the same for all prior indices
+  if (!indices_.empty()) {
+    auto& existing = indices_.front().first;
+
+    FAISS_THROW_IF_NOT_FMT(index->d == existing->d,
+                           "IndexReplicas::addIndex: dimension mismatch for "
+                           "newly added index; prior index has dim %d, "
+                           "new index has %d",
+                           existing->d, index->d);
+
+    FAISS_THROW_IF_NOT_FMT(index->ntotal == existing->ntotal,
+                           "IndexReplicas::addIndex: newly added index does "
+                           "not have same number of vectors as prior index; "
+                           "prior index has %ld vectors, new index has %ld",
+                           existing->ntotal, index->ntotal);
+
+    FAISS_THROW_IF_NOT_MSG(index->metric_type == existing->metric_type,
+                           "IndexReplicas::addIndex: newly added index is "
+                           "of different metric type than old index");
+  } else {
+    // Set our parameters
+    // FIXME: this is a little bit weird
+    this->d = index->d;
+    this->ntotal = index->ntotal;
+    this->verbose = index->verbose;
+    this->is_trained = index->is_trained;
+    this->metric_type = index->metric_type;
+  }
+
+  this->indices_.emplace_back(
+    std::make_pair(index,
+                   std::unique_ptr<WorkerThread>(new WorkerThread)));
+}
+
+template<class IndexClass>
+void IndexReplicasTemplate<IndexClass>::removeIndex(IndexClass* index) {
+  for (auto it = this->indices_.begin(); it != indices_.end(); ++it) {
+    if (it->first == index) {
+      // This is our index; stop the worker thread before removing it,
+      // to ensure that it has finished before function exit
+      it->second->stop();
+      it->second->waitForThreadExit();
+
+      this->indices_.erase(it);
+      return;
+    }
+  }
+
+  // could not find our index
+  FAISS_THROW_MSG("IndexReplicas::removeIndex: index not found");
+}
+
+template<class IndexClass>
+void IndexReplicasTemplate<IndexClass>::runOnIndex(std::function<void(IndexClass*)> f) {
+  FAISS_THROW_IF_NOT_MSG(!indices_.empty(), "no replicas in index");
+
+  std::vector<std::future<bool>> v;
+
+  for (auto& index : this->indices_) {
+    auto indexPtr = index.first;
+    v.emplace_back(index.second->add([indexPtr, f](){ f(indexPtr); }));
+  }
+
+  // Blocking wait for completion
+  for (auto& func : v) {
+    func.get();
+  }
+}
+
+template<class IndexClass>
+void IndexReplicasTemplate<IndexClass>::reset() {
+  runOnIndex([](IndexClass* index){ index->reset(); });
+  this->ntotal = 0;
+}
+
+template<class IndexClass>
+void IndexReplicasTemplate<IndexClass>::train(idx_t n, const component_t* x) {
+  runOnIndex([n, x](IndexClass* index){ index->train(n, x); });
+}
+
+template<class IndexClass>
+void IndexReplicasTemplate<IndexClass>::add(idx_t n, const component_t* x) {
+  runOnIndex([n, x](IndexClass* index){ index->add(n, x); });
+  this->ntotal += n;
+}
+
+template<class IndexClass>
+void IndexReplicasTemplate<IndexClass>::reconstruct(idx_t n, component_t* x) const {
+  FAISS_THROW_IF_NOT_MSG(!indices_.empty(), "no replicas in index");
+  indices_[0].first->reconstruct (n, x);
+}
+
+template<class IndexClass>
+void IndexReplicasTemplate<IndexClass>::search(
+              idx_t n,
+              const component_t* x,
+              idx_t k,
+              distance_t* distances,
+              idx_t* labels) const {
+  FAISS_THROW_IF_NOT_MSG(!indices_.empty(), "no replicas in index");
+
+  if (n == 0) {
+    return;
+  }
+
+  auto dim = indices_.front().first->d;
+
+  std::vector<std::future<bool>> v;
+
+  // Partition the query by the number of indices we have
+  auto queriesPerIndex =
+    (faiss::Index::idx_t) (n + indices_.size() - 1) / indices_.size();
+  FAISS_ASSERT(n / queriesPerIndex <= indices_.size());
+
+  for (faiss::Index::idx_t i = 0; i < indices_.size(); ++i) {
+    auto base = i * queriesPerIndex;
+    if (base >= n) {
+      break;
+    }
+
+    auto numForIndex = std::min(queriesPerIndex, n - base);
+    size_t components_per_vec = sizeof(component_t) == 1 ? (dim + 7) / 8 : dim;
+    auto queryStart = x + base * components_per_vec;
+    auto distancesStart = distances + base * k;
+    auto labelsStart = labels + base * k;
+
+    auto indexPtr = indices_[i].first;
+    auto fn =
+      [indexPtr, numForIndex, queryStart, k, distancesStart, labelsStart]() {
+        indexPtr->search(numForIndex, queryStart,
+                         k, distancesStart, labelsStart);
+      };
+
+    v.emplace_back(indices_[i].second->add(std::move(fn)));
+  }
+
+  // Blocking wait for completion
+  for (auto& f : v) {
+    f.get();
+  }
+}
+
+// explicit instanciations
+template struct IndexReplicasTemplate<Index>;
+template struct IndexReplicasTemplate<IndexBinary>;
+
+
+} // namespace
--- a/IndexReplicas.h
+++ b/IndexReplicas.h
+/**
+ * Copyright (c) 2015-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD+Patents license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include "Index.h"
+#include "IndexBinary.h"
+#include "WorkerThread.h"
+#include <memory>
+#include <vector>
+
+namespace faiss {
+
+/// Takes individual faiss::Index instances, and splits queries for
+/// sending to each Index instance, and joins the results together
+/// when done.
+/// Each index is managed by a separate CPU thread.
+template<class IndexClass>
+class IndexReplicasTemplate : public IndexClass {
+ public:
+  using idx_t = typename IndexClass::idx_t;
+  using component_t = typename IndexClass::component_t;
+  using distance_t = typename IndexClass::distance_t;
+
+  IndexReplicasTemplate();
+  ~IndexReplicasTemplate() override;
+
+  /// Adds an index that is managed by ourselves.
+  /// WARNING: once an index is added to this proxy, it becomes unsafe
+  /// to touch it from any other thread than that on which is managing
+  /// it, until we are shut down. Use runOnIndex to perform work on it
+  /// instead.
+  void addIndex(IndexClass* index);
+
+  /// Remove an index that is managed by ourselves.
+  /// This will flush all pending work on that index, and then shut
+  /// down its managing thread, and will remove the index.
+  void removeIndex(IndexClass* index);
+
+  /// Run a function on all indices, in the thread that the index is
+  /// managed in.
+  void runOnIndex(std::function<void(IndexClass*)> f);
+
+  /// faiss::Index API
+  /// All indices receive the same call
+  void reset() override;
+
+  /// faiss::Index API
+  /// All indices receive the same call
+  virtual void train(idx_t n, const component_t* x) override;
+
+  /// faiss::Index API
+  /// All indices receive the same call
+  virtual void add(idx_t n, const component_t* x) override;
+
+  /// faiss::Index API
+  /// Query is partitioned into a slice for each sub-index
+  /// split by ceil(n / #indices) for our sub-indices
+  virtual void search(idx_t n,
+              const component_t* x,
+              idx_t k,
+              distance_t* distances,
+              idx_t* labels) const override;
+
+  /// reconstructs from the first index
+  virtual void reconstruct(idx_t, component_t *v) const override;
+
+  bool own_fields;
+
+  int count() const {return indices_.size(); }
+
+  IndexClass* at(int i) {return indices_[i].first; }
+  const IndexClass* at(int i) const {return indices_[i].first; }
+
+ private:
+  /// Collection of Index instances, with their managing worker thread
+  mutable std::vector<std::pair<IndexClass*,
+                                std::unique_ptr<WorkerThread> > > indices_;
+};
+
+using IndexReplicas = IndexReplicasTemplate<Index>;
+using IndexBinaryReplicas = IndexReplicasTemplate<IndexBinary>;
+
+
+} // namespace
--- a/IndexScalarQuantizer.cpp
+++ b/IndexScalarQuantizer.cpp
--- a/IndexScalarQuantizer.h
+++ b/IndexScalarQuantizer.h
@@ -28,6 +28,7 @@ namespace faiss {
 * (default).
 */

+struct SQDistanceComputer;

 struct ScalarQuantizer {

@@ -37,6 +38,7 @@ struct ScalarQuantizer {
        QT_8bit_uniform,     ///< same, shared range for all dimensions
        QT_4bit_uniform,
        QT_fp16,
+        QT_8bit_direct,      /// fast indexing of uint8s
    };

    QuantizerType qtype;
@@ -79,25 +81,13 @@ struct ScalarQuantizer {
    /// decode a vector from a given code (or n vectors if third argument)
    void decode (const uint8_t *code, float *x, size_t n) const;

-    // fast, non thread-safe way of computing vector-to-code and
-    // code-to-code distances.
-    struct DistanceComputer {

-        /// vector-to-code distance computation
-        virtual float compute_distance (const float *x,
-                                        const uint8_t *code) const = 0;
-
-        /// code-to-code distance computation
-        virtual float compute_code_distance (const uint8_t *code1,
-                                             const uint8_t *code2) const = 0;
-        virtual ~DistanceComputer () {}
-    };
-
-    DistanceComputer *get_distance_computer (MetricType metric = METRIC_L2)
+    SQDistanceComputer *get_distance_computer (MetricType metric = METRIC_L2)
        const;

 };

+struct DistanceComputer;

 struct IndexScalarQuantizer: Index {
    /// Used to encode the vectors
@@ -137,6 +127,8 @@ struct IndexScalarQuantizer: Index {

    void reconstruct(idx_t key, float* recons) const override;

+    DistanceComputer *get_distance_computer () const;
+
 };


@@ -148,6 +140,7 @@ struct IndexScalarQuantizer: Index {

 struct IndexIVFScalarQuantizer: IndexIVF {
    ScalarQuantizer sq;
+    bool by_residual;

    IndexIVFScalarQuantizer(Index *quantizer, size_t d, size_t nlist,
                            ScalarQuantizer::QuantizerType qtype,

--- a/IndexShards.cpp
+++ b/IndexShards.cpp
+/**
+ * Copyright (c) 2015-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD+Patents license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include "IndexShards.h"
+
+#include <cstdio>
+#include <functional>
+
+#include "FaissAssert.h"
+#include "Heap.h"
+#include "WorkerThread.h"
+
+namespace faiss {
+
+// subroutines
+namespace {
+
+typedef Index::idx_t idx_t;
+
+
+// add translation to all valid labels
+void translate_labels (long n, idx_t *labels, long translation)
+{
+    if (translation == 0) return;
+    for (long i = 0; i < n; i++) {
+        if(labels[i] < 0) continue;
+        labels[i] += translation;
+    }
+}
+
+
+/** merge result tables from several shards.
+ * @param all_distances  size nshard * n * k
+ * @param all_labels     idem
+ * @param translartions  label translations to apply, size nshard
+ */
+
+template <class IndexClass, class C>
+void merge_tables (long n, long k, long nshard,
+                   typename IndexClass::distance_t *distances,
+                   idx_t *labels,
+                   const typename IndexClass::distance_t *all_distances,
+                   idx_t *all_labels,
+                   const long *translations)
+{
+    if(k == 0) {
+        return;
+    }
+    using distance_t = typename IndexClass::distance_t;
+
+    long stride = n * k;
+#pragma omp parallel
+    {
+        std::vector<int> buf (2 * nshard);
+        int * pointer = buf.data();
+        int * shard_ids = pointer + nshard;
+        std::vector<distance_t> buf2 (nshard);
+        distance_t * heap_vals = buf2.data();
+#pragma omp for
+        for (long i = 0; i < n; i++) {
+            // the heap maps values to the shard where they are
+            // produced.
+            const distance_t *D_in = all_distances + i * k;
+            const idx_t *I_in = all_labels + i * k;
+            int heap_size = 0;
+
+            for (long s = 0; s < nshard; s++) {
+                pointer[s] = 0;
+                if (I_in[stride * s] >= 0)
+                    heap_push<C> (++heap_size, heap_vals, shard_ids,
+                                 D_in[stride * s], s);
+            }
+
+            distance_t *D = distances + i * k;
+            idx_t *I = labels + i * k;
+
+            for (int j = 0; j < k; j++) {
+                if (heap_size == 0) {
+                    I[j] = -1;
+                    D[j] = C::neutral();
+                } else {
+                    // pop best element
+                    int s = shard_ids[0];
+                    int & p = pointer[s];
+                    D[j] = heap_vals[0];
+                    I[j] = I_in[stride * s + p] + translations[s];
+
+                    heap_pop<C> (heap_size--, heap_vals, shard_ids);
+                    p++;
+                    if (p < k && I_in[stride * s + p] >= 0)
+                        heap_push<C> (++heap_size, heap_vals, shard_ids,
+                                     D_in[stride * s + p], s);
+                }
+            }
+        }
+    }
+}
+
+template<class IndexClass>
+void runOnIndexes(bool threaded,
+                  std::function<void(int no, IndexClass*)> f,
+                  std::vector<IndexClass *> indexes)
+{
+    FAISS_THROW_IF_NOT_MSG(!indexes.empty(), "no shards in index");
+
+    if (!threaded) {
+        for (int no = 0; no < indexes.size(); no++) {
+            IndexClass *index = indexes[no];
+            f(no, index);
+        }
+    } else {
+        std::vector<std::unique_ptr<WorkerThread> > threads;
+        std::vector<std::future<bool>> v;
+
+        for (int no = 0; no < indexes.size(); no++) {
+            IndexClass *index = indexes[no];
+            threads.emplace_back(new WorkerThread());
+            WorkerThread *wt = threads.back().get();
+            v.emplace_back(wt->add([no, index, f](){ f(no, index); }));
+        }
+
+        // Blocking wait for completion
+        for (auto& func : v) {
+            func.get();
+        }
+    }
+
+};
+
+} // anonymous namespace
+
+
+template<class IndexClass>
+IndexShardsTemplate<IndexClass>::IndexShardsTemplate (idx_t d, bool threaded, bool successive_ids):
+    IndexClass (d), own_fields (false),
+    threaded (threaded), successive_ids (successive_ids)
+{
+
+
+}
+
+
+template<class IndexClass>
+void IndexShardsTemplate<IndexClass>::add_shard (IndexClass *idx)
+{
+    shard_indexes.push_back (idx);
+    sync_with_shard_indexes ();
+}
+
+template<class IndexClass>
+void IndexShardsTemplate<IndexClass>::sync_with_shard_indexes ()
+{
+    if (shard_indexes.empty()) return;
+    IndexClass * index0 = shard_indexes[0];
+    this->d = index0->d;
+    this->metric_type = index0->metric_type;
+    this->is_trained = index0->is_trained;
+    this->ntotal = index0->ntotal;
+    for (int i = 1; i < shard_indexes.size(); i++) {
+        IndexClass * index = shard_indexes[i];
+        FAISS_THROW_IF_NOT (this->metric_type == index->metric_type);
+        FAISS_THROW_IF_NOT (this->d == index->d);
+        this->ntotal += index->ntotal;
+    }
+}
+
+
+
+
+
+
+template<class IndexClass>
+void IndexShardsTemplate<IndexClass>::train (idx_t n, const component_t *x)
+{
+    auto train_func = [n, x](int no, IndexClass *index)
+    {
+        if (index->verbose)
+            printf ("begin train shard %d on %ld points\n", no, n);
+        index->train(n, x);
+        if (index->verbose)
+            printf ("end train shard %d\n", no);
+    };
+
+    runOnIndexes<IndexClass> (threaded, train_func, shard_indexes);
+    sync_with_shard_indexes ();
+}
+
+template<class IndexClass>
+void IndexShardsTemplate<IndexClass>::add (idx_t n, const component_t *x)
+{
+    add_with_ids (n, x, nullptr);
+}
+
+template<class IndexClass>
+void IndexShardsTemplate<IndexClass>::add_with_ids (idx_t n, const component_t * x, const idx_t *xids)
+{
+
+    FAISS_THROW_IF_NOT_MSG(!(successive_ids && xids),
+                   "It makes no sense to pass in ids and "
+                   "request them to be shifted");
+
+    if (successive_ids) {
+        FAISS_THROW_IF_NOT_MSG(!xids,
+                       "It makes no sense to pass in ids and "
+                       "request them to be shifted");
+        FAISS_THROW_IF_NOT_MSG(this->ntotal == 0,
+                       "when adding to IndexShards with sucessive_ids, "
+                       "only add() in a single pass is supported");
+    }
+
+    long nshard = shard_indexes.size();
+    const idx_t *ids = xids;
+    ScopeDeleter<idx_t> del;
+    if (!ids && !successive_ids) {
+        idx_t *aids = new idx_t[n];
+        for (idx_t i = 0; i < n; i++)
+            aids[i] = this->ntotal + i;
+        ids = aids;
+        del.set (ids);
+    }
+
+    size_t components_per_vec =
+        sizeof(component_t) == 1 ? (this->d + 7) / 8 : this->d;
+
+    auto add_func = [n, ids, x, nshard, components_per_vec]
+        (int no, IndexClass *index) {
+
+        idx_t i0 = no * n / nshard;
+        idx_t i1 = (no + 1) * n / nshard;
+
+        auto x0 = x + i0 * components_per_vec;
+
+        if (index->verbose) {
+            printf ("begin add shard %d on %ld points\n", no, n);
+        }
+        if (ids) {
+            index->add_with_ids (i1 - i0, x0, ids + i0);
+        } else {
+            index->add (i1 - i0, x0);
+        }
+        if (index->verbose) {
+            printf ("end add shard %d on %ld points\n", no, i1 - i0);
+        }
+    };
+
+    runOnIndexes<IndexClass> (threaded, add_func, shard_indexes);
+
+    this->ntotal += n;
+}
+
+template<class IndexClass>
+void IndexShardsTemplate<IndexClass>::reset ()
+{
+    for (int i = 0; i < shard_indexes.size(); i++) {
+        shard_indexes[i]->reset ();
+    }
+    sync_with_shard_indexes ();
+}
+
+template<class IndexClass>
+void IndexShardsTemplate<IndexClass>::search (
+           idx_t n, const component_t *x, idx_t k,
+           distance_t *distances, idx_t *labels) const
+{
+    long nshard = shard_indexes.size();
+    distance_t *all_distances = new distance_t [nshard * k * n];
+    idx_t *all_labels = new idx_t [nshard * k * n];
+    ScopeDeleter<distance_t> del (all_distances);
+    ScopeDeleter<idx_t> del2 (all_labels);
+
+    auto query_func = [n, k, x, all_distances, all_labels]
+        (int no, IndexClass *index) {
+
+        if (index->verbose) {
+            printf ("begin query shard %d on %ld points\n", no, n);
+        }
+        index->search (n, x, k,
+                       all_distances + no * k * n,
+                       all_labels + no * k * n);
+        if (index->verbose) {
+            printf ("end query shard %d\n", no);
+        }
+    };
+
+    runOnIndexes<IndexClass> (threaded, query_func, shard_indexes);
+
+    std::vector<long> translations (nshard, 0);
+    if (successive_ids) {
+        translations[0] = 0;
+        for (int s = 0; s + 1 < nshard; s++)
+            translations [s + 1] = translations [s] +
+                shard_indexes [s]->ntotal;
+    }
+
+    if (this->metric_type == METRIC_L2) {
+        merge_tables<IndexClass, CMin<distance_t, int> > (
+             n, k, nshard, distances, labels,
+             all_distances, all_labels, translations.data ());
+    } else {
+        merge_tables<IndexClass, CMax<distance_t, int> > (
+             n, k, nshard, distances, labels,
+             all_distances, all_labels, translations.data ());
+    }
+
+}
+
+
+template<class IndexClass>
+IndexShardsTemplate<IndexClass>::~IndexShardsTemplate ()
+{
+    if (own_fields) {
+        for (int s = 0; s < shard_indexes.size(); s++)
+            delete shard_indexes [s];
+    }
+}
+
+// explicit instanciations
+template struct IndexShardsTemplate<Index>;
+template struct IndexShardsTemplate<IndexBinary>;
+
+
+
+} // namespace faiss
--- a/IndexShards.h
+++ b/IndexShards.h
+/**
+ * Copyright (c) 2015-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD+Patents license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+#pragma once
+
+#include <vector>
+
+#include "Index.h"
+#include "IndexBinary.h"
+
+namespace faiss {
+
+/** Index that concatenates the results from several sub-indexes
+ *
+ */
+template<class IndexClass>
+struct IndexShardsTemplate : IndexClass {
+
+    using idx_t = typename IndexClass::idx_t;
+    using component_t = typename IndexClass::component_t;
+    using distance_t = typename IndexClass::distance_t;
+
+    std::vector<IndexClass*> shard_indexes;
+    bool own_fields;      /// should the sub-indexes be deleted along with this?
+    bool threaded;
+    bool successive_ids;
+
+    /**
+     * @param threaded     do we use one thread per sub_index or do
+     *                     queries sequentially?
+     * @param successive_ids should we shift the returned ids by
+     *                     the size of each sub-index or return them
+     *                     as they are?
+     */
+    explicit IndexShardsTemplate (idx_t d, bool threaded = false,
+                                  bool successive_ids = true);
+
+    void add_shard (IndexClass *);
+
+    // update metric_type and ntotal. Call if you changes something in
+    // the shard indexes.
+    void sync_with_shard_indexes ();
+
+    IndexClass *at(int i) {return shard_indexes[i]; }
+
+    /// supported only for sub-indices that implement add_with_ids
+    void add(idx_t n, const component_t* x) override;
+
+    /**
+     * Cases (successive_ids, xids):
+     * - true, non-NULL       ERROR: it makes no sense to pass in ids and
+     *                        request them to be shifted
+     * - true, NULL           OK, but should be called only once (calls add()
+     *                        on sub-indexes).
+     * - false, non-NULL      OK: will call add_with_ids with passed in xids
+     *                        distributed evenly over shards
+     * - false, NULL          OK: will call add_with_ids on each sub-index,
+     *                        starting at ntotal
+     */
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids) override;
+
+    void search(
+        idx_t n, const component_t* x, idx_t k,
+        distance_t* distances, idx_t* labels) const override;
+
+    void train(idx_t n, const component_t* x) override;
+
+    void reset() override;
+
+    ~IndexShardsTemplate() override;
+};
+
+using IndexShards = IndexShardsTemplate<Index>;
+using IndexBinaryShards = IndexShardsTemplate<IndexBinary>;
+
+
+} // namespace faiss
--- a/InvertedLists.cpp
+++ b/InvertedLists.cpp
@@ -97,6 +97,34 @@ void InvertedLists::merge_from (InvertedLists *oivf, size_t add_id) {
    }
 }

+double InvertedLists::imbalance_factor () const {
+    std::vector<int> hist(nlist);
+
+    for (size_t i = 0; i < nlist; i++) {
+        hist[i] = list_size(i);
+    }
+
+    return faiss::imbalance_factor(nlist, hist.data());
+}
+
+void InvertedLists::print_stats () const {
+    std::vector<int> sizes(40);
+    for (size_t i = 0; i < nlist; i++) {
+        for (size_t j = 0; j < sizes.size(); j++) {
+            if ((list_size(i) >> j) == 0) {
+                sizes[j]++;
+                break;
+            }
+        }
+    }
+    for (size_t i = 0; i < sizes.size(); i++) {
+        if (sizes[i]) {
+            printf("list size in < %d: %d instances\n", 1 << i, sizes[i]);
+        }
+    }
+}
+
+
 /*****************************************
 * ArrayInvertedLists implementation
 ******************************************/

--- a/InvertedLists.h
+++ b/InvertedLists.h
@@ -101,6 +101,16 @@ struct InvertedLists {

    virtual ~InvertedLists ();

+    /*************************
+     * statistics            */
+
+    /// 1= perfectly balanced, >1: imbalanced
+    double imbalance_factor () const;
+
+    /// display some stats about the inverted lists
+    void print_stats () const;
+
+
    /**************************************
     * Scoped inverted lists (for automatic deallocation)
     *

--- a/MetaIndexes.cpp
+++ b/MetaIndexes.cpp
--- a/MetaIndexes.h
+++ b/MetaIndexes.h
@@ -11,13 +11,10 @@
 #ifndef META_INDEXES_H
 #define META_INDEXES_H

-
 #include <vector>
 #include <unordered_map>
-
-
 #include "Index.h"
-
+#include "IndexShards.h"

 namespace faiss {

@@ -78,65 +75,6 @@ struct IndexIDMap2 : IndexIDMap {
    IndexIDMap2 () {}
 };

-
-/** Index that concatenates the results from several sub-indexes
- *
- */
-struct IndexShards : Index {
-
-    std::vector<Index*> shard_indexes;
-    bool own_fields;      /// should the sub-indexes be deleted along with this?
-    bool threaded;
-    bool successive_ids;
-
-    /**
-     * @param threaded     do we use one thread per sub_index or do
-     *                     queries sequentially?
-     * @param successive_ids should we shift the returned ids by
-     *                     the size of each sub-index or return them
-     *                     as they are?
-     */
-    explicit IndexShards (idx_t d, bool threaded = false,
-                         bool successive_ids = true);
-
-    void add_shard (Index *);
-
-    // update metric_type and ntotal. Call if you changes something in
-    // the shard indexes.
-    void sync_with_shard_indexes ();
-
-    Index *at(int i) {return shard_indexes[i]; }
-
-    /// supported only for sub-indices that implement add_with_ids
-    void add(idx_t n, const float* x) override;
-
-    /**
-     * Cases (successive_ids, xids):
-     * - true, non-NULL       ERROR: it makes no sense to pass in ids and
-     *                        request them to be shifted
-     * - true, NULL           OK, but should be called only once (calls add()
-     *                        on sub-indexes).
-     * - false, non-NULL      OK: will call add_with_ids with passed in xids
-     *                        distributed evenly over shards
-     * - false, NULL          OK: will call add_with_ids on each sub-index,
-     *                        starting at ntotal
-     */
-    void add_with_ids(idx_t n, const float* x, const long* xids) override;
-
-    void search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels) const override;
-
-    void train(idx_t n, const float* x) override;
-
-    void reset() override;
-
-    ~IndexShards() override;
-};
-
 /** splits input vectors in segments and assigns each segment to a sub-index
 * used to distribute a MultiIndexQuantizer
 */

--- a/ProductQuantizer.cpp
+++ b/ProductQuantizer.cpp
@@ -379,10 +379,74 @@ void ProductQuantizer::compute_code_from_distance_table (const float *tab,
    }
 }

+void ProductQuantizer::compute_codes_with_assign_index (
+                const float * x,
+                uint8_t * codes,
+                size_t n)
+{
+    FAISS_THROW_IF_NOT (assign_index && assign_index->d == dsub);
+
+    for (size_t m = 0; m < M; m++) {
+        assign_index->reset ();
+        assign_index->add (ksub, get_centroids (m, 0));
+        size_t bs = 65536;
+        float * xslice = new float[bs * dsub];
+        ScopeDeleter<float> del (xslice);
+        idx_t *assign = new idx_t[bs];
+        ScopeDeleter<idx_t> del2 (assign);
+
+        for (size_t i0 = 0; i0 < n; i0 += bs) {
+            size_t i1 = std::min(i0 + bs, n);
+
+            for (size_t i = i0; i < i1; i++) {
+                memcpy (xslice + (i - i0) * dsub,
+                        x + i * d + m * dsub,
+                        dsub * sizeof(float));
+            }
+
+            assign_index->assign (i1 - i0, xslice, assign);
+
+            switch (byte_per_idx) {
+            case 1:
+                {
+                    uint8_t *c = codes + code_size * i0 + m;
+                    for (size_t i = i0; i < i1; i++) {
+                        *c = assign[i - i0];
+                        c += M;
+                    }
+                }
+                break;
+           case 2:
+               {
+                   uint16_t *c = (uint16_t*)(codes + code_size * i0 + m * 2);
+                   for (size_t i = i0; i < i1; i++) {
+                       *c = assign[i - i0];
+                       c += M;
+                   }
+               }
+               break;
+            }
+
+        }
+    }
+
+}
+
 void ProductQuantizer::compute_codes (const float * x,
                                      uint8_t * codes,
                                      size_t n)  const
 {
+
+    // process by blocks to avoid using too much RAM
+    size_t bs = 256 * 1024;
+    if (n > bs) {
+        for (size_t i0 = 0; i0 < n; i0 += bs) {
+            size_t i1 = std::min(i0 + bs, n);
+            compute_codes (x + d * i0, codes + code_size * i0, i1 - i0);
+        }
+        return;
+    }
+
    if (dsub < 16) { // simple direct computation

 #pragma omp parallel for
@@ -525,15 +589,6 @@ static void pq_knn_search_with_tables (
    }
 }

-    /*
-static inline void pq_estimators_from_tables (const ProductQuantizer * pq,
-                                              const CT * codes,
-                                              size_t ncodes,
-                                              const float * dis_table,
-                                              size_t k,
-                                              float * heap_dis,
-                                              long * heap_ids)
-    */
 void ProductQuantizer::search (const float * __restrict x,
                               size_t nx,
                               const uint8_t * codes,

--- a/ProductQuantizer.h
+++ b/ProductQuantizer.h
@@ -23,6 +23,8 @@ namespace faiss {
 /** Product Quantizer. Implemented only for METRIC_L2 */
 struct ProductQuantizer {

+    using idx_t = Index::idx_t;
+
    size_t d;              ///< size of the input vectors
    size_t M;              ///< number of subquantizers
    size_t nbits;          ///< number of bits per quantization index
@@ -86,6 +88,13 @@ struct ProductQuantizer {
                        uint8_t * codes,
                        size_t n) const ;

+    /// speed up code assignment using assign_index
+    /// (non-const because the index is changed)
+    void compute_codes_with_assign_index (
+                const float * x,
+                uint8_t * codes,
+                size_t n);
+
    /// decode a vector from a given code (or n vectors if third argument)
    void decode (const uint8_t *code, float *x) const;
    void decode (const uint8_t *code, float *x, size_t n) const;

--- a/VectorTransform.cpp
+++ b/VectorTransform.cpp
--- a/VectorTransform.h
+++ b/VectorTransform.h
@@ -246,6 +246,25 @@ struct NormalizationTransform: VectorTransform {
    void reverse_transform(idx_t n, const float* xt, float* x) const override;
 };

+/** Subtract the mean of each component from the vectors. */
+struct CenteringTransform: VectorTransform {
+
+    /// Mean, size d_in = d_out
+    std::vector<float> mean;
+
+    explicit CenteringTransform (int d = 0);
+
+    /// train on n vectors.
+    void train(Index::idx_t n, const float* x) override;
+
+    /// subtract the mean
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// add the mean
+    void reverse_transform (idx_t n, const float * xt,
+                            float *x) const override;
+
+};


 /** Index that applies a LinearTransform transform on vectors before
@@ -285,6 +304,12 @@ struct IndexPreTransform: Index {
        float* distances,
        idx_t* labels) const override;

+
+    /* range search, no attempt is done to change the radius */
+    void range_search (idx_t n, const float* x, float radius,
+                       RangeSearchResult* result) const override;
+
+
    void reconstruct (idx_t key, float * recons) const override;

    void reconstruct_n (idx_t i0, idx_t ni, float *recons)

--- a/WorkerThread.cpp
+++ b/WorkerThread.cpp
--- a/WorkerThread.h
+++ b/WorkerThread.h
--- a/benchs/bench_all_ivf/bench_kmeans.py
+++ b/benchs/bench_all_ivf/bench_kmeans.py
--- a/benchs/bench_all_ivf/parse_bench_all_ivf.py
+++ b/benchs/bench_all_ivf/parse_bench_all_ivf.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD+Patents license found in the
 # LICENSE file in the root directory of this source tree.

-#! /usr/bin/python2
+#! /usr/bin/env python2

 import os
 import numpy as np

--- a/benchs/bench_all_ivf/run_on_cluster_generic.bash
+++ b/benchs/bench_all_ivf/run_on_cluster_generic.bash
@@ -23,14 +23,14 @@ function run_on_1machine () {
    # To be implemented
 }

-function run_on_1machine () {
+function run_on_8gpu () {
    # To be implemented
 }


 # prepare output directories
-
-basedir=/mnt/vol/gfsai-east/ai-group/users/matthijs/bench_all_ivf
+# set to some directory where all indexes, can be written.
+basedir=XXXXX

 logdir=$basedir/logs
 indexdir=$basedir/indexes

--- a/benchs/bench_gpu_1bn.py
+++ b/benchs/bench_gpu_1bn.py
@@ -654,7 +654,7 @@ def get_populated_index(preproc):

        print "Copy CPU index to %d sharded GPU indexes" % replicas

-        index = faiss.IndexProxy()
+        index = faiss.IndexReplicas()

        for i in range(replicas):
            gpu0 = ngpu * i / replicas

--- a/benchs/bench_hnsw.py
+++ b/benchs/bench_hnsw.py
--- a/benchs/kmeans_mnist.py
+++ b/benchs/kmeans_mnist.py
@@ -66,7 +66,7 @@ def train_kmeans(x, k, ngpu):
    else:
        indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                   for i in range(ngpu)]
-        index = faiss.IndexProxy()
+        index = faiss.IndexReplicas()
        for sub_index in indexes:
            index.addIndex(sub_index)


--- a/benchs/link_and_code/README.md
+++ b/benchs/link_and_code/README.md
--- a/benchs/link_and_code/neighbor_codec.py
+++ b/benchs/link_and_code/neighbor_codec.py
--- a/gpu/GpuAutoTune.cpp
+++ b/gpu/GpuAutoTune.cpp
--- a/gpu/GpuIndex.cu
+++ b/gpu/GpuIndex.cu
--- a/gpu/GpuIndex.h
+++ b/gpu/GpuIndex.h
--- a/gpu/GpuIndexBinaryFlat.cu
+++ b/gpu/GpuIndexBinaryFlat.cu
--- a/gpu/GpuIndexFlat.cu
+++ b/gpu/GpuIndexFlat.cu
--- a/gpu/GpuIndexFlat.h
+++ b/gpu/GpuIndexFlat.h
--- a/gpu/GpuIndexIVF.cu
+++ b/gpu/GpuIndexIVF.cu
--- a/gpu/GpuIndexIVF.h
+++ b/gpu/GpuIndexIVF.h
--- a/gpu/GpuIndexIVFFlat.cu
+++ b/gpu/GpuIndexIVFFlat.cu
--- a/gpu/GpuIndexIVFFlat.h
+++ b/gpu/GpuIndexIVFFlat.h
--- a/gpu/GpuIndexIVFPQ.cu
+++ b/gpu/GpuIndexIVFPQ.cu
--- a/gpu/GpuIndexIVFPQ.h
+++ b/gpu/GpuIndexIVFPQ.h
--- a/gpu/StandardGpuResources.cpp
+++ b/gpu/StandardGpuResources.cpp
--- a/gpu/StandardGpuResources.h
+++ b/gpu/StandardGpuResources.h
--- a/gpu/impl/BinaryDistance.cu
+++ b/gpu/impl/BinaryDistance.cu
--- a/gpu/impl/Distance.cu
+++ b/gpu/impl/Distance.cu
--- a/gpu/impl/IVFFlat.cu
+++ b/gpu/impl/IVFFlat.cu
--- a/gpu/impl/IVFPQ.cu
+++ b/gpu/impl/IVFPQ.cu
--- a/gpu/impl/IVFUtilsSelect1.cu
+++ b/gpu/impl/IVFUtilsSelect1.cu
--- a/gpu/impl/IVFUtilsSelect2.cu
+++ b/gpu/impl/IVFUtilsSelect2.cu
--- a/gpu/impl/L2Select.cu
+++ b/gpu/impl/L2Select.cu
--- a/gpu/perf/IndexWrapper-inl.h
+++ b/gpu/perf/IndexWrapper-inl.h
--- a/gpu/perf/IndexWrapper.h
+++ b/gpu/perf/IndexWrapper.h
--- a/gpu/perf/PerfSelect.cu
+++ b/gpu/perf/PerfSelect.cu
--- a/gpu/perf/slow.py
+++ b/gpu/perf/slow.py
--- a/gpu/test/TestGpuIndexBinaryFlat.cpp
+++ b/gpu/test/TestGpuIndexBinaryFlat.cpp
--- a/gpu/test/TestGpuIndexFlat.cpp
+++ b/gpu/test/TestGpuIndexFlat.cpp
--- a/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/gpu/test/TestGpuIndexIVFFlat.cpp
--- a/gpu/test/TestGpuIndexIVFPQ.cpp
+++ b/gpu/test/TestGpuIndexIVFPQ.cpp
--- a/gpu/test/TestGpuMemoryException.cpp
+++ b/gpu/test/TestGpuMemoryException.cpp
--- a/gpu/test/TestGpuSelect.cu
+++ b/gpu/test/TestGpuSelect.cu
--- a/gpu/test/test_gpu_index.py
+++ b/gpu/test/test_gpu_index.py
--- a/gpu/test/test_pytorch_faiss.py
+++ b/gpu/test/test_pytorch_faiss.py
--- a/gpu/utils/BlockSelectFloat.cu
+++ b/gpu/utils/BlockSelectFloat.cu
--- a/gpu/utils/BlockSelectHalf.cu
+++ b/gpu/utils/BlockSelectHalf.cu
--- a/gpu/utils/DeviceDefs.cuh
+++ b/gpu/utils/DeviceDefs.cuh
--- a/gpu/utils/DeviceMemory.cpp
+++ b/gpu/utils/DeviceMemory.cpp
--- a/gpu/utils/DeviceTensor-inl.cuh
+++ b/gpu/utils/DeviceTensor-inl.cuh
--- a/gpu/utils/DeviceUtils.cu
+++ b/gpu/utils/DeviceUtils.cu
--- a/gpu/utils/DeviceUtils.h
+++ b/gpu/utils/DeviceUtils.h
--- a/gpu/utils/DeviceVector.cuh
+++ b/gpu/utils/DeviceVector.cuh
--- a/gpu/utils/MemorySpace.cpp
+++ b/gpu/utils/MemorySpace.cpp
--- a/gpu/utils/MemorySpace.h
+++ b/gpu/utils/MemorySpace.h
--- a/gpu/utils/StackDeviceMemory.cpp
+++ b/gpu/utils/StackDeviceMemory.cpp
--- a/gpu/utils/ThrustAllocator.cuh
+++ b/gpu/utils/ThrustAllocator.cuh
--- a/gpu/utils/WarpSelectFloat.cu
+++ b/gpu/utils/WarpSelectFloat.cu
--- a/gpu/utils/WarpSelectHalf.cu
+++ b/gpu/utils/WarpSelectHalf.cu
--- a/gpu/utils/blockselect/BlockSelectFloatF2048.cu
+++ b/gpu/utils/blockselect/BlockSelectFloatF2048.cu
--- a/gpu/utils/blockselect/BlockSelectFloatT2048.cu
+++ b/gpu/utils/blockselect/BlockSelectFloatT2048.cu
--- a/gpu/utils/blockselect/BlockSelectHalfF2048.cu
+++ b/gpu/utils/blockselect/BlockSelectHalfF2048.cu
--- a/gpu/utils/blockselect/BlockSelectHalfT2048.cu
+++ b/gpu/utils/blockselect/BlockSelectHalfT2048.cu
--- a/gpu/utils/blockselect/BlockSelectImpl.cuh
+++ b/gpu/utils/blockselect/BlockSelectImpl.cuh
--- a/gpu/utils/warpselect/WarpSelectFloatF2048.cu
+++ b/gpu/utils/warpselect/WarpSelectFloatF2048.cu
--- a/gpu/utils/warpselect/WarpSelectFloatT2048.cu
+++ b/gpu/utils/warpselect/WarpSelectFloatT2048.cu
--- a/gpu/utils/warpselect/WarpSelectHalfF2048.cu
+++ b/gpu/utils/warpselect/WarpSelectHalfF2048.cu
--- a/gpu/utils/warpselect/WarpSelectHalfT2048.cu
+++ b/gpu/utils/warpselect/WarpSelectHalfT2048.cu
--- a/index_io.cpp
+++ b/index_io.cpp
--- a/index_io.h
+++ b/index_io.h
--- a/python/faiss.py
+++ b/python/faiss.py
--- a/python/swigfaiss.py
+++ b/python/swigfaiss.py
--- a/python/swigfaiss.swig
+++ b/python/swigfaiss.swig
--- a/python/swigfaiss_gpu.py
+++ b/python/swigfaiss_gpu.py
--- a/python/swigfaiss_gpu_wrap.cpp
+++ b/python/swigfaiss_gpu_wrap.cpp
--- a/python/swigfaiss_wrap.cpp
+++ b/python/swigfaiss_wrap.cpp
--- a/tests/test_build_blocks.py
+++ b/tests/test_build_blocks.py
--- a/tests/test_dealloc_invlists.cpp
+++ b/tests/test_dealloc_invlists.cpp
--- a/tests/test_index.py
+++ b/tests/test_index.py
--- a/tests/test_index_accuracy.py
+++ b/tests/test_index_accuracy.py
--- a/tests/test_index_binary.py
+++ b/tests/test_index_binary.py
--- a/tests/test_index_composite.py
+++ b/tests/test_index_composite.py
--- a/tests/test_omp_threads.cpp
+++ b/tests/test_omp_threads.cpp
--- a/tests/test_omp_threads_py.py
+++ b/tests/test_omp_threads_py.py
--- a/utils.cpp
+++ b/utils.cpp
--- a/utils.h
+++ b/utils.h