Facebook sync (Mar 2019) (#756)

Facebook sync (Mar 2019) - MatrixStats object - option to round coordinates during k-means optimization - alternative option for search in HNSW - moved stats and imbalance_factor of IndexIVF to InvertedLists object - range search for IVFScalarQuantizer - direct unit8 codec in ScalarQuantizer - renamed IndexProxy to IndexReplicas and moved to main Faiss - better support for PQ code assignment with external index - support for IMI2x16 (4B virtual centroids!) - support for k = 2048 search on GPU (instead of 1024) - most CUDA mem alloc failures throw exceptions instead of terminating on an assertion - support for renaming an ondisk invertedlists - interrupt computations with ctrl-C in python

Facebook sync (Mar 2019) (#756)
Facebook sync (Mar 2019) - MatrixStats object - option to round coordinates during k-means optimization - alternative option for search in HNSW - moved stats and imbalance_factor of IndexIVF to InvertedLists object - range search for IVFScalarQuantizer - direct unit8 codec in ScalarQuantizer - renamed IndexProxy to IndexReplicas and moved to main Faiss - better support for PQ code assignment with external index - support for IMI2x16 (4B virtual centroids!) - support for k = 2048 search on GPU (instead of 1024) - most CUDA mem alloc failures throw exceptions instead of terminating on an assertion - support for renaming an ondisk invertedlists - interrupt computations with ctrl-C in python
afe0fdc1 · Lucas Hosseini · GitHub · a9959bf6 · afe0fdc1 · afe0fdc1
Unverified Commit afe0fdc1 authored Mar 29, 2019 by Lucas Hosseini Committed by GitHub Mar 29, 2019
120 changed files
--- a/AutoTune.cpp
+++ b/AutoTune.cpp
@@ -15,6 +15,8 @@
 #include "AutoTune.h"
 #include <cmath>
+#include <stdarg.h>     /* va_list, va_start, va_arg, va_end */
 #include "FaissAssert.h"
 #include "utils.h"
@@ -992,5 +994,235 @@ IndexBinary *index_binary_factory(int d, const char *description)
    return index;
 }
+/*********************************************************************
+ * MatrixStats
+ *********************************************************************/
+MatrixStats::PerDimStats::PerDimStats():
+    n(0), n_nan(0), n_inf(0), n0(0),
+    min(HUGE_VALF), max(-HUGE_VALF),
+    sum(0), sum2(0),
+    mean(NAN), stddev(NAN)
+{}
+void MatrixStats::PerDimStats::add (float x)
+{
+    n++;
+    if (std::isnan(x)) {
+        n_nan++;
+        return;
+    }
+    if (!std::isfinite(x)) {
+        n_inf++;
+        return;
+    }
+    if (x == 0) n0++;
+    if (x < min) min = x;
+    if (x > max) max = x;
+    sum += x;
+    sum2 += (double)x * (double)x;
+}
+void MatrixStats::PerDimStats::compute_mean_std ()
+{
+    n_valid = n - n_nan - n_inf;
+    mean = sum / n_valid;
+    double var = sum2 / n_valid - mean * mean;
+    if (var < 0) var = 0;
+    stddev = sqrt(var);
+}
+void MatrixStats::do_comment (const char *fmt, ...)
+{
+    va_list ap;
+    /* Determine required size */
+    va_start(ap, fmt);
+    size_t size = vsnprintf(buf, nbuf, fmt, ap);
+    va_end(ap);
+    nbuf -= size;
+    buf += size;
+}
+MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
+    n(n), d(d),
+    n_collision(0), n_valid(0), n0(0),
+    min_norm2(HUGE_VAL), max_norm2(0)
+{
+    std::vector<char> comment_buf (10000);
+    buf = comment_buf.data ();
+    nbuf = comment_buf.size();
+    do_comment ("analyzing %ld vectors of size %ld\n", n, d);
+    if (d > 1024) {
+        do_comment (
+           "indexing this many dimensions is hard, "
+           "please consider dimensionality reducution (with PCAMatrix)\n");
+    }
+    size_t nbytes = sizeof (x[0]) * d;
+    per_dim_stats.resize (d);
+    for (size_t i = 0; i < n; i++) {
+        const float *xi = x + d * i;
+        double sum2 = 0;
+        for (size_t j = 0; j < d; j++) {
+            per_dim_stats[j].add (xi[j]);
+            sum2 += xi[j] * (double)xi[j];
+        }
+        if (std::isfinite (sum2)) {
+            n_valid++;
+            if (sum2 == 0) {
+                n0 ++;
+            } else {
+                if (sum2 < min_norm2) min_norm2 = sum2;
+                if (sum2 > max_norm2) max_norm2 = sum2;
+            }
+        }
+        { // check hash
+            uint64_t hash = hash_bytes((const uint8_t*)xi, nbytes);
+            auto elt = occurrences.find (hash);
+            if (elt == occurrences.end()) {
+                Occurrence occ = {i, 1};
+                occurrences[hash] = occ;
+            } else {
+                if (!memcmp (xi, x + elt->second.first * d, nbytes)) {
+                    elt->second.count ++;
+                } else {
+                    n_collision ++;
+                    // we should use a list of collisions but overkill
+                }
+            }
+        }
+    }
+    // invalid vecor stats
+    if (n_valid == n) {
+        do_comment ("no NaN or Infs in data\n");
+    } else {
+        do_comment ("%ld vectors contain NaN or Inf "
+                 "(or have too large components), "
+                 "expect bad results with indexing!\n", n - n_valid);
+    }
+    // copies in dataset
+    if (occurrences.size() == n) {
+        do_comment ("all vectors are distinct\n");
+    } else {
+        do_comment ("%ld vectors are distinct (%.2f%%)\n",
+                 occurrences.size(),
+                 occurrences.size() * 100.0 / n);
+        if (n_collision > 0) {
+            do_comment ("%ld collisions in hash table, "
+                     "counts may be invalid\n", n_collision);
+        }
+        Occurrence max = {0, 0};
+        for (auto it = occurrences.begin();
+             it != occurrences.end(); ++it) {
+            if (it->second.count > max.count) {
+                max = it->second;
+            }
+        }
+        do_comment ("vector %ld has %ld copies\n", max.first, max.count);
+    }
+    { // norm stats
+        min_norm2 = sqrt (min_norm2);
+        max_norm2 = sqrt (max_norm2);
+        do_comment ("range of L2 norms=[%g, %g] (%ld null vectors)\n",
+                 min_norm2, max_norm2, n0);
+        if (max_norm2 < min_norm2 * 1.0001) {
+            do_comment ("vectors are normalized, inner product and "
+                     "L2  search are equivalent\n");
+        }
+        if (max_norm2 > min_norm2 * 100) {
+            do_comment ("vectors have very large differences in norms, "
+                     "is this normal?\n");
+        }
+    }
+    { // per dimension stats
+        double max_std = 0, min_std = HUGE_VAL;
+        size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
+        for (size_t j = 0; j < d; j++) {
+            PerDimStats &st = per_dim_stats[j];
+            st.compute_mean_std ();
+            n0 += st.n0;
+            if (st.max == st.min) {
+                n_0_range ++;
+            } else if (st.max < 1.001 * st.min) {
+                n_dangerous_range ++;
+            }
+            if (st.stddev > max_std) max_std = st.stddev;
+            if (st.stddev < min_std) min_std = st.stddev;
+        }
+        if (n0 == 0) {
+            do_comment ("matrix contains no 0s\n");
+        } else {
+            do_comment ("matrix contains %.2f %% 0 entries\n",
+                     n0 * 100.0 / (n * d));
+        }
+        if (n_0_range == 0) {
+            do_comment ("no constant dimensions\n");
+        } else {
+            do_comment ("%ld dimensions are constant: they can be removed\n",
+                     n_0_range);
+        }
+        if (n_dangerous_range == 0) {
+            do_comment ("no dimension has a too large mean\n");
+        } else {
+            do_comment ("%ld dimensions are too large "
+                     "wrt. their variance, may loose precision "
+                     "in IndexFlatL2 (use CenteringTransform)\n",
+                     n_dangerous_range);
+        }
+        do_comment ("stddevs per dimension are in [%g %g]\n", min_std, max_std);
+        size_t n_small_var = 0;
+        for (size_t j = 0; j < d; j++) {
+            const PerDimStats &st = per_dim_stats[j];
+            if (st.stddev < max_std * 1e-4) {
+                n_small_var++;
+            }
+        }
+        if (n_small_var > 0) {
+            do_comment ("%ld dimensions have negligible stddev wrt. "
+                     "the largest dimension, they could be ignored",
+                     n_small_var);
+        }
+    }
+    comments = comment_buf.data ();
+    buf = nullptr;
+    nbuf = 0;
+}
 } // namespace faiss
--- a/AutoTune.h
+++ b/AutoTune.h
@@ -12,6 +12,7 @@
 #define FAISS_AUTO_TUNE_H
 #include <vector>
+#include <unordered_map>
 #include "Index.h"
 #include "IndexBinary.h"
@@ -209,6 +210,50 @@ Index *index_factory (int d, const char *description,
 IndexBinary *index_binary_factory (int d, const char *description);
+/** Reports some statistics on a dataset and comments on them.
+ *
+ * It is a class rather than a function so that all stats can also be
+ * accessed from code */
+struct MatrixStats {
+    MatrixStats (size_t n, size_t d, const float *x);
+    std::string comments;
+    // raw statistics
+    size_t n, d;
+    size_t n_collision, n_valid, n0;
+    double min_norm2, max_norm2;
+    struct PerDimStats {
+        size_t n, n_nan, n_inf, n0;
+        float min, max;
+        double sum, sum2;
+        size_t n_valid;
+        double mean, stddev;
+        PerDimStats();
+        void add (float x);
+        void compute_mean_std ();
+    };
+    std::vector<PerDimStats> per_dim_stats;
+    struct Occurrence {
+        size_t first;
+        size_t count;
+    };
+    std::unordered_map<uint64_t, Occurrence> occurrences;
+    char *buf;
+    size_t nbuf;
+    void do_comment (const char *fmt, ...);
+};
 } // namespace faiss

--- a/AuxIndexStructures.cpp
+++ b/AuxIndexStructures.cpp
@@ -8,11 +8,12 @@
 // -*- c++ -*-
+#include <cstring>
 #include "AuxIndexStructures.h"
 #include "FaissAssert.h"
-#include <cstring>
 namespace faiss {
@@ -72,6 +73,15 @@ BufferList::~BufferList ()
    }
 }
+void BufferList::add (idx_t id, float dis) {
+    if (wp == buffer_size) { // need new buffer
+        append_buffer();
+    }
+    Buffer & buf = buffers.back();
+    buf.ids [wp] = id;
+    buf.dis [wp] = dis;
+    wp++;
+}
 void BufferList::append_buffer ()
@@ -106,6 +116,12 @@ void BufferList::copy_range (size_t ofs, size_t n,
 * RangeSearchPartialResult
 ***********************************************************************/
+void RangeQueryResult::add (float dis, idx_t id) {
+    nres++;
+    pres->add (id, dis);
+}
 RangeSearchPartialResult::RangeSearchPartialResult (RangeSearchResult * res_in):
    BufferList(res_in->buffer_size),
@@ -114,10 +130,10 @@ RangeSearchPartialResult::RangeSearchPartialResult (RangeSearchResult * res_in):
 /// begin a new result
-RangeSearchPartialResult::QueryResult &
+RangeQueryResult &
    RangeSearchPartialResult::new_result (idx_t qno)
 {
-    QueryResult qres = {qno, 0, this};
+    RangeQueryResult qres = {qno, 0, this};
    queries.push_back (qres);
    return queries.back();
 }
@@ -140,7 +156,7 @@ void RangeSearchPartialResult::finalize ()
 void RangeSearchPartialResult::set_lims ()
 {
    for (int i = 0; i < queries.size(); i++) {
-        QueryResult & qres = queries[i];
+        RangeQueryResult & qres = queries[i];
        res->lims[qres.qno] = qres.nres;
    }
 }
@@ -150,7 +166,7 @@ void RangeSearchPartialResult::set_result (bool incremental)
 {
    size_t ofs = 0;
    for (int i = 0; i < queries.size(); i++) {
-        QueryResult & qres = queries[i];
+        RangeQueryResult & qres = queries[i];
        copy_range (ofs, qres.nres,
                    res->labels + res->lims[qres.qno],
@@ -246,6 +262,38 @@ size_t VectorIOReader::operator()(
 }
+/***********************************************************
+ * Interrupt callback
+ ***********************************************************/
+std::unique_ptr<InterruptCallback> InterruptCallback::instance;
+void InterruptCallback::check () {
+    if (!instance.get()) {
+        return;
+    }
+    if (instance->want_interrupt ()) {
+        FAISS_THROW_MSG ("computation interrupted");
+    }
+}
+bool InterruptCallback::is_interrupted () {
+    if (!instance.get()) {
+        return false;
+    }
+    return instance->want_interrupt();
+}
+size_t InterruptCallback::get_period_hint (size_t flops) {
+    if (!instance.get()) {
+        return 1L << 30; // never check
+    }
+    // for 10M flops, it is reasonable to check once every 10 iterations
+    return std::max((size_t)10 * 10 * 1000 * 1000 / (flops + 1), (size_t)1);
+}

--- a/AuxIndexStructures.h
+++ b/AuxIndexStructures.h
@@ -18,6 +18,7 @@
 #include <vector>
 #include <unordered_set>
+#include <memory>
 #include "Index.h"
@@ -117,16 +118,7 @@ struct BufferList {
    // create a new buffer
    void append_buffer ();
-    inline void add (idx_t id, float dis)
+    void add (idx_t id, float dis);
-    {
-        if (wp == buffer_size) { // need new buffer
-            append_buffer();
-        }
-        Buffer & buf = buffers.back();
-        buf.ids [wp] = id;
-        buf.dis [wp] = dis;
-        wp++;
-    }
    /// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
    /// tables dest_ids, dest_dis
@@ -135,7 +127,17 @@ struct BufferList {
 };
+struct RangeSearchPartialResult;
+/// result structure for a single query
+struct RangeQueryResult {
+    using idx_t = Index::idx_t;
+    idx_t qno;
+    size_t nres;
+    RangeSearchPartialResult * pres;
+    void add (float dis, idx_t id);
+};
 /// the entries in the buffers are split per query
 struct RangeSearchPartialResult: BufferList {
@@ -143,21 +145,10 @@ struct RangeSearchPartialResult: BufferList {
    explicit RangeSearchPartialResult (RangeSearchResult * res_in);
-    /// result structure for a single query
+    std::vector<RangeQueryResult> queries;
-    struct QueryResult {
-        idx_t qno;
-        size_t nres;
-        RangeSearchPartialResult * pres;
-        inline void add (float dis, idx_t id) {
-            nres++;
-            pres->add (id, dis);
-        }
-    };
-    std::vector<QueryResult> queries;
    /// begin a new result
-    QueryResult & new_result (idx_t qno);
+    RangeQueryResult & new_result (idx_t qno);
    void finalize ();
@@ -173,7 +164,6 @@ struct RangeSearchPartialResult: BufferList {
 * Abstract I/O objects
 ***********************************************************/
 struct IOReader {
    // name that can be used in error messages
    std::string name;
@@ -214,6 +204,57 @@ struct VectorIOWriter:IOWriter {
    size_t operator()(const void *ptr, size_t size, size_t nitems) override;
 };
+/***********************************************************
+ * The distance computer maintains a current query and computes
+ * distances to elements in an index that supports random access.
+ *
+ * The DistanceComputer is not intended to be thread-safe (eg. because
+ * it maintains counters) so the distance functions are not const,
+ * instanciate one from each thread if needed.
+ ***********************************************************/
+ struct DistanceComputer {
+     using idx_t = Index::idx_t;
+     /// called before computing distances
+     virtual void set_query(const float *x) = 0;
+     /// compute distance of vector i to current query
+     virtual float operator () (idx_t i) = 0;
+     /// compute distance between two stored vectors
+     virtual float symmetric_dis (idx_t i, idx_t j) = 0;
+     virtual ~DistanceComputer() {}
+ };
+/***********************************************************
+ * Interrupt callback
+ ***********************************************************/
+struct InterruptCallback {
+    virtual bool want_interrupt () = 0;
+    virtual ~InterruptCallback() {}
+    static std::unique_ptr<InterruptCallback> instance;
+    /** check if:
+     * - an interrupt callback is set
+     * - the callback retuns true
+     * if this is the case, then throw an exception
+     */
+    static void check ();
+    /// same as check() but return true if is interrupted instead of
+    /// throwing
+    static bool is_interrupted ();
+    /** assuming each iteration takes a certain number of flops, what
+     * is a reasonable interval to check for interrupts?
+     */
+    static size_t get_period_hint (size_t flops);
+};
 }; // namespace faiss

--- a/Clustering.cpp
+++ b/Clustering.cpp
@@ -9,6 +9,7 @@
 // -*- c++ -*-
 #include "Clustering.h"
+#include "AuxIndexStructures.h"
 #include <cmath>
@@ -24,7 +25,9 @@ namespace faiss {
 ClusteringParameters::ClusteringParameters ():
    niter(25),
    nredo(1),
-    verbose(false), spherical(false),
+    verbose(false),
+    spherical(false),
+    int_centroids(false),
    update_index(false),
    frozen_centroids(false),
    min_points_per_centroid(39),
@@ -58,7 +61,18 @@ static double imbalance_factor (int n, int k, long *assign) {
    return uf;
 }
+void Clustering::post_process_centroids ()
+{
+    if (spherical) {
+        fvec_renorm_L2 (d, k, centroids.data());
+    }
+    if (int_centroids) {
+        for (size_t i = 0; i < centroids.size(); i++)
+            centroids[i] = roundf (centroids[i]);
+    }
+}
 void Clustering::train (idx_t nx, const float *x_in, Index & index) {
@@ -117,9 +131,6 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
               "redo %d times, %d iterations\n",
               int(nx), d, k, nredo, niter);
    idx_t * assign = new idx_t[nx];
    ScopeDeleter<idx_t> del (assign);
    float * dis = new float[nx];
@@ -146,7 +157,7 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
    double t_search_tot = 0;
    if (verbose) {
        printf("  Preprocessing in %.2f s\n",
-               (getmillisecs() - t0)/1000.);
+               (getmillisecs() - t0) / 1000.);
    }
    t0 = getmillisecs();
@@ -156,7 +167,6 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
            printf("Outer iteration %d / %d\n", redo, nredo);
        }
        // initialize remaining centroids with random points from the dataset
        centroids.resize (d * k);
        std::vector<int> perm (nx);
@@ -166,9 +176,7 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
            memcpy (&centroids[i * d], x + perm[i] * d,
                    d * sizeof (float));
-        if (spherical) {
+        post_process_centroids ();
-            fvec_renorm_L2 (d, k, centroids.data());
-        }
        if (index.ntotal != 0) {
            index.reset();
@@ -183,6 +191,7 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
        for (int i = 0; i < niter; i++) {
            double t0s = getmillisecs();
            index.search (nx, x, 1, dis, assign);
+            InterruptCallback::check();
            t_search_tot += getmillisecs() - t0s;
            err = 0;
@@ -204,8 +213,7 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
                fflush (stdout);
            }
-            if (spherical)
+            post_process_centroids ();
-                fvec_renorm_L2 (d, k, centroids.data());
            index.reset ();
            if (update_index)
@@ -213,6 +221,7 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
            assert (index.ntotal == 0);
            index.add (k, centroids.data());
+            InterruptCallback::check ();
        }
        if (verbose) printf("\n");
        if (nredo > 1) {

--- a/Clustering.h
+++ b/Clustering.h
@@ -26,6 +26,7 @@ struct ClusteringParameters {
    bool verbose;
    bool spherical;     ///< do we want normalized centroids?
+    bool int_centroids; ///< round centroids coordinates to integer
    bool update_index;  ///< update index after each iteration?
    bool frozen_centroids;  ///< use the centroids provided as input and do not change them during iterations
@@ -72,6 +73,10 @@ struct Clustering: ClusteringParameters {
    /// Index is used during the assignment stage
    virtual void train (idx_t n, const float * x, faiss::Index & index);
+    /// Post-process the centroids after each centroid update.
+    /// includes optional L2 normalization and nearest integer rounding
+    void post_process_centroids ();
    virtual ~Clustering() {}
 };

--- a/HNSW.cpp
+++ b/HNSW.cpp
@@ -9,12 +9,11 @@
 // -*- c++ -*-
 #include "HNSW.h"
+#include "AuxIndexStructures.h"
 namespace faiss {
 using idx_t = Index::idx_t;
-using DistanceComputer = HNSW::DistanceComputer;
 /**************************************************************
 * HNSW structure implementation
@@ -544,12 +543,24 @@ int HNSW::search_from_candidates(
    vt.set(v1);
  }
+  bool do_dis_check = check_relative_distance;
  int nstep = 0;
  while (candidates.size() > 0) {
    float d0 = 0;
    int v0 = candidates.pop_min(&d0);
+    if (do_dis_check) {
+      // tricky stopping condition: there are more that ef
+      // distances that are processed already that are smaller
+      // than d0
+      int n_dis_below = candidates.count_below(d0);
+      if(n_dis_below >= efSearch) {
+        break;
+      }
+    }
    size_t begin, end;
    neighbor_range(v0, level, &begin, &end);
@@ -572,7 +583,7 @@ int HNSW::search_from_candidates(
    }
    nstep++;
-    if (nstep > efSearch) {
+    if (!do_dis_check && nstep > efSearch) {
      break;
    }
  }
@@ -596,38 +607,31 @@ int HNSW::search_from_candidates(
 * Searching
 **************************************************************/
-template<typename T>
+std::priority_queue<HNSW::Node> HNSW::search_from_candidate_unbounded(
-using MaxHeap = std::priority_queue<T, std::vector<T>, std::less<T>>;
-template<typename T>
-using MinHeap = std::priority_queue<T, std::vector<T>, std::greater<T>>;
-MaxHeap<HNSW::Node> HNSW::search_from(
  const Node& node,
  DistanceComputer& qdis,
  int ef,
  VisitedTable *vt) const
 {
-  MaxHeap<Node> top_candidates;
+  int ndis = 0;
-  MinHeap<Node> candidate_set;
+  std::priority_queue<Node> top_candidates;
+  std::priority_queue<Node, std::vector<Node>, std::greater<Node>> candidates;
  top_candidates.push(node);
-  candidate_set.push(node);
+  candidates.push(node);
  vt->set(node.second);
-  float lower_bound = node.first;
+  while (!candidates.empty()) {
-  while (!candidate_set.empty()) {
    float d0;
    storage_idx_t v0;
-    std::tie(d0, v0) = candidate_set.top();
+    std::tie(d0, v0) = candidates.top();
-    if (d0 > lower_bound) {
+    if (d0 > top_candidates.top().first) {
      break;
    }
-    candidate_set.pop();
+    candidates.pop();
    size_t begin, end;
    neighbor_range(v0, 0, &begin, &end);
@@ -645,20 +649,28 @@ MaxHeap<HNSW::Node> HNSW::search_from(
      vt->set(v1);
      float d1 = qdis(v1);
+      ++ndis;
      if (top_candidates.top().first > d1 || top_candidates.size() < ef) {
-        candidate_set.emplace(d1, v1);
+        candidates.emplace(d1, v1);
        top_candidates.emplace(d1, v1);
        if (top_candidates.size() > ef) {
          top_candidates.pop();
        }
-        lower_bound = top_candidates.top().first;
      }
    }
  }
+#pragma omp critical
+  {
+    ++hnsw_stats.n1;
+    if (candidates.size() == 0) {
+      ++hnsw_stats.n2;
+    }
+    hnsw_stats.n3 += ndis;
+  }
  return top_candidates;
 }
@@ -677,7 +689,17 @@ void HNSW::search(DistanceComputer& qdis, int k,
    }
    int ef = std::max(efSearch, k);
-    MaxHeap<Node> top_candidates = search_from(Node(d_nearest, nearest), qdis, ef, &vt);
+    if (search_bounded_queue) {
+      MinimaxHeap candidates(ef);
+      candidates.push(nearest, d_nearest);
+      search_from_candidates(qdis, k, I, D, candidates, vt, 0);
+    } else {
+      std::priority_queue<Node> top_candidates =
+        search_from_candidate_unbounded(Node(d_nearest, nearest),
+                                        qdis, ef, &vt);
      while (top_candidates.size() > k) {
        top_candidates.pop();
      }
@@ -690,19 +712,11 @@ void HNSW::search(DistanceComputer& qdis, int k,
        faiss::maxheap_push(++nres, D, I, d, label);
        top_candidates.pop();
      }
+    }
-    // MinimaxHeap candidates(candidates_size);
-//    top_candidates.emplace(d_nearest, nearest);
-    // search_from_candidates(qdis, k, I, D, candidates, vt, 0);
-    // NOTE(hoss): Init at the beginning?
    vt.advance();
  } else {
-    assert(false);
    int candidates_size = upper_beam;
    MinimaxHeap candidates(candidates_size);
@@ -742,44 +756,47 @@ void HNSW::MinimaxHeap::push(storage_idx_t i, float v) {
  if (k == n) {
    if (v >= dis[0]) return;
    faiss::heap_pop<HC> (k--, dis.data(), ids.data());
+    --nvalid;
  }
  faiss::heap_push<HC> (++k, dis.data(), ids.data(), v, i);
+  ++nvalid;
 }
 float HNSW::MinimaxHeap::max() const {
-  assert(k > 0);
  return dis[0];
 }
 int HNSW::MinimaxHeap::size() const {
-  return k;
+  return nvalid;
 }
 void HNSW::MinimaxHeap::clear() {
-  k = 0;
+  nvalid = k = 0;
 }
 int HNSW::MinimaxHeap::pop_min(float *vmin_out) {
  assert(k > 0);
  // returns min. This is an O(n) operation
  int i = k - 1;
+  while (i >= 0) {
+    if (ids[i] != -1) break;
+    i--;
+  }
+  if (i == -1) return -1;
  int imin = i;
  float vmin = dis[i];
  i--;
  while(i >= 0) {
-    if (dis[i] < vmin) {
+    if (ids[i] != -1 && dis[i] < vmin) {
      vmin = dis[i];
      imin = i;
    }
    i--;
  }
-  assert(2 * i > k);
  if (vmin_out) *vmin_out = vmin;
  int ret = ids[imin];
+  ids[imin] = -1;
-  --k;
+  --nvalid;
-  faiss::heap_push<HC>(++imin, dis.data(), ids.data(), ids[k], dis[k]);
  return ret;
 }

--- a/HNSW.h
+++ b/HNSW.h
@@ -37,12 +37,12 @@ namespace faiss {
 * (https://github.com/searchivarius/nmslib)
 *
 * The HNSW object stores only the neighbor link structure, see
- * IndexHNSW below for the full index object.
+ * IndexHNSW.h for the full index object.
 */
 struct VisitedTable;
+struct DistanceComputer; // from AuxIndexStructures
 struct HNSW {
  /// internal storage of vectors (32 bits: this is expensive)
@@ -53,37 +53,18 @@ struct HNSW {
  typedef std::pair<float, storage_idx_t> Node;
-  /** The HNSW structure does not store vectors, it only accesses
-   * them through this class.
-   *
-   * Functions are guaranteed to be be accessed only from 1 thread. */
-  struct DistanceComputer {
-    idx_t d;
-    /// called before computing distances
-    virtual void set_query(const float *x) = 0;
-    /// compute distance of vector i to current query
-    virtual float operator () (storage_idx_t i) = 0;
-    /// compute distance between two stored vectors
-    virtual float symmetric_dis(storage_idx_t i, storage_idx_t j) = 0;
-    virtual ~DistanceComputer() {}
-  };
  /** Heap structure that allows fast
   */
  struct MinimaxHeap {
    int n;
    int k;
+    int nvalid;
    std::vector<storage_idx_t> ids;
    std::vector<float> dis;
    typedef faiss::CMax<float, storage_idx_t> HC;
-    explicit MinimaxHeap(int n): n(n), k(0), ids(n), dis(n) {}
+    explicit MinimaxHeap(int n): n(n), k(0), nvalid(0), ids(n), dis(n) {}
    void push(storage_idx_t i, float v);
@@ -147,9 +128,15 @@ struct HNSW {
  /// expansion factor at search time
  int efSearch;
+  /// during search: do we check whether the next best distance is good enough?
+  bool check_relative_distance = true;
  /// number of entry points in levels > 0.
  int upper_beam;
+  /// use bounded queue during exploration
+  bool search_bounded_queue = true;
  // methods that initialize the tree sizes
  /// initialize the assign_probas and cum_nneighbor_per_level to
@@ -201,10 +188,12 @@ struct HNSW {
                             VisitedTable &vt,
                             int level, int nres_in = 0) const;
-  std::priority_queue<Node> search_from(const Node& node,
+  std::priority_queue<Node> search_from_candidate_unbounded(
+    const Node& node,
    DistanceComputer& qdis,
    int ef,
-                                        VisitedTable *vt) const;
+    VisitedTable *vt
+  ) const;
  /// search interface
  void search(DistanceComputer& qdis, int k,

--- a/IVFlib.cpp
+++ b/IVFlib.cpp
@@ -234,7 +234,7 @@ void SlidingIndexWindow::step(const Index *sub_index, bool remove_oldest) {
            for (int j = 0; j + 1 < n_slice; j++) {
                sizes[i][j] = sizes[i][j + 1] - amount_to_remove;
            }
-            sizes[i].resize(sizes[i].size() - 1);
+            sizes[i].pop_back ();
        }
        n_slice--;
    } else {

--- a/Index.h
+++ b/Index.h
@@ -60,8 +60,9 @@ struct RangeSearchResult;
 * database-to-database queries are not implemented.
 */
 struct Index {
+    using idx_t = long;    ///< all indices are this type
-    typedef long idx_t;    ///< all indices are this type
+    using component_t = float;
+    using distance_t = float;
    int d;                 ///< vector dimension
    idx_t ntotal;          ///< total nb of indexed vectors

--- a/IndexBinary.h
+++ b/IndexBinary.h
@@ -35,7 +35,9 @@ struct RangeSearchResult;
 * vectors.
 */
 struct IndexBinary {
-  typedef long idx_t;    ///< all indices are this type
+  using idx_t = Index::idx_t;    ///< all indices are this type
+  using component_t = uint8_t;
+  using distance_t = int32_t;
  int d;                 ///< vector dimension
  int code_size;   ///< number of bytes per vector ( = d / 8 )

--- a/IndexBinaryHNSW.cpp
+++ b/IndexBinaryHNSW.cpp
@@ -32,7 +32,7 @@
 #include "FaissAssert.h"
 #include "IndexBinaryFlat.h"
 #include "hamming.h"
+#include "AuxIndexStructures.h"
 namespace faiss {
@@ -121,7 +121,7 @@ void hnsw_add_vertices(IndexBinaryHNSW& index_hnsw,
      {
        VisitedTable vt (ntotal);
-        std::unique_ptr<HNSW::DistanceComputer> dis(
+        std::unique_ptr<DistanceComputer> dis(
          index_hnsw.get_distance_computer()
        );
        int prev_display = verbose && omp_get_thread_num() == 0 ? 0 : -1;
@@ -202,7 +202,7 @@ void IndexBinaryHNSW::search(idx_t n, const uint8_t *x, idx_t k,
 #pragma omp parallel
  {
    VisitedTable vt(ntotal);
-    std::unique_ptr<HNSW::DistanceComputer> dis(get_distance_computer());
+    std::unique_ptr<DistanceComputer> dis(get_distance_computer());
 #pragma omp for
    for(idx_t i = 0; i < n; i++) {
@@ -252,18 +252,18 @@ namespace {
 template<class HammingComputer>
-struct FlatHammingDis : HNSW::DistanceComputer {
+struct FlatHammingDis : DistanceComputer {
  const int code_size;
  const uint8_t *b;
  size_t ndis;
  HammingComputer hc;
-  float operator () (HNSW::storage_idx_t i) override {
+  float operator () (idx_t i) override {
    ndis++;
    return hc.hamming(b + i * code_size);
  }
-  float symmetric_dis(HNSW::storage_idx_t i, HNSW::storage_idx_t j) override {
+  float symmetric_dis(idx_t i, idx_t j) override {
    return HammingComputerDefault(b + j * code_size, code_size)
      .hamming(b + i * code_size);
  }
@@ -281,7 +281,7 @@ struct FlatHammingDis : HNSW::DistanceComputer {
    hc.set((uint8_t *)x, code_size);
  }
-  virtual ~FlatHammingDis() {
+  ~FlatHammingDis() override {
 #pragma omp critical
    {
      hnsw_stats.ndis += ndis;
@@ -293,7 +293,7 @@ struct FlatHammingDis : HNSW::DistanceComputer {
 }  // namespace
-HNSW::DistanceComputer *IndexBinaryHNSW::get_distance_computer() const {
+DistanceComputer *IndexBinaryHNSW::get_distance_computer() const {
  IndexBinaryFlat *flat_storage = dynamic_cast<IndexBinaryFlat *>(storage);
  FAISS_ASSERT(flat_storage != nullptr);

--- a/IndexBinaryHNSW.h
+++ b/IndexBinaryHNSW.h
@@ -37,7 +37,7 @@ struct IndexBinaryHNSW : IndexBinary {
  ~IndexBinaryHNSW() override;
-  HNSW::DistanceComputer *get_distance_computer() const;
+  DistanceComputer *get_distance_computer() const;
  void add(idx_t n, const uint8_t *x) override;

--- a/IndexBinaryIVF.cpp
+++ b/IndexBinaryIVF.cpp
@@ -252,39 +252,42 @@ long IndexBinaryIVF::remove_ids(const IDSelector& sel) {
 }
 void IndexBinaryIVF::train(idx_t n, const uint8_t *x) {
-  if (verbose)
+  if (verbose) {
-    printf("Training level-1 quantizer\n");
+    printf("Training quantizer\n");
+  }
-  train_q1(n, x, verbose);
+  if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
+    if (verbose) {
+      printf("IVF quantizer does not need training.\n");
+    }
+  } else {
+    if (verbose) {
+      printf("Training quantizer on %ld vectors in %dD\n", n, d);
+    }
-  is_trained = true;
+    Clustering clus(d, nlist, cp);
-}
+    quantizer->reset();
-double IndexBinaryIVF::imbalance_factor () const {
+    std::unique_ptr<float[]> x_f(new float[n * d]);
-  std::vector<int> hist(nlist);
+    binary_to_real(n * d, x, x_f.get());
-  for (int i = 0; i < nlist; i++) {
+    IndexFlatL2 index_tmp(d);
-    hist[i] = invlists->list_size(i);
+    if (clustering_index && verbose) {
+      printf("using clustering_index of dimension %d to do the clustering\n",
+             clustering_index->d);
    }
-  return faiss::imbalance_factor(nlist, hist.data());
+    clus.train(n, x_f.get(), clustering_index ? *clustering_index : index_tmp);
-}
-void IndexBinaryIVF::print_stats() const {
+    std::unique_ptr<uint8_t[]> x_b(new uint8_t[clus.k * code_size]);
-  std::vector<int> sizes(40);
+    real_to_binary(d * clus.k, clus.centroids.data(), x_b.get());
-  for (int i = 0; i < nlist; i++) {
-    for (int j = 0; j < sizes.size(); j++) {
+    quantizer->add(clus.k, x_b.get());
-      if ((invlists->list_size(i) >> j) == 0) {
+    quantizer->is_trained = true;
-        sizes[j]++;
-        break;
-      }
-    }
-  }
-  for (int i = 0; i < sizes.size(); i++) {
-    if (sizes[i]) {
-      printf("list size in < %d: %d instances\n", 1 << i, sizes[i]);
-    }
  }
+  is_trained = true;
 }
 void IndexBinaryIVF::merge_from(IndexBinaryIVF &other, idx_t add_id) {
@@ -315,38 +318,6 @@ void IndexBinaryIVF::replace_invlists(InvertedLists *il, bool own) {
 }
-void IndexBinaryIVF::train_q1(size_t n, const uint8_t *x, bool verbose) {
-  if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
-    if (verbose)
-      printf("IVF quantizer does not need training.\n");
-  } else {
-    if (verbose)
-      printf("Training level-1 quantizer on %ld vectors in %dD\n", n, d);
-    Clustering clus(d, nlist, cp);
-    quantizer->reset();
-    std::unique_ptr<float[]> x_f(new float[n * d]);
-    binary_to_real(n * d, x, x_f.get());
-    IndexFlatL2 index_tmp(d);
-    if (clustering_index && verbose) {
-        printf("using clustering_index of dimension %d to do the clustering\n",
-               clustering_index->d);
-    }
-    clus.train(n, x_f.get(), clustering_index ? *clustering_index : index_tmp);
-    std::unique_ptr<uint8_t[]> x_b(new uint8_t[clus.k * code_size]);
-    real_to_binary(d * clus.k, clus.centroids.data(), x_b.get());
-    quantizer->add(clus.k, x_b.get());
-    quantizer->is_trained = true;
-  }
-}
 namespace {
 using idx_t = Index::idx_t;

--- a/IndexBinaryIVF.h
+++ b/IndexBinaryIVF.h
@@ -58,9 +58,6 @@ struct IndexBinaryIVF : IndexBinary {
    ClusteringParameters cp; ///< to override default clustering params
    Index *clustering_index; ///< to override index used during clustering
-    /// Trains the quantizer and calls train_residual to train sub-quantizers
-    void train_q1(size_t n, const uint8_t *x, bool verbose);
    /** The Inverted file takes a quantizer (an IndexBinary) on input,
     * which implements the function mapping a vector to a list
     * identifier. The pointer is borrowed: the quantizer should not
@@ -74,10 +71,9 @@ struct IndexBinaryIVF : IndexBinary {
    void reset() override;
-    /// Trains the quantizer and calls train_residual to train sub-quantizers
+    /// Trains the quantizer
    void train(idx_t n, const uint8_t *x) override;
-    /// Quantizes x and calls add_with_key
    void add(idx_t n, const uint8_t *x) override;
    void add_with_ids(idx_t n, const uint8_t *x, const long *xids) override;
@@ -174,12 +170,6 @@ struct IndexBinaryIVF : IndexBinary {
     */
    void make_direct_map(bool new_maintain_direct_map=true);
-    /// 1= perfectly balanced, >1: imbalanced
-    double imbalance_factor() const;
-    /// display some stats about the inverted lists
-    void print_stats() const;
    void replace_invlists(InvertedLists *il, bool own=false);
 };

--- a/IndexHNSW.cpp
+++ b/IndexHNSW.cpp
@@ -35,6 +35,7 @@
 #include "FaissAssert.h"
 #include "IndexFlat.h"
 #include "IndexIVFPQ.h"
+#include "AuxIndexStructures.h"
 extern "C" {
@@ -55,7 +56,6 @@ using MinimaxHeap = HNSW::MinimaxHeap;
 using storage_idx_t = HNSW::storage_idx_t;
 using NodeDistCloser = HNSW::NodeDistCloser;
 using NodeDistFarther = HNSW::NodeDistFarther;
-using DistanceComputer = HNSW::DistanceComputer;
 HNSWStats hnsw_stats;
@@ -71,6 +71,7 @@ void hnsw_add_vertices(IndexHNSW &index_hnsw,
                       size_t n, const float *x,
                       bool verbose,
                       bool preset_levels = false) {
+    size_t d = index_hnsw.d;
    HNSW & hnsw = index_hnsw.hnsw;
    size_t ntotal = n0 + n;
    double t0 = getmillisecs();
@@ -80,6 +81,10 @@ void hnsw_add_vertices(IndexHNSW &index_hnsw,
               n, n0, int(preset_levels));
    }
+    if (n == 0) {
+        return;
+    }
    int max_level = hnsw.prepare_level_tab(n, preset_levels);
    if (verbose) {
@@ -119,6 +124,10 @@ void hnsw_add_vertices(IndexHNSW &index_hnsw,
        }
    }
+    idx_t check_period = InterruptCallback::get_period_hint
+        (max_level * index_hnsw.d * hnsw.efConstruction);
    { // perform add
        RandomGenerator rng2(789);
@@ -136,18 +145,26 @@ void hnsw_add_vertices(IndexHNSW &index_hnsw,
            for (int j = i0; j < i1; j++)
                std::swap(order[j], order[j + rng2.rand_int(i1 - j)]);
-#pragma omp parallel
+            bool interrupt = false;
+#pragma omp parallel if(i1 > i0 + 100)
            {
                VisitedTable vt (ntotal);
                DistanceComputer *dis = index_hnsw.get_distance_computer();
                ScopeDeleter1<DistanceComputer> del(dis);
                int prev_display = verbose && omp_get_thread_num() == 0 ? 0 : -1;
+                size_t counter = 0;
 #pragma omp  for schedule(dynamic)
                for (int i = i0; i < i1; i++) {
                    storage_idx_t pt_id = order[i];
-                    dis->set_query (x + (pt_id - n0) * dis->d);
+                    dis->set_query (x + (pt_id - n0) * d);
+                    // cannot break
+                    if (interrupt) {
+                        continue;
+                    }
                    hnsw.add_with_locks(*dis, pt_level, pt_id, locks, vt);
@@ -156,7 +173,21 @@ void hnsw_add_vertices(IndexHNSW &index_hnsw,
                        printf("  %d / %d\r", i - i0, i1 - i0);
                        fflush(stdout);
                    }
+                    if (counter % check_period == 0) {
+#pragma omp critical
+                        {
+                            if (InterruptCallback::is_interrupted ()) {
+                                interrupt = true;
+                            }
+                        }
+                    }
+                    counter++;
+                }
            }
+            if (interrupt) {
+                FAISS_THROW_MSG ("computation interrupted");
            }
            i1 = i0;
        }
@@ -214,16 +245,22 @@ void IndexHNSW::search (idx_t n, const float *x, idx_t k,
                        float *distances, idx_t *labels) const
 {
+    size_t nreorder = 0;
-#pragma omp parallel
+    idx_t check_period = InterruptCallback::get_period_hint (
+          hnsw.max_level * d * hnsw.efSearch);
+    for (idx_t i0 = 0; i0 < n; i0 += check_period) {
+        idx_t i1 = std::min(i0 + check_period, n);
+#pragma omp parallel reduction(+ : nreorder)
        {
            VisitedTable vt (ntotal);
            DistanceComputer *dis = get_distance_computer();
            ScopeDeleter1<DistanceComputer> del(dis);
-        size_t nreorder = 0;
 #pragma omp for
-        for(idx_t i = 0; i < n; i++) {
+            for(idx_t i = i0; i < i1; i++) {
                idx_t * idxi = labels + i * k;
                float * simi = distances + i * k;
                dis->set_query(x + i * d);
@@ -245,14 +282,13 @@ void IndexHNSW::search (idx_t n, const float *x, idx_t k,
                    maxheap_heapify (k_reorder, simi, idxi, simi, idxi, k_reorder);
                    maxheap_reorder (k_reorder, simi, idxi);
                }
            }
-#pragma omp critical
-        {
-            hnsw_stats.nreorder += nreorder;
        }
+        InterruptCallback::check ();
    }
+    hnsw_stats.nreorder += nreorder;
 }
@@ -552,7 +588,7 @@ namespace {
 // storage that explicitly reconstructs vectors before computing distances
 struct GenericDistanceComputer: DistanceComputer {
+    size_t d;
    const Index & storage;
    std::vector<float> buf;
    const float *q;
@@ -563,13 +599,13 @@ struct GenericDistanceComputer: DistanceComputer {
        buf.resize(d * 2);
    }
-    float operator () (storage_idx_t i) override
+    float operator () (idx_t i) override
    {
        storage.reconstruct(i, buf.data());
        return fvec_L2sqr(q, buf.data(), d);
    }
-    float symmetric_dis(storage_idx_t i, storage_idx_t j) override
+    float symmetric_dis(idx_t i, idx_t j) override
    {
        storage.reconstruct(i, buf.data());
        storage.reconstruct(j, buf.data() + d);
@@ -830,18 +866,19 @@ namespace {
 struct FlatL2Dis: DistanceComputer {
+    size_t d;
    Index::idx_t nb;
    const float *q;
    const float *b;
    size_t ndis;
-    float operator () (storage_idx_t i) override
+    float operator () (idx_t i) override
    {
        ndis++;
        return (fvec_L2sqr(q, b + i * d, d));
    }
-    float symmetric_dis(storage_idx_t i, storage_idx_t j) override
+    float symmetric_dis(idx_t i, idx_t j) override
    {
        return (fvec_L2sqr(b + j * d, b + i * d, d));
    }
@@ -860,7 +897,7 @@ struct FlatL2Dis: DistanceComputer {
        q = x;
    }
-    virtual ~FlatL2Dis () {
+    ~FlatL2Dis() override {
 #pragma omp critical
        {
            hnsw_stats.ndis += ndis;
@@ -903,6 +940,7 @@ namespace {
 struct PQDis: DistanceComputer {
+    size_t d;
    Index::idx_t nb;
    const uint8_t *codes;
    size_t code_size;
@@ -911,7 +949,7 @@ struct PQDis: DistanceComputer {
    std::vector<float> precomputed_table;
    size_t ndis;
-    float operator () (storage_idx_t i) override
+    float operator () (idx_t i) override
    {
        const uint8_t *code = codes + i * code_size;
        const float *dt = precomputed_table.data();
@@ -924,7 +962,7 @@ struct PQDis: DistanceComputer {
        return accu;
    }
-    float symmetric_dis(storage_idx_t i, storage_idx_t j) override
+    float symmetric_dis(idx_t i, idx_t j) override
    {
        const float * sdci = sdc;
        float accu = 0;
@@ -955,7 +993,7 @@ struct PQDis: DistanceComputer {
        pq.compute_distance_table(x, precomputed_table.data());
    }
-    virtual ~PQDis () {
+    ~PQDis() override {
 #pragma omp critical
        {
            hnsw_stats.ndis += ndis;
@@ -995,56 +1033,10 @@ DistanceComputer * IndexHNSWPQ::get_distance_computer () const
 **************************************************************/
-namespace {
-struct SQDis: DistanceComputer {
-    Index::idx_t nb;
-    const uint8_t *codes;
-    size_t code_size;
-    const ScalarQuantizer & sq;
-    const float *q;
-    ScalarQuantizer::DistanceComputer * dc;
-    float operator () (storage_idx_t i) override
-    {
-        const uint8_t *code = codes + i * code_size;
-        return dc->compute_distance (q, code);
-    }
-    float symmetric_dis(storage_idx_t i, storage_idx_t j) override
-    {
-        const uint8_t *codei = codes + i * code_size;
-        const uint8_t *codej = codes + j * code_size;
-        return dc->compute_code_distance (codei, codej);
-    }
-    SQDis(const IndexScalarQuantizer& storage, const float* /*q*/ = nullptr)
-        : sq(storage.sq) {
-      nb = storage.ntotal;
-      d = storage.d;
-      codes = storage.codes.data();
-      code_size = sq.code_size;
-      dc = sq.get_distance_computer();
-    }
-    void set_query(const float *x) override {
-        q = x;
-    }
-    virtual ~SQDis () {
-        delete dc;
-    }
-};
-}  // namespace
 IndexHNSWSQ::IndexHNSWSQ(int d, ScalarQuantizer::QuantizerType qtype, int M):
    IndexHNSW (new IndexScalarQuantizer (d, qtype), M)
 {
+    is_trained = false;
    own_fields = true;
 }
@@ -1052,7 +1044,8 @@ IndexHNSWSQ::IndexHNSWSQ() {}
 DistanceComputer * IndexHNSWSQ::get_distance_computer () const
 {
-    return new SQDis (*dynamic_cast<IndexScalarQuantizer*> (storage));
+    return (dynamic_cast<const IndexScalarQuantizer*> (storage))->
+        get_distance_computer ();
 }
@@ -1078,7 +1071,7 @@ namespace {
 struct Distance2Level: DistanceComputer {
+    size_t d;
    const Index2Layer & storage;
    std::vector<float> buf;
    const float *q;
@@ -1093,7 +1086,7 @@ struct Distance2Level: DistanceComputer {
        buf.resize(2 * d);
    }
-    float symmetric_dis(storage_idx_t i, storage_idx_t j) override
+    float symmetric_dis(idx_t i, idx_t j) override
    {
        storage.reconstruct(i, buf.data());
        storage.reconstruct(j, buf.data() + d);
@@ -1122,7 +1115,7 @@ struct DistanceXPQ4: Distance2Level {
        pq_l1_tab = quantizer->xb.data();
    }
-    float operator () (storage_idx_t i) override
+    float operator () (idx_t i) override
    {
 #ifdef __SSE__
        const uint8_t *code = storage.codes.data() + i * storage.code_size;
@@ -1173,7 +1166,7 @@ struct Distance2xXPQ4: Distance2Level {
        pq_l1_tab = mi->pq.centroids.data();
    }
-    float operator () (storage_idx_t i) override
+    float operator () (idx_t i) override
    {
        const uint8_t *code = storage.codes.data() + i * storage.code_size;
        long key01 = 0;

--- a/IndexHNSW.h
+++ b/IndexHNSW.h
@@ -86,7 +86,7 @@ struct IndexHNSW : Index {
    ~IndexHNSW() override;
    // get a DistanceComputer object for this kind of storage
-    virtual HNSW::DistanceComputer *get_distance_computer() const = 0;
+    virtual DistanceComputer *get_distance_computer() const = 0;
    void add(idx_t n, const float *x) override;
@@ -138,7 +138,7 @@ struct IndexHNSW : Index {
 struct IndexHNSWFlat : IndexHNSW {
    IndexHNSWFlat();
    IndexHNSWFlat(int d, int M);
-    HNSW::DistanceComputer *
+    DistanceComputer *
      get_distance_computer() const override;
 };
@@ -149,7 +149,7 @@ struct IndexHNSWPQ : IndexHNSW {
    IndexHNSWPQ();
    IndexHNSWPQ(int d, int pq_m, int M);
    void train(idx_t n, const float* x) override;
-    HNSW::DistanceComputer *
+    DistanceComputer *
      get_distance_computer() const override;
 };
@@ -159,7 +159,7 @@ struct IndexHNSWPQ : IndexHNSW {
 struct IndexHNSWSQ : IndexHNSW {
    IndexHNSWSQ();
    IndexHNSWSQ(int d, ScalarQuantizer::QuantizerType qtype, int M);
-    HNSW::DistanceComputer *
+    DistanceComputer *
      get_distance_computer() const override;
 };
@@ -168,7 +168,7 @@ struct IndexHNSWSQ : IndexHNSW {
 struct IndexHNSW2Level : IndexHNSW {
    IndexHNSW2Level();
    IndexHNSW2Level(Index *quantizer, size_t nlist, int m_pq, int M);
-    HNSW::DistanceComputer *
+    DistanceComputer *
      get_distance_computer() const override;
    void flip_to_ivf();

--- a/IndexIVF.cpp
+++ b/IndexIVF.cpp
@@ -204,12 +204,18 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
    using HeapForIP = CMin<float, idx_t>;
    using HeapForL2 = CMax<float, idx_t>;
+    idx_t check_period = InterruptCallback::get_period_hint
+        (nprobe * ntotal * d / nlist);
+    for (idx_t i0 = 0; i0 < n; i0 += check_period) {
+        idx_t i1 = std::min(i0 + check_period, n);
 #pragma omp parallel reduction(+: nlistv, ndis, nheap)
        {
            InvertedListScanner *scanner = get_InvertedListScanner(store_pairs);
            ScopeDeleter1<InvertedListScanner> del(scanner);
 #pragma omp for
-        for (size_t i = 0; i < n; i++) {
+            for (size_t i = i0; i < i1; i++) {
                // loop over queries
                const float * xi = x + i * d;
                scanner->set_query (xi);
@@ -248,7 +254,6 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
                    nlistv++;
                    InvertedLists::ScopedCodes scodes (invlists, key);
                    const Index::idx_t * ids = store_pairs ? nullptr :
                        invlists->get_ids (key);
@@ -271,9 +276,10 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
                } else {
                    heap_reorder<HeapForL2> (k, simi, idxi);
                }
            } // parallel for
        } // parallel
+        InterruptCallback::check ();
+    } // loop over blocks
    indexIVF_stats.nq += n;
    indexIVF_stats.nlist += nlistv;
@@ -284,10 +290,83 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
+void IndexIVF::range_search (idx_t nx, const float *x, float radius,
+                             RangeSearchResult *result) const
+{
+    long * keys = new long [nx * nprobe];
+    ScopeDeleter<long> del (keys);
+    float * coarse_dis = new float [nx * nprobe];
+    ScopeDeleter<float> del2 (coarse_dis);
+    double t0 = getmillisecs();
+    quantizer->search (nx, x, nprobe, coarse_dis, keys);
+    indexIVF_stats.quantization_time += getmillisecs() - t0;
+    t0 = getmillisecs();
+    invlists->prefetch_lists (keys, nx * nprobe);
+    size_t nlistv = 0, ndis = 0;
+    bool store_pairs = false;
+#pragma omp parallel reduction(+: nlistv, ndis)
+    {
+        RangeSearchPartialResult pres(result);
+        InvertedListScanner *scanner = get_InvertedListScanner(store_pairs);
+        ScopeDeleter1<InvertedListScanner> del3(scanner);
+#pragma omp for
+        for (size_t i = 0; i < nx; i++) {
+            const float * xi = x + i * d;
+            scanner->set_query (xi);
+            const long * keysi = keys + i * nprobe;
+            RangeQueryResult & qres = pres.new_result (i);
+            for (size_t ik = 0; ik < nprobe; ik++) {
+                long key = keysi[ik];  /* select the list  */
+                if (key < 0) continue;
+                FAISS_THROW_IF_NOT_FMT (key < (long) nlist,
+                      "Invalid key=%ld  at ik=%ld nlist=%ld\n",
+                      key, ik, nlist);
+                const size_t list_size = invlists->list_size(key);
+                if (list_size == 0) continue;
+                InvertedLists::ScopedCodes scodes (invlists, key);
+                InvertedLists::ScopedIds ids (invlists, key);
+                scanner->set_list (key, coarse_dis[i * nprobe + ik]);
+                nlistv++;
+                ndis += list_size;
+                scanner->scan_codes_range (list_size, scodes.get(),
+                                           ids.get(), radius, qres);
+            }
+        }
+        pres.finalize ();
+    }
+    indexIVF_stats.search_time += getmillisecs() - t0;
+    indexIVF_stats.nq += nx;
+    indexIVF_stats.nlist += nlistv;
+    indexIVF_stats.ndis += ndis;
+}
+InvertedListScanner *IndexIVF::get_InvertedListScanner (
+    bool /*store_pairs*/) const
+{
+    return nullptr;
+}
 void IndexIVF::reconstruct (idx_t key, float* recons) const
 {
    FAISS_THROW_IF_NOT_MSG (direct_map.size() == ntotal,
                            "direct map is not initialized");
+    FAISS_THROW_IF_NOT_MSG (key >= 0 && key < direct_map.size(),
+                            "invalid key");
    long list_no = direct_map[key] >> 32;
    long offset = direct_map[key] & 0xffffffff;
    reconstruct_from_offset (list_no, offset, recons);
@@ -430,37 +509,6 @@ void IndexIVF::train_residual(idx_t /*n*/, const float* /*x*/) {
 }
-double IndexIVF::imbalance_factor () const
-{
-    std::vector<int> hist (nlist);
-    for (int i = 0; i < nlist; i++) {
-        hist[i] = invlists->list_size(i);
-    }
-    return faiss::imbalance_factor (nlist, hist.data());
-}
-void IndexIVF::print_stats () const
-{
-    std::vector<int> sizes(40);
-    for (int i = 0; i < nlist; i++) {
-        for (int j = 0; j < sizes.size(); j++) {
-            if ((invlists->list_size(i) >> j) == 0) {
-                sizes[j]++;
-                break;
-            }
-        }
-    }
-    for (int i = 0; i < sizes.size(); i++) {
-        if (sizes[i]) {
-            printf ("list size in < %d: %d instances\n",
-                    1 << i, sizes[i]);
-        }
-    }
-}
 void IndexIVF::check_compatible_for_merge (const IndexIVF &other) const
 {
    // minimal sanity checks
@@ -581,5 +629,15 @@ void IndexIVFStats::reset()
 IndexIVFStats indexIVF_stats;
+void InvertedListScanner::scan_codes_range (size_t ,
+                       const uint8_t *,
+                       const idx_t *,
+                       float ,
+                       RangeQueryResult &) const
+{
+    FAISS_THROW_MSG ("scan_codes_range not implemented");
+}
 } // namespace faiss
--- a/IndexIVF.h
+++ b/IndexIVF.h
@@ -160,14 +160,15 @@ struct IndexIVF: Index, Level1Quantizer {
                                     ) const;
    /** assign the vectors, then call search_preassign */
-    virtual void search (idx_t n, const float *x, idx_t k,
+    void search (idx_t n, const float *x, idx_t k,
                 float *distances, idx_t *labels) const override;
+    void range_search (idx_t n, const float* x, float radius,
+                       RangeSearchResult* result) const override;
    /// get a scanner for this index (store_pairs means ignore labels)
    virtual InvertedListScanner *get_InvertedListScanner (
-                 bool store_pairs=false) const {
+        bool store_pairs=false) const;
-        return nullptr;
-    }
    void reconstruct (idx_t key, float* recons) const override;
@@ -242,18 +243,14 @@ struct IndexIVF: Index, Level1Quantizer {
     */
    void make_direct_map (bool new_maintain_direct_map=true);
-    /// 1= perfectly balanced, >1: imbalanced
-    double imbalance_factor () const;
-    /// display some stats about the inverted lists
-    void print_stats () const;
    /// replace the inverted lists, old one is deallocated if own_invlists
    void replace_invlists (InvertedLists *il, bool own=false);
    IndexIVF ();
 };
+class RangeQueryResult;
 /** Object that handles a query. The inverted lists to scan are
 * provided externally. The object has a lot of state, but
 * distance_to_code and scan_codes can be called in multiple
@@ -271,8 +268,8 @@ struct InvertedListScanner {
    /// compute a single query-to-code distance
    virtual float distance_to_code (const uint8_t *code) const = 0;
-    /** compute the distances to codes. (distances, labels) should be
+    /** scan a set of codes, compute distances to current query and
-     * organized ad a min- or max-heap
+     * update heap of results if necessary.
     *
     * @param n      number of codes to scan
     * @param codes  codes to scan (n * code_size)
@@ -280,6 +277,7 @@ struct InvertedListScanner {
     * @param distances  heap distances (size k)
     * @param labels     heap labels (size k)
     * @param k          heap size
+     * @return number of heap updates performed
     */
    virtual size_t scan_codes (size_t n,
                               const uint8_t *codes,
@@ -287,6 +285,16 @@ struct InvertedListScanner {
                               float *distances, idx_t *labels,
                               size_t k) const = 0;
+    /** scan a set of codes, compute distances to current query and
+     * update results if distances are below radius
+     *
+     * (default implementation fails) */
+    virtual void scan_codes_range (size_t n,
+                                   const uint8_t *codes,
+                                   const idx_t *ids,
+                                   float radius,
+                                   RangeQueryResult &result) const;
    virtual ~InvertedListScanner () {}
 };

--- a/IndexIVFFlat.cpp
+++ b/IndexIVFFlat.cpp
@@ -137,6 +137,25 @@ struct IVFFlatScanner: InvertedListScanner {
        return nup;
    }
+    void scan_codes_range (size_t list_size,
+                           const uint8_t *codes,
+                           const idx_t *ids,
+                           float radius,
+                           RangeQueryResult & res) const override
+    {
+        const float *list_vecs = (const float*)codes;
+        for (size_t j = 0; j < list_size; j++) {
+            const float * yj = list_vecs + d * j;
+            float dis = metric == METRIC_INNER_PRODUCT ?
+                fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
+            if (C::cmp (radius, dis)) {
+                long id = store_pairs ? (list_no << 32 | j) : ids[j];
+                res.add (dis, id);
+            }
+        }
+    }
 };
@@ -168,57 +187,6 @@ InvertedListScanner* IndexIVFFlat::get_InvertedListScanner
 }
-void IndexIVFFlat::range_search (idx_t nx, const float *x, float radius,
-                                 RangeSearchResult *result) const
-{
-    idx_t * keys = new idx_t [nx * nprobe];
-    ScopeDeleter<idx_t> del (keys);
-    quantizer->assign (nx, x, keys, nprobe);
-#pragma omp parallel
-    {
-        RangeSearchPartialResult pres(result);
-        for (size_t i = 0; i < nx; i++) {
-            const float * xi = x + i * d;
-            const long * keysi = keys + i * nprobe;
-            RangeSearchPartialResult::QueryResult & qres =
-                pres.new_result (i);
-            for (size_t ik = 0; ik < nprobe; ik++) {
-                long key = keysi[ik];  /* select the list  */
-                if (key < 0 || key >= (long) nlist) {
-                    fprintf (stderr, "Invalid key=%ld  at ik=%ld nlist=%ld\n",
-                             key, ik, nlist);
-                    throw;
-                }
-                const size_t list_size = invlists->list_size(key);
-                InvertedLists::ScopedCodes scodes (invlists, key);
-                const float * list_vecs = (const float*)scodes.get();
-                InvertedLists::ScopedIds ids (invlists, key);
-                for (size_t j = 0; j < list_size; j++) {
-                    const float * yj = list_vecs + d * j;
-                    if (metric_type == METRIC_L2) {
-                        float disij = fvec_L2sqr (xi, yj, d);
-                        if (disij < radius) {
-                            qres.add (disij, ids[j]);
-                        }
-                    } else if (metric_type == METRIC_INNER_PRODUCT) {
-                        float disij = fvec_inner_product(xi, yj, d);
-                        if (disij > radius) {
-                            qres.add (disij, ids[j]);
-                        }
-                    }
-                }
-            }
-        }
-        pres.finalize ();
-    }
-}
 void IndexIVFFlat::update_vectors (int n, idx_t *new_ids, const float *x)
 {
@@ -272,18 +240,6 @@ IndexIVFFlatDedup::IndexIVFFlatDedup (
    IndexIVFFlat (quantizer, d, nlist_, metric_type)
 {}
-// from Python's stringobject.c
-static uint64_t hash_bytes (const uint8_t *bytes, long n) {
-    const uint8_t *p = bytes;
-    uint64_t x = (uint64_t)(*p) << 7;
-    long len = n;
-    while (--len >= 0) {
-        x = (1000003*x) ^ *p++;
-    }
-    x ^= n;
-    return x;
-}
 void IndexIVFFlatDedup::train(idx_t n, const float* x)
 {

--- a/IndexIVFFlat.h
+++ b/IndexIVFFlat.h
@@ -39,24 +39,10 @@ struct IndexIVFFlat: IndexIVF {
                        const idx_t *list_nos,
                        uint8_t * codes) const override;
-    /*
-    void search_preassigned (idx_t n, const float *x, idx_t k,
-                             const idx_t *assign,
-                             const float *centroid_dis,
-                             float *distances, idx_t *labels,
-                             bool store_pairs,
-                             const IVFSearchParameters *params=nullptr
-                             ) const override;
-    */
    InvertedListScanner *get_InvertedListScanner (bool store_pairs)
        const override;
-    void range_search(
-        idx_t n,
-        const float* x,
-        float radius,
-        RangeSearchResult* result) const override;
    /** Update a subset of vectors.
     *
     * The index must have a direct_map

--- a/IndexPQ.cpp
+++ b/IndexPQ.cpp
@@ -796,7 +796,7 @@ struct MinSumK {
            // enqueue followers
            long ii = ti;
            for (int m = 0; m < M; m++) {
-                long n = ii & ((1 << nbit) - 1);
+                long n = ii & ((1L << nbit) - 1);
                ii >>= nbit;
                if (n + 1 >= N) continue;
@@ -819,8 +819,8 @@ struct MinSumK {
            }
            long ti = 0;
            for (int m = 0; m < M; m++) {
-                long n = ii & ((1 << nbit) - 1);
+                long n = ii & ((1L << nbit) - 1);
-                ti += ssx[m].get_ord(n) << (nbit * m);
+                ti += long(ssx[m].get_ord(n)) << (nbit * m);
                ii >>= nbit;
            }
            terms[k] = ti;

--- a/IndexReplicas.cpp
+++ b/IndexReplicas.cpp
+/**
+ * Copyright (c) 2015-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD+Patents license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include "IndexReplicas.h"
+#include "FaissAssert.h"
+namespace faiss {
+template<class IndexClass>
+IndexReplicasTemplate<IndexClass>::IndexReplicasTemplate()
+    : own_fields(false) {
+}
+template<class IndexClass>
+IndexReplicasTemplate<IndexClass>::~IndexReplicasTemplate() {
+  if (own_fields) {
+    for (auto& index : this->indices_)
+      delete index.first;
+  }
+}
+template<class IndexClass>
+void IndexReplicasTemplate<IndexClass>::addIndex(IndexClass* index) {
+  // Make sure that the parameters are the same for all prior indices
+  if (!indices_.empty()) {
+    auto& existing = indices_.front().first;
+    FAISS_THROW_IF_NOT_FMT(index->d == existing->d,
+                           "IndexReplicas::addIndex: dimension mismatch for "
+                           "newly added index; prior index has dim %d, "
+                           "new index has %d",
+                           existing->d, index->d);
+    FAISS_THROW_IF_NOT_FMT(index->ntotal == existing->ntotal,
+                           "IndexReplicas::addIndex: newly added index does "
+                           "not have same number of vectors as prior index; "
+                           "prior index has %ld vectors, new index has %ld",
+                           existing->ntotal, index->ntotal);
+    FAISS_THROW_IF_NOT_MSG(index->metric_type == existing->metric_type,
+                           "IndexReplicas::addIndex: newly added index is "
+                           "of different metric type than old index");
+  } else {
+    // Set our parameters
+    // FIXME: this is a little bit weird
+    this->d = index->d;
+    this->ntotal = index->ntotal;
+    this->verbose = index->verbose;
+    this->is_trained = index->is_trained;
+    this->metric_type = index->metric_type;
+  }
+  this->indices_.emplace_back(
+    std::make_pair(index,
+                   std::unique_ptr<WorkerThread>(new WorkerThread)));
+}
+template<class IndexClass>
+void IndexReplicasTemplate<IndexClass>::removeIndex(IndexClass* index) {
+  for (auto it = this->indices_.begin(); it != indices_.end(); ++it) {
+    if (it->first == index) {
+      // This is our index; stop the worker thread before removing it,
+      // to ensure that it has finished before function exit
+      it->second->stop();
+      it->second->waitForThreadExit();
+      this->indices_.erase(it);
+      return;
+    }
+  }
+  // could not find our index
+  FAISS_THROW_MSG("IndexReplicas::removeIndex: index not found");
+}
+template<class IndexClass>
+void IndexReplicasTemplate<IndexClass>::runOnIndex(std::function<void(IndexClass*)> f) {
+  FAISS_THROW_IF_NOT_MSG(!indices_.empty(), "no replicas in index");
+  std::vector<std::future<bool>> v;
+  for (auto& index : this->indices_) {
+    auto indexPtr = index.first;
+    v.emplace_back(index.second->add([indexPtr, f](){ f(indexPtr); }));
+  }
+  // Blocking wait for completion
+  for (auto& func : v) {
+    func.get();
+  }
+}
+template<class IndexClass>
+void IndexReplicasTemplate<IndexClass>::reset() {
+  runOnIndex([](IndexClass* index){ index->reset(); });
+  this->ntotal = 0;
+}
+template<class IndexClass>
+void IndexReplicasTemplate<IndexClass>::train(idx_t n, const component_t* x) {
+  runOnIndex([n, x](IndexClass* index){ index->train(n, x); });
+}
+template<class IndexClass>
+void IndexReplicasTemplate<IndexClass>::add(idx_t n, const component_t* x) {
+  runOnIndex([n, x](IndexClass* index){ index->add(n, x); });
+  this->ntotal += n;
+}
+template<class IndexClass>
+void IndexReplicasTemplate<IndexClass>::reconstruct(idx_t n, component_t* x) const {
+  FAISS_THROW_IF_NOT_MSG(!indices_.empty(), "no replicas in index");
+  indices_[0].first->reconstruct (n, x);
+}
+template<class IndexClass>
+void IndexReplicasTemplate<IndexClass>::search(
+              idx_t n,
+              const component_t* x,
+              idx_t k,
+              distance_t* distances,
+              idx_t* labels) const {
+  FAISS_THROW_IF_NOT_MSG(!indices_.empty(), "no replicas in index");
+  if (n == 0) {
+    return;
+  }
+  auto dim = indices_.front().first->d;
+  std::vector<std::future<bool>> v;
+  // Partition the query by the number of indices we have
+  auto queriesPerIndex =
+    (faiss::Index::idx_t) (n + indices_.size() - 1) / indices_.size();
+  FAISS_ASSERT(n / queriesPerIndex <= indices_.size());
+  for (faiss::Index::idx_t i = 0; i < indices_.size(); ++i) {
+    auto base = i * queriesPerIndex;
+    if (base >= n) {
+      break;
+    }
+    auto numForIndex = std::min(queriesPerIndex, n - base);
+    size_t components_per_vec = sizeof(component_t) == 1 ? (dim + 7) / 8 : dim;
+    auto queryStart = x + base * components_per_vec;
+    auto distancesStart = distances + base * k;
+    auto labelsStart = labels + base * k;
+    auto indexPtr = indices_[i].first;
+    auto fn =
+      [indexPtr, numForIndex, queryStart, k, distancesStart, labelsStart]() {
+        indexPtr->search(numForIndex, queryStart,
+                         k, distancesStart, labelsStart);
+      };
+    v.emplace_back(indices_[i].second->add(std::move(fn)));
+  }
+  // Blocking wait for completion
+  for (auto& f : v) {
+    f.get();
+  }
+}
+// explicit instanciations
+template struct IndexReplicasTemplate<Index>;
+template struct IndexReplicasTemplate<IndexBinary>;
+} // namespace
--- a/IndexReplicas.h
+++ b/IndexReplicas.h
+/**
+ * Copyright (c) 2015-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD+Patents license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include "Index.h"
+#include "IndexBinary.h"
+#include "WorkerThread.h"
+#include <memory>
+#include <vector>
+namespace faiss {
+/// Takes individual faiss::Index instances, and splits queries for
+/// sending to each Index instance, and joins the results together
+/// when done.
+/// Each index is managed by a separate CPU thread.
+template<class IndexClass>
+class IndexReplicasTemplate : public IndexClass {
+ public:
+  using idx_t = typename IndexClass::idx_t;
+  using component_t = typename IndexClass::component_t;
+  using distance_t = typename IndexClass::distance_t;
+  IndexReplicasTemplate();
+  ~IndexReplicasTemplate() override;
+  /// Adds an index that is managed by ourselves.
+  /// WARNING: once an index is added to this proxy, it becomes unsafe
+  /// to touch it from any other thread than that on which is managing
+  /// it, until we are shut down. Use runOnIndex to perform work on it
+  /// instead.
+  void addIndex(IndexClass* index);
+  /// Remove an index that is managed by ourselves.
+  /// This will flush all pending work on that index, and then shut
+  /// down its managing thread, and will remove the index.
+  void removeIndex(IndexClass* index);
+  /// Run a function on all indices, in the thread that the index is
+  /// managed in.
+  void runOnIndex(std::function<void(IndexClass*)> f);
+  /// faiss::Index API
+  /// All indices receive the same call
+  void reset() override;
+  /// faiss::Index API
+  /// All indices receive the same call
+  virtual void train(idx_t n, const component_t* x) override;
+  /// faiss::Index API
+  /// All indices receive the same call
+  virtual void add(idx_t n, const component_t* x) override;
+  /// faiss::Index API
+  /// Query is partitioned into a slice for each sub-index
+  /// split by ceil(n / #indices) for our sub-indices
+  virtual void search(idx_t n,
+              const component_t* x,
+              idx_t k,
+              distance_t* distances,
+              idx_t* labels) const override;
+  /// reconstructs from the first index
+  virtual void reconstruct(idx_t, component_t *v) const override;
+  bool own_fields;
+  int count() const {return indices_.size(); }
+  IndexClass* at(int i) {return indices_[i].first; }
+  const IndexClass* at(int i) const {return indices_[i].first; }
+ private:
+  /// Collection of Index instances, with their managing worker thread
+  mutable std::vector<std::pair<IndexClass*,
+                                std::unique_ptr<WorkerThread> > > indices_;
+};
+using IndexReplicas = IndexReplicasTemplate<Index>;
+using IndexBinaryReplicas = IndexReplicasTemplate<IndexBinary>;
+} // namespace
--- a/IndexScalarQuantizer.cpp
+++ b/IndexScalarQuantizer.cpp
--- a/IndexScalarQuantizer.h
+++ b/IndexScalarQuantizer.h
@@ -28,6 +28,7 @@ namespace faiss {
 * (default).
 */
+struct SQDistanceComputer;
 struct ScalarQuantizer {
@@ -37,6 +38,7 @@ struct ScalarQuantizer {
        QT_8bit_uniform,     ///< same, shared range for all dimensions
        QT_4bit_uniform,
        QT_fp16,
+        QT_8bit_direct,      /// fast indexing of uint8s
    };
    QuantizerType qtype;
@@ -79,25 +81,13 @@ struct ScalarQuantizer {
    /// decode a vector from a given code (or n vectors if third argument)
    void decode (const uint8_t *code, float *x, size_t n) const;
-    // fast, non thread-safe way of computing vector-to-code and
-    // code-to-code distances.
-    struct DistanceComputer {
-        /// vector-to-code distance computation
+    SQDistanceComputer *get_distance_computer (MetricType metric = METRIC_L2)
-        virtual float compute_distance (const float *x,
-                                        const uint8_t *code) const = 0;
-        /// code-to-code distance computation
-        virtual float compute_code_distance (const uint8_t *code1,
-                                             const uint8_t *code2) const = 0;
-        virtual ~DistanceComputer () {}
-    };
-    DistanceComputer *get_distance_computer (MetricType metric = METRIC_L2)
        const;
 };
+struct DistanceComputer;
 struct IndexScalarQuantizer: Index {
    /// Used to encode the vectors
@@ -137,6 +127,8 @@ struct IndexScalarQuantizer: Index {
    void reconstruct(idx_t key, float* recons) const override;
+    DistanceComputer *get_distance_computer () const;
 };
@@ -148,6 +140,7 @@ struct IndexScalarQuantizer: Index {
 struct IndexIVFScalarQuantizer: IndexIVF {
    ScalarQuantizer sq;
+    bool by_residual;
    IndexIVFScalarQuantizer(Index *quantizer, size_t d, size_t nlist,
                            ScalarQuantizer::QuantizerType qtype,

--- a/IndexShards.cpp
+++ b/IndexShards.cpp
+/**
+ * Copyright (c) 2015-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD+Patents license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include "IndexShards.h"
+#include <cstdio>
+#include <functional>
+#include "FaissAssert.h"
+#include "Heap.h"
+#include "WorkerThread.h"
+namespace faiss {
+// subroutines
+namespace {
+typedef Index::idx_t idx_t;
+// add translation to all valid labels
+void translate_labels (long n, idx_t *labels, long translation)
+{
+    if (translation == 0) return;
+    for (long i = 0; i < n; i++) {
+        if(labels[i] < 0) continue;
+        labels[i] += translation;
+    }
+}
+/** merge result tables from several shards.
+ * @param all_distances  size nshard * n * k
+ * @param all_labels     idem
+ * @param translartions  label translations to apply, size nshard
+ */
+template <class IndexClass, class C>
+void merge_tables (long n, long k, long nshard,
+                   typename IndexClass::distance_t *distances,
+                   idx_t *labels,
+                   const typename IndexClass::distance_t *all_distances,
+                   idx_t *all_labels,
+                   const long *translations)
+{
+    if(k == 0) {
+        return;
+    }
+    using distance_t = typename IndexClass::distance_t;
+    long stride = n * k;
+#pragma omp parallel
+    {
+        std::vector<int> buf (2 * nshard);
+        int * pointer = buf.data();
+        int * shard_ids = pointer + nshard;
+        std::vector<distance_t> buf2 (nshard);
+        distance_t * heap_vals = buf2.data();
+#pragma omp for
+        for (long i = 0; i < n; i++) {
+            // the heap maps values to the shard where they are
+            // produced.
+            const distance_t *D_in = all_distances + i * k;
+            const idx_t *I_in = all_labels + i * k;
+            int heap_size = 0;
+            for (long s = 0; s < nshard; s++) {
+                pointer[s] = 0;
+                if (I_in[stride * s] >= 0)
+                    heap_push<C> (++heap_size, heap_vals, shard_ids,
+                                 D_in[stride * s], s);
+            }
+            distance_t *D = distances + i * k;
+            idx_t *I = labels + i * k;
+            for (int j = 0; j < k; j++) {
+                if (heap_size == 0) {
+                    I[j] = -1;
+                    D[j] = C::neutral();
+                } else {
+                    // pop best element
+                    int s = shard_ids[0];
+                    int & p = pointer[s];
+                    D[j] = heap_vals[0];
+                    I[j] = I_in[stride * s + p] + translations[s];
+                    heap_pop<C> (heap_size--, heap_vals, shard_ids);
+                    p++;
+                    if (p < k && I_in[stride * s + p] >= 0)
+                        heap_push<C> (++heap_size, heap_vals, shard_ids,
+                                     D_in[stride * s + p], s);
+                }
+            }
+        }
+    }
+}
+template<class IndexClass>
+void runOnIndexes(bool threaded,
+                  std::function<void(int no, IndexClass*)> f,
+                  std::vector<IndexClass *> indexes)
+{
+    FAISS_THROW_IF_NOT_MSG(!indexes.empty(), "no shards in index");
+    if (!threaded) {
+        for (int no = 0; no < indexes.size(); no++) {
+            IndexClass *index = indexes[no];
+            f(no, index);
+        }
+    } else {
+        std::vector<std::unique_ptr<WorkerThread> > threads;
+        std::vector<std::future<bool>> v;
+        for (int no = 0; no < indexes.size(); no++) {
+            IndexClass *index = indexes[no];
+            threads.emplace_back(new WorkerThread());
+            WorkerThread *wt = threads.back().get();
+            v.emplace_back(wt->add([no, index, f](){ f(no, index); }));
+        }
+        // Blocking wait for completion
+        for (auto& func : v) {
+            func.get();
+        }
+    }
+};
+} // anonymous namespace
+template<class IndexClass>
+IndexShardsTemplate<IndexClass>::IndexShardsTemplate (idx_t d, bool threaded, bool successive_ids):
+    IndexClass (d), own_fields (false),
+    threaded (threaded), successive_ids (successive_ids)
+{
+}
+template<class IndexClass>
+void IndexShardsTemplate<IndexClass>::add_shard (IndexClass *idx)
+{
+    shard_indexes.push_back (idx);
+    sync_with_shard_indexes ();
+}
+template<class IndexClass>
+void IndexShardsTemplate<IndexClass>::sync_with_shard_indexes ()
+{
+    if (shard_indexes.empty()) return;
+    IndexClass * index0 = shard_indexes[0];
+    this->d = index0->d;
+    this->metric_type = index0->metric_type;
+    this->is_trained = index0->is_trained;
+    this->ntotal = index0->ntotal;
+    for (int i = 1; i < shard_indexes.size(); i++) {
+        IndexClass * index = shard_indexes[i];
+        FAISS_THROW_IF_NOT (this->metric_type == index->metric_type);
+        FAISS_THROW_IF_NOT (this->d == index->d);
+        this->ntotal += index->ntotal;
+    }
+}
+template<class IndexClass>
+void IndexShardsTemplate<IndexClass>::train (idx_t n, const component_t *x)
+{
+    auto train_func = [n, x](int no, IndexClass *index)
+    {
+        if (index->verbose)
+            printf ("begin train shard %d on %ld points\n", no, n);
+        index->train(n, x);
+        if (index->verbose)
+            printf ("end train shard %d\n", no);
+    };
+    runOnIndexes<IndexClass> (threaded, train_func, shard_indexes);
+    sync_with_shard_indexes ();
+}
+template<class IndexClass>
+void IndexShardsTemplate<IndexClass>::add (idx_t n, const component_t *x)
+{
+    add_with_ids (n, x, nullptr);
+}
+template<class IndexClass>
+void IndexShardsTemplate<IndexClass>::add_with_ids (idx_t n, const component_t * x, const idx_t *xids)
+{
+    FAISS_THROW_IF_NOT_MSG(!(successive_ids && xids),
+                   "It makes no sense to pass in ids and "
+                   "request them to be shifted");
+    if (successive_ids) {
+        FAISS_THROW_IF_NOT_MSG(!xids,
+                       "It makes no sense to pass in ids and "
+                       "request them to be shifted");
+        FAISS_THROW_IF_NOT_MSG(this->ntotal == 0,
+                       "when adding to IndexShards with sucessive_ids, "
+                       "only add() in a single pass is supported");
+    }
+    long nshard = shard_indexes.size();
+    const idx_t *ids = xids;
+    ScopeDeleter<idx_t> del;
+    if (!ids && !successive_ids) {
+        idx_t *aids = new idx_t[n];
+        for (idx_t i = 0; i < n; i++)
+            aids[i] = this->ntotal + i;
+        ids = aids;
+        del.set (ids);
+    }
+    size_t components_per_vec =
+        sizeof(component_t) == 1 ? (this->d + 7) / 8 : this->d;
+    auto add_func = [n, ids, x, nshard, components_per_vec]
+        (int no, IndexClass *index) {
+        idx_t i0 = no * n / nshard;
+        idx_t i1 = (no + 1) * n / nshard;
+        auto x0 = x + i0 * components_per_vec;
+        if (index->verbose) {
+            printf ("begin add shard %d on %ld points\n", no, n);
+        }
+        if (ids) {
+            index->add_with_ids (i1 - i0, x0, ids + i0);
+        } else {
+            index->add (i1 - i0, x0);
+        }
+        if (index->verbose) {
+            printf ("end add shard %d on %ld points\n", no, i1 - i0);
+        }
+    };
+    runOnIndexes<IndexClass> (threaded, add_func, shard_indexes);
+    this->ntotal += n;
+}
+template<class IndexClass>
+void IndexShardsTemplate<IndexClass>::reset ()
+{
+    for (int i = 0; i < shard_indexes.size(); i++) {
+        shard_indexes[i]->reset ();
+    }
+    sync_with_shard_indexes ();
+}
+template<class IndexClass>
+void IndexShardsTemplate<IndexClass>::search (
+           idx_t n, const component_t *x, idx_t k,
+           distance_t *distances, idx_t *labels) const
+{
+    long nshard = shard_indexes.size();
+    distance_t *all_distances = new distance_t [nshard * k * n];
+    idx_t *all_labels = new idx_t [nshard * k * n];
+    ScopeDeleter<distance_t> del (all_distances);
+    ScopeDeleter<idx_t> del2 (all_labels);
+    auto query_func = [n, k, x, all_distances, all_labels]
+        (int no, IndexClass *index) {
+        if (index->verbose) {
+            printf ("begin query shard %d on %ld points\n", no, n);
+        }
+        index->search (n, x, k,
+                       all_distances + no * k * n,
+                       all_labels + no * k * n);
+        if (index->verbose) {
+            printf ("end query shard %d\n", no);
+        }
+    };
+    runOnIndexes<IndexClass> (threaded, query_func, shard_indexes);
+    std::vector<long> translations (nshard, 0);
+    if (successive_ids) {
+        translations[0] = 0;
+        for (int s = 0; s + 1 < nshard; s++)
+            translations [s + 1] = translations [s] +
+                shard_indexes [s]->ntotal;
+    }
+    if (this->metric_type == METRIC_L2) {
+        merge_tables<IndexClass, CMin<distance_t, int> > (
+             n, k, nshard, distances, labels,
+             all_distances, all_labels, translations.data ());
+    } else {
+        merge_tables<IndexClass, CMax<distance_t, int> > (
+             n, k, nshard, distances, labels,
+             all_distances, all_labels, translations.data ());
+    }
+}
+template<class IndexClass>
+IndexShardsTemplate<IndexClass>::~IndexShardsTemplate ()
+{
+    if (own_fields) {
+        for (int s = 0; s < shard_indexes.size(); s++)
+            delete shard_indexes [s];
+    }
+}
+// explicit instanciations
+template struct IndexShardsTemplate<Index>;
+template struct IndexShardsTemplate<IndexBinary>;
+} // namespace faiss
--- a/IndexShards.h
+++ b/IndexShards.h
+/**
+ * Copyright (c) 2015-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD+Patents license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#pragma once
+#include <vector>
+#include "Index.h"
+#include "IndexBinary.h"
+namespace faiss {
+/** Index that concatenates the results from several sub-indexes
+ *
+ */
+template<class IndexClass>
+struct IndexShardsTemplate : IndexClass {
+    using idx_t = typename IndexClass::idx_t;
+    using component_t = typename IndexClass::component_t;
+    using distance_t = typename IndexClass::distance_t;
+    std::vector<IndexClass*> shard_indexes;
+    bool own_fields;      /// should the sub-indexes be deleted along with this?
+    bool threaded;
+    bool successive_ids;
+    /**
+     * @param threaded     do we use one thread per sub_index or do
+     *                     queries sequentially?
+     * @param successive_ids should we shift the returned ids by
+     *                     the size of each sub-index or return them
+     *                     as they are?
+     */
+    explicit IndexShardsTemplate (idx_t d, bool threaded = false,
+                                  bool successive_ids = true);
+    void add_shard (IndexClass *);
+    // update metric_type and ntotal. Call if you changes something in
+    // the shard indexes.
+    void sync_with_shard_indexes ();
+    IndexClass *at(int i) {return shard_indexes[i]; }
+    /// supported only for sub-indices that implement add_with_ids
+    void add(idx_t n, const component_t* x) override;
+    /**
+     * Cases (successive_ids, xids):
+     * - true, non-NULL       ERROR: it makes no sense to pass in ids and
+     *                        request them to be shifted
+     * - true, NULL           OK, but should be called only once (calls add()
+     *                        on sub-indexes).
+     * - false, non-NULL      OK: will call add_with_ids with passed in xids
+     *                        distributed evenly over shards
+     * - false, NULL          OK: will call add_with_ids on each sub-index,
+     *                        starting at ntotal
+     */
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids) override;
+    void search(
+        idx_t n, const component_t* x, idx_t k,
+        distance_t* distances, idx_t* labels) const override;
+    void train(idx_t n, const component_t* x) override;
+    void reset() override;
+    ~IndexShardsTemplate() override;
+};
+using IndexShards = IndexShardsTemplate<Index>;
+using IndexBinaryShards = IndexShardsTemplate<IndexBinary>;
+} // namespace faiss
--- a/InvertedLists.cpp
+++ b/InvertedLists.cpp
@@ -97,6 +97,34 @@ void InvertedLists::merge_from (InvertedLists *oivf, size_t add_id) {
    }
 }
+double InvertedLists::imbalance_factor () const {
+    std::vector<int> hist(nlist);
+    for (size_t i = 0; i < nlist; i++) {
+        hist[i] = list_size(i);
+    }
+    return faiss::imbalance_factor(nlist, hist.data());
+}
+void InvertedLists::print_stats () const {
+    std::vector<int> sizes(40);
+    for (size_t i = 0; i < nlist; i++) {
+        for (size_t j = 0; j < sizes.size(); j++) {
+            if ((list_size(i) >> j) == 0) {
+                sizes[j]++;
+                break;
+            }
+        }
+    }
+    for (size_t i = 0; i < sizes.size(); i++) {
+        if (sizes[i]) {
+            printf("list size in < %d: %d instances\n", 1 << i, sizes[i]);
+        }
+    }
+}
 /*****************************************
 * ArrayInvertedLists implementation
 ******************************************/

--- a/InvertedLists.h
+++ b/InvertedLists.h
@@ -101,6 +101,16 @@ struct InvertedLists {
    virtual ~InvertedLists ();
+    /*************************
+     * statistics            */
+    /// 1= perfectly balanced, >1: imbalanced
+    double imbalance_factor () const;
+    /// display some stats about the inverted lists
+    void print_stats () const;
    /**************************************
     * Scoped inverted lists (for automatic deallocation)
     *

--- a/MetaIndexes.cpp
+++ b/MetaIndexes.cpp
--- a/MetaIndexes.h
+++ b/MetaIndexes.h
@@ -11,13 +11,10 @@
 #ifndef META_INDEXES_H
 #define META_INDEXES_H
 #include <vector>
 #include <unordered_map>
 #include "Index.h"
+#include "IndexShards.h"
 namespace faiss {
@@ -78,65 +75,6 @@ struct IndexIDMap2 : IndexIDMap {
    IndexIDMap2 () {}
 };
-/** Index that concatenates the results from several sub-indexes
- *
- */
-struct IndexShards : Index {
-    std::vector<Index*> shard_indexes;
-    bool own_fields;      /// should the sub-indexes be deleted along with this?
-    bool threaded;
-    bool successive_ids;
-    /**
-     * @param threaded     do we use one thread per sub_index or do
-     *                     queries sequentially?
-     * @param successive_ids should we shift the returned ids by
-     *                     the size of each sub-index or return them
-     *                     as they are?
-     */
-    explicit IndexShards (idx_t d, bool threaded = false,
-                         bool successive_ids = true);
-    void add_shard (Index *);
-    // update metric_type and ntotal. Call if you changes something in
-    // the shard indexes.
-    void sync_with_shard_indexes ();
-    Index *at(int i) {return shard_indexes[i]; }
-    /// supported only for sub-indices that implement add_with_ids
-    void add(idx_t n, const float* x) override;
-    /**
-     * Cases (successive_ids, xids):
-     * - true, non-NULL       ERROR: it makes no sense to pass in ids and
-     *                        request them to be shifted
-     * - true, NULL           OK, but should be called only once (calls add()
-     *                        on sub-indexes).
-     * - false, non-NULL      OK: will call add_with_ids with passed in xids
-     *                        distributed evenly over shards
-     * - false, NULL          OK: will call add_with_ids on each sub-index,
-     *                        starting at ntotal
-     */
-    void add_with_ids(idx_t n, const float* x, const long* xids) override;
-    void search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels) const override;
-    void train(idx_t n, const float* x) override;
-    void reset() override;
-    ~IndexShards() override;
-};
 /** splits input vectors in segments and assigns each segment to a sub-index
 * used to distribute a MultiIndexQuantizer
 */

--- a/ProductQuantizer.cpp
+++ b/ProductQuantizer.cpp
@@ -379,10 +379,74 @@ void ProductQuantizer::compute_code_from_distance_table (const float *tab,
    }
 }
+void ProductQuantizer::compute_codes_with_assign_index (
+                const float * x,
+                uint8_t * codes,
+                size_t n)
+{
+    FAISS_THROW_IF_NOT (assign_index && assign_index->d == dsub);
+    for (size_t m = 0; m < M; m++) {
+        assign_index->reset ();
+        assign_index->add (ksub, get_centroids (m, 0));
+        size_t bs = 65536;
+        float * xslice = new float[bs * dsub];
+        ScopeDeleter<float> del (xslice);
+        idx_t *assign = new idx_t[bs];
+        ScopeDeleter<idx_t> del2 (assign);
+        for (size_t i0 = 0; i0 < n; i0 += bs) {
+            size_t i1 = std::min(i0 + bs, n);
+            for (size_t i = i0; i < i1; i++) {
+                memcpy (xslice + (i - i0) * dsub,
+                        x + i * d + m * dsub,
+                        dsub * sizeof(float));
+            }
+            assign_index->assign (i1 - i0, xslice, assign);
+            switch (byte_per_idx) {
+            case 1:
+                {
+                    uint8_t *c = codes + code_size * i0 + m;
+                    for (size_t i = i0; i < i1; i++) {
+                        *c = assign[i - i0];
+                        c += M;
+                    }
+                }
+                break;
+           case 2:
+               {
+                   uint16_t *c = (uint16_t*)(codes + code_size * i0 + m * 2);
+                   for (size_t i = i0; i < i1; i++) {
+                       *c = assign[i - i0];
+                       c += M;
+                   }
+               }
+               break;
+            }
+        }
+    }
+}
 void ProductQuantizer::compute_codes (const float * x,
                                      uint8_t * codes,
                                      size_t n)  const
 {
+    // process by blocks to avoid using too much RAM
+    size_t bs = 256 * 1024;
+    if (n > bs) {
+        for (size_t i0 = 0; i0 < n; i0 += bs) {
+            size_t i1 = std::min(i0 + bs, n);
+            compute_codes (x + d * i0, codes + code_size * i0, i1 - i0);
+        }
+        return;
+    }
    if (dsub < 16) { // simple direct computation
 #pragma omp parallel for
@@ -525,15 +589,6 @@ static void pq_knn_search_with_tables (
    }
 }
-    /*
-static inline void pq_estimators_from_tables (const ProductQuantizer * pq,
-                                              const CT * codes,
-                                              size_t ncodes,
-                                              const float * dis_table,
-                                              size_t k,
-                                              float * heap_dis,
-                                              long * heap_ids)
-    */
 void ProductQuantizer::search (const float * __restrict x,
                               size_t nx,
                               const uint8_t * codes,

--- a/ProductQuantizer.h
+++ b/ProductQuantizer.h
@@ -23,6 +23,8 @@ namespace faiss {
 /** Product Quantizer. Implemented only for METRIC_L2 */
 struct ProductQuantizer {
+    using idx_t = Index::idx_t;
    size_t d;              ///< size of the input vectors
    size_t M;              ///< number of subquantizers
    size_t nbits;          ///< number of bits per quantization index
@@ -86,6 +88,13 @@ struct ProductQuantizer {
                        uint8_t * codes,
                        size_t n) const ;
+    /// speed up code assignment using assign_index
+    /// (non-const because the index is changed)
+    void compute_codes_with_assign_index (
+                const float * x,
+                uint8_t * codes,
+                size_t n);
    /// decode a vector from a given code (or n vectors if third argument)
    void decode (const uint8_t *code, float *x) const;
    void decode (const uint8_t *code, float *x, size_t n) const;

--- a/VectorTransform.cpp
+++ b/VectorTransform.cpp
@@ -239,7 +239,7 @@ void RandomRotationMatrix::init (int seed)
    is_trained = true;
 }
-void RandomRotationMatrix::train (Index::idx_t n, const float *x)
+void RandomRotationMatrix::train (Index::idx_t /*n*/, const float */*x*/)
 {
    // initialize with some arbitrary seed
    init (12345);
@@ -671,11 +671,11 @@ void OPQMatrix::train (Index::idx_t n, const float *x)
        xproj (d2 * n), pq_recons (d2 * n), xxr (d * n),
        tmp(d * d * 4);
-    std::vector<uint8_t> codes (M * n);
    ProductQuantizer pq_default (d2, M, 8);
-    ProductQuantizer &pq_regular =
+    ProductQuantizer &pq_regular = pq ? *pq : pq_default;
-        pq ? *pq : pq_default;
+    std::vector<uint8_t> codes (pq_regular.code_size * n);
    double t0 = getmillisecs();
    for (int iter = 0; iter < niter; iter++) {
@@ -691,10 +691,18 @@ void OPQMatrix::train (Index::idx_t n, const float *x)
        pq_regular.cp.max_points_per_centroid = 1000;
        pq_regular.cp.niter = iter == 0 ? niter_pq_0 : niter_pq;
-        pq_regular.cp.verbose = verbose;
+        pq_regular.verbose = verbose;
        pq_regular.train (n, xproj.data());
+        if (verbose) {
+            printf("    encode / decode\n");
+        }
+        if (pq_regular.assign_index) {
+            pq_regular.compute_codes_with_assign_index
+                (xproj.data(), codes.data(), n);
+        } else {
            pq_regular.compute_codes (xproj.data(), codes.data(), n);
+        }
        pq_regular.decode (codes.data(), pq_recons.data(), n);
        float pq_err = fvec_L2sqr (pq_recons.data(), xproj.data(), n * d2) / n;
@@ -710,6 +718,9 @@ void OPQMatrix::train (Index::idx_t n, const float *x)
            FINTEGER di = d, d2i = d2, ni = n;
            float one = 1, zero = 0;
+            if (verbose) {
+                printf("    X * recons\n");
+            }
            // torch.mm(xtrain:t(), pq_recons)
            sgemm_ ("Not", "Transposed",
                    &d2i, &di, &ni,
@@ -788,6 +799,58 @@ void NormalizationTransform::reverse_transform (idx_t n, const float* xt,
    memcpy (x, xt, sizeof (xt[0]) * n * d_in);
 }
+/*********************************************
+ * CenteringTransform
+ *********************************************/
+CenteringTransform::CenteringTransform (int d):
+    VectorTransform (d, d)
+{
+    is_trained = false;
+}
+void CenteringTransform::train(Index::idx_t n, const float *x) {
+    FAISS_THROW_IF_NOT_MSG(n > 0, "need at least one training vector");
+    mean.resize (d_in, 0);
+    for (idx_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < d_in; j++) {
+            mean[j] += *x++;
+        }
+    }
+    for (size_t j = 0; j < d_in; j++) {
+        mean[j] /= n;
+    }
+    is_trained = true;
+}
+void CenteringTransform::apply_noalloc
+      (idx_t n, const float* x, float* xt) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    for (idx_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < d_in; j++) {
+            *xt++ = *x++ - mean[j];
+        }
+    }
+}
+void CenteringTransform::reverse_transform (idx_t n, const float* xt,
+                                                float* x) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    for (idx_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < d_in; j++) {
+            *x++ = *xt++ + mean[j];
+        }
+    }
+}
 /*********************************************
 * IndexPreTransform
 *********************************************/
@@ -956,6 +1019,16 @@ void IndexPreTransform::search (idx_t n, const float *x, idx_t k,
    index->search (n, xt, k, distances, labels);
 }
+void IndexPreTransform::range_search (idx_t n, const float* x, float radius,
+                                      RangeSearchResult* result) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    const float *xt = apply_chain (n, x);
+    ScopeDeleter<float> del(xt == x ? nullptr : xt);
+    index->range_search (n, xt, radius, result);
+}
 void IndexPreTransform::reset () {
    index->reset();

--- a/VectorTransform.h
+++ b/VectorTransform.h
@@ -246,6 +246,25 @@ struct NormalizationTransform: VectorTransform {
    void reverse_transform(idx_t n, const float* xt, float* x) const override;
 };
+/** Subtract the mean of each component from the vectors. */
+struct CenteringTransform: VectorTransform {
+    /// Mean, size d_in = d_out
+    std::vector<float> mean;
+    explicit CenteringTransform (int d = 0);
+    /// train on n vectors.
+    void train(Index::idx_t n, const float* x) override;
+    /// subtract the mean
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+    /// add the mean
+    void reverse_transform (idx_t n, const float * xt,
+                            float *x) const override;
+};
 /** Index that applies a LinearTransform transform on vectors before
@@ -285,6 +304,12 @@ struct IndexPreTransform: Index {
        float* distances,
        idx_t* labels) const override;
+    /* range search, no attempt is done to change the radius */
+    void range_search (idx_t n, const float* x, float radius,
+                       RangeSearchResult* result) const override;
    void reconstruct (idx_t key, float * recons) const override;
    void reconstruct_n (idx_t i0, idx_t ni, float *recons)

--- a/WorkerThread.cpp
+++ b/WorkerThread.cpp
+/**
+ * Copyright (c) 2015-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD+Patents license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include "WorkerThread.h"
+#include "FaissAssert.h"
+namespace faiss {
+WorkerThread::WorkerThread() :
+    wantStop_(false) {
+  startThread();
+  // Make sure that the thread has started before continuing
+  add([](){}).get();
+}
+WorkerThread::~WorkerThread() {
+  stop();
+  waitForThreadExit();
+}
+void
+WorkerThread::startThread() {
+  thread_ = std::thread([this](){ threadMain(); });
+}
+void
+WorkerThread::stop() {
+  std::lock_guard<std::mutex> guard(mutex_);
+  wantStop_ = true;
+  monitor_.notify_one();
+}
+std::future<bool>
+WorkerThread::add(std::function<void()> f) {
+  std::lock_guard<std::mutex> guard(mutex_);
+  if (wantStop_) {
+    // The timer thread has been stopped, or we want to stop; we can't
+    // schedule anything else
+    std::promise<bool> p;
+    auto fut = p.get_future();
+    // did not execute
+    p.set_value(false);
+    return fut;
+  }
+  auto pr = std::promise<bool>();
+  auto fut = pr.get_future();
+  queue_.emplace_back(std::make_pair(std::move(f), std::move(pr)));
+  // Wake up our thread
+  monitor_.notify_one();
+  return fut;
+}
+void
+WorkerThread::threadMain() {
+  threadLoop();
+  // Call all pending tasks
+  FAISS_ASSERT(wantStop_);
+  for (auto& f : queue_) {
+    f.first();
+    f.second.set_value(true);
+  }
+}
+void
+WorkerThread::threadLoop() {
+  while (true) {
+    std::pair<std::function<void()>, std::promise<bool>> data;
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      while (!wantStop_ && queue_.empty()) {
+        monitor_.wait(lock);
+      }
+      if (wantStop_) {
+        return;
+      }
+      data = std::move(queue_.front());
+      queue_.pop_front();
+    }
+    data.first();
+    data.second.set_value(true);
+  }
+}
+void
+WorkerThread::waitForThreadExit() {
+  try {
+    thread_.join();
+  } catch (...) {
+  }
+}
+} // namespace
--- a/WorkerThread.h
+++ b/WorkerThread.h
--- a/benchs/bench_all_ivf/bench_kmeans.py
+++ b/benchs/bench_all_ivf/bench_kmeans.py
--- a/benchs/bench_all_ivf/parse_bench_all_ivf.py
+++ b/benchs/bench_all_ivf/parse_bench_all_ivf.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD+Patents license found in the
 # LICENSE file in the root directory of this source tree.
-#! /usr/bin/python2
+#! /usr/bin/env python2
 import os
 import numpy as np

--- a/benchs/bench_all_ivf/run_on_cluster_generic.bash
+++ b/benchs/bench_all_ivf/run_on_cluster_generic.bash
@@ -23,14 +23,14 @@ function run_on_1machine () {
    # To be implemented
 }
-function run_on_1machine () {
+function run_on_8gpu () {
    # To be implemented
 }
 # prepare output directories
+# set to some directory where all indexes, can be written.
-basedir=/mnt/vol/gfsai-east/ai-group/users/matthijs/bench_all_ivf
+basedir=XXXXX
 logdir=$basedir/logs
 indexdir=$basedir/indexes

--- a/benchs/bench_gpu_1bn.py
+++ b/benchs/bench_gpu_1bn.py
@@ -654,7 +654,7 @@ def get_populated_index(preproc):
        print "Copy CPU index to %d sharded GPU indexes" % replicas
-        index = faiss.IndexProxy()
+        index = faiss.IndexReplicas()
        for i in range(replicas):
            gpu0 = ngpu * i / replicas

--- a/benchs/bench_hnsw.py
+++ b/benchs/bench_hnsw.py
--- a/benchs/kmeans_mnist.py
+++ b/benchs/kmeans_mnist.py
@@ -66,7 +66,7 @@ def train_kmeans(x, k, ngpu):
    else:
        indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                   for i in range(ngpu)]
-        index = faiss.IndexProxy()
+        index = faiss.IndexReplicas()
        for sub_index in indexes:
            index.addIndex(sub_index)

--- a/benchs/link_and_code/README.md
+++ b/benchs/link_and_code/README.md
 README for the link & code implementation
 =========================================

--- a/benchs/link_and_code/neighbor_codec.py
+++ b/benchs/link_and_code/neighbor_codec.py
--- a/gpu/GpuAutoTune.cpp
+++ b/gpu/GpuAutoTune.cpp
--- a/gpu/GpuIndex.cu
+++ b/gpu/GpuIndex.cu
--- a/gpu/GpuIndex.h
+++ b/gpu/GpuIndex.h
--- a/gpu/GpuIndexBinaryFlat.cu
+++ b/gpu/GpuIndexBinaryFlat.cu
--- a/gpu/GpuIndexFlat.cu
+++ b/gpu/GpuIndexFlat.cu
--- a/gpu/GpuIndexFlat.h
+++ b/gpu/GpuIndexFlat.h
--- a/gpu/GpuIndexIVF.cu
+++ b/gpu/GpuIndexIVF.cu
--- a/gpu/GpuIndexIVF.h
+++ b/gpu/GpuIndexIVF.h
--- a/gpu/GpuIndexIVFFlat.cu
+++ b/gpu/GpuIndexIVFFlat.cu
--- a/gpu/GpuIndexIVFFlat.h
+++ b/gpu/GpuIndexIVFFlat.h
--- a/gpu/GpuIndexIVFPQ.cu
+++ b/gpu/GpuIndexIVFPQ.cu
--- a/gpu/GpuIndexIVFPQ.h
+++ b/gpu/GpuIndexIVFPQ.h
--- a/gpu/StandardGpuResources.cpp
+++ b/gpu/StandardGpuResources.cpp
--- a/gpu/StandardGpuResources.h
+++ b/gpu/StandardGpuResources.h
--- a/gpu/impl/BinaryDistance.cu
+++ b/gpu/impl/BinaryDistance.cu
--- a/gpu/impl/Distance.cu
+++ b/gpu/impl/Distance.cu
--- a/gpu/impl/IVFFlat.cu
+++ b/gpu/impl/IVFFlat.cu
--- a/gpu/impl/IVFPQ.cu
+++ b/gpu/impl/IVFPQ.cu
--- a/gpu/impl/IVFUtilsSelect1.cu
+++ b/gpu/impl/IVFUtilsSelect1.cu
--- a/gpu/impl/IVFUtilsSelect2.cu
+++ b/gpu/impl/IVFUtilsSelect2.cu
--- a/gpu/impl/L2Select.cu
+++ b/gpu/impl/L2Select.cu
--- a/gpu/perf/IndexWrapper-inl.h
+++ b/gpu/perf/IndexWrapper-inl.h
--- a/gpu/perf/IndexWrapper.h
+++ b/gpu/perf/IndexWrapper.h
--- a/gpu/perf/PerfSelect.cu
+++ b/gpu/perf/PerfSelect.cu
--- a/gpu/perf/slow.py
+++ b/gpu/perf/slow.py
--- a/gpu/test/TestGpuIndexBinaryFlat.cpp
+++ b/gpu/test/TestGpuIndexBinaryFlat.cpp
--- a/gpu/test/TestGpuIndexFlat.cpp
+++ b/gpu/test/TestGpuIndexFlat.cpp
--- a/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/gpu/test/TestGpuIndexIVFFlat.cpp
--- a/gpu/test/TestGpuIndexIVFPQ.cpp
+++ b/gpu/test/TestGpuIndexIVFPQ.cpp
--- a/gpu/test/TestGpuMemoryException.cpp
+++ b/gpu/test/TestGpuMemoryException.cpp
--- a/gpu/test/TestGpuSelect.cu
+++ b/gpu/test/TestGpuSelect.cu
--- a/gpu/test/test_gpu_index.py
+++ b/gpu/test/test_gpu_index.py
--- a/gpu/test/test_pytorch_faiss.py
+++ b/gpu/test/test_pytorch_faiss.py
--- a/gpu/utils/BlockSelectFloat.cu
+++ b/gpu/utils/BlockSelectFloat.cu
--- a/gpu/utils/BlockSelectHalf.cu
+++ b/gpu/utils/BlockSelectHalf.cu
--- a/gpu/utils/DeviceDefs.cuh
+++ b/gpu/utils/DeviceDefs.cuh
--- a/gpu/utils/DeviceMemory.cpp
+++ b/gpu/utils/DeviceMemory.cpp
--- a/gpu/utils/DeviceTensor-inl.cuh
+++ b/gpu/utils/DeviceTensor-inl.cuh
--- a/gpu/utils/DeviceUtils.cu
+++ b/gpu/utils/DeviceUtils.cu
--- a/gpu/utils/DeviceUtils.h
+++ b/gpu/utils/DeviceUtils.h
--- a/gpu/utils/DeviceVector.cuh
+++ b/gpu/utils/DeviceVector.cuh
--- a/gpu/utils/MemorySpace.cpp
+++ b/gpu/utils/MemorySpace.cpp
--- a/gpu/utils/MemorySpace.h
+++ b/gpu/utils/MemorySpace.h
--- a/gpu/utils/StackDeviceMemory.cpp
+++ b/gpu/utils/StackDeviceMemory.cpp
--- a/gpu/utils/ThrustAllocator.cuh
+++ b/gpu/utils/ThrustAllocator.cuh
--- a/gpu/utils/WarpSelectFloat.cu
+++ b/gpu/utils/WarpSelectFloat.cu
--- a/gpu/utils/WarpSelectHalf.cu
+++ b/gpu/utils/WarpSelectHalf.cu
--- a/gpu/utils/blockselect/BlockSelectFloatF2048.cu
+++ b/gpu/utils/blockselect/BlockSelectFloatF2048.cu
--- a/gpu/utils/blockselect/BlockSelectFloatT2048.cu
+++ b/gpu/utils/blockselect/BlockSelectFloatT2048.cu
--- a/gpu/utils/blockselect/BlockSelectHalfF2048.cu
+++ b/gpu/utils/blockselect/BlockSelectHalfF2048.cu
--- a/gpu/utils/blockselect/BlockSelectHalfT2048.cu
+++ b/gpu/utils/blockselect/BlockSelectHalfT2048.cu
--- a/gpu/utils/blockselect/BlockSelectImpl.cuh
+++ b/gpu/utils/blockselect/BlockSelectImpl.cuh
--- a/gpu/utils/warpselect/WarpSelectFloatF2048.cu
+++ b/gpu/utils/warpselect/WarpSelectFloatF2048.cu
--- a/gpu/utils/warpselect/WarpSelectFloatT2048.cu
+++ b/gpu/utils/warpselect/WarpSelectFloatT2048.cu
--- a/gpu/utils/warpselect/WarpSelectHalfF2048.cu
+++ b/gpu/utils/warpselect/WarpSelectHalfF2048.cu
--- a/gpu/utils/warpselect/WarpSelectHalfT2048.cu
+++ b/gpu/utils/warpselect/WarpSelectHalfT2048.cu
--- a/index_io.cpp
+++ b/index_io.cpp
--- a/index_io.h
+++ b/index_io.h
--- a/python/faiss.py
+++ b/python/faiss.py
--- a/python/swigfaiss.py
+++ b/python/swigfaiss.py
--- a/python/swigfaiss.swig
+++ b/python/swigfaiss.swig
--- a/python/swigfaiss_gpu.py
+++ b/python/swigfaiss_gpu.py
--- a/python/swigfaiss_gpu_wrap.cpp
+++ b/python/swigfaiss_gpu_wrap.cpp
--- a/python/swigfaiss_wrap.cpp
+++ b/python/swigfaiss_wrap.cpp
--- a/tests/test_build_blocks.py
+++ b/tests/test_build_blocks.py
--- a/tests/test_dealloc_invlists.cpp
+++ b/tests/test_dealloc_invlists.cpp
--- a/tests/test_index.py
+++ b/tests/test_index.py
--- a/tests/test_index_accuracy.py
+++ b/tests/test_index_accuracy.py
--- a/tests/test_index_binary.py
+++ b/tests/test_index_binary.py
--- a/tests/test_index_composite.py
+++ b/tests/test_index_composite.py
--- a/tests/test_omp_threads.cpp
+++ b/tests/test_omp_threads.cpp
--- a/tests/test_omp_threads_py.py
+++ b/tests/test_omp_threads_py.py
--- a/utils.cpp
+++ b/utils.cpp
--- a/utils.h
+++ b/utils.h