sync with FB version 2017-01-09

- adding HNSW indexing method - simultaneous search and reconstruction for IndexIVFPQ

sync with FB version 2017-01-09
- adding HNSW indexing method - simultaneous search and reconstruction for IndexIVFPQ
9933892e · matthijs · 5b45b055 · 9933892e · 9933892e · 9933892e
Commit 9933892e authored Jan 09, 2018 by matthijs
32 changed files
--- a/AutoTune.cpp
+++ b/AutoTune.cpp
@@ -23,6 +23,7 @@
 #include "IndexIVFPQ.h"
 #include "MetaIndexes.h"
 #include "IndexScalarQuantizer.h"
+#include "IndexHNSW.h"
 namespace faiss {
@@ -321,6 +322,11 @@ static void init_pq_ParameterRange (const ProductQuantizer & pq,
 ParameterRange &ParameterSpace::add_range(const char * name)
 {
+    for (auto & pr : parameter_ranges) {
+        if (pr.name == name) {
+            return pr;
+        }
+    }
    parameter_ranges.push_back (ParameterRange ());
    parameter_ranges.back ().name = name;
    return parameter_ranges.back ();
@@ -353,6 +359,12 @@ void ParameterSpace::initialize (const Index * index)
                pr.values.push_back (nprobe);
            }
        }
+        if (dynamic_cast<const IndexHNSW*>(ix->quantizer)) {
+            ParameterRange & pr = add_range("efSearch");
+            for (int i = 2; i <= 9; i++) {
+                pr.values.push_back (1 << i);
+            }
+        }
    }
    if (DC (IndexPQ)) {
        ParameterRange & pr = add_range("ht");
@@ -361,7 +373,9 @@ void ParameterSpace::initialize (const Index * index)
    if (DC (IndexIVFPQ)) {
        ParameterRange & pr = add_range("ht");
        init_pq_ParameterRange (ix->pq, pr);
+    }
+    if (DC (IndexIVF)) {
        const MultiIndexQuantizer *miq =
            dynamic_cast<const MultiIndexQuantizer *> (ix->quantizer);
        if (miq) {
@@ -378,6 +392,12 @@ void ParameterSpace::initialize (const Index * index)
            pr.values.push_back (1 << i);
        }
    }
+    if (dynamic_cast<const IndexHNSW*>(index)) {
+        ParameterRange & pr = add_range("efSearch");
+        for (int i = 2; i <= 9; i++) {
+            pr.values.push_back (1 << i);
+        }
+    }
 }
 #undef DC
@@ -489,7 +509,7 @@ void ParameterSpace::set_index_parameter (
        }
    }
    if (name == "max_codes") {
-        if (DC (IndexIVFPQ)) {
+        if (DC (IndexIVF)) {
            ix->max_codes = finite(val) ? size_t(val) : 0;
            return;
        }
@@ -683,7 +703,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
    for (char *tok = strtok_r (description, " ,", &ptr);
         tok;
         tok = strtok_r (nullptr, " ,", &ptr)) {
-        int d_out, opq_M, nbit, M, M2;
+        int d_out, opq_M, nbit, M, M2, pq_m, ncent;
        std::string stok(tok);
        // to avoid mem leaks with exceptions:
@@ -793,7 +813,6 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
                index_pq->do_polysemous_training = do_polysemous_training;
                index_1 = index_pq;
            }
        } else if (stok == "RFlat") {
            make_IndexRefineFlat = true;
        } else {
@@ -841,7 +860,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
        index_pt->own_fields = true;
        // add from back
        while (vts.chain.size() > 0) {
-            index_pt->prepend_transform (vts.chain.back());
+            index_pt->prepend_transform (vts.chain.back ());
            vts.chain.pop_back ();
        }
        index = index_pt;

--- a/AutoTune.h
+++ b/AutoTune.h
@@ -162,7 +162,7 @@ struct ParameterSpace {
    /// print a description on stdout
    void display () const;
-    /// add a new parameter
+    /// add a new parameter (or return it if it exists)
    ParameterRange &add_range(const char * name);
    /// initialize with reasonable parameters for the index

--- a/Clustering.cpp
+++ b/Clustering.cpp
@@ -65,8 +65,9 @@ static double imbalance_factor (int n, int k, long *assign) {
 void Clustering::train (idx_t nx, const float *x_in, Index & index) {
-    FAISS_THROW_IF_NOT_MSG (nx >= k,
+    FAISS_THROW_IF_NOT_FMT (nx >= k,
-                    "need at least as many training points as clusters");
+             "Number of training points (%ld) should be at least "
+             "as large as number of clusters (%ld)", nx, k);
    double t0 = getmillisecs();
@@ -100,12 +101,26 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
    }
+    if (nx == k) {
+        if (verbose) {
+            printf("Number of training points (%ld) same as number of "
+                   "clusters, just copying\n", nx);
+        }
+        // this is a corner case, just copy training set to clusters
+        centroids.resize (d * k);
+        memcpy (centroids.data(), x_in, sizeof (*x_in) * d * k);
+        return;
+    }
    if (verbose)
        printf("Clustering %d points in %ldD to %ld clusters, "
               "redo %d times, %d iterations\n",
               int(nx), d, k, nredo, niter);
    idx_t * assign = new idx_t[nx];
    ScopeDeleter<idx_t> del (assign);
    float * dis = new float[nx];

--- a/Heap.h
+++ b/Heap.h
@@ -179,7 +179,7 @@ void maxheap_push (size_t k, T * bh_val, long * bh_ids, T val, long ids)
 * Heap initialization
 *******************************************************************/
-/* Initialization phase for the heap (with inconditionnal pushes).
+/* Initialization phase for the heap (with unconditionnal pushes).
 * Store k0 elements in a heap containing up to k values. Note that
 * (bh_val, bh_ids) can be the same as (x, ids) */
 template <class C> inline

--- a/Index.cpp
+++ b/Index.cpp
@@ -11,8 +11,19 @@
 #include "IndexFlat.h"
 #include "FaissAssert.h"
+#include <cstring>
 namespace faiss {
+Index::~Index ()
+{
+}
+void Index::train(idx_t /*n*/, const float* /*x*/) {
+    // does nothing by default
+}
 void Index::range_search (idx_t , const float *, float,
                          RangeSearchResult *) const
@@ -52,6 +63,25 @@ void Index::reconstruct_n (idx_t i0, idx_t ni, float *recons) const {
 }
+void Index::search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                    float *distances, idx_t *labels,
+                                    float *recons) const {
+  search (n, x, k, distances, labels);
+  for (idx_t i = 0; i < n; ++i) {
+    for (idx_t j = 0; j < k; ++j) {
+      idx_t ij = i * k + j;
+      idx_t key = labels[ij];
+      float* reconstructed = recons + ij * d;
+      if (key < 0) {
+        // Fill with NaNs
+        memset(reconstructed, -1, sizeof(*reconstructed) * d);
+      } else {
+        reconstruct (key, reconstructed);
+      }
+    }
+  }
+}
 void Index::compute_residual (const float * x,
                              float * residual, idx_t key) const {

--- a/Index.h
+++ b/Index.h
@@ -71,14 +71,14 @@ struct Index {
    /// type of metric this index uses for search
    MetricType metric_type;
-    explicit Index (idx_t d = 0, MetricType metric = METRIC_INNER_PRODUCT):
+    explicit Index (idx_t d = 0, MetricType metric = METRIC_L2):
                    d(d),
                    ntotal(0),
                    verbose(false),
                    is_trained(true),
                    metric_type (metric) {}
-    virtual ~Index () {  }
+    virtual ~Index ();
    /** Perform training on a representative set of vectors
@@ -86,9 +86,7 @@ struct Index {
     * @param n      nb of training vectors
     * @param x      training vecors, size n * d
     */
-    virtual void train(idx_t /*n*/, const float* /*x*/) {
+    virtual void train(idx_t n, const float* x);
-      // does nothing by default
-    }
    /** Add n vectors of dimension d to the index.
     *
@@ -164,6 +162,17 @@ struct Index {
     */
    virtual void reconstruct_n (idx_t i0, idx_t ni, float *recons) const;
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * If there are not enough results for a query, the resulting arrays
+     * is padded with -1s.
+     *
+     * @param recons      reconstructed vectors size (n, k, d)
+     **/
+    virtual void search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                         float *distances, idx_t *labels,
+                                         float *recons) const;
    /** Computes a residual vector after indexing encoding.
     *

--- a/IndexHNSW.cpp
+++ b/IndexHNSW.cpp
+/**
+ * Copyright (c) 2015-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD+Patents license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include "IndexHNSW.h"
+#include <cstdlib>
+#include <cassert>
+#include <cstring>
+#include <cstdio>
+#include <cmath>
+#include <omp.h>
+#include <unordered_set>
+#include <queue>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <immintrin.h>
+#include "utils.h"
+#include "Heap.h"
+#include "FaissAssert.h"
+#include "IndexFlat.h"
+#include "IndexIVFPQ.h"
+extern "C" {
+/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
+int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
+            n, FINTEGER *k, const float *alpha, const float *a,
+            FINTEGER *lda, const float *b, FINTEGER *
+            ldb, float *beta, float *c, FINTEGER *ldc);
+}
+namespace faiss {
+/**************************************************************
+ * Auxiliary structures
+ **************************************************************/
+/// set implementation optimized for fast access.
+struct VisitedTable {
+    std::vector<uint8_t> visited;
+    int visno;
+    VisitedTable(int size):
+        visited(size), visno(1)
+    {}
+    /// set flog #no to true
+    void set(int no) {
+        visited[no] = visno;
+    }
+    /// get flag #no
+    bool get(int no) const {
+        return visited[no] == visno;
+    }
+    /// reset all flags to false
+    void advance() {
+        visno++;
+        if (visno == 250) {
+            // 250 rather than 255 because sometimes we use visno and visno+1
+            memset (visited.data(), 0, sizeof(visited[0]) * visited.size());
+            visno = 1;
+        }
+    }
+};
+namespace {
+typedef HNSW::idx_t idx_t;
+typedef HNSW::storage_idx_t storage_idx_t;
+typedef HNSW::DistanceComputer DistanceComputer;
+    // typedef ::faiss::VisitedTable VisitedTable;
+/// to sort pairs of (id, distance) from nearest to fathest or the reverse
+struct NodeDistCloser {
+    float d;
+    int id;
+    NodeDistCloser(float d, int id): d(d), id(id) {}
+    bool operator<(const NodeDistCloser &obj1) const { return d < obj1.d; }
+};
+struct NodeDistFarther {
+    float d;
+    int id;
+    NodeDistFarther(float d, int id): d(d), id(id) {}
+    bool operator<(const NodeDistFarther &obj1) const { return d > obj1.d; }
+};
+/** Heap structure that allows fast */
+struct MinimaxHeap {
+    int n;
+    int k;
+    int nvalid;
+    std::vector<storage_idx_t> ids;
+    std::vector<float> dis;
+    typedef faiss::CMax<float, storage_idx_t> HC;
+    explicit MinimaxHeap(int n): n(n), k(0), nvalid(0), ids(n), dis(n) {}
+    void push(storage_idx_t i, float v)
+    {
+        if (k == n) {
+            if (v >= dis[0]) return;
+            faiss::heap_pop<HC> (k--, dis.data(), ids.data());
+            nvalid--;
+        }
+        faiss::heap_push<HC> (++k, dis.data(), ids.data(), v, i);
+        nvalid++;
+    }
+    float max() const
+    {
+        return dis[0];
+    }
+    int size() const {return nvalid;}
+    void clear() {nvalid = k = 0; }
+    int pop_min(float *vmin_out = nullptr)
+    {
+        assert(k > 0);
+        // returns min. This is an O(n) operation
+        int i = k - 1;
+        while (i >= 0) {
+            if (ids[i] != -1) break;
+            i--;
+        }
+        if (i == -1) return -1;
+        int imin = i;
+        float vmin = dis[i];
+        i--;
+        while(i >= 0) {
+            if (ids[i] != -1 && dis[i] < vmin) {
+                vmin = dis[i];
+                imin = i;
+            }
+            i--;
+        }
+        if (vmin_out) *vmin_out = vmin;
+        int ret = ids[imin];
+        ids[imin] = -1;
+        nvalid --;
+        return ret;
+    }
+    int count_below(float thresh) {
+        float n_below = 0;
+        for(int i = 0; i < k; i++) {
+            if (dis[i] < thresh)
+                n_below++;
+        }
+        return n_below;
+    }
+};
+/**************************************************************
+ * Addition subroutines
+ **************************************************************/
+/** Enumerate vertices from farthest to nearest from query, keep a
+ * neighbor only if there is no previous neighbor that is closer to
+ * that vertex than the query.
+ */
+void shrink_neighbor_list(DistanceComputer & qdis,
+                          std::priority_queue<NodeDistFarther> &input,
+                          std::vector<NodeDistFarther> &output,
+                          int max_size)
+{
+    while (input.size() > 0) {
+        NodeDistFarther v1 = input.top();
+        input.pop();
+        float dist_v1_q = v1.d;
+        bool good = true;
+        for (NodeDistFarther v2 : output) {
+            float dist_v1_v2 = qdis.symmetric_dis(v2.id, v1.id);
+            if (dist_v1_v2 < dist_v1_q) {
+                good = false;
+                break;
+            }
+        }
+        if (good) {
+            output.push_back(v1);
+            if (output.size() >= max_size)
+                return;
+        }
+    }
+}
+/// remove neighbors from the list to make it smaller than max_size
+void shrink_neighbor_list(DistanceComputer & qdis,
+                          std::priority_queue<NodeDistCloser> &resultSet1,
+                          int max_size)
+{
+    if (resultSet1.size() < max_size) {
+        return;
+    }
+    std::priority_queue<NodeDistFarther> resultSet;
+    std::vector<NodeDistFarther> returnlist;
+    while (resultSet1.size() > 0) {
+        resultSet.emplace(resultSet1.top().d, resultSet1.top().id);
+        resultSet1.pop();
+    }
+    shrink_neighbor_list (qdis, resultSet, returnlist, max_size);
+    for (NodeDistFarther curen2 : returnlist) {
+        resultSet1.emplace(curen2.d, curen2.id);
+    }
+}
+/// add a link between two elements, possibly shrinking the list
+/// of links to make room for it.
+void add_link(HNSW & hnsw,
+              DistanceComputer & qdis,
+              storage_idx_t src, storage_idx_t dest,
+              int level)
+{
+    size_t begin, end;
+    hnsw.neighbor_range(src, level, &begin, &end);
+    if (hnsw.neighbors[end - 1] == -1) {
+        // there is enough room, find a slot to add it
+        size_t i = end;
+        while(i > begin) {
+            if (hnsw.neighbors[i - 1] != -1) break;
+            i--;
+        }
+        hnsw.neighbors[i] = dest;
+        return;
+    }
+    // otherwise we let them fight out which to keep
+    // copy to resultSet...
+    std::priority_queue<NodeDistCloser> resultSet;
+    resultSet.emplace(qdis.symmetric_dis(src, dest), dest);
+    for (size_t i = begin; i < end; i++) { // HERE WAS THE BUG
+        storage_idx_t neigh = hnsw.neighbors[i];
+        resultSet.emplace(qdis.symmetric_dis(src, neigh), neigh);
+    }
+    shrink_neighbor_list(qdis, resultSet, end - begin);
+    // ...and back
+    size_t i = begin;
+    while (resultSet.size()) {
+        hnsw.neighbors[i++] = resultSet.top().id;
+        resultSet.pop();
+    }
+    // they may have shrunk more than just by 1 element
+    while(i < end) {
+        hnsw.neighbors[i++] = -1;
+    }
+}
+/// search neighbors on a single level, starting from an entry point
+void search_neighbors_to_add(HNSW & hnsw,
+                       DistanceComputer &qdis,
+                       std::priority_queue<NodeDistCloser> &results,
+                       int entry_point,
+                       float d_entry_point,
+                       int level,
+                       VisitedTable &vt)
+{
+    // top is nearest candidate
+    std::priority_queue<NodeDistFarther> candidates;
+    NodeDistFarther ev(d_entry_point, entry_point);
+    candidates.push(ev);
+    results.emplace(d_entry_point, entry_point);
+    vt.set(entry_point);
+    while (!candidates.empty()) {
+        // get nearest
+        const NodeDistFarther &currEv = candidates.top();
+        if (currEv.d > results.top().d) {
+            break;
+        }
+        int currNode = currEv.id;
+        candidates.pop();
+        // loop over neighbors
+        size_t begin, end;
+        hnsw.neighbor_range(currNode, level, &begin, &end);
+        for(size_t i = begin; i < end; i++) {
+            storage_idx_t nodeId = hnsw.neighbors[i];
+            if (nodeId < 0) break;
+            if (vt.get(nodeId)) continue;
+            vt.set(nodeId);
+            float dis = qdis(nodeId);
+            NodeDistFarther evE1(dis, nodeId);
+            if (results.size() < hnsw.efConstruction ||
+                results.top().d > dis) {
+                results.emplace(dis, nodeId);
+                candidates.emplace(dis, nodeId);
+                if (results.size() > hnsw.efConstruction) {
+                    results.pop();
+                }
+            }
+        }
+    }
+    vt.advance();
+}
+/// Finds neighbors and builds links with them, starting from an entry
+/// point. The own neighbor list is assumed to be locked.
+void add_links_starting_from(HNSW & hnsw,
+                             DistanceComputer &ptdis,
+                             storage_idx_t pt_id,
+                             storage_idx_t nearest,
+                             float d_nearest,
+                             int level,
+                             omp_lock_t * locks,
+                             VisitedTable &vt)
+{
+    std::priority_queue<NodeDistCloser> link_targets;
+    search_neighbors_to_add(
+            hnsw, ptdis, link_targets, nearest, d_nearest,
+            level, vt);
+    // but we can afford only this many neighbors
+    int M = hnsw.nb_neighbors(level);
+    shrink_neighbor_list(ptdis, link_targets, M);
+    while (!link_targets.empty()) {
+        int other_id = link_targets.top().id;
+        omp_set_lock(&locks[other_id]);
+        add_link(hnsw, ptdis, other_id, pt_id, level);
+        omp_unset_lock(&locks[other_id]);
+        add_link(hnsw, ptdis, pt_id, other_id, level);
+        link_targets.pop();
+    }
+}
+/**************************************************************
+ * Searching subroutines
+ **************************************************************/
+/// greedily update a nearest vector at a given level
+void greedy_update_nearest(const HNSW & hnsw,
+                           DistanceComputer & qdis,
+                           int level,
+                           storage_idx_t & nearest,
+                           float & d_nearest)
+{
+    for(;;) {
+        storage_idx_t prev_nearest = nearest;
+        size_t begin, end;
+        hnsw.neighbor_range(nearest, level, &begin, &end);
+        for(size_t i = begin; i < end; i++) {
+            storage_idx_t v = hnsw.neighbors[i];
+            if (v < 0) break;
+            float dis = qdis(v);
+            if (dis < d_nearest) {
+                nearest = v;
+                d_nearest = dis;
+            }
+        }
+        if (nearest == prev_nearest) {
+            return;
+        }
+    }
+}
+/** Do a BFS on the candidates list */
+int search_from_candidates(const HNSW & hnsw,
+                           DistanceComputer & qdis, int k,
+                           idx_t *I, float * D,
+                           MinimaxHeap &candidates,
+                           VisitedTable &vt,
+                           int level, int nres_in = 0)
+{
+    int nres = nres_in;
+    int ndis = 0;
+    for (int i = 0; i < candidates.size(); i++) {
+        idx_t v1 = candidates.ids[i];
+        float d = candidates.dis[i];
+        FAISS_ASSERT(v1 >= 0);
+        if (nres < k) {
+            faiss::maxheap_push (++nres, D, I, d, v1);
+        } else if (d < D[0]) {
+            faiss::maxheap_pop (nres--, D, I);
+            faiss::maxheap_push (++nres, D, I, d, v1);
+        }
+        vt.set(v1);
+    }
+    bool do_dis_check = hnsw.check_relative_distance;
+    int nstep = 0;
+    while (candidates.size() > 0) {
+        float d0 = 0;
+        int v0 = candidates.pop_min(&d0);
+        if (do_dis_check) {
+            // tricky stopping condition: there are more that ef
+            // distances that are processed already that are smaller
+            // than d0
+            int n_dis_below = candidates.count_below(d0);
+            if(n_dis_below >= hnsw.efSearch) {
+                break;
+            }
+        }
+        size_t begin, end;
+        hnsw.neighbor_range(v0, level, &begin, &end);
+        for (size_t j = begin; j < end; j++) {
+            int v1 = hnsw.neighbors[j];
+            if (v1 < 0) break;
+            if (vt.get(v1)) {
+                continue;
+            }
+            vt.set(v1);
+            ndis++;
+            float d = qdis(v1);
+            if (nres < k) {
+                faiss::maxheap_push (++nres, D, I, d, v1);
+            } else if (d < D[0]) {
+                faiss::maxheap_pop (nres--, D, I);
+                faiss::maxheap_push (++nres, D, I, d, v1);
+            }
+            candidates.push(v1, d);
+        }
+        nstep++;
+        if (!do_dis_check && nstep > hnsw.efSearch) {
+            break;
+        }
+    }
+    if (level == 0) {
+#pragma omp critical
+        {
+            hnsw_stats.n1 ++;
+            if (candidates.size() == 0)
+                hnsw_stats.n2 ++;
+            hnsw_stats.n3 += ndis;
+        }
+    }
+    return nres;
+}
+} // anonymous namespace
+/**************************************************************
+ * HNSW structure implementation
+ **************************************************************/
+int HNSW::nb_neighbors(int layer_no) const
+{
+    return cum_nneighbor_per_level[layer_no + 1] -
+        cum_nneighbor_per_level[layer_no];
+}
+void HNSW::set_nb_neighbors(int level_no, int n)
+{
+    FAISS_THROW_IF_NOT(levels.size() == 0);
+    int cur_n = nb_neighbors(level_no);
+    for (int i = level_no + 1; i < cum_nneighbor_per_level.size(); i++) {
+        cum_nneighbor_per_level[i] += n - cur_n;
+    }
+}
+int HNSW::cum_nb_neighbors(int layer_no) const
+{
+    return cum_nneighbor_per_level[layer_no];
+}
+void HNSW::neighbor_range(idx_t no, int layer_no,
+                        size_t * begin, size_t * end) const
+{
+    size_t o = offsets[no];
+    *begin = o + cum_nb_neighbors(layer_no);
+    *end = o + cum_nb_neighbors(layer_no + 1);
+}
+HNSW::HNSW(int M): rng(12345) {
+    set_default_probas(M, 1.0 / log(M));
+    max_level = -1;
+    entry_point = -1;
+    efSearch = 16;
+    check_relative_distance = true;
+    efConstruction = 40;
+    upper_beam = 1;
+    offsets.push_back(0);
+}
+int HNSW::random_level()
+{
+    double f = rng.rand_float();
+    // could be a bit faster with bissection
+    for (int level = 0; level < assign_probas.size(); level++) {
+        if (f < assign_probas[level]) {
+            return level;
+        }
+        f -= assign_probas[level];
+    }
+    // happens with exponentially low probability
+    return assign_probas.size() - 1;
+}
+void HNSW::set_default_probas(int M, float levelMult)
+{
+    int nn = 0;
+    cum_nneighbor_per_level.push_back (0);
+    for (int level = 0; ;level++) {
+        float proba = exp(-level / levelMult) * (1 - exp(-1 / levelMult));
+        if (proba < 1e-9) break;
+        assign_probas.push_back(proba);
+        nn += level == 0 ? M * 2 : M;
+        cum_nneighbor_per_level.push_back (nn);
+    }
+}
+void HNSW::clear_neighbor_tables(int level)
+{
+    for (int i = 0; i < levels.size(); i++) {
+        size_t begin, end;
+        neighbor_range(i, level, &begin, &end);
+        for (size_t j = begin; j < end; j++)
+            neighbors[j] = -1;
+    }
+}
+void HNSW::reset() {
+    max_level = -1;
+    entry_point = -1;
+    offsets.clear();
+    offsets.push_back(0);
+    levels.clear();
+    neighbors.clear();
+}
+void HNSW::print_neighbor_stats(int level) const
+{
+    FAISS_THROW_IF_NOT (level < cum_nneighbor_per_level.size());
+    printf("stats on level %d, max %d neighbors per vertex:\n",
+           level, nb_neighbors(level));
+    size_t tot_neigh = 0, tot_common = 0, tot_reciprocal = 0, n_node = 0;
+#pragma omp parallel for reduction(+: tot_neigh) reduction(+: tot_common) \
+                         reduction(+: tot_reciprocal) reduction(+: n_node)
+    for (int i = 0; i < levels.size(); i++) {
+        if (levels[i] > level) {
+            n_node++;
+            size_t begin, end;
+            neighbor_range(i, level, &begin, &end);
+            std::unordered_set<int> neighset;
+            for (size_t j = begin; j < end; j++) {
+                if (neighbors [j] < 0) break;
+                neighset.insert(neighbors[j]);
+            }
+            int n_neigh = neighset.size();
+            int n_common = 0;
+            int n_reciprocal = 0;
+            for (size_t j = begin; j < end; j++) {
+                storage_idx_t i2 = neighbors[j];
+                if (i2 < 0) break;
+                FAISS_ASSERT(i2 != i);
+                size_t begin2, end2;
+                neighbor_range(i2, level, &begin2, &end2);
+                for (size_t j2 = begin2; j2 < end2; j2++) {
+                    storage_idx_t i3 = neighbors[j2];
+                    if (i3 < 0) break;
+                    if (i3 == i) {
+                        n_reciprocal++;
+                        continue;
+                    }
+                    if (neighset.count(i3)) {
+                        neighset.erase(i3);
+                        n_common++;
+                    }
+                }
+            }
+            tot_neigh += n_neigh;
+            tot_common += n_common;
+            tot_reciprocal += n_reciprocal;
+        }
+    }
+    float normalizer = n_node;
+    printf("   nb of nodes at that level %ld\n", n_node);
+    printf("   neighbors per node: %.2f (%ld)\n", tot_neigh / normalizer, tot_neigh);
+    printf("   nb of reciprocal neighbors: %.2f\n", tot_reciprocal / normalizer);
+    printf("   nb of neighbors that are also neighbor-of-neighbors: %.2f (%ld)\n",
+           tot_common / normalizer, tot_common);
+}
+HNSWStats hnsw_stats;
+void HNSWStats::reset ()
+{
+    memset(this, 0, sizeof(*this));
+}
+/**************************************************************
+ * Building, parallel
+ **************************************************************/
+void HNSW::add_with_locks(
+      DistanceComputer & ptdis, int pt_level, int pt_id,
+      std::vector<omp_lock_t> & locks,
+      VisitedTable &vt)
+{
+    //  greedy search on upper levels
+    storage_idx_t nearest;
+#pragma omp critical
+    {
+        nearest = entry_point;
+        if (nearest == -1) {
+            max_level = pt_level;
+            entry_point = pt_id;
+        }
+    }
+    if (nearest < 0) {
+        return;
+    }
+    omp_set_lock(&locks[pt_id]);
+    int level = max_level; // level at which we start adding neighbors
+    float d_nearest = ptdis(nearest);
+    for(; level > pt_level; level--) {
+        greedy_update_nearest(*this, ptdis, level, nearest, d_nearest);
+    }
+    for(; level >= 0; level--) {
+        add_links_starting_from(*this, ptdis, pt_id, nearest, d_nearest,
+                                level, locks.data(), vt);
+    }
+    omp_unset_lock(&locks[pt_id]);
+    if (pt_level > max_level) {
+        max_level = pt_level;
+        entry_point = pt_id;
+    }
+}
+/**************************************************************
+ * Searching
+ **************************************************************/
+void HNSW::search(DistanceComputer & qdis,
+                  int k, idx_t *I, float * D,
+                  VisitedTable &vt) const
+{
+    if (upper_beam == 1) {
+        //  greedy search on upper levels
+        storage_idx_t nearest = entry_point;
+        float d_nearest = qdis(nearest);
+        for(int level = max_level; level >= 1; level--) {
+            greedy_update_nearest(*this, qdis, level, nearest, d_nearest);
+        }
+        int candidates_size = std::max(efSearch, k);
+        MinimaxHeap candidates(candidates_size);
+        candidates.push(nearest, d_nearest);
+        search_from_candidates (
+                  *this, qdis, k, I, D, candidates, vt, 0);
+        vt.advance();
+    } else {
+        int candidates_size = upper_beam;
+        MinimaxHeap candidates(candidates_size);
+        std::vector<idx_t> I_to_next(candidates_size);
+        std::vector<float> D_to_next(candidates_size);
+        int nres = 1;
+        I_to_next[0] = entry_point;
+        D_to_next[0] = qdis(entry_point);
+        for(int level = max_level; level >= 0; level--) {
+            // copy I, D -> candidates
+            candidates.clear();
+            for (int i = 0; i < nres; i++) {
+                candidates.push(I_to_next[i], D_to_next[i]);
+            }
+            if (level == 0) {
+                nres = search_from_candidates (
+                   *this, qdis, k, I, D, candidates, vt, 0);
+            } else  {
+                nres = search_from_candidates (
+                   *this, qdis, candidates_size,
+                   I_to_next.data(), D_to_next.data(),
+                   candidates, vt, level);
+            }
+            vt.advance();
+        }
+    }
+}
+/**************************************************************
+ * add / search blocks of descriptors
+ **************************************************************/
+namespace {
+int prepare_level_tab (HNSW & hnsw, size_t n, bool preset_levels = false)
+{
+    size_t n0 = hnsw.offsets.size() - 1;
+    if (preset_levels) {
+        FAISS_ASSERT (n0 + n == hnsw.levels.size());
+    } else {
+        FAISS_ASSERT (n0 == hnsw.levels.size());
+        for (int i = 0; i < n; i++) {
+            int pt_level = hnsw.random_level();
+            hnsw.levels.push_back(pt_level + 1);
+        }
+    }
+    int max_level = 0;
+    for (int i = 0; i < n; i++) {
+        int pt_level = hnsw.levels[i + n0] - 1;
+        if (pt_level > max_level) max_level = pt_level;
+        hnsw.offsets.push_back(hnsw.offsets.back() +
+                               hnsw.cum_nb_neighbors(pt_level + 1));
+        hnsw.neighbors.resize(hnsw.offsets.back(), -1);
+    }
+    return max_level;
+}
+void hnsw_add_vertices(IndexHNSW &index_hnsw,
+                       size_t n0,
+                       size_t n, const float *x,
+                       bool verbose,
+                       bool preset_levels = false) {
+    HNSW & hnsw = index_hnsw.hnsw;
+    size_t ntotal = n0 + n;
+    double t0 = getmillisecs();
+    if (verbose) {
+        printf("hnsw_add_vertices: adding %ld elements on top of %ld "
+               "(preset_levels=%d)\n",
+               n, n0, int(preset_levels));
+    }
+    int max_level = prepare_level_tab (index_hnsw.hnsw, n, preset_levels);
+    if (verbose) {
+        printf("  max_level = %d\n", max_level);
+    }
+    std::vector<omp_lock_t> locks(ntotal);
+    for(int i = 0; i < ntotal; i++)
+        omp_init_lock(&locks[i]);
+    // add vectors from highest to lowest level
+    std::vector<int> hist;
+    std::vector<int> order(n);
+    { // make buckets with vectors of the same level
+        // build histogram
+        for (int i = 0; i < n; i++) {
+            storage_idx_t pt_id = i + n0;
+            int pt_level = hnsw.levels[pt_id] - 1;
+            while (pt_level >= hist.size())
+                hist.push_back(0);
+            hist[pt_level] ++;
+        }
+        // accumulate
+        std::vector<int> offsets(hist.size() + 1, 0);
+        for (int i = 0; i < hist.size() - 1; i++) {
+            offsets[i + 1] = offsets[i] + hist[i];
+        }
+        // bucket sort
+        for (int i = 0; i < n; i++) {
+            storage_idx_t pt_id = i + n0;
+            int pt_level = hnsw.levels[pt_id] - 1;
+            order[offsets[pt_level]++] = pt_id;
+        }
+    }
+    { // perform add
+        RandomGenerator rng2(789);
+        int i1 = n;
+        for (int pt_level = hist.size() - 1; pt_level >= 0; pt_level--) {
+            int i0 = i1 - hist[pt_level];
+            if (verbose) {
+                printf("Adding %d elements at level %d\n",
+                       i1 - i0, pt_level);
+            }
+            // random permutation to get rid of dataset order bias
+            for (int j = i0; j < i1; j++)
+                std::swap(order[j], order[j + rng2.rand_int(i1 - j)]);
+#pragma omp parallel
+            {
+                VisitedTable vt (ntotal);
+                DistanceComputer *dis = index_hnsw.get_distance_computer();
+                ScopeDeleter1<DistanceComputer> del(dis);
+                int prev_display = verbose && omp_get_thread_num() == 0 ? 0 : -1;
+#pragma omp  for schedule(dynamic)
+                for (int i = i0; i < i1; i++) {
+                    storage_idx_t pt_id = order[i];
+                    dis->set_query (x + (pt_id - n0) * dis->d);
+                    hnsw.add_with_locks (
+                           *dis, pt_level, pt_id, locks,
+                           vt);
+                    if (prev_display >= 0 && i - i0 > prev_display + 10000) {
+                        prev_display = i - i0;
+                        printf("  %d / %d\r", i - i0, i1 - i0);
+                        fflush(stdout);
+                    }
+                }
+            }
+            i1 = i0;
+        }
+        FAISS_ASSERT(i1 == 0);
+    }
+    if (verbose)
+        printf("Done in %.3f ms\n", getmillisecs() - t0);
+    for(int i = 0; i < ntotal; i++)
+        omp_destroy_lock(&locks[i]);
+}
+} // anonymous namespace
+void HNSW::fill_with_random_links(size_t n)
+{
+    int max_level = prepare_level_tab (*this, n);
+    RandomGenerator rng2(456);
+    for (int level = max_level - 1; level >= 0; level++) {
+        std::vector<int> elts;
+        for (int i = 0; i < n; i++) {
+            if (levels[i] > level) {
+                elts.push_back(i);
+            }
+        }
+        printf ("linking %ld elements in level %d\n",
+                elts.size(), level);
+        if (elts.size() == 1) continue;
+        for (int ii = 0; ii < elts.size(); ii++) {
+            int i = elts[ii];
+            size_t begin, end;
+            neighbor_range(i, 0, &begin, &end);
+            for (size_t j = begin; j < end; j++) {
+                int other = 0;
+                do {
+                    other = elts[rng2.rand_int(elts.size())];
+                } while(other == i);
+                neighbors[j] = other;
+            }
+        }
+    }
+}
+/**************************************************************
+ * IndexHNSW implementation
+ **************************************************************/
+IndexHNSW::IndexHNSW(int d, int M):
+    Index(d, METRIC_L2),
+    hnsw(M),
+    own_fields(false),
+    storage(nullptr),
+    reconstruct_from_neighbors(nullptr)
+{}
+IndexHNSW::IndexHNSW(Index *storage, int M):
+    Index(storage->d, METRIC_L2),
+    hnsw(M),
+    own_fields(false),
+    storage(storage),
+    reconstruct_from_neighbors(nullptr)
+{}
+IndexHNSW::~IndexHNSW() {
+    if (own_fields) {
+        delete storage;
+    }
+}
+void IndexHNSW::train(idx_t n, const float* x)
+{
+    // hnsw structure does not require training
+    storage->train (n, x);
+    is_trained = true;
+}
+void IndexHNSW::search (idx_t n, const float *x, idx_t k,
+                            float *distances, idx_t *labels) const
+{
+#pragma omp parallel
+    {
+        VisitedTable vt (ntotal);
+        DistanceComputer *dis = get_distance_computer();
+        ScopeDeleter1<DistanceComputer> del(dis);
+        size_t nreorder = 0;
+#pragma omp for
+        for(int i = 0; i < n; i++) {
+            idx_t * idxi = labels + i * k;
+            float * simi = distances + i * k;
+            dis->set_query(x + i * d);
+            maxheap_heapify (k, simi, idxi);
+            hnsw.search (*dis, k, idxi, simi, vt);
+            maxheap_reorder (k, simi, idxi);
+            if (reconstruct_from_neighbors &&
+                reconstruct_from_neighbors->k_reorder != 0) {
+                int k_reorder = reconstruct_from_neighbors->k_reorder;
+                if (k_reorder == -1 || k_reorder > k) k_reorder = k;
+                nreorder += reconstruct_from_neighbors->compute_distances(
+                       k_reorder, idxi, x + i * d, simi);
+                // sort top k_reorder
+                maxheap_heapify (k_reorder, simi, idxi, simi, idxi, k_reorder);
+                maxheap_reorder (k_reorder, simi, idxi);
+            }
+        }
+#pragma omp critical
+        {
+            hnsw_stats.nreorder += nreorder;
+        }
+    }
+}
+void IndexHNSW::add(idx_t n, const float *x)
+{
+    FAISS_THROW_IF_NOT(is_trained);
+    int n0 = ntotal;
+    storage->add(n, x);
+    ntotal = storage->ntotal;
+    hnsw_add_vertices (*this, n0, n, x, verbose,
+                       hnsw.levels.size() == ntotal);
+}
+void IndexHNSW::reset()
+{
+    hnsw.reset();
+    storage->reset();
+    ntotal = 0;
+}
+void IndexHNSW::reconstruct (idx_t key, float* recons) const
+{
+    storage->reconstruct(key, recons);
+}
+void IndexHNSW::shrink_level_0_neighbors(int new_size)
+{
+#pragma omp parallel
+    {
+        DistanceComputer *dis = get_distance_computer();
+        ScopeDeleter1<DistanceComputer> del(dis);
+#pragma omp for
+        for (idx_t i = 0; i < ntotal; i++) {
+            size_t begin, end;
+            hnsw.neighbor_range(i, 0, &begin, &end);
+            std::priority_queue<NodeDistFarther> initial_list;
+            for (size_t j = begin; j < end; j++) {
+                int v1 = hnsw.neighbors[j];
+                if (v1 < 0) break;
+                initial_list.emplace(dis->symmetric_dis(i, v1), v1);
+                // initial_list.emplace(qdis(v1), v1);
+            }
+            std::vector<NodeDistFarther> shrunk_list;
+            shrink_neighbor_list (*dis, initial_list, shrunk_list, new_size);
+            for (size_t j = begin; j < end; j++) {
+                if (j - begin < shrunk_list.size())
+                    hnsw.neighbors[j] = shrunk_list[j - begin].id;
+                else
+                    hnsw.neighbors[j] = -1;
+            }
+        }
+    }
+}
+void IndexHNSW::search_level_0(
+                        idx_t n, const float *x, idx_t k,
+                        const storage_idx_t *nearest, const float *nearest_d,
+                        float *distances, idx_t *labels, int nprobe,
+                        int search_type) const
+{
+    storage_idx_t ntotal = hnsw.levels.size();
+#pragma omp parallel
+    {
+        DistanceComputer *qdis = get_distance_computer();
+        ScopeDeleter1<DistanceComputer> del(qdis);
+        VisitedTable vt (ntotal);
+#pragma omp for
+        for(idx_t i = 0; i < n; i++) {
+            idx_t * idxi = labels + i * k;
+            float * simi = distances + i * k;
+            qdis->set_query(x + i * d);
+            maxheap_heapify (k, simi, idxi);
+            if (search_type == 1) {
+                int nres = 0;
+                for(int j = 0; j < nprobe; j++) {
+                    storage_idx_t cj = nearest[i * nprobe + j];
+                    if (cj < 0) break;
+                    if (vt.get(cj)) continue;
+                    int candidates_size = std::max(hnsw.efSearch, int(k));
+                    MinimaxHeap candidates(candidates_size);
+                    candidates.push(cj, nearest_d[i * nprobe + j]);
+                    nres = search_from_candidates (
+                      hnsw, *qdis, k, idxi, simi,
+                      candidates, vt, 0, nres);
+                }
+            } else if (search_type == 2) {
+                int candidates_size = std::max(hnsw.efSearch, int(k));
+                candidates_size = std::max(candidates_size, nprobe);
+                MinimaxHeap candidates(candidates_size);
+                for(int j = 0; j < nprobe; j++) {
+                    storage_idx_t cj = nearest[i * nprobe + j];
+                    if (cj < 0) break;
+                    candidates.push(cj, nearest_d[i * nprobe + j]);
+                }
+                search_from_candidates (
+                      hnsw, *qdis, k, idxi, simi,
+                      candidates, vt, 0);
+            }
+            vt.advance();
+            maxheap_reorder (k, simi, idxi);
+        }
+    }
+}
+void IndexHNSW::init_level_0_from_knngraph(
+       int k, const float *D, const idx_t *I)
+{
+    int dest_size = hnsw.nb_neighbors (0);
+#pragma omp parallel for
+    for (idx_t i = 0; i < ntotal; i++) {
+        DistanceComputer *qdis = get_distance_computer();
+        float vec[d];
+        storage->reconstruct(i, vec);
+        qdis->set_query(vec);
+        std::priority_queue<NodeDistFarther> initial_list;
+        for (size_t j = 0; j < k; j++) {
+            int v1 = I[i * k + j];
+            if (v1 == i) continue;
+            if (v1 < 0) break;
+            initial_list.emplace(D[i * k + j], v1);
+        }
+        std::vector<NodeDistFarther> shrunk_list;
+        shrink_neighbor_list (*qdis, initial_list, shrunk_list, dest_size);
+        size_t begin, end;
+        hnsw.neighbor_range(i, 0, &begin, &end);
+        for (size_t j = begin; j < end; j++) {
+            if (j - begin < shrunk_list.size())
+                hnsw.neighbors[j] = shrunk_list[j - begin].id;
+            else
+                hnsw.neighbors[j] = -1;
+        }
+    }
+}
+void IndexHNSW::init_level_0_from_entry_points(
+          int n, const storage_idx_t *points,
+          const storage_idx_t *nearests)
+{
+    std::vector<omp_lock_t> locks(ntotal);
+    for(int i = 0; i < ntotal; i++)
+        omp_init_lock(&locks[i]);
+#pragma omp parallel
+    {
+        VisitedTable vt (ntotal);
+        DistanceComputer *dis = get_distance_computer();
+        ScopeDeleter1<DistanceComputer> del(dis);
+        float vec[storage->d];
+#pragma omp  for schedule(dynamic)
+        for (int i = 0; i < n; i++) {
+            storage_idx_t pt_id = points[i];
+            storage_idx_t nearest = nearests[i];
+            storage->reconstruct (pt_id, vec);
+            dis->set_query (vec);
+            add_links_starting_from(hnsw, *dis, pt_id, nearest, (*dis)(nearest),
+                                    0, locks.data(), vt);
+            if (verbose && i % 10000 == 0) {
+                printf("  %d / %d\r", i, n);
+                fflush(stdout);
+            }
+        }
+    }
+    if (verbose) {
+        printf("\n");
+    }
+    for(int i = 0; i < ntotal; i++)
+        omp_destroy_lock(&locks[i]);
+}
+void IndexHNSW::reorder_links()
+{
+    int M = hnsw.nb_neighbors(0);
+#pragma omp parallel
+    {
+        std::vector<float> distances (M);
+        std::vector<size_t> order (M);
+        std::vector<storage_idx_t> tmp (M);
+        DistanceComputer *dis = get_distance_computer();
+        ScopeDeleter1<DistanceComputer> del(dis);
+#pragma omp for
+        for(storage_idx_t i = 0; i < ntotal; i++) {
+            size_t begin, end;
+            hnsw.neighbor_range(i, 0, &begin, &end);
+            for (size_t j = begin; j < end; j++) {
+                storage_idx_t nj = hnsw.neighbors[j];
+                if (nj < 0) {
+                    end = j;
+                    break;
+                }
+                distances[j - begin] = dis->symmetric_dis(i, nj);
+                tmp [j - begin] = nj;
+            }
+            fvec_argsort (end - begin, distances.data(), order.data());
+            for (size_t j = begin; j < end; j++) {
+                hnsw.neighbors[j] = tmp[order[j - begin]];
+            }
+        }
+    }
+}
+void IndexHNSW::link_singletons()
+{
+    printf("search for singletons\n");
+    std::vector<bool> seen(ntotal);
+    for (size_t i = 0; i < ntotal; i++) {
+        size_t begin, end;
+        hnsw.neighbor_range(i, 0, &begin, &end);
+        for (size_t j = begin; j < end; j++) {
+            storage_idx_t ni = hnsw.neighbors[j];
+            if (ni >= 0) seen[ni] = true;
+        }
+    }
+    int n_sing = 0, n_sing_l1 = 0;
+    std::vector<storage_idx_t> singletons;
+    for (storage_idx_t i = 0; i < ntotal; i++) {
+        if (!seen[i]) {
+            singletons.push_back(i);
+            n_sing++;
+            if (hnsw.levels[i] > 1)
+                n_sing_l1++;
+        }
+    }
+    printf("  Found %d / %ld singletons (%d appear in a level above)\n",
+           n_sing, ntotal, n_sing_l1);
+    std::vector<float>recons(singletons.size() * d);
+    for (int i = 0; i < singletons.size(); i++) {
+        FAISS_ASSERT(!"not implemented");
+    }
+}
+// storage that explicitly reconstructs vectors before computing distances
+struct GenericDistanceComputer: HNSW::DistanceComputer {
+    const Index & storage;
+    std::vector<float> buf;
+    const float *q;
+    GenericDistanceComputer(const Index & storage): storage(storage)
+    {
+        d = storage.d;
+        buf.resize(d * 2);
+    }
+    float operator () (storage_idx_t i) override
+    {
+        storage.reconstruct(i, buf.data());
+        return fvec_L2sqr(q, buf.data(), d);
+    }
+    float symmetric_dis(storage_idx_t i, storage_idx_t j) override
+    {
+        storage.reconstruct(i, buf.data());
+        storage.reconstruct(j, buf.data() + d);
+        return fvec_L2sqr(buf.data() + d, buf.data(), d);
+    }
+    void set_query(const float *x) override {
+        q = x;
+    }
+};
+HNSW::DistanceComputer * IndexHNSW::get_distance_computer () const
+{
+    return new GenericDistanceComputer (*storage);
+}
+/**************************************************************
+ * ReconstructFromNeighbors implementation
+ **************************************************************/
+ReconstructFromNeighbors::ReconstructFromNeighbors(
+             const IndexHNSW & index, size_t k, size_t nsq):
+    index(index), k(k), nsq(nsq) {
+    M = index.hnsw.nb_neighbors(0);
+    FAISS_ASSERT(k <= 256);
+    code_size = k == 1 ? 0 : nsq;
+    ntotal = 0;
+    d = index.d;
+    FAISS_ASSERT(d % nsq == 0);
+    dsub = d / nsq;
+    k_reorder = -1;
+}
+void ReconstructFromNeighbors::reconstruct(storage_idx_t i, float *x, float *tmp) const
+{
+    const HNSW & hnsw = index.hnsw;
+    size_t begin, end;
+    hnsw.neighbor_range(i, 0, &begin, &end);
+    if (k == 1 || nsq == 1) {
+        const float * beta;
+        if (k == 1) {
+            beta = codebook.data();
+        } else {
+            int idx = codes[i];
+            beta = codebook.data() + idx * (M + 1);
+        }
+        float w0 = beta[0]; // weight of image itself
+        index.storage->reconstruct(i, tmp);
+        for (int l = 0; l < d; l++)
+            x[l] = w0 * tmp[l];
+        for (size_t j = begin; j < end; j++) {
+            storage_idx_t ji = hnsw.neighbors[j];
+            if (ji < 0) ji = i;
+            float w = beta[j - begin + 1];
+            index.storage->reconstruct(ji, tmp);
+            for (int l = 0; l < d; l++)
+                x[l] += w * tmp[l];
+        }
+    } else if (nsq == 2) {
+        int idx0 = codes[2 * i];
+        int idx1 = codes[2 * i + 1];
+        const float *beta0 = codebook.data() +  idx0 * (M + 1);
+        const float *beta1 = codebook.data() + (idx1 + k) * (M + 1);
+        index.storage->reconstruct(i, tmp);
+        float w0;
+        w0 = beta0[0];
+        for (int l = 0; l < dsub; l++)
+            x[l] = w0 * tmp[l];
+        w0 = beta1[0];
+        for (int l = dsub; l < d; l++)
+            x[l] = w0 * tmp[l];
+        for (size_t j = begin; j < end; j++) {
+            storage_idx_t ji = hnsw.neighbors[j];
+            if (ji < 0) ji = i;
+            index.storage->reconstruct(ji, tmp);
+            float w;
+            w = beta0[j - begin + 1];
+            for (int l = 0; l < dsub; l++)
+                x[l] += w * tmp[l];
+            w = beta1[j - begin + 1];
+            for (int l = dsub; l < d; l++)
+                x[l] += w * tmp[l];
+        }
+    } else {
+        const float *betas[nsq];
+        {
+            const float *b = codebook.data();
+            const uint8_t *c = &codes[i * code_size];
+            for (int sq = 0; sq < nsq; sq++) {
+                betas[sq] = b + (*c++) * (M + 1);
+                b += (M + 1) * k;
+            }
+        }
+        index.storage->reconstruct(i, tmp);
+        {
+            int d0 = 0;
+            for (int sq = 0; sq < nsq; sq++) {
+                float w = *(betas[sq]++);
+                int d1 = d0 + dsub;
+                for (int l = d0; l < d1; l++) {
+                    x[l] = w * tmp[l];
+                }
+                d0 = d1;
+            }
+        }
+        for (size_t j = begin; j < end; j++) {
+            storage_idx_t ji = hnsw.neighbors[j];
+            if (ji < 0) ji = i;
+            index.storage->reconstruct(ji, tmp);
+            int d0 = 0;
+            for (int sq = 0; sq < nsq; sq++) {
+                float w = *(betas[sq]++);
+                int d1 = d0 + dsub;
+                for (int l = d0; l < d1; l++) {
+                    x[l] += w * tmp[l];
+                }
+                d0 = d1;
+            }
+        }
+    }
+}
+void ReconstructFromNeighbors::reconstruct_n(storage_idx_t n0,
+                                             storage_idx_t ni,
+                                             float *x) const
+{
+#pragma omp parallel
+    {
+        std::vector<float> tmp(index.d);
+#pragma omp for
+        for (storage_idx_t i = 0; i < ni; i++) {
+            reconstruct(n0 + i, x + i * index.d, tmp.data());
+        }
+    }
+}
+size_t ReconstructFromNeighbors::compute_distances(size_t n, const idx_t *shortlist,
+                                                 const float *query, float *distances) const
+{
+    std::vector<float> tmp(2 * index.d);
+    size_t ncomp = 0;
+    for (int i = 0; i < n; i++) {
+        if (shortlist[i] < 0) break;
+        reconstruct(shortlist[i], tmp.data(), tmp.data() + index.d);
+        distances[i] = fvec_L2sqr(query, tmp.data(), index.d);
+        ncomp++;
+    }
+    return ncomp;
+}
+void ReconstructFromNeighbors::get_neighbor_table(storage_idx_t i, float *tmp1) const
+{
+    const HNSW & hnsw = index.hnsw;
+    size_t begin, end;
+    hnsw.neighbor_range(i, 0, &begin, &end);
+    size_t d = index.d;
+    index.storage->reconstruct(i, tmp1);
+    for (size_t j = begin; j < end; j++) {
+        storage_idx_t ji = hnsw.neighbors[j];
+        if (ji < 0) ji = i;
+        index.storage->reconstruct(ji, tmp1 + (j - begin + 1) * d);
+    }
+}
+/// called by add_codes
+void ReconstructFromNeighbors::estimate_code(
+       const float *x, storage_idx_t i, uint8_t *code) const
+{
+    // fill in tmp table with the neighbor values
+    float *tmp1 = new float[d * (M + 1) + (d * k)];
+    float *tmp2 = tmp1 + d * (M + 1);
+    ScopeDeleter<float> del(tmp1);
+    // collect coordinates of base
+    get_neighbor_table (i, tmp1);
+    for (int sq = 0; sq < nsq; sq++) {
+        int d0 = sq * dsub;
+        int d1 = d0 + dsub;
+        {
+            FINTEGER ki = k, di = d, m1 = M + 1;
+            FINTEGER dsubi = dsub;
+            float zero = 0, one = 1;
+            sgemm_ ("N", "N", &dsubi, &ki, &m1, &one,
+                    tmp1 + d0, &di,
+                    codebook.data() + sq * (m1 * k), &m1,
+                    &zero, tmp2, &dsubi);
+        }
+        float min = HUGE_VAL;
+        int argmin = -1;
+        for (int j = 0; j < k; j++) {
+            float dis = fvec_L2sqr(x + d0, tmp2 + j * dsub, dsub);
+            if (dis < min) {
+                min = dis;
+                argmin = j;
+            }
+        }
+        code[sq] = argmin;
+    }
+}
+void ReconstructFromNeighbors::add_codes(size_t n, const float *x)
+{
+    if (k == 1) { // nothing to encode
+        ntotal += n;
+        return;
+    }
+    codes.resize(codes.size() + code_size * n);
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+        estimate_code(x + i * index.d, ntotal + i,
+                      codes.data() + (ntotal + i) * code_size);
+    }
+    ntotal += n;
+    FAISS_ASSERT (codes.size() == ntotal * code_size);
+}
+/**************************************************************
+ * IndexHNSWFlat implementation
+ **************************************************************/
+struct FlatL2Dis: HNSW::DistanceComputer {
+    Index::idx_t nb;
+    const float *q;
+    const float *b;
+    size_t ndis;
+    float operator () (storage_idx_t i) override
+    {
+        ndis++;
+        return (fvec_L2sqr(q, b + i * d, d));
+    }
+    float symmetric_dis(storage_idx_t i, storage_idx_t j) override
+    {
+        return (fvec_L2sqr(b + j * d, b + i * d, d));
+    }
+    FlatL2Dis(const IndexFlatL2 & storage, const float *q = nullptr):
+        q(q)
+    {
+        nb = storage.ntotal;
+        d = storage.d;
+        b = storage.xb.data();
+        ndis = 0;
+    }
+    void set_query(const float *x) override {
+        q = x;
+    }
+    virtual ~FlatL2Dis () {
+#pragma omp critical
+        {
+            hnsw_stats.ndis += ndis;
+        }
+    }
+};
+IndexHNSWFlat::IndexHNSWFlat()
+{
+    is_trained = true;
+}
+IndexHNSWFlat::IndexHNSWFlat(int d, int M):
+    IndexHNSW(new IndexFlatL2(d), M)
+{
+    own_fields = true;
+    is_trained = true;
+}
+HNSW::DistanceComputer * IndexHNSWFlat::get_distance_computer () const
+{
+    return new FlatL2Dis (*dynamic_cast<IndexFlatL2*> (storage));
+}
+/**************************************************************
+ * IndexHNSWPQ implementation
+ **************************************************************/
+struct PQDis: HNSW::DistanceComputer {
+    Index::idx_t nb;
+    const uint8_t *codes;
+    size_t code_size;
+    const ProductQuantizer & pq;
+    const float *sdc;
+    std::vector<float> precomputed_table;
+    size_t ndis;
+    float operator () (storage_idx_t i) override
+    {
+        const uint8_t *code = codes + i * code_size;
+        const float *dt = precomputed_table.data();
+        float accu = 0;
+        for (int j = 0; j < pq.M; j++) {
+            accu += dt[*code++];
+            dt += 256;
+        }
+        ndis++;
+        return accu;
+    }
+    float symmetric_dis(storage_idx_t i, storage_idx_t j) override
+    {
+        const float * sdci = sdc;
+        float accu = 0;
+        const uint8_t *codei = codes + i * code_size;
+        const uint8_t *codej = codes + j * code_size;
+        for (int l = 0; l < pq.M; l++) {
+            accu += sdci[(*codei++) + (*codej++) * 256];
+            sdci += 256 * 256;
+        }
+        return accu;
+    }
+    PQDis(const IndexPQ & storage, const float *q = nullptr):
+        pq(storage.pq)
+    {
+        precomputed_table.resize(pq.M * pq.ksub);
+        nb = storage.ntotal;
+        d = storage.d;
+        codes = storage.codes.data();
+        code_size = pq.code_size;
+        FAISS_ASSERT(pq.ksub == 256);
+        FAISS_ASSERT(pq.sdc_table.size() == pq.ksub * pq.ksub * pq.M);
+        sdc = pq.sdc_table.data();
+        ndis = 0;
+    }
+    void set_query(const float *x) override {
+        pq.compute_distance_table(x, precomputed_table.data());
+    }
+    virtual ~PQDis () {
+#pragma omp critical
+        {
+            hnsw_stats.ndis += ndis;
+        }
+    }
+};
+IndexHNSWPQ::IndexHNSWPQ() {}
+IndexHNSWPQ::IndexHNSWPQ(int d, int pq_m, int M):
+    IndexHNSW(new IndexPQ(d, pq_m, 8), M)
+{
+    own_fields = true;
+    is_trained = false;
+}
+void IndexHNSWPQ::train(idx_t n, const float* x)
+{
+    IndexHNSW::train (n, x);
+    (dynamic_cast<IndexPQ*> (storage))->pq.compute_sdc_table();
+}
+HNSW::DistanceComputer * IndexHNSWPQ::get_distance_computer () const
+{
+    return new PQDis (*dynamic_cast<IndexPQ*> (storage));
+}
+/**************************************************************
+ * IndexHNSWSQ implementation
+ **************************************************************/
+struct SQDis: HNSW::DistanceComputer {
+    Index::idx_t nb;
+    const uint8_t *codes;
+    size_t code_size;
+    const ScalarQuantizer & sq;
+    const float *q;
+    ScalarQuantizer::DistanceComputer * dc;
+    float operator () (storage_idx_t i) override
+    {
+        const uint8_t *code = codes + i * code_size;
+        return dc->compute_distance (q, code);
+    }
+    float symmetric_dis(storage_idx_t i, storage_idx_t j) override
+    {
+        const uint8_t *codei = codes + i * code_size;
+        const uint8_t *codej = codes + j * code_size;
+        return dc->compute_code_distance (codei, codej);
+    }
+    SQDis(const IndexScalarQuantizer & storage, const float *q = nullptr):
+        sq(storage.sq)
+    {
+        nb = storage.ntotal;
+        d = storage.d;
+        codes = storage.codes.data();
+        code_size = sq.code_size;
+        dc = sq.get_distance_computer();
+    }
+    void set_query(const float *x) override {
+        q = x;
+    }
+    virtual ~SQDis () {
+        delete dc;
+    }
+};
+IndexHNSWSQ::IndexHNSWSQ(int d, ScalarQuantizer::QuantizerType qtype, int M):
+    IndexHNSW (new IndexScalarQuantizer (d, qtype), M)
+{
+    own_fields = true;
+}
+IndexHNSWSQ::IndexHNSWSQ() {}
+HNSW::DistanceComputer * IndexHNSWSQ::get_distance_computer () const
+{
+    return new SQDis (*dynamic_cast<IndexScalarQuantizer*> (storage));
+}
+/**************************************************************
+ * IndexHNSW2Level implementation
+ **************************************************************/
+IndexHNSW2Level::IndexHNSW2Level(Index *quantizer, size_t nlist, int m_pq, int M):
+    IndexHNSW (new Index2Layer (quantizer, nlist, m_pq), M)
+{
+    own_fields = true;
+    is_trained = false;
+}
+IndexHNSW2Level::IndexHNSW2Level() {}
+struct Distance2Level: HNSW::DistanceComputer {
+    const Index2Layer & storage;
+    std::vector<float> buf;
+    const float *q;
+    const float *pq_l1_tab, *pq_l2_tab;
+    Distance2Level(const Index2Layer & storage): storage(storage)
+    {
+        d = storage.d;
+        FAISS_ASSERT(storage.pq.dsub == 4);
+        pq_l2_tab = storage.pq.centroids.data();
+        buf.resize(2 * d);
+    }
+    float symmetric_dis(storage_idx_t i, storage_idx_t j) override
+    {
+        storage.reconstruct(i, buf.data());
+        storage.reconstruct(j, buf.data() + d);
+        return fvec_L2sqr(buf.data() + d, buf.data(), d);
+    }
+    void set_query(const float *x) override {
+        q = x;
+    }
+};
+// well optimized for xNN+PQNN
+struct DistanceXPQ4: Distance2Level {
+    int M, k;
+    DistanceXPQ4(const Index2Layer & storage):
+        Distance2Level (storage)
+    {
+        const IndexFlat *quantizer =
+            dynamic_cast<IndexFlat*> (storage.q1.quantizer);
+        FAISS_ASSERT(quantizer);
+        M = storage.pq.M;
+        pq_l1_tab = quantizer->xb.data();
+    }
+    float operator () (storage_idx_t i) override
+    {
+        const uint8_t *code = storage.codes.data() + i * storage.code_size;
+        long key = 0;
+        memcpy (&key, code, storage.code_size_1);
+        code += storage.code_size_1;
+        // walking pointers
+        const float *qa = q;
+        const __m128 *l1_t = (const __m128 *)(pq_l1_tab + d * key);
+        const __m128 *pq_l2_t = (const __m128 *)pq_l2_tab;
+        __m128 accu = _mm_setzero_ps();
+        for (int m = 0; m < M; m++) {
+            __m128 qi = _mm_loadu_ps(qa);
+            __m128 recons = l1_t[m] + pq_l2_t[*code++];
+            __m128 diff = qi - recons;
+            accu += diff * diff;
+            pq_l2_t += 256;
+            qa += 4;
+        }
+        accu = _mm_hadd_ps (accu, accu);
+        accu = _mm_hadd_ps (accu, accu);
+        return  _mm_cvtss_f32 (accu);
+    }
+};
+// well optimized for 2xNN+PQNN
+struct Distance2xXPQ4: Distance2Level {
+    int M_2, mi_nbits;
+    Distance2xXPQ4(const Index2Layer & storage):
+        Distance2Level (storage)
+    {
+        const MultiIndexQuantizer *mi =
+            dynamic_cast<MultiIndexQuantizer*> (storage.q1.quantizer);
+        FAISS_ASSERT(mi);
+        FAISS_ASSERT(storage.pq.M % 2 == 0);
+        M_2 = storage.pq.M / 2;
+        mi_nbits = mi->pq.nbits;
+        pq_l1_tab = mi->pq.centroids.data();
+    }
+    float operator () (storage_idx_t i) override
+    {
+        const uint8_t *code = storage.codes.data() + i * storage.code_size;
+        long key01 = 0;
+        memcpy (&key01, code, storage.code_size_1);
+        code += storage.code_size_1;
+        // walking pointers
+        const float *qa = q;
+        const __m128 *pq_l1_t = (const __m128 *)pq_l1_tab;
+        const __m128 *pq_l2_t = (const __m128 *)pq_l2_tab;
+        __m128 accu = _mm_setzero_ps();
+        for (int mi_m = 0; mi_m < 2; mi_m++) {
+            long l1_idx = key01 & ((1L << mi_nbits) - 1);
+            const __m128 * pq_l1 = pq_l1_t + M_2 * l1_idx;
+            for (int m = 0; m < M_2; m++) {
+                __m128 qi = _mm_loadu_ps(qa);
+                __m128 recons = pq_l1[m] + pq_l2_t[*code++];
+                __m128 diff = qi - recons;
+                accu += diff * diff;
+                pq_l2_t += 256;
+                qa += 4;
+            }
+            pq_l1_t += M_2 << mi_nbits;
+            key01 >>= mi_nbits;
+        }
+        accu = _mm_hadd_ps (accu, accu);
+        accu = _mm_hadd_ps (accu, accu);
+        return  _mm_cvtss_f32 (accu);
+    }
+};
+HNSW::DistanceComputer * IndexHNSW2Level::get_distance_computer () const
+{
+    const Index2Layer *storage2l =
+        dynamic_cast<Index2Layer*>(storage);
+    if (storage2l) {
+        const MultiIndexQuantizer *mi =
+            dynamic_cast<MultiIndexQuantizer*> (storage2l->q1.quantizer);
+        if (mi && storage2l->pq.M % 2 == 0 && storage2l->pq.dsub == 4) {
+            return new Distance2xXPQ4(*storage2l);
+        }
+        const IndexFlat *fl =
+            dynamic_cast<IndexFlat*> (storage2l->q1.quantizer);
+        if (fl && storage2l->pq.dsub == 4) {
+            return new DistanceXPQ4(*storage2l);
+        }
+    }
+    // IVFPQ and cases not handled above
+    return new GenericDistanceComputer (*storage);
+}
+namespace {
+// same as search_from_candidates but uses v
+// visno -> is in result list
+// visno + 1 -> in result list + in candidates
+int search_from_candidates_2(const HNSW & hnsw,
+                             DistanceComputer & qdis, int k,
+                             idx_t *I, float * D,
+                             MinimaxHeap &candidates,
+                             VisitedTable &vt,
+                             int level, int nres_in = 0)
+{
+    int nres = nres_in;
+    int ndis = 0;
+    for (int i = 0; i < candidates.size(); i++) {
+        idx_t v1 = candidates.ids[i];
+        float d = candidates.dis[i];
+        FAISS_ASSERT(v1 >= 0);
+        vt.visited[v1] = vt.visno + 1;
+    }
+    bool do_dis_check = hnsw.check_relative_distance;
+    int nstep = 0;
+    while (candidates.size() > 0) {
+        float d0 = 0;
+        int v0 = candidates.pop_min(&d0);
+        if (do_dis_check) {
+            // tricky stopping condition: there are more that ef
+            // distances that are processed already that are smaller
+            // than d0
+            int n_dis_below = candidates.count_below(d0);
+            if(n_dis_below >= hnsw.efSearch) {
+                break;
+            }
+        }
+        size_t begin, end;
+        hnsw.neighbor_range(v0, level, &begin, &end);
+        for (size_t j = begin; j < end; j++) {
+            int v1 = hnsw.neighbors[j];
+            if (v1 < 0) break;
+            if (vt.visited[v1] == vt.visno + 1) {
+                // nothing to do
+            } else {
+                ndis++;
+                float d = qdis(v1);
+                candidates.push(v1, d);
+                // never seen before --> add to heap
+                if (vt.visited[v1] < vt.visno) {
+                    if (nres < k) {
+                        faiss::maxheap_push (++nres, D, I, d, v1);
+                    } else if (d < D[0]) {
+                        faiss::maxheap_pop (nres--, D, I);
+                        faiss::maxheap_push (++nres, D, I, d, v1);
+                    }
+                }
+                vt.visited[v1] = vt.visno + 1;
+            }
+        }
+        nstep++;
+        if (!do_dis_check && nstep > hnsw.efSearch) {
+            break;
+        }
+    }
+    if (level == 0) {
+#pragma omp critical
+        {
+            hnsw_stats.n1 ++;
+            if (candidates.size() == 0)
+                hnsw_stats.n2 ++;
+        }
+    }
+    return nres;
+}
+} // anonymous namespace
+void IndexHNSW2Level::search (idx_t n, const float *x, idx_t k,
+                              float *distances, idx_t *labels) const
+{
+    if (dynamic_cast<const Index2Layer*>(storage)) {
+        IndexHNSW::search (n, x, k, distances, labels);
+    } else { // "mixed" search
+        const IndexIVFPQ *index_ivfpq =
+            dynamic_cast<const IndexIVFPQ*>(storage);
+        int nprobe = index_ivfpq->nprobe;
+        long * coarse_assign = new long [n * nprobe];
+        ScopeDeleter<long> del (coarse_assign);
+        float * coarse_dis = new float [n * nprobe];
+        ScopeDeleter<float> del2 (coarse_dis);
+        index_ivfpq->quantizer->search (n, x, nprobe, coarse_dis, coarse_assign);
+        index_ivfpq->search_preassigned (
+            n, x, k, coarse_assign, coarse_dis, distances, labels, false);
+#pragma omp parallel
+        {
+            VisitedTable vt (ntotal);
+            DistanceComputer *dis = get_distance_computer();
+            ScopeDeleter1<DistanceComputer> del(dis);
+            int candidates_size = hnsw.upper_beam;
+            MinimaxHeap candidates(candidates_size);
+#pragma omp for
+            for(int i = 0; i < n; i++) {
+                idx_t * idxi = labels + i * k;
+                float * simi = distances + i * k;
+                dis->set_query(x + i * d);
+                // mark all inverted list elements as visited
+                for (int j = 0; j < nprobe; j++) {
+                    idx_t key = coarse_assign[j + i * nprobe];
+                    if (key < 0) break;
+                    const std::vector<idx_t> & ids = index_ivfpq->ids[key];
+                    for (int jj = 0; jj < ids.size(); jj++) {
+                        vt.set (ids[jj]);
+                    }
+                }
+                candidates.clear();
+                // copy the upper_beam elements to candidates list
+                int search_policy = 2;
+                if (search_policy == 1) {
+                    for (int j = 0 ; j < hnsw.upper_beam && j < k; j++) {
+                        if (idxi[j] < 0) break;
+                        candidates.push (idxi[j], simi[j]);
+                        // search_from_candidates adds them back
+                        idxi[j] = -1;
+                        simi[j] = HUGE_VAL;
+                    }
+                    // reorder from sorted to heap
+                    maxheap_heapify (k, simi, idxi, simi, idxi, k);
+                    search_from_candidates (
+                        hnsw, *dis, k, idxi, simi,
+                        candidates, vt, 0, k);
+                    vt.advance();
+                } else if (search_policy == 2) {
+                    for (int j = 0 ; j < hnsw.upper_beam && j < k; j++) {
+                        if (idxi[j] < 0) break;
+                        candidates.push (idxi[j], simi[j]);
+                    }
+                    // reorder from sorted to heap
+                    maxheap_heapify (k, simi, idxi, simi, idxi, k);
+                    search_from_candidates_2 (
+                        hnsw, *dis, k, idxi, simi,
+                        candidates, vt, 0, k);
+                    vt.advance ();
+                    vt.advance ();
+                }
+                maxheap_reorder (k, simi, idxi);
+            }
+        }
+    }
+}
+void IndexHNSW2Level::flip_to_ivf ()
+{
+    Index2Layer *storage2l =
+        dynamic_cast<Index2Layer*>(storage);
+    FAISS_THROW_IF_NOT (storage2l);
+    IndexIVFPQ * index_ivfpq =
+        new IndexIVFPQ (storage2l->q1.quantizer,
+                        d, storage2l->q1.nlist,
+                        storage2l->pq.M, 8);
+    index_ivfpq->pq = storage2l->pq;
+    index_ivfpq->is_trained = storage2l->is_trained;
+    index_ivfpq->precompute_table();
+    index_ivfpq->own_fields = storage2l->q1.own_fields;
+    storage2l->transfer_to_IVFPQ(*index_ivfpq);
+    index_ivfpq->make_direct_map (true);
+    storage = index_ivfpq;
+    delete storage2l;
+}
+} // namespace faiss
--- a/IndexHNSW.h
+++ b/IndexHNSW.h
+/**
+ * Copyright (c) 2015-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD+Patents license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <vector>
+#include <omp.h>
+#include "IndexFlat.h"
+#include "IndexPQ.h"
+#include "IndexScalarQuantizer.h"
+#include "utils.h"
+namespace faiss {
+/** Implementation of the Hierarchical Navigable Small World
+ * datastructure.
+ *
+ * Efficient and robust approximate nearest neighbor search using
+ * Hierarchical Navigable Small World graphs
+ *
+ *  Yu. A. Malkov, D. A. Yashunin, arXiv 2017
+ *
+ * This implmentation is heavily influenced by the NMSlib
+ * implementation by Yury Malkov and Leonid Boystov
+ * (https://github.com/searchivarius/nmslib)
+ *
+ * The HNSW object stores only the neighbor link structure, see
+ * IndexHNSW below for the full index object.
+ */
+struct VisitedTable;
+struct HNSW {
+    /// internal storage of vectors (32 bits: this is expensive)
+    typedef int storage_idx_t;
+    /// Faiss results are 64-bit
+    typedef faiss::Index::idx_t idx_t;
+    /** The HNSW structure does not store vectors, it only accesses
+     * them through this class.
+     *
+     * Functions are guaranteed to be be accessed only from 1 thread. */
+    struct DistanceComputer {
+        idx_t d;
+        /// called before computing distances
+        virtual void set_query (const float *x) = 0;
+        /// compute distance of vector i to current query
+        virtual float operator () (storage_idx_t i) = 0;
+        /// compute distance between two stored vectors
+        virtual float symmetric_dis(storage_idx_t i, storage_idx_t j) = 0;
+        virtual ~DistanceComputer () {}
+    };
+    /// assignment probability to each layer (sum=1)
+    std::vector<double> assign_probas;
+    /// number of neighbors stored per layer (cumulative), should not
+    /// be changed after first add
+    std::vector<int> cum_nneighbor_per_level;
+    /// level of each vector (base level = 1), size = ntotal
+    std::vector<int> levels;
+    /// offsets[i] is the offset in the neighbors array where vector i is stored
+    /// size ntotal + 1
+    std::vector<size_t> offsets;
+    /// neighbors[offsets[i]:offsets[i+1]] is the list of neighbors of vector i
+    /// for all levels. this is where all storage goes.
+    std::vector<storage_idx_t> neighbors;
+    /// entry point in the search structure (one of the points with maximum level
+    storage_idx_t entry_point;
+    faiss::RandomGenerator rng;
+    /// maximum level
+    int max_level;
+    /// expansion factor at construction time
+    int efConstruction;
+    /// expansion factor at search time
+    int efSearch;
+    /// during search: do we check whether the next best distance is good enough?
+    bool check_relative_distance;
+    /// number of entry points in levels > 0.
+    int upper_beam;
+    // methods that initialize the tree sizes
+    /// initialize the assign_probas and cum_nneighbor_per_level to
+    /// have 2*M links on level 0 and M links on levels > 0
+    void set_default_probas(int M, float levelMult);
+    /// set nb of neighbors for this level (before adding anything)
+    void set_nb_neighbors(int level_no, int n);
+    // methods that access the tree sizes
+    /// nb of neighbors for this level
+    int nb_neighbors(int layer_no) const;
+    /// cumumlative nb up to (and excluding) this level
+    int cum_nb_neighbors(int layer_no) const;
+    /// range of entries in the neighbors table of vertex no at layer_no
+    void neighbor_range(idx_t no, int layer_no,
+                        size_t * begin, size_t * end) const;
+    /// only mandatory parameter: nb of neighbors
+    explicit HNSW(int M = 32);
+    /// pick a random level for a new point
+    int random_level();
+    /// add n random levels to table (for debugging...)
+    void fill_with_random_links(size_t n);
+    /** add point pt_id on all levels <= pt_level and build the link
+     * structure for them. */
+    void add_with_locks(DistanceComputer & ptdis, int pt_level, int pt_id,
+                        std::vector<omp_lock_t> & locks,
+                        VisitedTable &vt);
+    /// search interface
+    void search(DistanceComputer & qdis, int k,
+                idx_t *I, float * D,
+                VisitedTable &vt) const;
+    void reset();
+    void clear_neighbor_tables(int level);
+    void print_neighbor_stats(int level) const;
+};
+struct HNSWStats {
+    size_t n1, n2, n3;
+    size_t ndis;
+    size_t nreorder;
+    bool view;
+    HNSWStats () {reset (); }
+    void reset ();
+};
+// global var that collects them all
+extern HNSWStats hnsw_stats;
+class IndexHNSW;
+struct ReconstructFromNeighbors {
+    typedef Index::idx_t idx_t;
+    typedef HNSW::storage_idx_t storage_idx_t;
+    const IndexHNSW & index;
+    size_t M; // number of neighbors
+    size_t k; // number of codebook entries
+    size_t nsq; // number of subvectors
+    size_t code_size;
+    int k_reorder; // nb to reorder. -1 = all
+    std::vector<float> codebook; // size nsq * k * (M + 1)
+    std::vector<uint8_t> codes; // size ntotal * code_size
+    size_t ntotal;
+    size_t d, dsub; // derived values
+    ReconstructFromNeighbors(const IndexHNSW & index,
+                             size_t k=256, size_t nsq=1);
+    /// codes must be added in the correct order and the IndexHNSW
+    /// must be populated and sorted
+    void add_codes(size_t n, const float *x);
+    size_t compute_distances(size_t n, const idx_t *shortlist,
+                           const float *query, float *distances) const;
+    /// called by add_codes
+    void estimate_code(const float *x, storage_idx_t i, uint8_t *code) const;
+    /// called by compute_distances
+    void reconstruct(storage_idx_t i, float *x, float *tmp) const;
+    void reconstruct_n(storage_idx_t n0, storage_idx_t ni, float *x) const;
+    /// get the M+1 -by-d table for neighbor coordinates for vector i
+    void get_neighbor_table(storage_idx_t i, float *out) const;
+};
+/** The HNSW index is a normal random-access index with a HNSW
+ * link structure built on top */
+struct IndexHNSW: Index {
+    typedef HNSW::storage_idx_t storage_idx_t;
+    // the link strcuture
+    HNSW hnsw;
+    // the sequential storage
+    bool own_fields;
+    Index * storage;
+    ReconstructFromNeighbors *reconstruct_from_neighbors;
+    explicit IndexHNSW (int d = 0, int M = 32);
+    explicit IndexHNSW (Index * storage, int M = 32);
+    ~IndexHNSW() override;
+    // get a DistanceComputer object for this kind of storage
+    virtual HNSW::DistanceComputer * get_distance_computer() const = 0;
+    void add(idx_t n, const float *x) override;
+    /// Trains the storage if needed
+    void train(idx_t n, const float* x) override;
+    /// entry point for search
+    void search (idx_t n, const float *x, idx_t k,
+                 float *distances, idx_t *labels) const override;
+    void reconstruct(idx_t key, float* recons) const override;
+    void reset () override;
+    void shrink_level_0_neighbors(int size);
+    /** Perform search only on level 0, given the starting points for
+     * each vertex.
+     *
+     * @param search_type 1:perform one search per nprobe, 2: enqueue
+     *                    all entry points
+     */
+    void search_level_0(idx_t n, const float *x, idx_t k,
+                        const storage_idx_t *nearest, const float *nearest_d,
+                        float *distances, idx_t *labels, int nprobe = 1,
+                        int search_type = 1) const;
+    /// alternative graph building
+    void init_level_0_from_knngraph(
+                        int k, const float *D, const idx_t *I);
+    /// alternative graph building
+    void init_level_0_from_entry_points(
+                        int npt, const storage_idx_t *points,
+                        const storage_idx_t *nearests);
+    // reorder links from nearest to farthest
+    void reorder_links();
+    void link_singletons();
+};
+/** Flat index topped with with a HNSW structure to access elements
+ *  more efficiently.
+ */
+struct IndexHNSWFlat: IndexHNSW {
+    IndexHNSWFlat();
+    IndexHNSWFlat(int d, int M);
+    HNSW::DistanceComputer * get_distance_computer() const override;
+};
+/** PQ index topped with with a HNSW structure to access elements
+ *  more efficiently.
+ */
+struct IndexHNSWPQ: IndexHNSW {
+    IndexHNSWPQ();
+    IndexHNSWPQ(int d, int pq_m, int M);
+    void train(idx_t n, const float* x) override;
+    HNSW::DistanceComputer * get_distance_computer() const override;
+};
+/** SQ index topped with with a HNSW structure to access elements
+ *  more efficiently.
+ */
+struct IndexHNSWSQ: IndexHNSW {
+    IndexHNSWSQ();
+    IndexHNSWSQ(int d, ScalarQuantizer::QuantizerType qtype, int M);
+    HNSW::DistanceComputer * get_distance_computer() const override;
+};
+/** 2-level code structure with fast random access
+ */
+struct IndexHNSW2Level: IndexHNSW {
+    IndexHNSW2Level();
+    IndexHNSW2Level(Index *quantizer, size_t nlist, int m_pq, int M);
+    HNSW::DistanceComputer * get_distance_computer() const override;
+    void flip_to_ivf();
+    /// entry point for search
+    void search (idx_t n, const float *x, idx_t k,
+                 float *distances, idx_t *labels) const override;
+};
+};
--- a/IndexIVF.cpp
+++ b/IndexIVF.cpp
@@ -23,6 +23,81 @@
 namespace faiss {
+/*****************************************
+ * Level1Quantizer implementation
+ ******************************************/
+Level1Quantizer::Level1Quantizer (Index * quantizer, size_t nlist):
+    quantizer (quantizer),
+    nlist (nlist),
+    quantizer_trains_alone (0),
+    own_fields (false),
+    clustering_index (nullptr)
+{
+    cp.niter = 10;
+}
+Level1Quantizer::Level1Quantizer ():
+    quantizer (nullptr),
+    nlist (0),
+    quantizer_trains_alone (0), own_fields (false),
+    clustering_index (nullptr)
+{}
+Level1Quantizer::~Level1Quantizer ()
+{
+    if (own_fields) delete quantizer;
+}
+void Level1Quantizer::train_q1 (size_t n, const float *x, bool verbose, MetricType metric_type)
+{
+    size_t d = quantizer->d;
+    if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
+        if (verbose)
+            printf ("IVF quantizer does not need training.\n");
+    } else if (quantizer_trains_alone == 1) {
+        if (verbose)
+            printf ("IVF quantizer trains alone...\n");
+        quantizer->train (n, x);
+        quantizer->verbose = verbose;
+        FAISS_THROW_IF_NOT_MSG (quantizer->ntotal == nlist,
+                          "nlist not consistent with quantizer size");
+    } else if (quantizer_trains_alone == 0) {
+        if (verbose)
+            printf ("Training level-1 quantizer on %ld vectors in %ldD\n",
+                    n, d);
+        Clustering clus (d, nlist, cp);
+        quantizer->reset();
+        if (clustering_index) {
+            clus.train (n, x, *clustering_index);
+            quantizer->add (nlist, clus.centroids.data());
+        } else {
+            clus.train (n, x, *quantizer);
+        }
+        quantizer->is_trained = true;
+    } else if (quantizer_trains_alone == 2) {
+        if (verbose)
+            printf (
+                "Training L2 quantizer on %ld vectors in %ldD%s\n",
+                n, d,
+                clustering_index ? "(user provided index)" : "");
+        FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
+        Clustering clus (d, nlist, cp);
+        if (!clustering_index) {
+            IndexFlatL2 assigner (d);
+            clus.train(n, x, assigner);
+        } else {
+            clus.train(n, x, *clustering_index);
+        }
+        if (verbose)
+            printf ("Adding centroids to quantizer\n");
+        quantizer->add (nlist, clus.centroids.data());
+    }
+}
 /*****************************************
 * IndexIVF implementation
 ******************************************/
@@ -31,13 +106,9 @@ namespace faiss {
 IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
                    MetricType metric):
    Index (d, metric),
-    nlist (nlist),
+    Level1Quantizer (quantizer, nlist),
    nprobe (1),
-    quantizer (quantizer),
+    max_codes (0),
-    quantizer_trains_alone (0),
-    own_fields (false),
-    clustering_index (nullptr),
-    ids (nlist),
    maintain_direct_map (false)
 {
    FAISS_THROW_IF_NOT (d == quantizer->d);
@@ -49,16 +120,13 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
    // here we set a low # iterations because this is typically used
    // for large clusterings (nb this is not used for the MultiIndex,
    // for which quantizer_trains_alone = true)
-    cp.niter = 10;
-    cp.verbose = verbose;
    code_size = 0; // let sub-classes set this
-    codes.resize(nlist);
+    ids.resize (nlist);
+    codes.resize (nlist);
 }
 IndexIVF::IndexIVF ():
-    nlist (0), nprobe (1), quantizer (nullptr),
+    nprobe (1), max_codes (0),
-    quantizer_trains_alone (0), own_fields (false),
-    clustering_index (nullptr),
    maintain_direct_map (false)
 {}
@@ -109,6 +177,78 @@ void IndexIVF::search (idx_t n, const float *x, idx_t k,
 }
+void IndexIVF::reconstruct (idx_t key, float* recons) const
+{
+    FAISS_THROW_IF_NOT_MSG (direct_map.size() == ntotal,
+                            "direct map is not initialized");
+    long list_no = direct_map[key] >> 32;
+    long offset = direct_map[key] & 0xffffffff;
+    reconstruct_from_offset (list_no, offset, recons);
+}
+void IndexIVF::reconstruct_n (idx_t i0, idx_t ni, float* recons) const
+{
+    FAISS_THROW_IF_NOT (ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
+    for (long list_no = 0; list_no < nlist; list_no++) {
+        const std::vector<long>& idlist = ids[list_no];
+        for (long offset = 0; offset < idlist.size(); offset++) {
+            long id = idlist[offset];
+            if (!(id >= i0 && id < i0 + ni)) {
+              continue;
+            }
+            float* reconstructed = recons + (id - i0) * d;
+            reconstruct_from_offset (list_no, offset, reconstructed);
+        }
+    }
+}
+void IndexIVF::search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                       float *distances, idx_t *labels,
+                                       float *recons) const
+{
+    long * idx = new long [n * nprobe];
+    ScopeDeleter<long> del (idx);
+    float * coarse_dis = new float [n * nprobe];
+    ScopeDeleter<float> del2 (coarse_dis);
+    quantizer->search (n, x, nprobe, coarse_dis, idx);
+    // search_preassigned() with `store_pairs` enabled to obtain the list_no
+    // and offset into `codes` for reconstruction
+    search_preassigned (n, x, k, idx, coarse_dis,
+                        distances, labels, true /* store_pairs */);
+    for (idx_t i = 0; i < n; ++i) {
+      for (idx_t j = 0; j < k; ++j) {
+        idx_t ij = i * k + j;
+        idx_t key = labels[ij];
+        float* reconstructed = recons + ij * d;
+        if (key < 0) {
+          // Fill with NaNs
+          memset(reconstructed, -1, sizeof(*reconstructed) * d);
+        } else {
+          int list_no = key >> 32;
+          int offset = key & 0xffffffff;
+          // Update label to the actual id
+          labels[ij] = ids[list_no][offset];
+          reconstruct_from_offset (list_no, offset, reconstructed);
+        }
+      }
+    }
+}
+void IndexIVF::reconstruct_from_offset (long list_no, long offset,
+                                        float* recons) const
+{
+  FAISS_THROW_MSG ("reconstruct_from_offset not implemented");
+}
 void IndexIVF::reset ()
 {
    ntotal = 0;
@@ -156,48 +296,11 @@ long IndexIVF::remove_ids (const IDSelector & sel)
 void IndexIVF::train (idx_t n, const float *x)
 {
-    if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
+    if (verbose)
-        if (verbose)
+        printf ("Training level-1 quantizer\n");
-            printf ("IVF quantizer does not need training.\n");
-    } else if (quantizer_trains_alone == 1) {
+    train_q1 (n, x, verbose, metric_type);
-        if (verbose)
-            printf ("IVF quantizer trains alone...\n");
-        quantizer->train (n, x);
-        quantizer->verbose = verbose;
-        FAISS_THROW_IF_NOT_MSG (quantizer->ntotal == nlist,
-                          "nlist not consistent with quantizer size");
-    } else if (quantizer_trains_alone == 0) {
-        if (verbose)
-            printf ("Training IVF quantizer on %ld vectors in %dD\n",
-                    n, d);
-        Clustering clus (d, nlist, cp);
-        quantizer->reset();
-        if (clustering_index) {
-            clus.train (n, x, *clustering_index);
-            quantizer->add (nlist, clus.centroids.data());
-        } else {
-            clus.train (n, x, *quantizer);
-        }
-        quantizer->is_trained = true;
-    } else if (quantizer_trains_alone == 2) {
-        if (verbose)
-            printf (
-                "Training L2 quantizer on %ld vectors in %dD%s\n",
-                n, d,
-                clustering_index ? "(user provided index)" : "");
-        FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
-        Clustering clus (d, nlist, cp);
-        if (!clustering_index) {
-            IndexFlatL2 assigner (d);
-            clus.train(n, x, assigner);
-        } else {
-            clus.train(n, x, *clustering_index);
-        }
-        if (verbose)
-            printf ("Adding centroids to quantizer\n");
-        quantizer->add (nlist, clus.centroids.data());
-    }
    if (verbose)
        printf ("Training IVF residual\n");
@@ -337,7 +440,6 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
 IndexIVF::~IndexIVF()
 {
-    if (own_fields) delete quantizer;
 }
@@ -408,13 +510,13 @@ void IndexIVFFlat::add_core (idx_t n, const float * x, const long *xids,
    ntotal += n_add;
 }
-void IndexIVFFlatStats::reset()
+void IndexIVFStats::reset()
 {
    memset ((void*)this, 0, sizeof (*this));
 }
-IndexIVFFlatStats indexIVFFlat_stats;
+IndexIVFStats indexIVF_stats;
 namespace {
@@ -437,6 +539,7 @@ void search_knn_inner_product (const IndexIVFFlat & ivf,
        float * __restrict simi = res->get_val (i);
        long * __restrict idxi = res->get_ids (i);
        minheap_heapify (k, simi, idxi);
+        size_t nscan = 0;
        for (size_t ik = 0; ik < ivf.nprobe; ik++) {
            long key = keysi[ik];  /* select the list  */
@@ -462,13 +565,16 @@ void search_knn_inner_product (const IndexIVFFlat & ivf,
                    minheap_push (k, simi, idxi, ip, id);
                }
            }
-            ndis += list_size;
+            nscan += list_size;
+            if (ivf.max_codes && nscan >= ivf.max_codes)
+                break;
        }
+        ndis += nscan;
        minheap_reorder (k, simi, idxi);
    }
-    indexIVFFlat_stats.nq += nx;
+    indexIVF_stats.nq += nx;
-    indexIVFFlat_stats.nlist += nlistv;
+    indexIVF_stats.nlist += nlistv;
-    indexIVFFlat_stats.ndis += ndis;
+    indexIVF_stats.ndis += ndis;
 }
@@ -490,6 +596,8 @@ void search_knn_L2sqr (const IndexIVFFlat &ivf,
        long * __restrict idxi = res->get_ids (i);
        maxheap_heapify (k, disi, idxi);
+        size_t nscan = 0;
        for (size_t ik = 0; ik < ivf.nprobe; ik++) {
            long key = keysi[ik];  /* select the list  */
            if (key < 0) {
@@ -514,13 +622,16 @@ void search_knn_L2sqr (const IndexIVFFlat &ivf,
                    maxheap_push (k, disi, idxi, disij, id);
                }
            }
-            ndis += list_size;
+            nscan += list_size;
+            if (ivf.max_codes && nscan >= ivf.max_codes)
+                break;
        }
+        ndis += nscan;
        maxheap_reorder (k, disi, idxi);
    }
-    indexIVFFlat_stats.nq += nx;
+    indexIVF_stats.nq += nx;
-    indexIVFFlat_stats.nlist += nlistv;
+    indexIVF_stats.nlist += nlistv;
-    indexIVFFlat_stats.ndis += ndis;
+    indexIVF_stats.ndis += ndis;
 }
@@ -639,20 +750,11 @@ void IndexIVFFlat::update_vectors (int n, idx_t *new_ids, const float *x)
 }
+void IndexIVFFlat::reconstruct_from_offset (long list_no, long offset,
+                                            float* recons) const
-void IndexIVFFlat::reconstruct (idx_t key, float * recons) const
 {
-    FAISS_THROW_IF_NOT_MSG (direct_map.size() == ntotal,
+    memcpy (recons, &codes[list_no][offset * code_size], d * sizeof(recons[0]));
-                      "direct map is not initialized");
-    int list_no = direct_map[key] >> 32;
-    int ofs = direct_map[key] & 0xffffffff;
-    memcpy (recons, &codes[list_no][ofs * code_size], d * sizeof(recons[0]));
 }
 } // namespace faiss
--- a/IndexIVF.h
+++ b/IndexIVF.h
@@ -24,6 +24,38 @@
 namespace faiss {
+/** Encapsulates a quantizer object for the IndexIVF
+ *
+ * The class isolates the fields that are independent of the storage
+ * of the lists (especially training)
+ */
+struct Level1Quantizer {
+    Index * quantizer;        ///< quantizer that maps vectors to inverted lists
+    size_t nlist;             ///< number of possible key values
+    /**
+     * = 0: use the quantizer as index in a kmeans training
+     * = 1: just pass on the training set to the train() of the quantizer
+     * = 2: kmeans training on a flat index + add the centroids to the quantizer
+     */
+    char quantizer_trains_alone;
+    bool own_fields;          ///< whether object owns the quantizer
+    ClusteringParameters cp; ///< to override default clustering params
+    Index *clustering_index; ///< to override index used during clustering
+    /// Trains the quantizer and calls train_residual to train sub-quantizers
+    void train_q1 (size_t n, const float *x, bool verbose,
+                   MetricType metric_type);
+    Level1Quantizer (Index * quantizer, size_t nlist);
+    Level1Quantizer ();
+    ~Level1Quantizer ();
+};
 /** Index based on a inverted file (IVF)
 *
@@ -42,22 +74,9 @@ namespace faiss {
 * Sub-classes implement a post-filtering of the index that refines
 * the distance estimation from the query to databse vectors.
 */
-struct IndexIVF: Index {
+struct IndexIVF: Index, Level1Quantizer {
-    size_t nlist;             ///< number of possible key values
    size_t nprobe;            ///< number of probes at query time
+    size_t max_codes;         ///< max nb of codes to visit to do a query
-    Index * quantizer;        ///< quantizer that maps vectors to inverted lists
-    /**
-     * = 0: use the quantizer as index in a kmeans training
-     * = 1: just pass on the training set to the train() of the quantizer
-     * = 2: kmeans training on a flat index + add the centroids to the quantizer
-     */
-    char quantizer_trains_alone;
-    bool own_fields;          ///< whether object owns the quantizer
-    ClusteringParameters cp; ///< to override default clustering params
-    Index *clustering_index; ///< to override index used during clustering
    std::vector < std::vector<long> > ids;  ///< Inverted lists for indexes
@@ -74,7 +93,7 @@ struct IndexIVF: Index {
     * be deleted while the IndexIVF is in use.
     */
    IndexIVF (Index * quantizer, size_t d, size_t nlist,
-              MetricType metric = METRIC_INNER_PRODUCT);
+              MetricType metric = METRIC_L2);
    void reset() override;
@@ -115,6 +134,42 @@ struct IndexIVF: Index {
    virtual void search (idx_t n, const float *x, idx_t k,
                         float *distances, idx_t *labels) const override;
+    void reconstruct (idx_t key, float* recons) const override;
+    /** Reconstruct a subset of the indexed vectors.
+     *
+     * Overrides default implementation to bypass reconstruct() which requires
+     * direct_map to be maintained.
+     *
+     * @param i0     first vector to reconstruct
+     * @param ni     nb of vectors to reconstruct
+     * @param recons output array of reconstructed vectors, size ni * d
+     */
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * Overrides default implementation to avoid having to maintain direct_map
+     * and instead fetch the code offsets through the `store_pairs` flag in
+     * search_preassigned().
+     *
+     * @param recons      reconstructed vectors size (n, k, d)
+     */
+    void search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                 float *distances, idx_t *labels,
+                                 float *recons) const override;
+    /** Reconstruct a vector given the location in terms of (inv list index +
+     * inv list offset) instead of the id.
+     *
+     * Useful for reconstructing when the direct_map is not maintained and
+     * the inv list offset is computed by search_preassigned() with
+     * `store_pairs` set.
+     */
+    virtual void reconstruct_from_offset (long list_no, long offset,
+                                          float* recons) const;
    /// Dataset manipulation functions
@@ -157,18 +212,17 @@ struct IndexIVF: Index {
 };
-struct IndexIVFFlatStats {
+struct IndexIVFStats {
    size_t nq;       // nb of queries run
    size_t nlist;    // nb of inverted lists scanned
    size_t ndis;     // nb of distancs computed
-    size_t npartial; // nb of bound computations (IndexIVFFlatIPBounds)
-    IndexIVFFlatStats () {reset (); }
+    IndexIVFStats () {reset (); }
    void reset ();
 };
 // global var that collects them all
-extern IndexIVFFlatStats indexIVFFlat_stats;
+extern IndexIVFStats indexIVF_stats;
@@ -182,7 +236,7 @@ struct IndexIVFFlat: IndexIVF {
    IndexIVFFlat (
            Index * quantizer, size_t d, size_t nlist_,
-            MetricType = METRIC_INNER_PRODUCT);
+            MetricType = METRIC_L2);
    /// same as add_with_ids, with precomputed coarse quantizer
    virtual void add_core (idx_t n, const float * x, const long *xids,
@@ -213,7 +267,8 @@ struct IndexIVFFlat: IndexIVF {
     */
    void update_vectors (int nv, idx_t *idx, const float *v);
-    void reconstruct(idx_t key, float* recons) const override;
+    void reconstruct_from_offset (long list_no, long offset,
+                                  float* recons) const override;
    IndexIVFFlat () {}
 };

--- a/IndexIVFPQ.cpp
+++ b/IndexIVFPQ.cpp
@@ -53,7 +53,6 @@ IndexIVFPQ::IndexIVFPQ (Index * quantizer, size_t d, size_t nlist,
    by_residual = true;
    use_precomputed_table = 0;
    scan_table_threshold = 0;
-    max_codes = 0; // means unlimited
    polysemous_training = nullptr;
    do_polysemous_training = false;
@@ -192,6 +191,23 @@ void IndexIVFPQ::add_with_ids (idx_t n, const float * x, const long *xids)
 void IndexIVFPQ::add_core_o (idx_t n, const float * x, const long *xids,
                             float *residuals_2, const long *precomputed_idx)
 {
+    idx_t bs = 32768;
+    if (n > bs) {
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min(i0 + bs, n);
+            if (verbose) {
+                printf("IndexIVFPQ::add_core_o: adding %ld:%ld / %ld\n",
+                       i0, i1, n);
+            }
+            add_core_o (i1 - i0, x + i0 * d,
+                        xids ? xids + i0 : nullptr,
+                        residuals_2 ? residuals_2 + i0 * d : nullptr,
+                        precomputed_idx ? precomputed_idx + i0 : nullptr);
+        }
+        return;
+    }
    FAISS_THROW_IF_NOT (is_trained);
    double t0 = getmillisecs ();
    const long * idx;
@@ -271,50 +287,22 @@ void IndexIVFPQ::add_core_o (idx_t n, const float * x, const long *xids,
    ntotal += n;
 }
-void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
-{
-    FAISS_THROW_IF_NOT (ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
-    std::vector<float> centroid (d);
-    for (int key = 0; key < nlist; key++) {
-        const std::vector<long> & idlist = ids[key];
-        const uint8_t * code_line = codes[key].data();
-        for (long ofs = 0; ofs < idlist.size(); ofs++) {
-            long id = idlist[ofs];
-            if (!(id >= i0 && id < i0 + ni)) continue;
-            float *r = recons + d * (id - i0);
-            if (by_residual) {
-                quantizer->reconstruct (key, centroid.data());
-                pq.decode (code_line + ofs * pq.code_size, r);
-                for (int j = 0; j < d; j++) {
-                    r[j] += centroid[j];
-                }
-            } else {
-                pq.decode (code_line + ofs * pq.code_size, r);
-            }
-        }
-    }
-}
-void IndexIVFPQ::reconstruct (idx_t key, float * recons) const
+void IndexIVFPQ::reconstruct_from_offset (long list_no, long offset,
+                                          float* recons) const
 {
-    FAISS_THROW_IF_NOT (direct_map.size() == ntotal);
+    const uint8_t* code = &(codes[list_no][offset * code_size]);
-    int list_no = direct_map[key] >> 32;
-    int ofs = direct_map[key] & 0xffffffff;
-    quantizer->reconstruct (list_no, recons);
+    if (by_residual) {
-    const uint8_t * code = &(codes[list_no][ofs * pq.code_size]);
+      std::vector<float> centroid(d);
+      quantizer->reconstruct (list_no, centroid.data());
-    for (size_t m = 0; m < pq.M; m++) {
+      pq.decode (code, recons);
-        float * out = recons + m * pq.dsub;
+      for (int i = 0; i < d; ++i) {
-        const float * cent = pq.get_centroids (m, code[m]);
+        recons[i] += centroid[i];
-        for (size_t i = 0; i < pq.dsub; i++) {
+      }
-            out[i] += cent[i];
+    } else {
-        }
+      pq.decode (code, recons);
    }
 }
@@ -1029,53 +1017,6 @@ void IndexIVFPQ::search_preassigned (idx_t nx, const float *qx, idx_t k,
 }
-void IndexIVFPQ::search_and_reconstruct (idx_t n, const float *x, idx_t k,
-                                         float *distances, idx_t *labels,
-                                         float *reconstructed)
-{
-    long * idx = new long [n * nprobe];
-    ScopeDeleter<long> del (idx);
-    float * coarse_dis = new float [n * nprobe];
-    ScopeDeleter<float> del2 (coarse_dis);
-    quantizer->search (n, x, nprobe, coarse_dis, idx);
-    search_preassigned (n, x, k, idx, coarse_dis,
-                        distances, labels, true);
-    for (long i = 0; i < n; i++) {
-        for (long j = 0; j < k; j++) {
-            long ij = i * k + j;
-            idx_t res = labels[ij];
-            float *recons = reconstructed + d * (ij);
-            if (res < 0) {
-                // fill with NaNs
-                memset(recons, -1, sizeof(*recons) * d);
-            } else {
-                int list_no = res >> 32;
-                int ofs = res & 0xffffffff;
-                labels[ij] = ids[list_no][ofs];
-                quantizer->reconstruct (list_no, recons);
-                const uint8_t * code = &(codes[list_no][ofs * pq.code_size]);
-                for (size_t m = 0; m < pq.M; m++) {
-                    float * out = recons + m * pq.dsub;
-                    const float * cent = pq.get_centroids (m, code[m]);
-                    for (size_t l = 0; l < pq.dsub; l++) {
-                        out[l] += cent[l];
-                    }
-                }
-            }
-        }
-    }
-}
 IndexIVFPQ::IndexIVFPQ ()
 {
    // initialize some runtime values
@@ -1083,7 +1024,6 @@ IndexIVFPQ::IndexIVFPQ ()
    scan_table_threshold = 0;
    do_polysemous_training = false;
    polysemous_ht = 0;
-    max_codes = 0;
    polysemous_training = nullptr;
 }
@@ -1209,29 +1149,22 @@ void IndexIVFPQR::add_core (idx_t n, const float *x, const long *xids,
 }
-void IndexIVFPQR::search (
+void IndexIVFPQR::search_preassigned (idx_t n, const float *x, idx_t k,
-            idx_t n, const float *x, idx_t k,
+                                      const idx_t *idx,
-            float *distances, idx_t *labels) const
+                                      const float *L1_dis,
+                                      float *distances, idx_t *labels,
+                                      bool store_pairs) const
 {
-    FAISS_THROW_IF_NOT (is_trained);
-    long * idx = new long [n * nprobe];
-    ScopeDeleter<long> del (idx);
-    float * L1_dis = new float [n * nprobe];
-    ScopeDeleter<float> del2 (L1_dis);
    uint64_t t0;
-    TIC;
-    quantizer->search (n, x, nprobe, L1_dis, idx);
-    indexIVFPQ_stats.assign_cycles += TOC;
    TIC;
    size_t k_coarse = long(k * k_factor);
    idx_t *coarse_labels = new idx_t [k_coarse * n];
-    ScopeDeleter<idx_t> del3 (coarse_labels);
+    ScopeDeleter<idx_t> del1 (coarse_labels);
    { // query with quantizer levels 1 and 2.
        float *coarse_distances = new float [k_coarse * n];
        ScopeDeleter<float> del(coarse_distances);
-        search_preassigned (n, x, k_coarse,
+        IndexIVFPQ::search_preassigned (n, x, k_coarse,
                            idx, L1_dis, coarse_distances, coarse_labels,
                            true);
    }
@@ -1287,7 +1220,8 @@ void IndexIVFPQR::search (
                if (dis < heap_sim[0]) {
                    maxheap_pop (k, heap_sim, heap_ids);
-                    maxheap_push (k, heap_sim, heap_ids, dis, id);
+                    long id_or_pair = store_pairs ? sl : id;
+                    maxheap_push (k, heap_sim, heap_ids, dis, id_or_pair);
                }
                n_refine ++;
            }
@@ -1298,25 +1232,21 @@ void IndexIVFPQR::search (
    indexIVFPQ_stats.refine_cycles += TOC;
 }
-void IndexIVFPQR::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
+void IndexIVFPQR::reconstruct_from_offset (long list_no, long offset,
+                                           float* recons) const
 {
-    std::vector<float> r3 (d);
+    IndexIVFPQ::reconstruct_from_offset (list_no, offset, recons);
-    IndexIVFPQ::reconstruct_n (i0, ni, recons);
-    for (idx_t i = i0; i < i0 + ni; i++) {
-        float *r = recons + i * d;
-        refine_pq.decode (&refine_codes [i * refine_pq.code_size], r3.data());
-        for (int j = 0; j < d; j++)
+    idx_t id = ids[list_no][offset];
-            r[j] += r3[j];
+    assert (0 <= id && id < ntotal);
+    std::vector<float> r3(d);
+    refine_pq.decode (&refine_codes [id * refine_pq.code_size], r3.data());
+    for (int i = 0; i < d; ++i) {
+      recons[i] += r3[i];
    }
 }
 void IndexIVFPQR::merge_from (IndexIVF &other_in, idx_t add_id)
 {
    IndexIVFPQR *other = dynamic_cast<IndexIVFPQR *> (&other_in);
@@ -1335,6 +1265,206 @@ long IndexIVFPQR::remove_ids(const IDSelector& /*sel*/) {
  return 0;
 }
+/*************************************
+ * Index2Layer implementation
+ *************************************/
+Index2Layer::Index2Layer (Index * quantizer, size_t nlist,
+                          int M,
+                          MetricType metric):
+    Index (quantizer->d, metric),
+    q1 (quantizer, nlist),
+    pq (quantizer->d, M, 8)
+{
+    is_trained = false;
+    for (int nbyte = 0; nbyte < 7; nbyte++) {
+        if ((1L << (8 * nbyte)) >= nlist) {
+            code_size_1 = nbyte;
+            break;
+        }
+    }
+    code_size_2 = pq.code_size;
+    code_size = code_size_1 + code_size_2;
+}
+Index2Layer::Index2Layer ()
+{
+    code_size = code_size_1 = code_size_2 = 0;
+}
+Index2Layer::~Index2Layer ()
+{}
+void Index2Layer::train(idx_t n, const float* x)
+{
+    if (verbose) {
+        printf ("training level-1 quantizer %ld vectors in %dD\n",
+                n, d);
+    }
+    q1.train_q1 (n, x, verbose, metric_type);
+    if (verbose) {
+        printf("computing residuals\n");
+    }
+    const float * x_in = x;
+    x = fvecs_maybe_subsample (
+         d, (size_t*)&n, pq.cp.max_points_per_centroid * pq.ksub,
+         x, verbose, pq.cp.seed);
+    ScopeDeleter<float> del_x (x_in == x ? nullptr : x);
+    std::vector<idx_t> assign(n); // assignement to coarse centroids
+    q1.quantizer->assign (n, x, assign.data());
+    std::vector<float> residuals(n * d);
+    for (idx_t i = 0; i < n; i++) {
+        q1.quantizer->compute_residual (
+           x + i * d, residuals.data() + i * d, assign[i]);
+    }
+    if (verbose)
+        printf ("training %zdx%zd product quantizer on %ld vectors in %dD\n",
+                pq.M, pq.ksub, n, d);
+    pq.verbose = verbose;
+    pq.train (n, residuals.data());
+    is_trained = true;
+}
+void Index2Layer::add(idx_t n, const float* x)
+{
+    idx_t bs = 32768;
+    if (n > bs) {
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min(i0 + bs, n);
+            if (verbose) {
+                printf("Index2Layer::add: adding %ld:%ld / %ld\n",
+                       i0, i1, n);
+            }
+            add (i1 - i0, x + i0 * d);
+        }
+        return;
+    }
+    std::vector<idx_t> codes1 (n);
+    q1.quantizer->assign (n, x, codes1.data());
+    std::vector<float> residuals(n * d);
+    for (idx_t i = 0; i < n; i++) {
+        q1.quantizer->compute_residual (
+            x + i * d, residuals.data() + i * d, codes1[i]);
+    }
+    std::vector<uint8_t> codes2 (n * code_size_2);
+    pq.compute_codes (residuals.data(), codes2.data(), n);
+    codes.resize ((ntotal + n) * code_size);
+    uint8_t *wp = &codes[ntotal * code_size];
+    {
+        int i = 0x11223344;
+        const char *ip = (char*)&i;
+        FAISS_THROW_IF_NOT_MSG (ip[0] == 0x44,
+                                "works only on a little-endian CPU");
+    }
+    // copy to output table
+    for (idx_t i = 0; i < n; i++) {
+        memcpy (wp, &codes1[i], code_size_1);
+        wp += code_size_1;
+        memcpy (wp, &codes2[i * code_size_2], code_size_2);
+        wp += code_size_2;
+    }
+    ntotal += n;
+}
+void Index2Layer::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+void Index2Layer::reconstruct_n(idx_t i0, idx_t ni, float* recons) const
+{
+    float recons1[d];
+    FAISS_THROW_IF_NOT (i0 >= 0 && i0 + ni <= ntotal);
+    const uint8_t *rp = &codes[i0 * code_size];
+    for (idx_t i = 0; i < ni; i++) {
+        idx_t key = 0;
+        memcpy (&key, rp, code_size_1);
+        q1.quantizer->reconstruct (key, recons1);
+        rp += code_size_1;
+        pq.decode (rp, recons);
+        for (idx_t j = 0; j < d; j++) {
+            recons[j] += recons1[j];
+        }
+        rp += code_size_2;
+        recons += d;
+    }
+}
+void Index2Layer::transfer_to_IVFPQ (IndexIVFPQ & other) const
+{
+    FAISS_THROW_IF_NOT (other.nlist == q1.nlist);
+    FAISS_THROW_IF_NOT (other.code_size == code_size_2);
+    FAISS_THROW_IF_NOT (other.ntotal == 0);
+    const uint8_t *rp = codes.data();
+    for (idx_t i = 0; i < ntotal; i++) {
+        idx_t key = 0;
+        memcpy (&key, rp, code_size_1);
+        other.ids[key].push_back (i);
+        rp += code_size_1;
+        std::vector<uint8_t> & list = other.codes[key];
+        size_t len = list.size();
+        list.resize(len + code_size_2);
+        memcpy (&list[len], rp, code_size_2);
+        rp += code_size_2;
+    }
+    other.ntotal = ntotal;
+}
+void Index2Layer::reconstruct(idx_t key, float* recons) const
+{
+    reconstruct_n (key, 1, recons);
+}
+void Index2Layer::reset()
+{
+    ntotal = 0;
+    codes.clear ();
+}
 /*****************************************
 * IndexIVFPQCompact implementation
 ******************************************/

--- a/IndexIVFPQ.h
+++ b/IndexIVFPQ.h
@@ -36,7 +36,6 @@ struct IndexIVFPQ: IndexIVF {
    // search-time parameters
    size_t scan_table_threshold;   ///< use table computation or on-the-fly?
-    size_t max_codes;              ///< max nb of codes to visit to do a query
    int polysemous_ht;             ///< Hamming thresh for polysemous filtering
@@ -64,16 +63,8 @@ struct IndexIVFPQ: IndexIVF {
    /// same as train_residual, also output 2nd level residuals
    void train_residual_o (idx_t n, const float *x, float *residuals_2);
+    void reconstruct_from_offset (long list_no, long offset,
-    /** Reconstruct a subset of the indexed vectors
+                                  float* recons) const override;
-     *
-     * @param i0     first vector to reconstruct
-     * @param ni     nb of vectors to reconstruct
-     * @param recons output array of reconstructed vectors, size ni * d
-     */
-    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
-    void reconstruct(idx_t key, float* recons) const override;
    /** Find exact duplicates in the dataset.
     *
@@ -114,16 +105,6 @@ struct IndexIVFPQ: IndexIVF {
                             float *distances, idx_t *labels,
                             bool store_pairs) const override;
-    /** Same as the search function, but also reconstruct approximate
-     * vectors for the search results
-     *
-     * @param reconstructed    size (n, k, d)
-     **/
-    void search_and_reconstruct (idx_t n, const float *x, idx_t k,
-                                 float *distances, idx_t *labels,
-                                 float *reconstructed);
    /// build precomputed table
    void precompute_table ();
@@ -190,17 +171,17 @@ struct IndexIVFPQR: IndexIVFPQ {
    void add_core (idx_t n, const float *x, const long *xids,
                     const long *precomputed_idx = nullptr);
-    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+    void reconstruct_from_offset (long list_no, long offset,
+                                  float* recons) const override;
    void merge_from (IndexIVF &other, idx_t add_id) override;
-    void search(
+    void search_preassigned (idx_t n, const float *x, idx_t k,
-        idx_t n,
+                             const idx_t *assign,
-        const float* x,
+                             const float *centroid_dis,
-        idx_t k,
+                             float *distances, idx_t *labels,
-        float* distances,
+                             bool store_pairs) const override;
-        idx_t* labels) const override;
    IndexIVFPQR();
 };
@@ -250,6 +231,60 @@ struct IndexIVFPQCompact: IndexIVFPQ {
 };
+/** Same as an IndexIVFPQ without the inverted lists: codes are stored sequentially
+ *
+ * The class is mainly inteded to store encoded vectors that can be
+ * accessed randomly, the search function is not implemented.
+ */
+struct Index2Layer: Index {
+    /// first level quantizer
+    Level1Quantizer q1;
+    /// second level quantizer is always a PQ
+    ProductQuantizer pq;
+    /// Codes. Size ntotal * code_size.
+    std::vector<uint8_t> codes;
+    /// size of the code for the first level (ceil(log8(q1.nlist)))
+    size_t code_size_1;
+    /// size of the code for the second level
+    size_t code_size_2;
+    /// code_size_1 + code_size_2
+    size_t code_size;
+    Index2Layer (Index * quantizer, size_t nlist,
+                 int M, MetricType metric = METRIC_L2);
+    Index2Layer ();
+    ~Index2Layer ();
+    void train(idx_t n, const float* x) override;
+    void add(idx_t n, const float* x) override;
+    /// not implemented
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+    void reconstruct(idx_t key, float* recons) const override;
+    void reset() override;
+    /// transfer the flat codes to an IVFPQ index
+    void transfer_to_IVFPQ(IndexIVFPQ & other) const;
+};
 } // namespace faiss

--- a/IndexPQ.cpp
+++ b/IndexPQ.cpp
@@ -803,6 +803,7 @@ MultiIndexQuantizer::MultiIndexQuantizer (int d,
 void MultiIndexQuantizer::train(idx_t n, const float *x)
 {
+    pq.verbose = verbose;
    pq.train (n, x);
    is_trained = true;
    // count virtual elements in index

--- a/IndexScalarQuantizer.cpp
+++ b/IndexScalarQuantizer.cpp
@@ -11,6 +11,8 @@
 #include <cstdio>
 #include <algorithm>
+#include <malloc.h>
 #include <omp.h>
 #include <immintrin.h>
@@ -46,6 +48,7 @@ namespace {
 typedef Index::idx_t idx_t;
 typedef ScalarQuantizer::QuantizerType QuantizerType;
 typedef ScalarQuantizer::RangeStat RangeStat;
+using DistanceComputer = ScalarQuantizer::DistanceComputer;
 /*******************************************************************
@@ -117,145 +120,179 @@ struct Codec4bit {
 };
 /*******************************************************************
- * Similarity: gets vector components and computes a similarity wrt. a
+ * Quantizer: normalizes scalar vector components, then passes them
- * query vector stored in the object
+ * through a codec
 */
-struct SimilarityL2 {
-    const float *y, *yi;
-    explicit SimilarityL2 (const float * y): y(y) {}
-    /******* scalar accumulator *******/
+struct Quantizer {
+    virtual void encode_vector(const float *x, uint8_t *code) const = 0;
+    virtual void decode_vector(const uint8_t *code, float *x) const = 0;
-    float accu;
-    void begin () {
+    virtual ~Quantizer() {}
-        accu = 0;
+};
-        yi = y;
+template<class Codec>
+struct QuantizerUniform: Quantizer {
+    const size_t d;
+    const float vmin, vdiff;
+    QuantizerUniform(size_t d, const std::vector<float> &trained):
+        d(d), vmin(trained[0]), vdiff(trained[1])
+    {
    }
-    void add_component (float x) {
+    void encode_vector(const float* x, uint8_t* code) const override {
-        float tmp = *yi++ - x;
+      for (size_t i = 0; i < d; i++) {
-        accu += tmp * tmp;
+        float xi = (x[i] - vmin) / vdiff;
+        if (xi < 0)
+          xi = 0;
+        if (xi > 1.0)
+          xi = 1.0;
+        Codec::encode_component(xi, code, i);
+      }
    }
-    float result () {
+    void decode_vector(const uint8_t* code, float* x) const override {
-        return accu;
+      for (size_t i = 0; i < d; i++) {
+        float xi = Codec::decode_component(code, i);
+        x[i] = vmin + xi * vdiff;
+      }
    }
+    float reconstruct_component (const uint8_t * code, int i) const
+    {
+        float xi = Codec::decode_component (code, i);
+        return vmin + xi * vdiff;
+    }
+};
 #ifdef USE_AVX
-    /******* AVX accumulator *******/
-    __m256 accu8;
+template<class Codec>
+struct QuantizerUniform8: QuantizerUniform<Codec> {
-    void begin_8 () {
+    QuantizerUniform8 (size_t d, const std::vector<float> &trained):
-        accu8 = _mm256_setzero_ps();
+        QuantizerUniform<Codec> (d, trained) {}
-        yi = y;
-    }
-    void add_8_components (__m256 x) {
+    __m256 reconstruct_8_components (const uint8_t * code, int i) const
-        __m256 yiv = _mm256_loadu_ps (yi);
+    {
-        yi += 8;
+        __m256 xi = Codec::decode_8_components (code, i);
-        __m256 tmp = yiv - x;
+        return _mm256_set1_ps(this->vmin) + xi * _mm256_set1_ps (this->vdiff);
-        accu8 += tmp * tmp;
    }
-    float result_8 () {
-        __m256 sum = _mm256_hadd_ps(accu8, accu8);
-        __m256 sum2 = _mm256_hadd_ps(sum, sum);
-        // now add the 0th and 4th component
-        return
-            _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
-            _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
-    }
-#endif
 };
-struct SimilarityIP {
+#endif
-    const float *y, *yi;
-    const float accu0;
-    /******* scalar accumulator *******/
-    float accu;
-    SimilarityIP (const float * y, float accu0):
+template<class Codec>
-        y (y), accu0 (accu0) {}
+struct QuantizerNonUniform: Quantizer {
+    const size_t d;
+    const float *vmin, *vdiff;
-    void begin () {
+    QuantizerNonUniform(size_t d, const std::vector<float> &trained):
-        accu = accu0;
+        d(d), vmin(trained.data()), vdiff(trained.data() + d) {}
-        yi = y;
+    void encode_vector(const float* x, uint8_t* code) const override {
+      for (size_t i = 0; i < d; i++) {
+        float xi = (x[i] - vmin[i]) / vdiff[i];
+        if (xi < 0)
+          xi = 0;
+        if (xi > 1.0)
+          xi = 1.0;
+        Codec::encode_component(xi, code, i);
+      }
    }
-    void add_component (float x) {
+    void decode_vector(const uint8_t* code, float* x) const override {
-        accu +=  *yi++ * x;
+      for (size_t i = 0; i < d; i++) {
+        float xi = Codec::decode_component(code, i);
+        x[i] = vmin[i] + xi * vdiff[i];
+      }
    }
-    float result () {
+    float reconstruct_component (const uint8_t * code, int i) const
-        return accu;
+    {
+        float xi = Codec::decode_component (code, i);
+        return vmin[i] + xi * vdiff[i];
    }
-#ifdef USE_AVX
+};
-    /******* AVX accumulator *******/
-    __m256 accu8;
-    void begin_8 () {
+#ifdef USE_AVX
-        accu8 = _mm256_setzero_ps();
-        yi = y;
-    }
-    void add_8_components (__m256 x) {
+template<class Codec>
-        __m256 yiv = _mm256_loadu_ps (yi);
+struct QuantizerNonUniform8: QuantizerNonUniform<Codec> {
-        yi += 8;
-        accu8 += yiv * x;
-    }
-    float result_8 () {
+    QuantizerNonUniform8 (size_t d, const std::vector<float> &trained):
-        __m256 sum = _mm256_hadd_ps(accu8, accu8);
+        QuantizerNonUniform<Codec> (d, trained) {}
-        __m256 sum2 = _mm256_hadd_ps(sum, sum);
-        // now add the 0th and 4th component
+    __m256 reconstruct_8_components (const uint8_t * code, int i) const
-        return
+    {
-            accu0 +
+        __m256 xi = Codec::decode_8_components (code, i);
-            _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
+        return _mm256_loadu_ps (this->vmin + i) + xi * _mm256_loadu_ps (this->vdiff + i);
-            _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
    }
-#endif
-};
-/*******************************************************************
+};
- * templatized distance functions
- */
+#endif
-template<class Quantizer, class Similarity>
+Quantizer *select_quantizer (
-float compute_distance(const Quantizer & quant, Similarity & sim,
+          QuantizerType qtype,
-                       const uint8_t *code)
+          size_t d, const std::vector<float> & trained)
 {
-    sim.begin();
+#ifdef USE_AVX
-    for (size_t i = 0; i < quant.d; i++) {
+    if (d % 8 == 0) {
-        float xi = quant.reconstruct_component (code, i);
+        switch(qtype) {
-        sim.add_component (xi);
+        case ScalarQuantizer::QT_8bit:
+            return new QuantizerNonUniform8<Codec8bit>(d, trained);
+        case ScalarQuantizer::QT_4bit:
+            return new QuantizerNonUniform8<Codec4bit>(d, trained);
+        case ScalarQuantizer::QT_8bit_uniform:
+            return new QuantizerUniform8<Codec8bit>(d, trained);
+        case ScalarQuantizer::QT_4bit_uniform:
+            return new QuantizerUniform8<Codec4bit>(d, trained);
+        }
+    } else
+#endif
+    {
+        switch(qtype) {
+        case ScalarQuantizer::QT_8bit:
+            return new QuantizerNonUniform<Codec8bit>(d, trained);
+        case ScalarQuantizer::QT_4bit:
+            return new QuantizerNonUniform<Codec4bit>(d, trained);
+        case ScalarQuantizer::QT_8bit_uniform:
+            return new QuantizerUniform<Codec8bit>(d, trained);
+        case ScalarQuantizer::QT_4bit_uniform:
+            return new QuantizerUniform<Codec4bit>(d, trained);
+        }
    }
-    return sim.result();
+    FAISS_THROW_MSG ("unknown qtype");
+    return nullptr;
 }
-#ifdef USE_AVX
-template<class Quantizer, class Similarity>
+Quantizer *select_quantizer (const ScalarQuantizer &sq)
-float compute_distance_8(const Quantizer & quant, Similarity & sim,
-                         const uint8_t *code)
 {
-    sim.begin_8();
+    return select_quantizer (sq.qtype, sq.d, sq.trained);
-    for (size_t i = 0; i < quant.d; i += 8) {
-        __m256 xi = quant.reconstruct_8_components (code, i);
-        sim.add_8_components (xi);
-    }
-    return sim.result_8();
 }
-#endif
 /*******************************************************************
@@ -412,215 +449,261 @@ void train_NonUniform(RangeStat rs, float rs_arg,
 }
 /*******************************************************************
- * Quantizer: normalizes scalar vector components, then passes them
+ * Similarity: gets vector components and computes a similarity wrt. a
- * through a codec
+ * query vector stored in the object. The data fields just encapsulate
+ * an accumulator.
 */
+struct SimilarityL2 {
+    const float *y, *yi;
+    explicit SimilarityL2 (const float * y): y(y) {}
-struct Quantizer {
+    /******* scalar accumulator *******/
-    virtual void encode_vector(const float *x, uint8_t *code) const = 0;
-    virtual void decode_vector(const uint8_t *code, float *x) const = 0;
-    virtual float compute_distance_L2 (SimilarityL2 &sim,
+    float accu;
-                                       const uint8_t * codes) const = 0;
-    virtual float compute_distance_IP (SimilarityIP &sim,
-                                       const uint8_t * codes) const = 0;
-    virtual ~Quantizer() {}
+    void begin () {
-};
+        accu = 0;
+        yi = y;
+    }
+    void add_component (float x) {
+        float tmp = *yi++ - x;
+        accu += tmp * tmp;
+    }
+    void add_component_2 (float x1, float x2) {
+        float tmp = x1 - x2;
+        accu += tmp * tmp;
+    }
+    float result () {
+        return accu;
+    }
-template<class Codec>
+#ifdef USE_AVX
-struct QuantizerUniform: Quantizer {
+    __m256 accu8;
-    const size_t d;
-    const float vmin, vdiff;
-    QuantizerUniform(size_t d, const std::vector<float> &trained):
+    void begin_8 () {
-        d(d), vmin(trained[0]), vdiff(trained[1]) {
+        accu8 = _mm256_setzero_ps();
+        yi = y;
    }
-    void encode_vector(const float* x, uint8_t* code) const override {
+    void add_8_components (__m256 x) {
-      for (size_t i = 0; i < d; i++) {
+        __m256 yiv = _mm256_loadu_ps (yi);
-        float xi = (x[i] - vmin) / vdiff;
+        yi += 8;
-        if (xi < 0)
+        __m256 tmp = yiv - x;
-          xi = 0;
+        accu8 += tmp * tmp;
-        if (xi > 1.0)
-          xi = 1.0;
-        Codec::encode_component(xi, code, i);
-      }
    }
-    void decode_vector(const uint8_t* code, float* x) const override {
+    void add_8_components_2 (__m256 x, __m256 y) {
-      for (size_t i = 0; i < d; i++) {
+        __m256 tmp = y - x;
-        float xi = Codec::decode_component(code, i);
+        accu8 += tmp * tmp;
-        x[i] = vmin + xi * vdiff;
-      }
    }
-    float reconstruct_component (const uint8_t * code, int i) const
+    float result_8 () {
-    {
+        __m256 sum = _mm256_hadd_ps(accu8, accu8);
-        float xi = Codec::decode_component (code, i);
+        __m256 sum2 = _mm256_hadd_ps(sum, sum);
-        return vmin + xi * vdiff;
+        // now add the 0th and 4th component
+        return
+            _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
+            _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
    }
+#endif
-#ifdef USE_AVX
+};
-    __m256 reconstruct_8_components (const uint8_t * code, int i) const
-    {
-        __m256 xi = Codec::decode_8_components (code, i);
+struct SimilarityIP {
-        return _mm256_set1_ps(vmin) + xi * _mm256_set1_ps (vdiff);
+    const float *y, *yi;
+    /******* scalar accumulator *******/
+    float accu;
+    explicit SimilarityIP (const float * y):
+        y (y) {}
+    void begin () {
+        accu = 0;
+        yi = y;
    }
-#endif
-    float compute_distance_L2(SimilarityL2& sim, const uint8_t* codes)
+    void add_component (float x) {
-        const override {
+        accu +=  *yi++ * x;
-      return compute_distance(*this, sim, codes);
    }
-    float compute_distance_IP(SimilarityIP& sim, const uint8_t* codes)
+    void add_component_2 (float x1, float x2) {
-        const override {
+        accu +=  x1 * x2;
-      return compute_distance(*this, sim, codes);
+    }
+    float result () {
+        return accu;
    }
-};
 #ifdef USE_AVX
-template<class Codec>
-struct QuantizerUniform8: QuantizerUniform<Codec> {
-    QuantizerUniform8 (size_t d, const std::vector<float> &trained):
+    __m256 accu8;
-        QuantizerUniform<Codec> (d, trained) {}
-    float compute_distance_L2(SimilarityL2& sim, const uint8_t* codes)
+    void begin_8 () {
-        const override {
+        accu8 = _mm256_setzero_ps();
-      return compute_distance_8(*this, sim, codes);
+        yi = y;
    }
-    float compute_distance_IP(SimilarityIP& sim, const uint8_t* codes)
+    void add_8_components (__m256 x) {
-        const override {
+        __m256 yiv = _mm256_loadu_ps (yi);
-      return compute_distance_8(*this, sim, codes);
+        yi += 8;
+        accu8 += yiv * x;
    }
-};
-#endif
+    void add_8_components_2 (__m256 x1, __m256 x2) {
+        accu8 += x1 * x2;
+    }
+    float result_8 () {
+        __m256 sum = _mm256_hadd_ps(accu8, accu8);
+        __m256 sum2 = _mm256_hadd_ps(sum, sum);
+        // now add the 0th and 4th component
+        return
+            _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
+            _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
+    }
+#endif
+};
+/*******************************************************************
+ * DistanceComputer: combines a similarity and a quantizer to do
+ * code-to-vector or code-to-code comparisons
+ */
-template<class Codec>
-struct QuantizerNonUniform: Quantizer {
-    const size_t d;
-    const float *vmin, *vdiff;
-    QuantizerNonUniform(size_t d, const std::vector<float> &trained):
+template<class Quantizer, class Similarity>
-        d(d), vmin(trained.data()), vdiff(trained.data() + d) {}
+struct DCTemplate : ScalarQuantizer::DistanceComputer {
-    void encode_vector(const float* x, uint8_t* code) const override {
+    Quantizer quant;
-      for (size_t i = 0; i < d; i++) {
-        float xi = (x[i] - vmin[i]) / vdiff[i];
-        if (xi < 0)
-          xi = 0;
-        if (xi > 1.0)
-          xi = 1.0;
-        Codec::encode_component(xi, code, i);
-      }
-    }
-    void decode_vector(const uint8_t* code, float* x) const override {
+    DCTemplate(size_t d, const std::vector<float> &trained):
-      for (size_t i = 0; i < d; i++) {
+        quant(d, trained)
-        float xi = Codec::decode_component(code, i);
+    {}
-        x[i] = vmin[i] + xi * vdiff[i];
-      }
-    }
-    float reconstruct_component (const uint8_t * code, int i) const
+    float compute_distance (const float *x,
+                            const uint8_t *code) override
    {
-        float xi = Codec::decode_component (code, i);
+        Similarity sim(x);
-        return vmin[i] + xi * vdiff[i];
+        sim.begin();
+        for (size_t i = 0; i < quant.d; i ++) {
+            float xi = quant.reconstruct_component (code, i);
+            sim.add_component (xi);
+        }
+        return sim.result();
    }
-#ifdef USE_AVX
+    float compute_code_distance (const uint8_t *code1,
-    __m256 reconstruct_8_components (const uint8_t * code, int i) const
+                                 const uint8_t *code2) override
    {
-        __m256 xi = Codec::decode_8_components (code, i);
+        Similarity sim(nullptr);
-        return _mm256_loadu_ps(vmin + i) + xi * _mm256_loadu_ps (vdiff + i);
+        sim.begin ();
-    }
+        for (size_t i = 0; i < quant.d; i ++) {
-#endif
+            float x1 = quant.reconstruct_component (code1, i);
+            float x2 = quant.reconstruct_component (code2, i);
-    float compute_distance_L2(SimilarityL2& sim, const uint8_t* codes)
+            sim.add_component_2 (x1, x2);
-        const override {
+        }
-      return compute_distance(*this, sim, codes);
+        return sim.result ();
    }
-    float compute_distance_IP(SimilarityIP& sim, const uint8_t* codes)
-        const override {
-      return compute_distance(*this, sim, codes);
-    }
 };
 #ifdef USE_AVX
-template<class Codec>
-struct QuantizerNonUniform8: QuantizerNonUniform<Codec> {
-    QuantizerNonUniform8 (size_t d, const std::vector<float> &trained):
+template<class Quantizer, class Similarity>
-        QuantizerNonUniform<Codec> (d, trained) {}
+struct DCTemplate_8 : ScalarQuantizer::DistanceComputer {
+    Quantizer quant;
-    float compute_distance_L2(SimilarityL2& sim, const uint8_t* codes)
+    DCTemplate_8(size_t d, const std::vector<float> &trained):
-        const override {
+        quant(d, trained)
-      return compute_distance_8(*this, sim, codes);
+    {}
+    float compute_distance (const float *x,
+                            const uint8_t *code) override
+    {
+        Similarity sim(x);
+        sim.begin_8();
+        for (size_t i = 0; i < quant.d; i += 8) {
+            __m256 xi = quant.reconstruct_8_components (code, i);
+            sim.add_8_components (xi);
+        }
+        return sim.result_8();
    }
-    float compute_distance_IP(SimilarityIP& sim, const uint8_t* codes)
+    float compute_code_distance (const uint8_t *code1,
-        const override {
+                                 const uint8_t *code2) override
-      return compute_distance_8(*this, sim, codes);
+    {
+        Similarity sim(nullptr);
+        sim.begin_8 ();
+        for (size_t i = 0; i < quant.d; i += 8) {
+            __m256 x1 = quant.reconstruct_8_components (code1, i);
+            __m256 x2 = quant.reconstruct_8_components (code2, i);
+            sim.add_8_components_2 (x1, x2);
+        }
+        return sim.result_8 ();
    }
 };
-#endif
+#endif
-Quantizer *select_quantizer (
+template<class Sim>
-       QuantizerType qtype,
+DistanceComputer *select_distance_computer (
-       size_t d, const std::vector<float> & trained)
+          QuantizerType qtype,
+          size_t d, const std::vector<float> & trained)
 {
 #ifdef USE_AVX
    if (d % 8 == 0) {
        switch(qtype) {
        case ScalarQuantizer::QT_8bit:
-            return new QuantizerNonUniform8<Codec8bit>(d, trained);
+            return new DCTemplate_8<QuantizerNonUniform8
+                                    <Codec8bit>, Sim>(d, trained);
        case ScalarQuantizer::QT_4bit:
-            return new QuantizerNonUniform8<Codec4bit>(d, trained);
+            return new DCTemplate_8<QuantizerNonUniform8
+                                    <Codec4bit>, Sim>(d, trained);
        case ScalarQuantizer::QT_8bit_uniform:
-            return new QuantizerUniform8<Codec8bit>(d, trained);
+            return new DCTemplate_8<QuantizerUniform8
+                                    <Codec8bit>, Sim>(d, trained);
        case ScalarQuantizer::QT_4bit_uniform:
-            return new QuantizerUniform8<Codec4bit>(d, trained);
+            return new DCTemplate_8<QuantizerUniform8
+                                    <Codec4bit>, Sim>(d, trained);
        }
    } else
 #endif
    {
        switch(qtype) {
        case ScalarQuantizer::QT_8bit:
-            return new QuantizerNonUniform<Codec8bit>(d, trained);
+            return new DCTemplate<QuantizerNonUniform
+                                  <Codec8bit>, Sim>(d, trained);
        case ScalarQuantizer::QT_4bit:
-            return new QuantizerNonUniform<Codec4bit>(d, trained);
+            return new DCTemplate<QuantizerNonUniform
+                                  <Codec4bit>, Sim>(d, trained);
        case ScalarQuantizer::QT_8bit_uniform:
-            return new QuantizerUniform<Codec8bit>(d, trained);
+            return new DCTemplate<QuantizerUniform
+                                  <Codec8bit>, Sim>(d, trained);
        case ScalarQuantizer::QT_4bit_uniform:
-            return new QuantizerUniform<Codec4bit>(d, trained);
+            return new DCTemplate<QuantizerUniform
+                                  <Codec4bit>, Sim>(d, trained);
        }
    }
    FAISS_THROW_MSG ("unknown qtype");
    return nullptr;
 }
-Quantizer *select_quantizer (const ScalarQuantizer &sq)
-{
-    return select_quantizer (sq.qtype, sq.d, sq.trained);
-}
 } // anonymous namespace
@@ -691,6 +774,19 @@ void ScalarQuantizer::decode (const uint8_t *codes, float *x, size_t n) const
        squant->decode_vector (codes + i * code_size, x + i * d);
 }
+ScalarQuantizer::DistanceComputer *ScalarQuantizer::get_distance_computer (
+                                         MetricType metric)
+    const
+{
+    if (metric == METRIC_L2) {
+        return select_distance_computer<SimilarityL2>(qtype, d, trained);
+    } else {
+        return select_distance_computer<SimilarityIP>(qtype, d, trained);
+    }
+}
 /*******************************************************************
 * IndexScalarQuantizer implementation
 ********************************************************************/
@@ -724,61 +820,66 @@ void IndexScalarQuantizer::add(idx_t n, const float* x)
    ntotal += n;
 }
-void IndexScalarQuantizer::search(
+namespace {
+template<class C>
+void search_flat_scalar_quantizer(
+        const IndexScalarQuantizer & index,
        idx_t n,
        const float* x,
        idx_t k,
        float* distances,
-        idx_t* labels) const
+        idx_t* labels)
 {
-    Quantizer *squant = select_quantizer (sq);
+    size_t code_size = index.code_size;
-    ScopeDeleter1<Quantizer> del(squant);
+    size_t d = index.d;
-    FAISS_THROW_IF_NOT (is_trained);
-    if (metric_type == METRIC_INNER_PRODUCT) {
+#pragma omp parallel
-#pragma omp parallel for
+    {
+        DistanceComputer *dc =
+            index.sq.get_distance_computer(index.metric_type);
+        ScopeDeleter1<DistanceComputer> del(dc);
+#pragma omp for
        for (size_t i = 0; i < n; i++) {
            idx_t *idxi = labels + i * k;
            float *simi = distances + i * k;
-            minheap_heapify (k, simi, idxi);
+            heap_heapify<C> (k, simi, idxi);
-            SimilarityIP sim(x + i * d, 0);
-            const uint8_t *ci = codes.data ();
-            for (size_t j = 0; j < ntotal; j++) {
+            const float *xi = x + i * d;
-                float accu = squant->compute_distance_IP(sim, ci);
+            const uint8_t *ci = index.codes.data ();
-                if (accu > simi [0]) {
+            for (size_t j = 0; j < index.ntotal; j++) {
-                    minheap_pop (k, simi, idxi);
+                float accu = dc->compute_distance(xi, ci);
-                    minheap_push (k, simi, idxi, accu, j);
+                if (C::cmp (simi [0], accu)) {
+                    heap_pop<C> (k, simi, idxi);
+                    heap_push<C> (k, simi, idxi, accu, j);
                }
                ci += code_size;
            }
-            minheap_reorder (k, simi, idxi);
+            heap_reorder<C> (k, simi, idxi);
        }
-    } else {
+    }
-#pragma omp parallel for
-        for (size_t i = 0; i < n; i++) {
-            idx_t *idxi = labels + i * k;
-            float *simi = distances + i * k;
-            maxheap_heapify (k, simi, idxi);
-            SimilarityL2 sim(x + i * d);
+};
-            const uint8_t *ci = codes.data ();
-            for (size_t j = 0; j < ntotal; j++) {
+}
-                float accu = squant->compute_distance_L2(sim, ci);
-                if (accu < simi [0]) {
+void IndexScalarQuantizer::search(
-                    maxheap_pop (k, simi, idxi);
+        idx_t n,
-                    maxheap_push (k, simi, idxi, accu, j);
+        const float* x,
-                }
+        idx_t k,
-                ci += code_size;
+        float* distances,
-            }
+        idx_t* labels) const
-            maxheap_reorder (k, simi, idxi);
+{
-        }
+    FAISS_THROW_IF_NOT (is_trained);
+    if (metric_type == METRIC_L2) {
+        search_flat_scalar_quantizer<CMax<float, idx_t> > (*this, n, x, k, distances, labels);
+    } else {
+        search_flat_scalar_quantizer<CMin<float, idx_t> > (*this, n, x, k, distances, labels);
    }
 }
 void IndexScalarQuantizer::reset()
@@ -883,18 +984,20 @@ void IndexIVFScalarQuantizer::add_with_ids
 namespace {
 void search_with_probes_ip (const IndexIVFScalarQuantizer & index,
-                            const float *x,
+                         const float *x,
-                            const idx_t *cent_ids, const float *cent_dis,
+                         const idx_t *cent_ids, const float *cent_dis,
-                            const Quantizer & quant,
+                         DistanceComputer & dc,
-                            int k, float *simi, idx_t *idxi,
+                         int k, float *simi, idx_t *idxi,
-                            bool store_pairs)
+                         bool store_pairs)
 {
    int nprobe = index.nprobe;
    size_t code_size = index.code_size;
    size_t d = index.d;
    std::vector<float> decoded(d);
    minheap_heapify (k, simi, idxi);
+    size_t nscan = 0;
    for (int i = 0; i < nprobe; i++) {
        idx_t list_no = cent_ids[i];
        if (list_no < 0) break;
@@ -903,11 +1006,11 @@ void search_with_probes_ip (const IndexIVFScalarQuantizer & index,
        const std::vector<idx_t> & ids = index.ids[list_no];
        const uint8_t* codes = index.codes[list_no].data();
-        SimilarityIP sim(x, accu0);
+        SimilarityIP sim(x);
        for (size_t j = 0; j < ids.size(); j++) {
-            float accu = quant.compute_distance_IP(sim, codes);
+            float accu = accu0 + dc.compute_distance(x, codes);
            if (accu > simi [0]) {
                minheap_pop (k, simi, idxi);
@@ -916,7 +1019,9 @@ void search_with_probes_ip (const IndexIVFScalarQuantizer & index,
            }
            codes += code_size;
        }
+        nscan += ids.size();
+        if (index.max_codes && nscan > index.max_codes)
+            break;
    }
    minheap_reorder (k, simi, idxi);
 }
@@ -925,15 +1030,16 @@ void search_with_probes_L2 (const IndexIVFScalarQuantizer & index,
                            const float *x_in,
                            const idx_t *cent_ids,
                            const Index *quantizer,
-                            const Quantizer & quant,
+                            DistanceComputer & dc,
                            int k, float *simi, idx_t *idxi,
                            bool store_pairs)
 {
    int nprobe = index.nprobe;
    size_t code_size = index.code_size;
    size_t d = index.d;
-    std::vector<float> decoded(d), x(d);
+    std::vector<float> x(d);
    maxheap_heapify (k, simi, idxi);
+    size_t nscan = 0;
    for (int i = 0; i < nprobe; i++) {
        idx_t list_no = cent_ids[i];
        if (list_no < 0) break;
@@ -944,11 +1050,9 @@ void search_with_probes_L2 (const IndexIVFScalarQuantizer & index,
        // shift of x_in wrt centroid
        quantizer->compute_residual (x_in, x.data(), list_no);
-        SimilarityL2 sim(x.data());
        for (size_t j = 0; j < ids.size(); j++) {
-            float dis = quant.compute_distance_L2 (sim, codes);
+            float dis = dc.compute_distance (x.data(), codes);
            if (dis < simi [0]) {
                maxheap_pop (k, simi, idxi);
@@ -957,6 +1061,9 @@ void search_with_probes_L2 (const IndexIVFScalarQuantizer & index,
            }
            codes += code_size;
        }
+        nscan += ids.size();
+        if (index.max_codes && nscan > index.max_codes)
+            break;
    }
    maxheap_reorder (k, simi, idxi);
 }
@@ -972,28 +1079,49 @@ void IndexIVFScalarQuantizer::search_preassigned (
 {
    FAISS_THROW_IF_NOT (is_trained);
-    Quantizer *squant = select_quantizer (sq);
-    ScopeDeleter1<Quantizer> del(squant);
    if (metric_type == METRIC_INNER_PRODUCT) {
-#pragma omp parallel for
+#pragma omp parallel
-        for (size_t i = 0; i < n; i++) {
+        {
-            search_with_probes_ip (*this, x + i * d,
+            DistanceComputer *dc = sq.get_distance_computer (metric_type);
-                                   idx + i * nprobe, dis + i * nprobe, *squant,
+            ScopeDeleter1<DistanceComputer> del(dc);
-                                   k, distances + i * k, labels + i * k,
+#pragma omp for
-                                   store_pairs);
+            for (size_t i = 0; i < n; i++) {
+                search_with_probes_ip (*this, x + i * d,
+                                       idx + i * nprobe, dis + i * nprobe, *dc,
+                                       k, distances + i * k, labels + i * k,
+                                       store_pairs);
+            }
        }
    } else {
-#pragma omp parallel for
+#pragma omp parallel
-        for (size_t i = 0; i < n; i++) {
+        {
-            search_with_probes_L2 (*this, x + i * d,
+            DistanceComputer *dc = sq.get_distance_computer (metric_type);
-                                   idx + i * nprobe, quantizer, *squant,
+            ScopeDeleter1<DistanceComputer> del(dc);
-                                   k, distances + i * k, labels + i * k,
+#pragma omp for
-                                   store_pairs);
+            for (size_t i = 0; i < n; i++) {
+                search_with_probes_L2 (*this, x + i * d,
+                                       idx + i * nprobe, quantizer, *dc,
+                                       k, distances + i * k, labels + i * k,
+                                       store_pairs);
+            }
        }
    }
 }
+void IndexIVFScalarQuantizer::reconstruct_from_offset (long list_no,
+                                                       long offset,
+                                                       float* recons) const
+{
+    std::vector<float> centroid(d);
+    quantizer->reconstruct (list_no, centroid.data());
+    const uint8_t* code = &(codes[list_no][offset * code_size]);
+    sq.decode (code, recons, 1);
+    for (int i = 0; i < d; ++i) {
+      recons[i] += centroid[i];
+    }
+}
 }
--- a/IndexScalarQuantizer.h
+++ b/IndexScalarQuantizer.h
@@ -74,7 +74,24 @@ struct ScalarQuantizer {
                        size_t n) const ;
    /// decode a vector from a given code (or n vectors if third argument)
-     void decode (const uint8_t *code, float *x, size_t n) const;
+    void decode (const uint8_t *code, float *x, size_t n) const;
+    // fast, non thread-safe way of computing vector-to-code and
+    // code-to-code distances.
+    struct DistanceComputer {
+        /// vector-to-code distance computation
+        virtual float compute_distance (const float *x,
+                                        const uint8_t *code) = 0;
+        /// code-to-code distance computation
+        virtual float compute_code_distance (const uint8_t *code1,
+                                             const uint8_t *code2) = 0;
+        virtual ~DistanceComputer () {}
+    };
+    DistanceComputer *get_distance_computer (MetricType metric = METRIC_L2)
+        const;
 };
@@ -126,7 +143,7 @@ struct IndexScalarQuantizer: Index {
 * distances are computed.
 */
-struct IndexIVFScalarQuantizer:IndexIVF {
+struct IndexIVFScalarQuantizer: IndexIVF {
    ScalarQuantizer sq;
    IndexIVFScalarQuantizer(Index *quantizer, size_t d, size_t nlist,
@@ -145,6 +162,9 @@ struct IndexIVFScalarQuantizer:IndexIVF {
                             float *distances, idx_t *labels,
                             bool store_pairs) const override;
+    void reconstruct_from_offset (long list_no, long offset,
+                                  float* recons) const override;
 };

--- a/Makefile
+++ b/Makefile
@@ -29,7 +29,7 @@ LIBOBJ=hamming.o  utils.o \
       Clustering.o Heap.o VectorTransform.o index_io.o \
       PolysemousTraining.o MetaIndexes.o Index.o \
       ProductQuantizer.o AutoTune.o AuxIndexStructures.o \
-       IndexScalarQuantizer.o FaissException.o
+       IndexScalarQuantizer.o FaissException.o IndexHNSW.o
 $(LIBNAME).a: $(LIBOBJ)
@@ -44,6 +44,7 @@ $(LIBNAME).$(SHAREDEXT): $(LIBOBJ)
 utils.o:             EXTRAFLAGS=$(BLASCFLAGS)
 VectorTransform.o:   EXTRAFLAGS=$(BLASCFLAGS)
 ProductQuantizer.o:  EXTRAFLAGS=$(BLASCFLAGS)
+IndexHNSW.o:         EXTRAFLAGS=$(BLASCFLAGS)
 # for MKL, the flags when generating a dynamic lib are different from
 # the ones when making an executable, but by default they are the same
@@ -121,7 +122,7 @@ VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \
 index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
 IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
 ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
- IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
+ IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h IndexHNSW.h utils.h
 PolysemousTraining.o: PolysemousTraining.cpp PolysemousTraining.h \
 ProductQuantizer.h Clustering.h Index.h Heap.h utils.h hamming.h \
 FaissAssert.h FaissException.h
@@ -134,12 +135,16 @@ ProductQuantizer.o: ProductQuantizer.cpp ProductQuantizer.h Clustering.h \
 AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
 FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
 IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
- IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
+ IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h IndexHNSW.h
 AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
 IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \
 IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \
 FaissException.h
 FaissException.o: FaissException.cpp FaissException.h
+IndexHNSW.o: IndexHNSW.cpp IndexHNSW.h IndexFlat.h Index.h IndexPQ.h \
+ ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h \
+ IndexScalarQuantizer.h IndexIVF.h utils.h FaissAssert.h FaissException.h \
+ IndexIVFPQ.h
 clean:

--- a/PolysemousTraining.h
+++ b/PolysemousTraining.h
@@ -25,7 +25,7 @@ namespace faiss {
 struct SimulatedAnnealingParameters {
    // optimization parameters
-    double init_temperature;   // init probability of accepting a bad swap
+    double init_temperature;   // init probaility of accepting a bad swap
    double temperature_decay;  // at each iteration the temp is multiplied by this
    int n_iter; // nb of iterations
    int n_redo; // nb of runs of the simulation

--- a/ProductQuantizer.cpp
+++ b/ProductQuantizer.cpp
@@ -355,7 +355,7 @@ void ProductQuantizer::decode (const uint8_t *code, float *x) const
 void ProductQuantizer::decode (const uint8_t *code, float *x, size_t n) const
 {
    for (size_t i = 0; i < n; i++) {
-        this->decode (code + M * i, x + d * i);
+        this->decode (code + code_size * i, x + d * i);
    }
 }

--- a/VectorTransform.cpp
+++ b/VectorTransform.cpp
@@ -95,11 +95,12 @@ void VectorTransform::reverse_transform (
 /*********************************************
 * LinearTransform
 *********************************************/
 /// both d_in > d_out and d_out < d_in are supported
 LinearTransform::LinearTransform (int d_in, int d_out,
                                  bool have_bias):
    VectorTransform (d_in, d_out), have_bias (have_bias),
-    verbose (false)
+    is_orthonormal (false), verbose (false)
 {}
 void LinearTransform::apply_noalloc (Index::idx_t n, const float * x,
@@ -156,6 +157,56 @@ void LinearTransform::transform_transpose (idx_t n, const float * y,
    if (have_bias) delete [] y;
 }
+void LinearTransform::set_is_orthonormal ()
+{
+    if (d_out > d_in) {
+        // not clear what we should do in this case
+        is_orthonormal = false;
+        return;
+    }
+    if (d_out == 0) { // borderline case, unnormalized matrix
+        is_orthonormal = true;
+        return;
+    }
+    double eps = 4e-5;
+    FAISS_ASSERT(A.size() >= d_out * d_in);
+    {
+        std::vector<float> ATA(d_out * d_out);
+        FINTEGER dii = d_in, doi = d_out;
+        float one = 1.0, zero = 0.0;
+        sgemm_ ("Transposed", "Not", &doi, &doi, &dii,
+                &one, A.data (), &dii,
+                A.data(), &dii,
+                &zero, ATA.data(), &doi);
+        is_orthonormal = true;
+        for (long i = 0; i < d_out; i++) {
+            for (long j = 0; j < d_out; j++) {
+                float v = ATA[i + j * d_out];
+                if (i == j) v-= 1;
+                if (fabs(v) > eps) {
+                    is_orthonormal = false;
+                }
+            }
+        }
+    }
+}
+void LinearTransform::reverse_transform (idx_t n, const float * xt,
+                                         float *x) const
+{
+    if (is_orthonormal) {
+        transform_transpose (n, xt, x);
+    } else {
+        FAISS_THROW_MSG ("reverse transform not implemented for non-orthonormal matrices");
+    }
+}
 /*********************************************
 * RandomRotationMatrix
@@ -183,13 +234,7 @@ void RandomRotationMatrix::init (int seed)
        }
        A.resize(d_in * d_out);
    }
+    is_orthonormal = true;
-}
-void RandomRotationMatrix::reverse_transform (idx_t n, const float * xt,
-                                              float *x) const
-{
-    transform_transpose (n, xt, x);
 }
 /*********************************************
@@ -422,12 +467,12 @@ void PCAMatrix::copy_from (const PCAMatrix & other)
 void PCAMatrix::prepare_Ab ()
 {
+    FAISS_THROW_IF_NOT_FMT (
+            d_out * d_in <= PCAMat.size(),
+            "PCA matrix cannot output %d dimensions from %d ",
+            d_out, d_in);
    if (!random_rotation) {
-        FAISS_THROW_IF_NOT_MSG (
-            d_out * d_in <= PCAMat.size(),
-            "PCA matrix was trained on too few examples "
-            "to output this number of dimensions");
        A = PCAMat;
        A.resize(d_out * d_in); // strip off useless dimensions
@@ -480,8 +525,8 @@ void PCAMatrix::prepare_Ab ()
    } else {
        FAISS_THROW_IF_NOT_MSG (balanced_bins == 0,
-                          "both balancing bins and applying a random rotation "
+             "both balancing bins and applying a random rotation "
-                          "does not make sense");
+             "does not make sense");
        RandomRotationMatrix rr(d_out, d_out);
        rr.init(5);
@@ -517,14 +562,8 @@ void PCAMatrix::prepare_Ab ()
        b[i] = accu;
    }
-}
+    is_orthonormal = eigen_power == 0;
-void PCAMatrix::reverse_transform (idx_t n, const float * xt,
-                                   float *x) const
-{
-    FAISS_THROW_IF_NOT_MSG (eigen_power == 0,
-                      "reverse only implemented for orthogonal transforms");
-    transform_transpose (n, xt, x);
 }
 /*********************************************
@@ -701,15 +740,7 @@ void OPQMatrix::train (Index::idx_t n, const float *x)
    }
    is_trained = true;
-}
+    is_orthonormal = true;
-void OPQMatrix::reverse_transform (idx_t n, const float * xt,
-                                   float *x) const
-{
-    transform_transpose (n, xt, x);
 }
@@ -738,6 +769,12 @@ void NormalizationTransform::apply_noalloc
    }
 }
+void NormalizationTransform::reverse_transform (idx_t n, const float* xt,
+                                                float* x) const
+{
+    memcpy (x, xt, sizeof (xt[0]) * n * d_in);
+}
 /*********************************************
 * IndexPreTransform
 *********************************************/
@@ -810,6 +847,7 @@ void IndexPreTransform::train (idx_t n, const float *x)
    }
    for (int i = 0; i <= last_untrained; i++) {
        if (i < chain.size()) {
            VectorTransform *ltrans = chain [i];
            if (!ltrans->is_trained) {
@@ -835,6 +873,7 @@ void IndexPreTransform::train (idx_t n, const float *x)
        }
        float * xt = chain[i]->apply (n, prev_x);
        if (prev_x != x) delete [] prev_x;
        prev_x = xt;
        del.set(xt);
@@ -859,6 +898,20 @@ const float *IndexPreTransform::apply_chain (idx_t n, const float *x) const
    return prev_x;
 }
+void IndexPreTransform::reverse_chain (idx_t n, const float* xt, float* x) const
+{
+    const float* next_x = xt;
+    ScopeDeleter<float> del;
+    for (int i = chain.size() - 1; i >= 0; i--) {
+        float* prev_x = (i == 0) ? x : new float [n * chain[i]->d_in];
+        ScopeDeleter<float> del2 ((prev_x == x) ? nullptr : prev_x);
+        chain [i]->reverse_transform (n, next_x, prev_x);
+        del2.swap (del);
+        next_x = prev_x;
+    }
+}
 void IndexPreTransform::add (idx_t n, const float *x)
 {
    FAISS_THROW_IF_NOT (is_trained);
@@ -903,24 +956,47 @@ long IndexPreTransform::remove_ids (const IDSelector & sel) {
 }
+void IndexPreTransform::reconstruct (idx_t key, float * recons) const
+{
+    float *x = chain.empty() ? recons : new float [index->d];
+    ScopeDeleter<float> del (recons == x ? nullptr : x);
+    // Initial reconstruction
+    index->reconstruct (key, x);
+    // Revert transformations from last to first
+    reverse_chain (1, x, recons);
+}
 void IndexPreTransform::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
 {
    float *x = chain.empty() ? recons : new float [ni * index->d];
    ScopeDeleter<float> del (recons == x ? nullptr : x);
-    // initial reconstruction
+    // Initial reconstruction
    index->reconstruct_n (i0, ni, x);
-    // revert transformations from last to first
+    // Revert transformations from last to first
-    for (int i = chain.size() - 1; i >= 0; i--) {
+    reverse_chain (ni, x, recons);
-        float *x_pre = i == 0 ? recons : new float [chain[i]->d_in * ni];
-        ScopeDeleter<float> del2 (x_pre == recons ? nullptr : x_pre);
-        chain [i]->reverse_transform (ni, x, x_pre);
-        del2.swap (del);  // delete [] x;
-        x = x_pre;
-    }
 }
+void IndexPreTransform::search_and_reconstruct (
+      idx_t n, const float *x, idx_t k,
+      float *distances, idx_t *labels, float* recons) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    const float* xt = apply_chain (n, x);
+    ScopeDeleter<float> del ((xt == x) ? nullptr : xt);
+    float* recons_temp = chain.empty() ? recons : new float [n * k * index->d];
+    ScopeDeleter<float> del2 ((recons_temp == recons) ? nullptr : recons_temp);
+    index->search_and_reconstruct (n, xt, k, distances, labels, recons_temp);
+    // Revert transformations from last to first
+    reverse_chain (n * k, recons_temp, recons);
+}
 /*********************************************
 * RemapDimensionsTransform

--- a/VectorTransform.h
+++ b/VectorTransform.h
@@ -37,7 +37,7 @@ struct VectorTransform {
    {}
-    /// set if the LinearTransform does not require training, or if
+    /// set if the VectorTransform does not require training, or if
    /// training is done already
    bool is_trained;
@@ -78,6 +78,9 @@ struct LinearTransform: VectorTransform {
    bool have_bias; ///! whether to use the bias term
+    /// check if matrix A is orthonormal (enables reverse_transform)
+    bool is_orthonormal;
    /// Transformation matrix, size d_out * d_in
    std::vector<float> A;
@@ -96,6 +99,13 @@ struct LinearTransform: VectorTransform {
    void transform_transpose (idx_t n, const float * y,
                              float *x) const;
+    /// works only if is_orthonormal
+    void reverse_transform (idx_t n, const float * xt,
+                            float *x) const override;
+    /// compute A^T * A to set the is_orthonormal flag
+    void set_is_orthonormal ();
    bool verbose;
    ~LinearTransform() override {}
@@ -113,8 +123,6 @@ struct RandomRotationMatrix: LinearTransform {
     /// must be called before the transform is used
     void init(int seed);
-     void reverse_transform(idx_t n, const float* xt, float* x) const override;
     RandomRotationMatrix () {}
 };
@@ -157,8 +165,6 @@ struct PCAMatrix: LinearTransform {
    /// will be completed with 0s
    void train(Index::idx_t n, const float* x) override;
-    void reverse_transform(idx_t n, const float* xt, float* x) const override;
    /// copy pre-trained PCA matrix
    void copy_from (const PCAMatrix & other);
@@ -192,8 +198,6 @@ struct OPQMatrix: LinearTransform {
    explicit OPQMatrix (int d = 0, int M = 1, int d2 = -1);
    void train(Index::idx_t n, const float* x) override;
-    void reverse_transform(idx_t n, const float* xt, float* x) const override;
 };
@@ -230,6 +234,9 @@ struct NormalizationTransform: VectorTransform {
    NormalizationTransform ();
    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+    /// Identity transform since norm is not revertible
+    void reverse_transform(idx_t n, const float* xt, float* x) const override;
 };
@@ -271,13 +278,23 @@ struct IndexPreTransform: Index {
        float* distances,
        idx_t* labels) const override;
+    void reconstruct (idx_t key, float * recons) const override;
    void reconstruct_n (idx_t i0, idx_t ni, float *recons)
        const override;
+    void search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                 float *distances, idx_t *labels,
+                                 float *recons) const override;
    /// apply the transforms in the chain. The returned float * may be
    /// equal to x, otherwise it should be deallocated.
    const float * apply_chain (idx_t n, const float *x) const;
+    /// Reverse the transforms in the chain. May not be implemented for
+    /// all transforms in the chain or may return approximate results.
+    void reverse_chain (idx_t n, const float* xt, float* x) const;
    ~IndexPreTransform() override;
 };

--- a/benchs/bench_hnsw.py
+++ b/benchs/bench_hnsw.py
+# Copyright (c) 2015-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the BSD+Patents license found in the
+# LICENSE file in the root directory of this source tree.
+#!/usr/bin/env python2
+import time
+import sys
+import numpy as np
+import faiss
+#################################################################
+# Small I/O functions
+#################################################################
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype='int32')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+#################################################################
+#  Main program
+#################################################################
+print "load data"
+xt = fvecs_read("sift1M/sift_learn.fvecs")
+xb = fvecs_read("sift1M/sift_base.fvecs")
+xq = fvecs_read("sift1M/sift_query.fvecs")
+nq, d = xq.shape
+print "load GT"
+gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
+todo = sys.argv[1:]
+if todo == []:
+    todo = 'hnsw hnsw_sq ivf ivf_hnsw_quantizer kmeans kmeans_hnsw'.split()
+def evaluate(index):
+    # for timing with a single core
+    # faiss.omp_set_num_threads(1)
+    t0 = time.time()
+    D, I = index.search(xq, 1)
+    t1 = time.time()
+    recall_at_1 = (I == gt[:, :1]).sum() / float(nq)
+    print "\t %7.3f ms per query, R@1 %.4f" % (
+        (t1 - t0) * 1000.0 / nq, recall_at_1)
+if 'hnsw' in todo:
+    print "Testing HNSW Flat"
+    index = faiss.IndexHNSWFlat(d, 32)
+    # training is not needed
+    # this is the default, higher is more accurate and slower to
+    # construct
+    index.hnsw.efConstruction = 40
+    print "add"
+    # to see progress
+    index.verbose = True
+    index.add(xb)
+    print "search"
+    for efSearch in 16, 32, 64, 128, 256:
+        print "efSearch", efSearch,
+        index.hnsw.efSearch = efSearch
+        evaluate(index)
+if 'hnsw_sq' in todo:
+    print "Testing HNSW with a scalar quantizer"
+    # also set M so that the vectors and links both use 128 bytes per
+    # entry (total 256 bytes)
+    index = faiss.IndexHNSWSQ(d, faiss.ScalarQuantizer.QT_8bit, 16)
+    print "training"
+    # training for the scalar quantizer
+    index.train(xt)
+    # this is the default, higher is more accurate and slower to
+    # construct
+    index.hnsw.efConstruction = 40
+    print "add"
+    # to see progress
+    index.verbose = True
+    index.add(xb)
+    print "search"
+    for efSearch in 16, 32, 64, 128, 256:
+        print "efSearch", efSearch,
+        index.hnsw.efSearch = efSearch
+        evaluate(index)
+if 'ivf' in todo:
+    print "Testing IVF Flat (baseline)"
+    quantizer = faiss.IndexFlatL2(d)
+    index = faiss.IndexIVFFlat(quantizer, d, 16384)
+    index.cp.min_points_per_centroid = 5   # quiet warning
+    # to see progress
+    index.verbose = True
+    print "training"
+    index.train(xt)
+    print "add"
+    index.add(xb)
+    print "search"
+    for nprobe in 1, 4, 16, 64, 256:
+        print "nprobe", nprobe,
+        index.nprobe = nprobe
+        evaluate(index)
+if 'ivf_hnsw_quantizer' in todo:
+    print "Testing IVF Flat with HNSW quantizer"
+    quantizer = faiss.IndexHNSWFlat(d, 32)
+    index = faiss.IndexIVFFlat(quantizer, d, 16384)
+    index.cp.min_points_per_centroid = 5   # quiet warning
+    index.quantizer_trains_alone = 2
+    # to see progress
+    index.verbose = True
+    print "training"
+    index.train(xt)
+    print "add"
+    index.add(xb)
+    print "search"
+    quantizer.hnsw.efSearch = 64
+    for nprobe in 1, 4, 16, 64, 256:
+        print "nprobe", nprobe,
+        index.nprobe = nprobe
+        evaluate(index)
+# Bonus: 2 kmeans tests
+if 'kmeans' in todo:
+    print "Performing kmeans on sift1M database vectors (baseline)"
+    clus = faiss.Clustering(d, 16384)
+    clus.verbose = True
+    clus.niter = 10
+    index = faiss.IndexFlatL2(d)
+    clus.train(xb, index)
+if 'kmeans_hnsw' in todo:
+    print "Performing kmeans on sift1M using HNSW assignment"
+    clus = faiss.Clustering(d, 16384)
+    clus.verbose = True
+    clus.niter = 10
+    index = faiss.IndexHNSWFlat(d, 32)
+    # increase the default efSearch, otherwise the number of empty
+    # clusters is too high.
+    index.hnsw.efSearch = 128
+    clus.train(xb, index)
--- a/faiss.py
+++ b/faiss.py
@@ -427,7 +427,7 @@ def replacement_map_add(self, keys, vals):
 def replacement_map_search_multiple(self, keys):
    n, = keys.shape
-    vals = np.empty(n, dtype='uint64')
+    vals = np.empty(n, dtype='int64')
    self.search_multiple_c(n, swig_ptr(keys), swig_ptr(vals))
    return vals

--- a/gpu/Makefile
+++ b/gpu/Makefile
@@ -150,6 +150,8 @@ dep:
 ../VectorTransform.h ../MetaIndexes.h GpuIndexFlat.h GpuIndexIVFFlat.h \
 GpuIndexIVF.h ../Clustering.h GpuIndexIVFPQ.h IndexProxy.h \
 utils/WorkerThread.h
+./GpuClonerOptions.o: GpuClonerOptions.cpp GpuClonerOptions.h \
+ GpuIndicesOptions.h
 impl/RemapIndices.o: impl/RemapIndices.cpp impl/RemapIndices.h \
 impl/../../FaissAssert.h impl/../../FaissException.h
 utils/DeviceMemory.o: utils/DeviceMemory.cpp utils/DeviceMemory.h \

--- a/gpu/test/test_gpu_index.py
+++ b/gpu/test/test_gpu_index.py
@@ -111,4 +111,4 @@ class EvalIVFPQAccuracy(unittest.TestCase):
        index = faiss.index_factory(12, "PCAR8,IVF10,PQ4")
        res = faiss.StandardGpuResources()
        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
-        faiss.GpuParameterSpace().set_index_parameter(index, "nprobe", 3)
+        faiss.GpuParameterSpace().set_index_parameter(gpu_index, "nprobe", 3)
--- a/index_io.cpp
+++ b/index_io.cpp
@@ -25,6 +25,7 @@
 #include "IndexIVFPQ.h"
 #include "MetaIndexes.h"
 #include "IndexScalarQuantizer.h"
+#include "IndexHNSW.h"
 /*************************************************************
 * The I/O format is the content of the class. For objects that are
@@ -153,8 +154,6 @@ static void write_index_header (const Index *idx, FILE *f) {
    WRITE1 (idx->metric_type);
 }
 void write_VectorTransform (const VectorTransform *vt, FILE *f) {
    if (const LinearTransform * lt =
           dynamic_cast < const LinearTransform *> (vt)) {
@@ -221,6 +220,21 @@ void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) {
    write_ProductQuantizer (pq, f);
 }
+static void write_HNSW (const HNSW *hnsw, FILE *f) {
+    WRITEVECTOR (hnsw->assign_probas);
+    WRITEVECTOR (hnsw->cum_nneighbor_per_level);
+    WRITEVECTOR (hnsw->levels);
+    WRITEVECTOR (hnsw->offsets);
+    WRITEVECTOR (hnsw->neighbors);
+    WRITE1 (hnsw->entry_point);
+    WRITE1 (hnsw->max_level);
+    WRITE1 (hnsw->efConstruction);
+    WRITE1 (hnsw->efSearch);
+    WRITE1 (hnsw->upper_beam);
+}
 static void write_ivf_header (const IndexIVF * ivf, FILE *f,
                              bool include_ids = true) {
@@ -265,6 +279,19 @@ void write_index (const Index *idx, FILE *f) {
        WRITE1 (idxp->search_type);
        WRITE1 (idxp->encode_signs);
        WRITE1 (idxp->polysemous_ht);
+    } else if(const Index2Layer * idxp =
+              dynamic_cast<const Index2Layer *> (idx)) {
+        uint32_t h = fourcc ("Ix2L");
+        WRITE1 (h);
+        write_index_header (idx, f);
+        write_index (idxp->q1.quantizer, f);
+        WRITE1 (idxp->q1.nlist);
+        WRITE1 (idxp->q1.quantizer_trains_alone);
+        write_ProductQuantizer (&idxp->pq, f);
+        WRITE1 (idxp->code_size_1);
+        WRITE1 (idxp->code_size_2);
+        WRITE1 (idxp->code_size);
+        WRITEVECTOR (idxp->codes);
    } else if(const IndexScalarQuantizer * idxs =
              dynamic_cast<const IndexScalarQuantizer *> (idx)) {
        uint32_t h = fourcc ("IxSQ");
@@ -348,6 +375,19 @@ void write_index (const Index *idx, FILE *f) {
        write_index_header (idxmap, f);
        write_index (idxmap->index, f);
        WRITEVECTOR (idxmap->id_map);
+    } else if(const IndexHNSW * idxhnsw =
+              dynamic_cast<const IndexHNSW *> (idx)) {
+        uint32_t h =
+            dynamic_cast<const IndexHNSWFlat*>(idx)   ? fourcc("IHNf") :
+            dynamic_cast<const IndexHNSWPQ*>(idx)     ? fourcc("IHNp") :
+            dynamic_cast<const IndexHNSWSQ*>(idx)     ? fourcc("IHNs") :
+            dynamic_cast<const IndexHNSW2Level*>(idx) ? fourcc("IHN2") :
+            0;
+        FAISS_THROW_IF_NOT (h != 0);
+        WRITE1 (h);
+        write_index_header (idxhnsw, f);
+        write_HNSW (&idxhnsw->hnsw, f);
+        write_index (idxhnsw->storage, f);
    } else {
      FAISS_THROW_MSG ("don't know how to serialize this type of index");
    }
@@ -409,6 +449,9 @@ VectorTransform* read_VectorTransform (FILE *f) {
        READ1 (lt->have_bias);
        READVECTOR (lt->A);
        READVECTOR (lt->b);
+        FAISS_THROW_IF_NOT (lt->A.size() >= lt->d_in * lt->d_out);
+        FAISS_THROW_IF_NOT (!lt->have_bias || lt->b.size() >= lt->d_out);
+        lt->set_is_orthonormal();
        vt = lt;
    } else if (h == fourcc ("RmDT")) {
        RemapDimensionsTransform *rdt = new RemapDimensionsTransform ();
@@ -444,6 +487,19 @@ static void read_ScalarQuantizer (ScalarQuantizer *ivsc, FILE *f) {
    READVECTOR (ivsc->trained);
 }
+static void read_HNSW (HNSW *hnsw, FILE *f) {
+    READVECTOR (hnsw->assign_probas);
+    READVECTOR (hnsw->cum_nneighbor_per_level);
+    READVECTOR (hnsw->levels);
+    READVECTOR (hnsw->offsets);
+    READVECTOR (hnsw->neighbors);
+    READ1 (hnsw->entry_point);
+    READ1 (hnsw->max_level);
+    READ1 (hnsw->efConstruction);
+    READ1 (hnsw->efSearch);
+    READ1 (hnsw->upper_beam);
+}
 ProductQuantizer * read_ProductQuantizer (const char*fname) {
    FILE *f = fopen (fname, "r");
@@ -675,6 +731,33 @@ Index *read_index (FILE * f, bool try_mmap) {
            static_cast<IndexIDMap2*>(idxmap)->construct_rev_map ();
        }
        idx = idxmap;
+    } else if (h == fourcc ("Ix2L")) {
+        Index2Layer * idxp = new Index2Layer ();
+        read_index_header (idxp, f);
+        idxp->q1.quantizer = read_index (f);
+        READ1 (idxp->q1.nlist);
+        READ1 (idxp->q1.quantizer_trains_alone);
+        read_ProductQuantizer (&idxp->pq, f);
+        READ1 (idxp->code_size_1);
+        READ1 (idxp->code_size_2);
+        READ1 (idxp->code_size);
+        READVECTOR (idxp->codes);
+        idx = idxp;
+    } else if(h == fourcc("IHNf") || h == fourcc("IHNp") ||
+              h == fourcc("IHNs") || h == fourcc("IHN2")) {
+        IndexHNSW *idxhnsw = nullptr;
+        if (h == fourcc("IHNf")) idxhnsw = new IndexHNSWFlat ();
+        if (h == fourcc("IHNp")) idxhnsw = new IndexHNSWPQ ();
+        if (h == fourcc("IHNs")) idxhnsw = new IndexHNSWSQ ();
+        if (h == fourcc("IHN2")) idxhnsw = new IndexHNSW2Level ();
+        read_index_header (idxhnsw, f);
+        read_HNSW (&idxhnsw->hnsw, f);
+        idxhnsw->storage = read_index (f);
+        idxhnsw->own_fields = true;
+        if (h == fourcc("IHNp")) {
+            dynamic_cast<IndexPQ*>(idxhnsw->storage)->pq.compute_sdc_table ();
+        }
+        idx = idxhnsw;
    } else {
        FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
        idx = nullptr;
@@ -771,6 +854,12 @@ Index *Cloner::clone_Index (const Index *index)
            res->chain.push_back (clone_VectorTransform (ipt->chain[i]));
        res->own_fields = true;
        return res;
+    } else if (const IndexIDMap *idmap =
+               dynamic_cast<const IndexIDMap*> (index)) {
+        IndexIDMap *res = new IndexIDMap (*idmap);
+        res->own_fields = true;
+        res->index = clone_Index (idmap->index);
+        return res;
    } else {
        FAISS_THROW_MSG( "clone not supported for this type of Index");
    }

--- a/swigfaiss.swig
+++ b/swigfaiss.swig
@@ -27,7 +27,8 @@
 typedef unsigned long uint64_t;
 typedef uint64_t size_t;
-typedef int int32_t ;
+typedef int int32_t;
+typedef unsigned char uint8_t;
 #define __restrict
@@ -77,6 +78,7 @@ extern "C" {
 #include "IndexIVF.h"
 #include "IndexIVFPQ.h"
 #include "IndexScalarQuantizer.h"
+#include "IndexHNSW.h"
 #include "MetaIndexes.h"
 #include "FaissAssert.h"
@@ -247,6 +249,7 @@ int get_num_gpus()
 %include "IndexPQ.h"
 %include "IndexIVF.h"
 %include "IndexScalarQuantizer.h"
+%include "IndexHNSW.h"
 %ignore faiss::IndexIVFPQ::alloc_type;
 %include "IndexIVFPQ.h"
@@ -431,6 +434,11 @@ struct AsyncIndexSearchC {
    DOWNCAST ( IndexLSH )
    DOWNCAST ( IndexPreTransform )
    DOWNCAST ( MultiIndexQuantizer )
+    DOWNCAST ( IndexHNSWFlat )
+    DOWNCAST ( IndexHNSWPQ )
+    DOWNCAST ( IndexHNSWSQ )
+    DOWNCAST ( IndexHNSW2Level )
+    DOWNCAST ( Index2Layer )
 #ifdef GPU_WRAPPER
    DOWNCAST_GPU ( IndexProxy )
    DOWNCAST_GPU ( GpuIndexIVFPQ )
@@ -537,11 +545,14 @@ PyObject *swig_ptr (PyObject *a)
    if(PyArray_TYPE(ao) == NPY_FLOAT32) {
        return SWIG_NewPointerObj(data, SWIGTYPE_p_float, 0);
    }
+    if(PyArray_TYPE(ao) == NPY_FLOAT64) {
+        return SWIG_NewPointerObj(data, SWIGTYPE_p_double, 0);
+    }
    if(PyArray_TYPE(ao) == NPY_INT32) {
        return SWIG_NewPointerObj(data, SWIGTYPE_p_int, 0);
    }
    if(PyArray_TYPE(ao) == NPY_UINT8) {
-        return SWIG_NewPointerObj(data, SWIGTYPE_p_uint8_t, 0);
+        return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_char, 0);
    }
    if(PyArray_TYPE(ao) == NPY_UINT64) {
        return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_long, 0);

--- a/tests/demo_auto_tune.py
+++ b/tests/demo_auto_tune.py
@@ -9,7 +9,6 @@
 import os
 import time
 import numpy as np
-import pdb
 try:
    import matplotlib

--- a/tests/test_build_blocks.py
+++ b/tests/test_build_blocks.py
@@ -92,7 +92,7 @@ class TestProductQuantizer(unittest.TestCase):
    def test_pq(self):
        d = 64
-        n = 1000
+        n = 2000
        cs = 4
        np.random.seed(123)
        x = np.random.random(size=(n, d)).astype('float32')
@@ -103,8 +103,20 @@ class TestProductQuantizer(unittest.TestCase):
        diff = ((x - x2)**2).sum()
        # print "diff=", diff
-        # diff= 1807.98
+        # diff= 4418.0562
-        self.assertGreater(2500, diff)
+        self.assertGreater(5000, diff)
+        pq10 = faiss.ProductQuantizer(d, cs, 10)
+        assert pq10.code_size == cs * 2
+        pq10.verbose = True
+        pq10.cp.verbose = True
+        pq10.train(x)
+        codes = pq10.compute_codes(x)
+        x10 = pq10.decode(codes)
+        diff10 = ((x - x10)**2).sum()
+        self.assertGreater(diff, diff10)
 class TestRevSwigPtr(unittest.TestCase):
@@ -132,7 +144,7 @@ class TestException(unittest.TestCase):
        try:
            # an unsupported operation for IndexFlat
            index.add_with_ids(a, b)
-        except RuntimeError as e:
+        except RuntimeError, e:
            assert 'add_with_ids not implemented' in str(e)
        else:
            assert False, 'exception did not fire???'
@@ -141,14 +153,14 @@ class TestException(unittest.TestCase):
        try:
            faiss.index_factory(12, 'IVF256,Flat,PQ8')
-        except RuntimeError as e:
+        except RuntimeError, e:
            assert 'could not parse' in str(e)
        else:
            assert False, 'exception did not fire???'
-class TestMapLong2Long:
+class TestMapLong2Long(unittest.TestCase):
-    def test_do_it(self):
+    def test_maplong2long(self):
        keys = np.array([13, 45, 67])
        vals = np.array([3, 8, 2])
@@ -160,5 +172,46 @@ class TestMapLong2Long:
        assert m.search(12343) == -1
+class TestOrthognalReconstruct(unittest.TestCase):
+    def test_recons_orthonormal(self):
+        lt = faiss.LinearTransform(20, 10, True)
+        rs = np.random.RandomState(10)
+        A, _ = np.linalg.qr(rs.randn(20, 20))
+        A = A[:10].astype('float32')
+        faiss.copy_array_to_vector(A.ravel(), lt.A)
+        faiss.copy_array_to_vector(rs.randn(10).astype('float32'), lt.b)
+        lt.set_is_orthonormal()
+        assert lt.is_orthonormal
+        x = rs.rand(30, 20).astype('float32')
+        xt = lt.apply_py(x)
+        xtt = lt.reverse_transform(xt)
+        xttt = lt.apply_py(xtt)
+        err = ((xt - xttt)**2).sum()
+        self.assertGreater(1e-5, err)
+    def test_recons_orthogona_impossible(self):
+        lt = faiss.LinearTransform(20, 10, True)
+        rs = np.random.RandomState(10)
+        A = rs.randn(10 * 20).astype('float32')
+        faiss.copy_array_to_vector(A.ravel(), lt.A)
+        faiss.copy_array_to_vector(rs.randn(10).astype('float32'), lt.b)
+        lt.set_is_orthonormal()
+        assert not lt.is_orthonormal
+        x = rs.rand(30, 20).astype('float32')
+        xt = lt.apply_py(x)
+        try:
+            xtt = lt.reverse_transform(xt)
+        except Exception:
+            pass
+        else:
+            self.assertFalse('should do an exception')
 if __name__ == '__main__':
    unittest.main()
--- a/tests/test_factory.py
+++ b/tests/test_factory.py
@@ -6,7 +6,6 @@
 #! /usr/bin/env python2
-import numpy as np
 import unittest
 import faiss

--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -11,6 +11,8 @@
 import numpy as np
 import unittest
 import faiss
+import tempfile
+import os
 def get_dataset(d, nb, nt, nq):
@@ -56,6 +58,7 @@ class EvalIVFPQAccuracy(unittest.TestCase):
        coarse_quantizer = faiss.IndexFlatL2(d)
        index = faiss.IndexIVFPQ(coarse_quantizer, d, 32, 8, 8)
+        index.cp.min_points_per_centroid = 5    # quiet warning
        index.train(xt)
        index.add(xb)
        index.nprobe = 4
@@ -65,6 +68,23 @@ class EvalIVFPQAccuracy(unittest.TestCase):
        self.assertGreater(n_ok, nq * 0.66)
+        # check that and Index2Layer gives the same reconstruction
+        # this is a bit fragile: it assumes 2 runs of training give
+        # the exact same result.
+        index2 = faiss.Index2Layer(coarse_quantizer, 32, 8)
+        if True:
+            index2.train(xt)
+        else:
+            index2.pq = index.pq
+            index2.is_trained = True
+        index2.add(xb)
+        ref_recons = index.reconstruct_n(0, nb)
+        new_recons = index2.reconstruct_n(0, nb)
+        self.assertTrue(np.all(ref_recons == new_recons))
 class TestMultiIndexQuantizer(unittest.TestCase):
@@ -114,6 +134,7 @@ class TestScalarQuantizer(unittest.TestCase):
        index = faiss.IndexIVFFlat(quantizer, d, ncent,
                                   faiss.METRIC_L2)
+        index.cp.min_points_per_centroid = 5    # quiet warning
        index.nprobe = 4
        index.train(xt)
        index.add(xb)
@@ -201,5 +222,175 @@ class TestRangeSearch(unittest.TestCase):
                    self.assertGreaterEqual(1e-4, abs(Dline[idx] - dis))
+class TestSearchAndReconstruct(unittest.TestCase):
+    def run_search_and_reconstruct(self, index, xb, xq, k=10, eps=None):
+        n, d = xb.shape
+        assert xq.shape[1] == d
+        assert index.d == d
+        D_ref, I_ref = index.search(xq, k)
+        R_ref = index.reconstruct_n(0, n)
+        D, I, R = index.search_and_reconstruct(xq, k)
+        self.assertTrue((D == D_ref).all())
+        self.assertTrue((I == I_ref).all())
+        self.assertEqual(R.shape[:2], I.shape)
+        self.assertEqual(R.shape[2], d)
+        # (n, k, ..) -> (n * k, ..)
+        I_flat = I.reshape(-1)
+        R_flat = R.reshape(-1, d)
+        # Filter out -1s when not enough results
+        R_flat = R_flat[I_flat >= 0]
+        I_flat = I_flat[I_flat >= 0]
+        recons_ref_err = np.mean(np.linalg.norm(R_flat - R_ref[I_flat]))
+        self.assertLessEqual(recons_ref_err, 1e-6)
+        def norm1(x):
+            return np.sqrt((x ** 2).sum(axis=1))
+        recons_err = np.mean(norm1(R_flat - xb[I_flat]))
+        print('Reconstruction error = %.3f' % recons_err)
+        if eps is not None:
+            self.assertLessEqual(recons_err, eps)
+        return D, I, R
+    def test_IndexFlat(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+        index = faiss.IndexFlatL2(d)
+        index.add(xb)
+        self.run_search_and_reconstruct(index, xb, xq, eps=0.0)
+    def test_IndexIVFFlat(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+        quantizer = faiss.IndexFlatL2(d)
+        index = faiss.IndexIVFFlat(quantizer, d, 32, faiss.METRIC_L2)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.nprobe = 4
+        index.train(xt)
+        index.add(xb)
+        self.run_search_and_reconstruct(index, xb, xq, eps=0.0)
+    def test_IndexIVFPQ(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+        quantizer = faiss.IndexFlatL2(d)
+        index = faiss.IndexIVFPQ(quantizer, d, 32, 8, 8)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.nprobe = 4
+        index.train(xt)
+        index.add(xb)
+        self.run_search_and_reconstruct(index, xb, xq, eps=1.0)
+    def test_MultiIndex(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+        index = faiss.index_factory(d, "IMI2x5,PQ8np")
+        faiss.ParameterSpace().set_index_parameter(index, "nprobe", 4)
+        index.train(xt)
+        index.add(xb)
+        self.run_search_and_reconstruct(index, xb, xq, eps=1.0)
+    def test_IndexTransform(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+        index = faiss.index_factory(d, "L2norm,PCA8,IVF32,PQ8np")
+        faiss.ParameterSpace().set_index_parameter(index, "nprobe", 4)
+        index.train(xt)
+        index.add(xb)
+        self.run_search_and_reconstruct(index, xb, xq)
+class TestHNSW(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        unittest.TestCase.__init__(self, *args, **kwargs)
+        d = 32
+        nt = 0
+        nb = 1500
+        nq = 500
+        (_, self.xb, self.xq) = get_dataset_2(d, nb, nt, nq)
+        index = faiss.IndexFlatL2(d)
+        index.add(self.xb)
+        Dref, Iref = index.search(self.xq, 1)
+        self.Iref = Iref
+    def test_hnsw(self):
+        d = self.xq.shape[1]
+        index = faiss.IndexHNSWFlat(d, 16)
+        index.add(self.xb)
+        Dhnsw, Ihnsw = index.search(self.xq, 1)
+        self.assertGreaterEqual((self.Iref == Ihnsw).sum(), 460)
+        self.io_and_retest(index, Dhnsw, Ihnsw)
+    def io_and_retest(self, index, Dhnsw, Ihnsw):
+        _, tmpfile = tempfile.mkstemp()
+        try:
+            faiss.write_index(index, tmpfile)
+            index2 = faiss.read_index(tmpfile)
+        finally:
+            if os.path.exists(tmpfile):
+                os.unlink(tmpfile)
+        Dhnsw2, Ihnsw2 = index2.search(self.xq, 1)
+        self.assertTrue(np.all(Dhnsw2 == Dhnsw))
+        self.assertTrue(np.all(Ihnsw2 == Ihnsw))
+    def test_hnsw_2level(self):
+        d = self.xq.shape[1]
+        quant = faiss.IndexFlatL2(d)
+        index = faiss.IndexHNSW2Level(quant, 256, 8, 8)
+        index.train(self.xb)
+        index.add(self.xb)
+        Dhnsw, Ihnsw = index.search(self.xq, 1)
+        self.assertGreaterEqual((self.Iref == Ihnsw).sum(), 310)
+        self.io_and_retest(index, Dhnsw, Ihnsw)
 if __name__ == '__main__':
    unittest.main()
--- a/utils.cpp
+++ b/utils.cpp
@@ -943,12 +943,14 @@ static void knn_L2sqr_blas (const float * x,
 * KNN driver functions
 *******************************************************/
+int distance_compute_blas_threshold = 20;
 void knn_inner_product (const float * x,
        const float * y,
        size_t d, size_t nx, size_t ny,
        float_minheap_array_t * res)
 {
-    if (d % 4 == 0 && nx < 20) {
+    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
        knn_inner_product_sse (x, y, d, nx, ny, res);
    } else {
        knn_inner_product_blas (x, y, d, nx, ny, res);
@@ -968,7 +970,7 @@ void knn_L2sqr (const float * x,
                size_t d, size_t nx, size_t ny,
                float_maxheap_array_t * res)
 {
-    if (d % 4 == 0 && nx < 20) {
+    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
        knn_L2sqr_sse (x, y, d, nx, ny, res);
    } else {
        NopDistanceCorrection nop;
@@ -1270,7 +1272,7 @@ void range_search_L2sqr (
        RangeSearchResult *res)
 {
-    if (d % 4 == 0 && nx < 20) {
+    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
        range_search_sse<true> (x, y, d, nx, ny, radius, res);
    } else {
        range_search_blas<true> (x, y, d, nx, ny, radius, res);
@@ -1285,7 +1287,7 @@ void range_search_inner_product (
        RangeSearchResult *res)
 {
-    if (d % 4 == 0 && nx < 20) {
+    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
        range_search_sse<false> (x, y, d, nx, ny, radius, res);
    } else {
        range_search_blas<false> (x, y, d, nx, ny, radius, res);

--- a/utils.h
+++ b/utils.h
@@ -195,6 +195,8 @@ void fvec_L2sqr_by_idx (
 * KNN functions
 ***************************************************************************/
+// threshold on nx above which we switch to BLAS to compute distances
+extern int distance_compute_blas_threshold;
 /** Return the k nearest neighors of each of the nx vectors x among the ny
 *  vector y, w.r.t to max inner product