sync with FB version. Added:

- better selection of training sets for PQ and preprocessing - GPU parameter object - IndexIDMap fixed - fixed redo bug in clustering

sync with FB version. Added:
- better selection of training sets for PQ and preprocessing - GPU parameter object - IndexIDMap fixed - fixed redo bug in clustering
c5077070 · matthijs · acb93857 · c5077070 · c5077070 · c5077070
Commit c5077070 authored Mar 20, 2017 by matthijs
41 changed files
--- a/Clustering.cpp
+++ b/Clustering.cpp
@@ -104,14 +104,13 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
               int(nx), d, k, nredo, niter);
    idx_t * assign = new idx_t[nx];
    float * dis = new float[nx];
    float best_err = 1e50;
    double t_search_tot = 0;
    if (verbose) {
-        printf("  Preprocessing in %5g s\n",
+        printf("  Preprocessing in %.2f s\n",
               (getmillisecs() - t0)/1000.);
    }
    t0 = getmillisecs();
@@ -149,7 +148,7 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
        if (!index.is_trained)
            index.train (k, cur_centroids.data());
-        FAISS_ASSERT (index.ntotal == 0 );
+        FAISS_ASSERT (index.ntotal == 0);
        index.add (k, cur_centroids.data());
        float err = 0;
        for (int i = 0; i < niter; i++) {
@@ -183,16 +182,17 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
                index.train (k, cur_centroids.data());
            assert (index.ntotal == 0);
-            index.add (k, centroids.data());
+            index.add (k, cur_centroids.data());
        }
        if (verbose) printf("\n");
        if (nredo > 1) {
            if (err < best_err) {
                if (verbose)
-                    printf ("Keep new clusters\n");
+                    printf ("Objective improved: keep new clusters\n");
-                centroids = cur_centroids;
+                centroids = buf_centroids;
                best_err = err;
            }
+            index.reset ();
        }
    }

--- a/Clustering.h
+++ b/Clustering.h
@@ -26,7 +26,6 @@ struct ClusteringParameters {
    int niter;          ///< clustering iterations
    int nredo;          ///< redo clustering this many times and keep best
    bool verbose;
    bool spherical;     ///< do we want normalized centroids?
    bool update_index;  ///< update index after each iteration?

--- a/IndexIVFPQ.cpp
+++ b/IndexIVFPQ.cpp
@@ -82,8 +82,11 @@ void IndexIVFPQ::train_residual (idx_t n, const float *x)
 void IndexIVFPQ::train_residual_o (idx_t n, const float *x, float *residuals_2)
 {
-    idx_t ntrain = pq.ksub * 64;
+    const float * x_in = x;
-    if(n > ntrain) n = ntrain;
+    x = fvecs_maybe_subsample (
+         d, (size_t*)&n, pq.cp.max_points_per_centroid * pq.ksub,
+         x, verbose, pq.cp.seed);
    const float *trainset;
    if (by_residual) {
@@ -132,6 +135,7 @@ void IndexIVFPQ::train_residual_o (idx_t n, const float *x, float *residuals_2)
        precompute_table ();
    }
+    if (x_in != x) delete [] x;
 }

--- a/Makefile
+++ b/Makefile
@@ -94,8 +94,6 @@ AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h utils.h Heap.h \
 IndexFlat.h VectorTransform.h IndexLSH.h IndexPQ.h ProductQuantizer.h \
 Clustering.h PolysemousTraining.h IndexIVF.h IndexIVFPQ.h MetaIndexes.h
 AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
-BinaryCode.o: BinaryCode.cpp BinaryCode.h VectorTransform.h Index.h \
- FaissAssert.h hamming.h Heap.h
 Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
 FaissAssert.h IndexFlat.h
 hamming.o: hamming.cpp hamming.h Heap.h FaissAssert.h
@@ -105,7 +103,7 @@ IndexFlat.o: IndexFlat.cpp IndexFlat.h Index.h utils.h Heap.h \
 FaissAssert.h
 index_io.o: index_io.cpp index_io.h FaissAssert.h IndexFlat.h Index.h \
 VectorTransform.h IndexLSH.h IndexPQ.h ProductQuantizer.h Clustering.h \
- Heap.h PolysemousTraining.h IndexIVF.h IndexIVFPQ.h
+ Heap.h PolysemousTraining.h IndexIVF.h IndexIVFPQ.h MetaIndexes.h
 IndexIVF.o: IndexIVF.cpp IndexIVF.h Index.h Clustering.h Heap.h utils.h \
 hamming.h FaissAssert.h IndexFlat.h AuxIndexStructures.h
 IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
@@ -113,12 +111,8 @@ IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
 IndexFlat.h hamming.h FaissAssert.h AuxIndexStructures.h
 IndexLSH.o: IndexLSH.cpp IndexLSH.h Index.h VectorTransform.h utils.h \
 Heap.h hamming.h FaissAssert.h
-IndexNested.o: IndexNested.cpp IndexNested.h IndexIVF.h Index.h \
- Clustering.h Heap.h IndexIVFPQ.h IndexPQ.h ProductQuantizer.h \
- PolysemousTraining.h IndexFlat.h FaissAssert.h
 IndexPQ.o: IndexPQ.cpp IndexPQ.h Index.h ProductQuantizer.h Clustering.h \
 Heap.h PolysemousTraining.h FaissAssert.h hamming.h
 MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h Heap.h
 PolysemousTraining.o: PolysemousTraining.cpp PolysemousTraining.h \
 ProductQuantizer.h Clustering.h Index.h Heap.h utils.h hamming.h \
@@ -131,6 +125,7 @@ VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \
 Heap.h FaissAssert.h IndexPQ.h ProductQuantizer.h Clustering.h \
 PolysemousTraining.h
 clean:
 	rm -f $(LIBNAME).a $(LIBNAME).$(SHAREDEXT)* *.o \
 	   	lua/swigfaiss.so lua/swigfaiss_wrap.cxx \

--- a/MetaIndexes.h
+++ b/MetaIndexes.h
@@ -49,7 +49,7 @@ struct IndexIDMap : Index {
    virtual void set_typename () override;
    virtual ~IndexIDMap ();
+    IndexIDMap () {own_fields=false; index=nullptr; }
 };
 /** Index that concatenates the results from several sub-indexes

--- a/VectorTransform.cpp
+++ b/VectorTransform.cpp
@@ -95,7 +95,7 @@ void VectorTransform::reverse_transform (
 LinearTransform::LinearTransform (int d_in, int d_out,
                                  bool have_bias):
    VectorTransform (d_in, d_out), have_bias (have_bias),
-    max_points_per_d (1 << 20), verbose (false)
+    verbose (false)
 {}
 void LinearTransform::apply_noalloc (Index::idx_t n, const float * x,
@@ -152,27 +152,6 @@ void LinearTransform::transform_transpose (idx_t n, const float * y,
    if (have_bias) delete [] y;
 }
-const float * LinearTransform::maybe_subsample_train_set (
-            Index::idx_t *n, const float *x)
-{
-    if (*n <= max_points_per_d * d_in) return x;
-    size_t n2 = max_points_per_d * d_in;
-    if (verbose) {
-        printf ("  Input training set too big, sampling "
-                "%ld / %ld vectors\n", n2, *n);
-    }
-    std::vector<int> subset (*n);
-    rand_perm (subset.data (), *n, 1234);
-    float *x_subset = new float[n2 * d_in];
-    for (long i = 0; i < n2; i++)
-        memcpy (&x_subset[i * d_in],
-                &x[subset[i] * size_t(d_in)],
-                sizeof (x[0]) * d_in);
-    *n = n2;
-    return x_subset;
-}
 /*********************************************
 * RandomRotationMatrix
@@ -228,7 +207,8 @@ void PCAMatrix::train (Index::idx_t n, const float *x)
 {
    const float * x_in = x;
-    x = maybe_subsample_train_set(&n, x);
+    x = fvecs_maybe_subsample (d_in, (size_t*)&n,
+                               max_points_per_d * d_in, x, verbose);
    // compute mean
    mean.clear(); mean.resize(d_in, 0.0);
@@ -461,7 +441,8 @@ OPQMatrix::OPQMatrix (int d, int M, int d2):
    verbose(false)
 {
    is_trained = false;
-    max_points_per_d = 1000;
+    // OPQ is quite expensive to train, so set this right.
+    max_train_points = 256 * 256;
 }
@@ -471,7 +452,8 @@ void OPQMatrix::train (Index::idx_t n, const float *x)
    const float * x_in = x;
-    x = maybe_subsample_train_set (&n, x);
+    x = fvecs_maybe_subsample (d_in, (size_t*)&n,
+                               max_train_points, x, verbose);
    // To support d_out > d_in, we pad input vectors with 0s to d_out
    size_t d = d_out <= d_in ? d_in : d_out;

--- a/VectorTransform.h
+++ b/VectorTransform.h
@@ -100,13 +100,8 @@ struct LinearTransform: VectorTransform {
    void transform_transpose (idx_t n, const float * y,
                              float *x) const;
-    // ratio between # training vectors and dimension
-    size_t max_points_per_d;
    bool verbose;
-    // subsamples training set if there are too many vectors
-    const float *maybe_subsample_train_set (Index::idx_t *n, const float *x);
    virtual ~LinearTransform () {}
@@ -146,6 +141,9 @@ struct PCAMatrix: LinearTransform {
    /// random rotation after PCA
    bool random_rotation;
+    /// ratio between # training vectors and dimension
+    size_t max_points_per_d;
    /// try to distribute output eigenvectors in this many bins
    int balanced_bins;
@@ -191,8 +189,9 @@ struct OPQMatrix: LinearTransform {
    int niter;      ///< Number of outer training iterations
    int niter_pq;   ///< Number of training iterations for the PQ
    int niter_pq_0; ///< same, for the first outer iteration
    /// if there are too many training points, resample
-    int max_points_per_d;
+    size_t max_train_points;
    bool verbose;
    /// if d2 != -1, output vectors of this dimension

--- a/benchs/bench_gpu_sift1m.py
+++ b/benchs/bench_gpu_sift1m.py
@@ -52,7 +52,10 @@ res = faiss.StandardGpuResources()
 print "============ Exact search"
-index = faiss.GpuIndexFlatL2(res, 0, d, False)
+flat_config = faiss.GpuIndexFlatConfig()
+flat_config.device = 0
+index = faiss.GpuIndexFlatL2(res, d, flat_config)
 print "add vectors to index"

--- a/benchs/kmeans_mnist.py
+++ b/benchs/kmeans_mnist.py
@@ -55,12 +55,17 @@ def train_kmeans(x, k, ngpu):
    res = [faiss.StandardGpuResources() for i in range(ngpu)]
-    useFloat16 = False
+    flat_config = []
+    for i in range(ngpu):
+        cfg = faiss.GpuIndexFlatConfig()
+        cfg.useFloat16 = False
+        cfg.device = i
+        flat_config.append(cfg)
    if ngpu == 1:
-        index = faiss.GpuIndexFlatL2(res[0], 0, d, useFloat16)
+        index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
    else:
-        indexes = [faiss.GpuIndexFlatL2(res[i], i, d, useFloat16)
+        indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                   for i in range(ngpu)]
        index = faiss.IndexProxy()
        for sub_index in indexes:

--- a/gpu/GpuAutoTune.cpp
+++ b/gpu/GpuAutoTune.cpp
@@ -65,6 +65,7 @@ GpuClonerOptions::GpuClonerOptions():
    useFloat16(false),
    usePrecomputed(true),
    reserveVecs(0),
+    storeTransposed(false),
    verbose(0)
 {}
@@ -79,7 +80,12 @@ struct ToGpuCloner: faiss::Cloner, GpuClonerOptions {
    Index *clone_Index(const Index *index) override {
        if(auto ifl = dynamic_cast<const IndexFlat *>(index)) {
-          return new GpuIndexFlat(resources, device, useFloat16, ifl);
+          GpuIndexFlatConfig config;
+          config.device = device;
+          config.useFloat16 = useFloat16;
+          config.storeTransposed = storeTransposed;
+          return new GpuIndexFlat(resources, ifl, config);
        } else if(auto ifl = dynamic_cast<const faiss::IndexIVFFlat *>(index)) {
          GpuIndexIVFFlat *res =
            new GpuIndexIVFFlat(resources,

--- a/gpu/GpuAutoTune.h
+++ b/gpu/GpuAutoTune.h
@@ -40,6 +40,8 @@ struct GpuClonerOptions {
  bool usePrecomputed;
  /// reserve vectors in the invfiles?
  long reserveVecs;
+  /// For GpuIndexFlat, store data in transposed layout?
+  bool storeTransposed;
  int verbose;
  GpuClonerOptions ();
 };

--- a/gpu/GpuIndexFlat.cu
+++ b/gpu/GpuIndexFlat.cu
@@ -31,31 +31,30 @@ constexpr size_t kMinPagedQuerySize = (size_t) 256 * 1024 * 1024;
 constexpr size_t kNonPinnedPageSize = (size_t) 256 * 1024 * 1024;
 GpuIndexFlat::GpuIndexFlat(GpuResources* resources,
-                           int device,
+                           const faiss::IndexFlat* index,
-                           bool useFloat16,
+                           GpuIndexFlatConfig config) :
-                           const faiss::IndexFlat* index) :
+    GpuIndex(resources, config.device, index->d, index->metric_type),
-    GpuIndex(resources, device, index->d, index->metric_type),
    minPagedSize_(kMinPagedQuerySize),
-    useFloat16_(useFloat16),
+    config_(config),
    data_(nullptr) {
  copyFrom(index);
 }
 GpuIndexFlat::GpuIndexFlat(GpuResources* resources,
-                           int device,
                           int dims,
-                           bool useFloat16,
+                           faiss::MetricType metric,
-                           faiss::MetricType metric) :
+                           GpuIndexFlatConfig config) :
-    GpuIndex(resources, device, dims, metric),
+    GpuIndex(resources, config.device, dims, metric),
    minPagedSize_(kMinPagedQuerySize),
-    useFloat16_(useFloat16),
+    config_(config),
    data_(nullptr) {
  DeviceScope scope(device_);
  data_ = new FlatIndex(resources,
                        dims,
                        metric == faiss::METRIC_L2,
-                        useFloat16);
+                        config_.useFloat16,
+                        config_.storeTransposed);
 }
 GpuIndexFlat::~GpuIndexFlat() {
@@ -74,7 +73,7 @@ GpuIndexFlat::getMinPagingSize() const {
 bool
 GpuIndexFlat::getUseFloat16() const {
-  return useFloat16_;
+  return config_.useFloat16;
 }
 void
@@ -93,7 +92,8 @@ GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) {
  data_ = new FlatIndex(resources_,
                        this->d,
                        index->metric_type == faiss::METRIC_L2,
-                        useFloat16_);
+                        config_.useFloat16,
+                        config_.storeTransposed);
  // The index could be empty
  if (index->ntotal > 0) {
@@ -117,7 +117,7 @@ GpuIndexFlat::copyTo(faiss::IndexFlat* index) const {
  auto stream = resources_->getDefaultStream(device_);
  if (this->ntotal > 0) {
-    if (useFloat16_) {
+    if (config_.useFloat16) {
      auto vecFloat32 = data_->getVectorsFloat32Copy(stream);
      fromDevice(vecFloat32, index->xb.data(), stream);
    } else {
@@ -444,7 +444,7 @@ GpuIndexFlat::reconstruct(faiss::Index::idx_t key,
  FAISS_ASSERT(key < this->ntotal);
  auto stream = resources_->getDefaultStream(device_);
-  if (useFloat16_) {
+  if (config_.useFloat16) {
    auto vec = data_->getVectorsFloat32Copy(key, 1, stream);
    fromDevice(vec.data(), out, this->d, stream);
  } else {
@@ -463,7 +463,7 @@ GpuIndexFlat::reconstruct_n(faiss::Index::idx_t i0,
  FAISS_ASSERT(i0 + num - 1 < this->ntotal);
  auto stream = resources_->getDefaultStream(device_);
-  if (useFloat16_) {
+  if (config_.useFloat16) {
    auto vec = data_->getVectorsFloat32Copy(i0, num, stream);
    fromDevice(vec.data(), out, num * this->d, stream);
  } else {
@@ -486,17 +486,15 @@ GpuIndexFlat::set_typename() {
 //
 GpuIndexFlatL2::GpuIndexFlatL2(GpuResources* resources,
-                               int device,
+                               faiss::IndexFlatL2* index,
-                               bool useFloat16,
+                               GpuIndexFlatConfig config) :
-                               faiss::IndexFlatL2* index) :
+    GpuIndexFlat(resources, index, config) {
-    GpuIndexFlat(resources, device, useFloat16, index) {
 }
 GpuIndexFlatL2::GpuIndexFlatL2(GpuResources* resources,
-                               int device,
                               int dims,
-                               bool useFloat16) :
+                               GpuIndexFlatConfig config) :
-    GpuIndexFlat(resources, device, dims, useFloat16, faiss::METRIC_L2) {
+    GpuIndexFlat(resources, dims, faiss::METRIC_L2, config) {
 }
 void
@@ -514,18 +512,15 @@ GpuIndexFlatL2::copyTo(faiss::IndexFlatL2* index) {
 //
 GpuIndexFlatIP::GpuIndexFlatIP(GpuResources* resources,
-                               int device,
+                               faiss::IndexFlatIP* index,
-                               bool useFloat16,
+                               GpuIndexFlatConfig config) :
-                               faiss::IndexFlatIP* index) :
+    GpuIndexFlat(resources, index, config) {
-    GpuIndexFlat(resources, device, useFloat16, index) {
 }
 GpuIndexFlatIP::GpuIndexFlatIP(GpuResources* resources,
-                               int device,
                               int dims,
-                               bool useFloat16) :
+                               GpuIndexFlatConfig config) :
-    GpuIndexFlat(resources, device, dims, useFloat16,
+    GpuIndexFlat(resources, dims, faiss::METRIC_INNER_PRODUCT, config) {
-                 faiss::METRIC_INNER_PRODUCT) {
 }
 void

--- a/gpu/GpuIndexFlat.h
+++ b/gpu/GpuIndexFlat.h
@@ -25,6 +25,18 @@ namespace faiss { namespace gpu {
 struct FlatIndex;
+struct GpuIndexFlatConfig {
+  inline GpuIndexFlatConfig()
+      : device(0),
+        useFloat16(false),
+        storeTransposed(false) {
+  }
+  int device;
+  bool useFloat16;
+  bool storeTransposed;
+};
 /// Wrapper around the GPU implementation that looks like
 /// faiss::IndexFlat; copies over centroid data from a given
 /// faiss::IndexFlat
@@ -33,16 +45,14 @@ class GpuIndexFlat : public GpuIndex {
  /// Construct from a pre-existing faiss::IndexFlat instance, copying
  /// data over to the given GPU
  GpuIndexFlat(GpuResources* resources,
-               int device,
+               const faiss::IndexFlat* index,
-               bool useFloat16,
+               GpuIndexFlatConfig config = GpuIndexFlatConfig());
-               const faiss::IndexFlat* index);
  /// Construct an empty instance that can be added to
  GpuIndexFlat(GpuResources* resources,
-               int device,
               int dims,
-               bool useFloat16,
+               faiss::MetricType metric,
-               faiss::MetricType metric);
+               GpuIndexFlatConfig config = GpuIndexFlatConfig());
  ~GpuIndexFlat() override;
@@ -118,8 +128,7 @@ class GpuIndexFlat : public GpuIndex {
  /// Size above which we page copies from the CPU to GPU
  size_t minPagedSize_;
-  /// Whether or not we store our vectors in float32 or float16
+  const GpuIndexFlatConfig config_;
-  const bool useFloat16_;
  /// Holds our GPU data containing the list of vectors
  FlatIndex* data_;
@@ -133,15 +142,13 @@ class GpuIndexFlatL2 : public GpuIndexFlat {
  /// Construct from a pre-existing faiss::IndexFlatL2 instance, copying
  /// data over to the given GPU
  GpuIndexFlatL2(GpuResources* resources,
-                 int device,
+                 faiss::IndexFlatL2* index,
-                 bool useFloat16,
+                 GpuIndexFlatConfig config = GpuIndexFlatConfig());
-                 faiss::IndexFlatL2* index);
  /// Construct an empty instance that can be added to
  GpuIndexFlatL2(GpuResources* resources,
-                 int device,
                 int dims,
-                 bool useFloat16);
+                 GpuIndexFlatConfig config = GpuIndexFlatConfig());
  /// Initialize ourselves from the given CPU index; will overwrite
  /// all data in ourselves
@@ -160,15 +167,13 @@ class GpuIndexFlatIP : public GpuIndexFlat {
  /// Construct from a pre-existing faiss::IndexFlatIP instance, copying
  /// data over to the given GPU
  GpuIndexFlatIP(GpuResources* resources,
-                 int device,
+                 faiss::IndexFlatIP* index,
-                 bool useFloat16,
+                 GpuIndexFlatConfig config = GpuIndexFlatConfig());
-                 faiss::IndexFlatIP* index);
  /// Construct an empty instance that can be added to
  GpuIndexFlatIP(GpuResources* resources,
-                 int device,
                 int dims,
-                 bool useFloat16);
+                 GpuIndexFlatConfig config = GpuIndexFlatConfig());
  /// Initialize ourselves from the given CPU index; will overwrite
  /// all data in ourselves

--- a/gpu/GpuIndexIVF.cu
+++ b/gpu/GpuIndexIVF.cu
@@ -82,14 +82,17 @@ GpuIndexIVF::init_() {
  if (!quantizer_) {
    // Construct an empty quantizer
+    GpuIndexFlatConfig config;
+    config.device = device_;
+    config.useFloat16 = useFloat16CoarseQuantizer_;
+    config.storeTransposed = false;
    if (this->metric_type == faiss::METRIC_L2) {
      // FIXME: 2 different float16 options?
-      quantizer_ = new GpuIndexFlatL2(resources_, device_, this->d,
+      quantizer_ = new GpuIndexFlatL2(resources_, this->d, config);
-                                      useFloat16CoarseQuantizer_);
    } else if (this->metric_type == faiss::METRIC_INNER_PRODUCT) {
      // FIXME: 2 different float16 options?
-      quantizer_ = new GpuIndexFlatIP(resources_, device_, this->d,
+      quantizer_ = new GpuIndexFlatIP(resources_, this->d, config);
-                                      useFloat16CoarseQuantizer_);
    } else {
      // unknown metric type
      FAISS_ASSERT(false);
@@ -131,14 +134,17 @@ GpuIndexIVF::copyFrom(const faiss::IndexIVF* index) {
  delete quantizer_;
  quantizer_ = nullptr;
+  GpuIndexFlatConfig config;
+  config.device = device_;
+  config.useFloat16 = useFloat16CoarseQuantizer_;
+  config.storeTransposed = false;
  if (index->metric_type == faiss::METRIC_L2) {
    // FIXME: 2 different float16 options?
-    quantizer_ = new GpuIndexFlatL2(resources_, device_, this->d,
+    quantizer_ = new GpuIndexFlatL2(resources_, this->d, config);
-                                    useFloat16CoarseQuantizer_);
  } else if (index->metric_type == faiss::METRIC_INNER_PRODUCT) {
    // FIXME: 2 different float16 options?
-    quantizer_ = new GpuIndexFlatIP(resources_, device_, this->d,
+    quantizer_ = new GpuIndexFlatIP(resources_, this->d, config);
-                                    useFloat16CoarseQuantizer_);
  } else {
    // unknown metric type
    FAISS_ASSERT(false);

--- a/gpu/IndexProxy.cpp
+++ b/gpu/IndexProxy.cpp
@@ -92,8 +92,8 @@ IndexProxy::runOnIndex(std::function<void(faiss::Index*)> f) {
  }
  // Blocking wait for completion
-  for (auto& f : v) {
+  for (auto& func : v) {
-    f.get();
+    func.get();
  }
 }
@@ -183,7 +183,8 @@ IndexProxy::set_typename() {
 float kmeans_clustering_gpu (int ngpu, size_t d, size_t n, size_t k,
                             const float *x,
                             float *centroids,
-                             bool useFloat16)
+                             bool useFloat16,
+                             bool storeTransposed)
 {
    Clustering clus (d, k);
    // display logs if > 16Gflop per iteration
@@ -194,8 +195,15 @@ float kmeans_clustering_gpu (int ngpu, size_t d, size_t n, size_t k,
    std::vector<std::unique_ptr<GpuIndexFlatL2> > sub_indices;
    for(int dev_no = 0; dev_no < ngpu; dev_no++) {
        res.emplace_back(new StandardGpuResources());
+        GpuIndexFlatConfig config;
+        config.device = dev_no;
+        config.useFloat16 = useFloat16;
+        config.storeTransposed = storeTransposed;
        sub_indices.emplace_back(
-          new GpuIndexFlatL2(res.back().get(), dev_no, d, useFloat16));
+          new GpuIndexFlatL2(res.back().get(), d, config));
    }
    IndexProxy proxy;

--- a/gpu/IndexProxy.h
+++ b/gpu/IndexProxy.h
@@ -98,7 +98,8 @@ class IndexProxy : public faiss::Index {
 float kmeans_clustering_gpu (int ngpu, size_t d, size_t n, size_t k,
                             const float *x,
                             float *centroids,
-                             bool useFloat16);
+                             bool useFloat16,
+                             bool storeTransposed);

--- a/gpu/Makefile
+++ b/gpu/Makefile
--- a/gpu/StandardGpuResources.cpp
+++ b/gpu/StandardGpuResources.cpp
@@ -119,7 +119,8 @@ StandardGpuResources::initializeForDevice(int device) {
  auto& prop = getDeviceProperties(device);
  // Also check to make sure we meet our minimum compute capability (3.5)
-  FAISS_ASSERT(prop.major > 3 || (prop.major == 3 && prop.minor >= 5));
+  FAISS_ASSERT(prop.major > 3 || (prop.major == 3 && prop.minor >= 5) ||
+               !"Device not supported, need 3.5+ compute capability");
  // Create streams
  cudaStream_t defaultStream = 0;

--- a/gpu/impl/Distance.cu
+++ b/gpu/impl/Distance.cu
@@ -33,6 +33,7 @@ constexpr int kDefaultTileSize = 256;
 template <typename T>
 void runL2Distance(GpuResources* resources,
                   Tensor<T, 2, true>& centroids,
+                   Tensor<T, 2, true>* centroidsTransposed,
                   Tensor<T, 1, true>* centroidNorms,
                   Tensor<T, 2, true>& queries,
                   int k,
@@ -132,7 +133,8 @@ void runL2Distance(GpuResources* resources,
    // (query id x dim) x (centroid id, dim)' = (query id, centroid id)
    runMatrixMult(distanceBufView, false,
                  queryView, false,
-                  centroids, true,
+                  centroidsTransposed ? *centroidsTransposed : centroids,
+                  centroidsTransposed ? false : true,
                  -2.0f, 0.0f,
                  resources->getBlasHandleCurrentDevice(),
                  streams[curStream]);
@@ -164,6 +166,7 @@ void runL2Distance(GpuResources* resources,
 template <typename T>
 void runIPDistance(GpuResources* resources,
                   Tensor<T, 2, true>& centroids,
+                   Tensor<T, 2, true>* centroidsTransposed,
                   Tensor<T, 2, true>& queries,
                   int k,
                   Tensor<T, 2, true>& outDistances,
@@ -236,7 +239,9 @@ void runIPDistance(GpuResources* resources,
    // (query id x dim) x (centroid id, dim)' = (query id, centroid id)
    runMatrixMult(distanceBufView, false,
-                  queryView, false, centroids, true,
+                  queryView, false,
+                  centroidsTransposed ? *centroidsTransposed : centroids,
+                  centroidsTransposed ? false : true,
                  1.0f, 0.0f,
                  resources->getBlasHandleCurrentDevice(),
                  streams[curStream]);
@@ -261,6 +266,7 @@ void runIPDistance(GpuResources* resources,
 void
 runIPDistance(GpuResources* resources,
              Tensor<float, 2, true>& vectors,
+              Tensor<float, 2, true>* vectorsTransposed,
              Tensor<float, 2, true>& queries,
              int k,
              Tensor<float, 2, true>& outDistances,
@@ -268,6 +274,7 @@ runIPDistance(GpuResources* resources,
              int tileSize) {
  runIPDistance<float>(resources,
                       vectors,
+                       vectorsTransposed,
                       queries,
                       k,
                       outDistances,
@@ -279,6 +286,7 @@ runIPDistance(GpuResources* resources,
 void
 runIPDistance(GpuResources* resources,
              Tensor<half, 2, true>& vectors,
+              Tensor<half, 2, true>* vectorsTransposed,
              Tensor<half, 2, true>& queries,
              int k,
              Tensor<half, 2, true>& outDistances,
@@ -286,6 +294,7 @@ runIPDistance(GpuResources* resources,
              int tileSize) {
  runIPDistance<half>(resources,
                      vectors,
+                      vectorsTransposed,
                      queries,
                      k,
                      outDistances,
@@ -297,6 +306,7 @@ runIPDistance(GpuResources* resources,
 void
 runL2Distance(GpuResources* resources,
              Tensor<float, 2, true>& vectors,
+              Tensor<float, 2, true>* vectorsTransposed,
              Tensor<float, 1, true>* vectorNorms,
              Tensor<float, 2, true>& queries,
              int k,
@@ -306,6 +316,7 @@ runL2Distance(GpuResources* resources,
              int tileSize) {
  runL2Distance<float>(resources,
                       vectors,
+                       vectorsTransposed,
                       vectorNorms,
                       queries,
                       k,
@@ -319,6 +330,7 @@ runL2Distance(GpuResources* resources,
 void
 runL2Distance(GpuResources* resources,
              Tensor<half, 2, true>& vectors,
+              Tensor<half, 2, true>* vectorsTransposed,
              Tensor<half, 1, true>* vectorNorms,
              Tensor<half, 2, true>& queries,
              int k,
@@ -328,6 +340,7 @@ runL2Distance(GpuResources* resources,
              int tileSize) {
  runL2Distance<half>(resources,
                      vectors,
+                      vectorsTransposed,
                      vectorNorms,
                      queries,
                      k,

--- a/gpu/impl/Distance.cuh
+++ b/gpu/impl/Distance.cuh
@@ -22,6 +22,7 @@ class GpuResources;
 /// `queries`, returning the k closest results seen
 void runL2Distance(GpuResources* resources,
                   Tensor<float, 2, true>& vectors,
+                   Tensor<float, 2, true>* vectorsTransposed,
                   // can be optionally pre-computed; nullptr if we
                   // have to compute it upon the call
                   Tensor<float, 1, true>* vectorNorms,
@@ -41,6 +42,7 @@ void runL2Distance(GpuResources* resources,
 /// and `queries`, returning the k closest results seen
 void runIPDistance(GpuResources* resources,
                   Tensor<float, 2, true>& vectors,
+                   Tensor<float, 2, true>* vectorsTransposed,
                   Tensor<float, 2, true>& queries,
                   int k,
                   Tensor<float, 2, true>& outDistances,
@@ -53,6 +55,7 @@ void runIPDistance(GpuResources* resources,
 #ifdef FAISS_USE_FLOAT16
 void runIPDistance(GpuResources* resources,
                   Tensor<half, 2, true>& vectors,
+                   Tensor<half, 2, true>* vectorsTransposed,
                   Tensor<half, 2, true>& queries,
                   int k,
                   Tensor<half, 2, true>& outDistances,
@@ -61,6 +64,7 @@ void runIPDistance(GpuResources* resources,
 void runL2Distance(GpuResources* resources,
                   Tensor<half, 2, true>& vectors,
+                   Tensor<half, 2, true>* vectorsTransposed,
                   Tensor<half, 1, true>* vectorNorms,
                   Tensor<half, 2, true>& queries,
                   int k,

--- a/gpu/impl/FlatIndex.cu
+++ b/gpu/impl/FlatIndex.cu
@@ -14,16 +14,19 @@
 #include "L2Norm.cuh"
 #include "../utils/CopyUtils.cuh"
 #include "../utils/DeviceUtils.h"
+#include "../utils/Transpose.cuh"
 namespace faiss { namespace gpu {
 FlatIndex::FlatIndex(GpuResources* res,
                     int dim,
                     bool l2Distance,
-                     bool useFloat16) :
+                     bool useFloat16,
+                     bool storeTransposed) :
    resources_(res),
    dim_(dim),
    useFloat16_(useFloat16),
+    storeTransposed_(storeTransposed),
    l2Distance_(l2Distance),
    num_(0) {
 #ifndef FAISS_USE_FLOAT16
@@ -92,7 +95,7 @@ FlatIndex::getVectorsFloat32Copy(int from, int num, cudaStream_t stream) {
 }
 void
-FlatIndex::query(Tensor<float, 2, true>& vecs,
+FlatIndex::query(Tensor<float, 2, true>& input,
                 int k,
                 Tensor<float, 2, true>& outDistances,
                 Tensor<int, 2, true>& outIndices,
@@ -104,12 +107,12 @@ FlatIndex::query(Tensor<float, 2, true>& vecs,
  if (useFloat16_) {
    // We need to convert to float16
 #ifdef FAISS_USE_FLOAT16
-    auto vecsHalf = toHalf<2>(resources_, stream, vecs);
+    auto inputHalf = toHalf<2>(resources_, stream, input);
    DeviceTensor<half, 2, true> outDistancesHalf(
      mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream);
-    query(vecsHalf, k, outDistancesHalf, outIndices, exactDistance, tileSize);
+    query(inputHalf, k, outDistancesHalf, outIndices, exactDistance, tileSize);
    if (exactDistance) {
      // Convert outDistances back
@@ -120,8 +123,9 @@ FlatIndex::query(Tensor<float, 2, true>& vecs,
    if (l2Distance_) {
      runL2Distance(resources_,
                    vectors_,
+                    storeTransposed_ ? &vectorsTransposed_ : nullptr,
                    &norms_,
-                    vecs,
+                    input,
                    k,
                    outDistances,
                    outIndices,
@@ -131,7 +135,8 @@ FlatIndex::query(Tensor<float, 2, true>& vecs,
    } else {
      runIPDistance(resources_,
                    vectors_,
-                    vecs,
+                    storeTransposed_ ? &vectorsTransposed_ : nullptr,
+                    input,
                    k,
                    outDistances,
                    outIndices,
@@ -142,7 +147,7 @@ FlatIndex::query(Tensor<float, 2, true>& vecs,
 #ifdef FAISS_USE_FLOAT16
 void
-FlatIndex::query(Tensor<half, 2, true>& vecs,
+FlatIndex::query(Tensor<half, 2, true>& input,
                 int k,
                 Tensor<half, 2, true>& outDistances,
                 Tensor<int, 2, true>& outIndices,
@@ -153,8 +158,9 @@ FlatIndex::query(Tensor<half, 2, true>& vecs,
  if (l2Distance_) {
    runL2Distance(resources_,
                  vectorsHalf_,
+                  storeTransposed_ ? &vectorsHalfTransposed_ : nullptr,
                  &normsHalf_,
-                  vecs,
+                  input,
                  k,
                  outDistances,
                  outIndices,
@@ -164,7 +170,8 @@ FlatIndex::query(Tensor<half, 2, true>& vecs,
  } else {
    runIPDistance(resources_,
                  vectorsHalf_,
-                  vecs,
+                  storeTransposed_ ? &vectorsHalfTransposed_ : nullptr,
+                  input,
                  k,
                  outDistances,
                  outIndices,
@@ -215,6 +222,20 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
    vectors_ = std::move(vectors);
  }
+  if (storeTransposed_) {
+    if (useFloat16_) {
+#ifdef FAISS_USE_FLOAT16
+      vectorsHalfTransposed_ =
+        std::move(DeviceTensor<half, 2, true>({dim_, (int) num_}));
+      runTransposeAny(vectorsHalf_, 0, 1, vectorsHalfTransposed_, stream);
+#endif
+    } else {
+      vectorsTransposed_ =
+        std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}));
+      runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
+    }
+  }
  if (l2Distance_) {
    // Precompute L2 norms of our database
    if (useFloat16_) {

--- a/gpu/impl/FlatIndex.cuh
+++ b/gpu/impl/FlatIndex.cuh
@@ -25,7 +25,8 @@ class FlatIndex {
  FlatIndex(GpuResources* res,
            int dim,
            bool l2Distance,
-            bool useFloat16);
+            bool useFloat16,
+            bool storeTransposed);
  bool getUseFloat16() const;
@@ -84,6 +85,10 @@ class FlatIndex {
  /// Float16 data format
  const bool useFloat16_;
+  /// Store vectors in transposed layout for speed; makes addition to
+  /// the index slower
+  const bool storeTransposed_;
  /// L2 or inner product distance?
  bool l2Distance_;
@@ -95,10 +100,12 @@ class FlatIndex {
  /// Vectors currently in rawData_
  DeviceTensor<float, 2, true> vectors_;
+  DeviceTensor<float, 2, true> vectorsTransposed_;
 #ifdef FAISS_USE_FLOAT16
  /// Vectors currently in rawData_, float16 form
  DeviceTensor<half, 2, true> vectorsHalf_;
+  DeviceTensor<half, 2, true> vectorsHalfTransposed_;
 #endif
  /// Precomputed L2 norms

--- a/gpu/impl/IVFPQ.cu
+++ b/gpu/impl/IVFPQ.cu
@@ -187,6 +187,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
    runL2Distance(resources_,
                  pqCentroidsMiddleCodeView,
+                  nullptr, // no transposed storage
                  nullptr, // no precomputed norms
                  residualsTransposeView,
                  1,

--- a/gpu/perf/CompareFlat.cu
+++ b/gpu/perf/CompareFlat.cu
@@ -31,6 +31,7 @@ DEFINE_int32(dim, 128, "# of dimensions");
 DEFINE_int32(num_queries, 3, "number of query vectors");
 DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
 DEFINE_bool(use_float16, false, "use encodings in float16 instead of float32");
+DEFINE_bool(transposed, false, "store vectors transposed");
 DEFINE_int64(seed, -1, "specify random seed");
 DEFINE_int32(num_gpus, 1, "number of gpus to use");
 DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use");
@@ -38,7 +39,7 @@ DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use");
 using namespace faiss::gpu;
 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
  cudaProfilerStop();
@@ -59,6 +60,7 @@ int main(int argc, char** argv) {
  printf("L2 lookup: %d queries, total k %d\n",
         numQueries, FLAGS_k);
  printf("float16 encoding %s\n", FLAGS_use_float16 ? "enabled" : "disabled");
+  printf("transposed storage %s\n", FLAGS_transposed ? "enabled" : "disabled");
  // Convert to GPU index
  printf("Copying index to %d GPU(s)...\n", FLAGS_num_gpus);
@@ -68,8 +70,13 @@ int main(int argc, char** argv) {
    ((faiss::gpu::StandardGpuResources*) res)->setPinnedMemory(
      FLAGS_pinned_mem);
+    GpuIndexFlatConfig config;
+    config.device = dev;
+    config.useFloat16 = FLAGS_use_float16;
+    config.storeTransposed = FLAGS_transposed;
    auto p = std::unique_ptr<faiss::gpu::GpuIndexFlatL2>(
-      new faiss::gpu::GpuIndexFlatL2(res, dev, FLAGS_use_float16, index.get()));
+      new faiss::gpu::GpuIndexFlatL2(res, index.get(), config));
    return p;
  };

--- a/gpu/perf/CompareIVFFlat.cu
+++ b/gpu/perf/CompareIVFFlat.cu
@@ -41,7 +41,7 @@ DEFINE_int32(index, 2, "0 = no indices on GPU; 1 = 32 bit, 2 = 64 bit on GPU");
 using namespace faiss::gpu;
 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
  cudaProfilerStop();

--- a/gpu/perf/CompareIVFPQ.cu
+++ b/gpu/perf/CompareIVFPQ.cu
@@ -41,7 +41,7 @@ DEFINE_int32(index, 2, "0 = no indices on GPU; 1 = 32 bit, 2 = 64 bit on GPU");
 using namespace faiss::gpu;
 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
  CUDA_VERIFY(cudaProfilerStop());

--- a/gpu/perf/CompareIVFPQGrid.cu
+++ b/gpu/perf/CompareIVFPQGrid.cu
@@ -34,7 +34,7 @@ DEFINE_int32(index, 2, "0 = no indices on GPU; 1 = 32 bit, 2 = 64 bit on GPU");
 using namespace faiss::gpu;
 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
  auto seed = time(nullptr);
  auto k = FLAGS_k;

--- a/gpu/perf/PerfClustering.cpp
+++ b/gpu/perf/PerfClustering.cpp
@@ -28,6 +28,7 @@ DEFINE_int32(dim, 128, "# of dimensions");
 DEFINE_int32(niter, 10, "# of iterations");
 DEFINE_bool(L2_metric, true, "If true, use L2 metric. If false, use IP metric");
 DEFINE_bool(use_float16, false, "use float16 vectors and math");
+DEFINE_bool(transposed, false, "transposed vector storage");
 DEFINE_bool(verbose, false, "turn on clustering logging");
 DEFINE_int64(seed, -1, "specify random seed");
 DEFINE_int32(num_gpus, 1, "number of gpus to use");
@@ -38,7 +39,7 @@ DEFINE_int32(max_points, -1, "max points per centroid");
 using namespace faiss::gpu;
 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
  cudaProfilerStop();
@@ -52,6 +53,7 @@ int main(int argc, char** argv) {
         FLAGS_L2_metric ? "L2" : "IP",
         FLAGS_dim, FLAGS_k, FLAGS_num, FLAGS_niter);
  printf("float16 math %s\n", FLAGS_use_float16 ? "enabled" : "disabled");
+  printf("transposed storage %s\n", FLAGS_transposed ? "enabled" : "disabled");
  printf("verbose %s\n", FLAGS_verbose ? "enabled" : "disabled");
  auto initFn = [](faiss::gpu::GpuResources* res, int dev) ->
@@ -61,12 +63,17 @@ int main(int argc, char** argv) {
        FLAGS_pinned_mem);
    }
+    GpuIndexFlatConfig config;
+    config.device = dev;
+    config.useFloat16 = FLAGS_use_float16;
+    config.storeTransposed = FLAGS_transposed;
    auto p = std::unique_ptr<faiss::gpu::GpuIndexFlat>(
      FLAGS_L2_metric ?
      (faiss::gpu::GpuIndexFlat*)
-      new faiss::gpu::GpuIndexFlatL2(res, dev, FLAGS_dim, FLAGS_use_float16) :
+      new faiss::gpu::GpuIndexFlatL2(res, FLAGS_dim, config) :
      (faiss::gpu::GpuIndexFlat*)
-      new faiss::gpu::GpuIndexFlatIP(res, dev, FLAGS_dim, FLAGS_use_float16));
+      new faiss::gpu::GpuIndexFlatIP(res, FLAGS_dim, config));
    if (FLAGS_min_paging_size >= 0) {
      p->setMinPagingSize(FLAGS_min_paging_size);

--- a/gpu/perf/PerfIVFPQAdd.cpp
+++ b/gpu/perf/PerfIVFPQAdd.cpp
@@ -35,7 +35,7 @@ DEFINE_bool(per_batch_time, false, "print per-batch times");
 DEFINE_bool(reserve_memory, false, "whether or not to pre-reserve memory");
 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
  cudaProfilerStop();

--- a/gpu/perf/WriteIndex.cpp
+++ b/gpu/perf/WriteIndex.cpp
@@ -52,7 +52,7 @@ void fillAndSave(T& index, int numTrain, int num, int dim) {
 }
 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
  // Either ivfpq or ivfflat must be set
  if ((FLAGS_ivfpq && FLAGS_ivfflat) ||

--- a/gpu/test/TestGpuIndexFlat.cpp
+++ b/gpu/test/TestGpuIndexFlat.cpp
@@ -22,7 +22,10 @@
 constexpr float kF16MaxRelErr = 0.07f;
 constexpr float kF32MaxRelErr = 6e-3f;
-void testFlat(bool useL2, bool useFloat16, int kOverride = -1) {
+void testFlat(bool useL2,
+              bool useFloat16,
+              bool useTransposed,
+              int kOverride = -1) {
  int numVecs = faiss::gpu::randVal(1000, 20000);
  int dim = faiss::gpu::randVal(50, 800);
  int numQuery = faiss::gpu::randVal(1, 512);
@@ -49,8 +52,15 @@ void testFlat(bool useL2, bool useFloat16, int kOverride = -1) {
  faiss::gpu::StandardGpuResources res;
  res.noTempMemory();
-  faiss::gpu::GpuIndexFlatIP gpuIndexIP(&res, device, dim, useFloat16);
-  faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, device, dim, useFloat16);
+  faiss::gpu::GpuIndexFlatConfig config;
+  config.device = device;
+  config.useFloat16 = useFloat16;
+  config.storeTransposed = useTransposed;
+  faiss::gpu::GpuIndexFlatIP gpuIndexIP(&res, dim, config);
+  faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
  faiss::gpu::GpuIndexFlat* gpuIndex =
    useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
@@ -64,6 +74,7 @@ void testFlat(bool useL2, bool useFloat16, int kOverride = -1) {
  str << (useL2 ? "L2" : "IP") << " numVecs " << numVecs
      << " dim " << dim
      << " useFloat16 " << useFloat16
+      << " transposed " << useTransposed
      << " numQuery " << numQuery
      << " k " << k;
@@ -79,16 +90,18 @@ void testFlat(bool useL2, bool useFloat16, int kOverride = -1) {
 }
 TEST(TestGpuIndexFlat, IP_Float32) {
-  for (int tries = 0; tries < 10; ++tries) {
+  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(false, false);
+    testFlat(false, false, false);
+    testFlat(false, false, true);
  }
 }
 TEST(TestGpuIndexFlat, L2_Float32) {
-  for (int tries = 0; tries < 10; ++tries) {
+  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(true, false);
+    testFlat(true, false, false);
+    testFlat(true, false, true);
  }
 }
@@ -96,21 +109,24 @@ TEST(TestGpuIndexFlat, L2_Float32) {
 TEST(TestGpuIndexFlat, L2_Float32_K1) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(true, false, 1);
+    testFlat(true, false, false, 1);
+    testFlat(true, false, true, 1);
  }
 }
 TEST(TestGpuIndexFlat, IP_Float16) {
-  for (int tries = 0; tries < 10; ++tries) {
+  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(false, true);
+    testFlat(false, true, false);
+    testFlat(false, true, false);
  }
 }
 TEST(TestGpuIndexFlat, L2_Float16) {
-  for (int tries = 0; tries < 10; ++tries) {
+  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(true, true);
+    testFlat(true, true, false);
+    testFlat(true, true, true);
  }
 }
@@ -118,7 +134,8 @@ TEST(TestGpuIndexFlat, L2_Float16) {
 TEST(TestGpuIndexFlat, L2_Float16_K1) {
  for (int tries = 0; tries < 5; ++tries) {
    faiss::gpu::newTestSeed();
-    testFlat(true, true, 1);
+    testFlat(true, true, false, 1);
+    testFlat(true, true, true, 1);
  }
 }
@@ -126,8 +143,13 @@ TEST(TestGpuIndexFlat, QueryEmpty) {
  faiss::gpu::StandardGpuResources res;
  res.noTempMemory();
+  faiss::gpu::GpuIndexFlatConfig config;
+  config.device = 0;
+  config.useFloat16 = false;
+  config.storeTransposed = false;
  int dim = 128;
-  faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, 0, dim, false);
+  faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
  // Querying an empty index should not blow up, and just return
  // (FLT_MAX, -1)
@@ -165,7 +187,13 @@ TEST(TestGpuIndexFlat, CopyFrom) {
  // Fill with garbage values
  int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-  faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, device, 2000, false);
+  faiss::gpu::GpuIndexFlatConfig config;
+  config.device = 0;
+  config.useFloat16 = false;
+  config.storeTransposed = false;
+  faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, 2000, config);
  gpuIndex.copyFrom(&cpuIndex);
  EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
@@ -195,7 +223,13 @@ TEST(TestGpuIndexFlat, CopyTo) {
  int dim = faiss::gpu::randVal(1, 1000);
  int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-  faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, device, dim, false);
+  faiss::gpu::GpuIndexFlatConfig config;
+  config.device = device;
+  config.useFloat16 = false;
+  config.storeTransposed = false;
+  faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
  std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
  gpuIndex.add(numVecs, vecs.data());

--- a/gpu/test/test_gpu_index.py
+++ b/gpu/test/test_gpu_index.py
@@ -45,7 +45,10 @@ class EvalIVFPQAccuracy(testutil.BaseFacebookTestCase):
        res = faiss.StandardGpuResources()
-        gt_index = faiss.GpuIndexFlatL2(res, dev_no, d, False)
+        flat_config = faiss.GpuIndexFlatConfig()
+        flat_config.device = dev_no
+        gt_index = faiss.GpuIndexFlatL2(res, d, flat_config)
        gt_index.add(xb)
        D, gt_nns = gt_index.search(xq, 1)

--- a/gpu/utils/Tensor-inl.cuh
+++ b/gpu/utils/Tensor-inl.cuh
@@ -11,6 +11,7 @@
 #include "../../FaissAssert.h"
 #include "DeviceUtils.h"
+#include <limits>
 namespace faiss { namespace gpu {
@@ -279,6 +280,58 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastResize() const {
  return true;
 }
+template <typename T, int Dim, bool Contig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename NewIndexT>
+__host__ Tensor<T, Dim, Contig, NewIndexT, PtrTraits>
+Tensor<T, Dim, Contig, IndexT, PtrTraits>::castIndexType() const {
+  if (sizeof(NewIndexT) < sizeof(IndexT)) {
+    assert(this->canCastIndexType<NewIndexT>());
+  }
+  NewIndexT newSize[Dim];
+  NewIndexT newStride[Dim];
+  for (int i = 0; i < Dim; ++i) {
+    newSize[i] = (NewIndexT) size_[i];
+    newStride[i] = (NewIndexT) stride_[i];
+  }
+  return Tensor<T, Dim, Contig, NewIndexT, PtrTraits>(
+    data_, newSize, newStride);
+}
+template <typename T, int Dim, bool Contig,
+          typename IndexT, template <typename U> class PtrTraits>
+template <typename NewIndexT>
+__host__ bool
+Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastIndexType() const {
+  static_assert(sizeof(size_t) >= sizeof(IndexT),
+                "index size too large");
+  static_assert(sizeof(size_t) >= sizeof(NewIndexT),
+                "new index size too large");
+  // Find maximum offset that can be calculated
+  // FIXME: maybe also consider offset in bytes? multiply by sizeof(T)?
+  size_t maxOffset = 0;
+  if (Contig) {
+    maxOffset = (size_t) size_[0] * (size_t) stride_[0];
+  } else {
+    for (int i = 0; i < Dim; ++i) {
+      size_t curMaxOffset = (size_t) size_[i] * (size_t) stride_[i];
+      if (curMaxOffset > maxOffset) {
+        maxOffset = curMaxOffset;
+      }
+    }
+  }
+  if (maxOffset > (size_t) std::numeric_limits<NewIndexT>::max()) {
+    return false;
+  }
+  return true;
+}
 template <typename T, int Dim, bool Contig,
          typename IndexT, template <typename U> class PtrTraits>
 __host__ __device__ IndexT

--- a/gpu/utils/Tensor.cuh
+++ b/gpu/utils/Tensor.cuh
@@ -158,6 +158,18 @@ class Tensor {
  template <typename U>
  __host__ __device__ bool canCastResize() const;
+  /// Attempts to cast this tensor to a tensor of a different IndexT.
+  /// Fails if size or stride entries are not representable in the new
+  /// IndexT.
+  template <typename NewIndexT>
+  __host__ Tensor<T, Dim, Contig, NewIndexT, PtrTraits>
+  castIndexType() const;
+  /// Returns true if we can castIndexType() this tensor to the new
+  /// index type
+  template <typename NewIndexT>
+  __host__ bool canCastIndexType() const;
  /// Returns a raw pointer to the start of our data.
  __host__ __device__ inline DataPtrType data() {
    return data_;
@@ -337,6 +349,27 @@ class Tensor {
  IndexT size_[Dim];
 };
+// Utilities for checking a collection of tensors
+namespace detail {
+template <typename IndexType>
+bool canCastIndexType() {
+  return true;
+}
+template <typename IndexType, typename T, typename... U>
+bool canCastIndexType(const T& arg, const U&... args) {
+  return arg.canCastIndexType<IndexType>() &&
+    canCastIndexType(args...);
+}
+} // namespace detail
+template <typename IndexType, typename... T>
+bool canCastIndexType(const T&... args) {
+  return detail::canCastIndexType(args...);
+}
 namespace detail {
 /// Specialization for a view of a single value (0-dimensional)

--- a/index_io.cpp
+++ b/index_io.cpp
@@ -24,16 +24,25 @@
 #include "IndexPQ.h"
 #include "IndexIVF.h"
 #include "IndexIVFPQ.h"
+#include "MetaIndexes.h"
 /*************************************************************
 * The I/O format is the content of the class. For objects that are
- * inherited, like Index, a 4-character-code indicates which child
+ * inherited, like Index, a 4-character-code (fourcc) indicates which
- * class this is an instance of.
+ * child class this is an instance of.
 *
 * In this case, the fields of the parent class are written first,
 * then the ones for the child classes. Note that this requires
 * classes to be serialized to have a constructor without parameters,
- * so that the fields can be filled in later.
+ * so that the fields can be filled in later. The default constructor
+ * should set reasonable defaults for all fields.
+ *
+ * The fourccs are assigned arbitrarily. When the class changed (added
+ * or deprecated fields), the fourcc can be replaced. New code should
+ * be able to read the old fourcc and fill in new classes.
+ *
+ * TODO: serialization to strings for use in Python pickle or Torch
+ * serialization.
 **************************************************************/
@@ -294,6 +303,13 @@ void write_index (const Index *idx, FILE *f) {
        write_index (idxrf->base_index, f);
        write_index (&idxrf->refine_index, f);
        WRITE1 (idxrf->k_factor);
+    } else if(const IndexIDMap * idxmap =
+              dynamic_cast<const IndexIDMap *> (idx)) {
+        uint32_t h = fourcc ("IxMp");
+        WRITE1 (h);
+        write_index_header (idxmap, f);
+        write_index (idxmap->index, f);
+        WRITEVECTOR (idxmap->id_map);
    } else {
        FAISS_ASSERT (!"don't know how to serialize this type of index");
    }
@@ -572,6 +588,13 @@ Index *read_index (FILE * f, bool try_mmap) {
        delete rf;
        READ1 (idxrf->k_factor);
        idx = idxrf;
+    } else if(h == fourcc ("IxMp")) {
+        IndexIDMap * idxmap = new IndexIDMap ();
+        read_index_header (idxmap, f);
+        idxmap->index = read_index (f);
+        idxmap->own_fields = true;
+        READVECTOR (idxmap->id_map);
+        idx = idxmap;
    } else {
        fprintf (stderr, "Index type 0x%08x not supported\n", h);
        abort ();

--- a/python/swigfaiss.py
+++ b/python/swigfaiss.py
@@ -724,6 +724,10 @@ bincode_hist = _swigfaiss.bincode_hist
 def ivec_checksum(*args):
  return _swigfaiss.ivec_checksum(*args)
 ivec_checksum = _swigfaiss.ivec_checksum
+def fvecs_maybe_subsample(*args):
+  return _swigfaiss.fvecs_maybe_subsample(*args)
+fvecs_maybe_subsample = _swigfaiss.fvecs_maybe_subsample
 METRIC_INNER_PRODUCT = _swigfaiss.METRIC_INNER_PRODUCT
 METRIC_L2 = _swigfaiss.METRIC_L2
 class Index(_object):
@@ -963,13 +967,9 @@ class LinearTransform(VectorTransform):
        except: self.this = this
    def apply_noalloc(self, *args): return _swigfaiss.LinearTransform_apply_noalloc(self, *args)
    def transform_transpose(self, *args): return _swigfaiss.LinearTransform_transform_transpose(self, *args)
-    __swig_setmethods__["max_points_per_d"] = _swigfaiss.LinearTransform_max_points_per_d_set
-    __swig_getmethods__["max_points_per_d"] = _swigfaiss.LinearTransform_max_points_per_d_get
-    if _newclass:max_points_per_d = _swig_property(_swigfaiss.LinearTransform_max_points_per_d_get, _swigfaiss.LinearTransform_max_points_per_d_set)
    __swig_setmethods__["verbose"] = _swigfaiss.LinearTransform_verbose_set
    __swig_getmethods__["verbose"] = _swigfaiss.LinearTransform_verbose_get
    if _newclass:verbose = _swig_property(_swigfaiss.LinearTransform_verbose_get, _swigfaiss.LinearTransform_verbose_set)
-    def maybe_subsample_train_set(self, *args): return _swigfaiss.LinearTransform_maybe_subsample_train_set(self, *args)
    __swig_destroy__ = _swigfaiss.delete_LinearTransform
    __del__ = lambda self : None;
 LinearTransform_swigregister = _swigfaiss.LinearTransform_swigregister
@@ -1008,6 +1008,9 @@ class PCAMatrix(LinearTransform):
    __swig_setmethods__["random_rotation"] = _swigfaiss.PCAMatrix_random_rotation_set
    __swig_getmethods__["random_rotation"] = _swigfaiss.PCAMatrix_random_rotation_get
    if _newclass:random_rotation = _swig_property(_swigfaiss.PCAMatrix_random_rotation_get, _swigfaiss.PCAMatrix_random_rotation_set)
+    __swig_setmethods__["max_points_per_d"] = _swigfaiss.PCAMatrix_max_points_per_d_set
+    __swig_getmethods__["max_points_per_d"] = _swigfaiss.PCAMatrix_max_points_per_d_get
+    if _newclass:max_points_per_d = _swig_property(_swigfaiss.PCAMatrix_max_points_per_d_get, _swigfaiss.PCAMatrix_max_points_per_d_set)
    __swig_setmethods__["balanced_bins"] = _swigfaiss.PCAMatrix_balanced_bins_set
    __swig_getmethods__["balanced_bins"] = _swigfaiss.PCAMatrix_balanced_bins_get
    if _newclass:balanced_bins = _swig_property(_swigfaiss.PCAMatrix_balanced_bins_get, _swigfaiss.PCAMatrix_balanced_bins_set)
@@ -1053,9 +1056,9 @@ class OPQMatrix(LinearTransform):
    __swig_setmethods__["niter_pq_0"] = _swigfaiss.OPQMatrix_niter_pq_0_set
    __swig_getmethods__["niter_pq_0"] = _swigfaiss.OPQMatrix_niter_pq_0_get
    if _newclass:niter_pq_0 = _swig_property(_swigfaiss.OPQMatrix_niter_pq_0_get, _swigfaiss.OPQMatrix_niter_pq_0_set)
-    __swig_setmethods__["max_points_per_d"] = _swigfaiss.OPQMatrix_max_points_per_d_set
+    __swig_setmethods__["max_train_points"] = _swigfaiss.OPQMatrix_max_train_points_set
-    __swig_getmethods__["max_points_per_d"] = _swigfaiss.OPQMatrix_max_points_per_d_get
+    __swig_getmethods__["max_train_points"] = _swigfaiss.OPQMatrix_max_train_points_get
-    if _newclass:max_points_per_d = _swig_property(_swigfaiss.OPQMatrix_max_points_per_d_get, _swigfaiss.OPQMatrix_max_points_per_d_set)
+    if _newclass:max_train_points = _swig_property(_swigfaiss.OPQMatrix_max_train_points_get, _swigfaiss.OPQMatrix_max_train_points_set)
    __swig_setmethods__["verbose"] = _swigfaiss.OPQMatrix_verbose_set
    __swig_getmethods__["verbose"] = _swigfaiss.OPQMatrix_verbose_get
    if _newclass:verbose = _swig_property(_swigfaiss.OPQMatrix_verbose_get, _swigfaiss.OPQMatrix_verbose_set)
@@ -1885,10 +1888,6 @@ class IndexIDMap(Index):
    __swig_setmethods__["id_map"] = _swigfaiss.IndexIDMap_id_map_set
    __swig_getmethods__["id_map"] = _swigfaiss.IndexIDMap_id_map_get
    if _newclass:id_map = _swig_property(_swigfaiss.IndexIDMap_id_map_get, _swigfaiss.IndexIDMap_id_map_set)
-    def __init__(self, *args): 
-        this = _swigfaiss.new_IndexIDMap(*args)
-        try: self.this.append(this)
-        except: self.this = this
    def add_with_ids(self, *args): return _swigfaiss.IndexIDMap_add_with_ids(self, *args)
    def add(self, *args): return _swigfaiss.IndexIDMap_add(self, *args)
    def search(self, *args): return _swigfaiss.IndexIDMap_search(self, *args)
@@ -1897,6 +1896,10 @@ class IndexIDMap(Index):
    def set_typename(self): return _swigfaiss.IndexIDMap_set_typename(self)
    __swig_destroy__ = _swigfaiss.delete_IndexIDMap
    __del__ = lambda self : None;
+    def __init__(self, *args): 
+        this = _swigfaiss.new_IndexIDMap(*args)
+        try: self.this.append(this)
+        except: self.this = this
 IndexIDMap_swigregister = _swigfaiss.IndexIDMap_swigregister
 IndexIDMap_swigregister(IndexIDMap)

--- a/python/swigfaiss_gpu.py
+++ b/python/swigfaiss_gpu.py
@@ -793,6 +793,10 @@ bincode_hist = _swigfaiss_gpu.bincode_hist
 def ivec_checksum(*args):
  return _swigfaiss_gpu.ivec_checksum(*args)
 ivec_checksum = _swigfaiss_gpu.ivec_checksum
+def fvecs_maybe_subsample(*args):
+  return _swigfaiss_gpu.fvecs_maybe_subsample(*args)
+fvecs_maybe_subsample = _swigfaiss_gpu.fvecs_maybe_subsample
 METRIC_INNER_PRODUCT = _swigfaiss_gpu.METRIC_INNER_PRODUCT
 METRIC_L2 = _swigfaiss_gpu.METRIC_L2
 class Index(_object):
@@ -1032,13 +1036,9 @@ class LinearTransform(VectorTransform):
        except: self.this = this
    def apply_noalloc(self, *args): return _swigfaiss_gpu.LinearTransform_apply_noalloc(self, *args)
    def transform_transpose(self, *args): return _swigfaiss_gpu.LinearTransform_transform_transpose(self, *args)
-    __swig_setmethods__["max_points_per_d"] = _swigfaiss_gpu.LinearTransform_max_points_per_d_set
-    __swig_getmethods__["max_points_per_d"] = _swigfaiss_gpu.LinearTransform_max_points_per_d_get
-    if _newclass:max_points_per_d = _swig_property(_swigfaiss_gpu.LinearTransform_max_points_per_d_get, _swigfaiss_gpu.LinearTransform_max_points_per_d_set)
    __swig_setmethods__["verbose"] = _swigfaiss_gpu.LinearTransform_verbose_set
    __swig_getmethods__["verbose"] = _swigfaiss_gpu.LinearTransform_verbose_get
    if _newclass:verbose = _swig_property(_swigfaiss_gpu.LinearTransform_verbose_get, _swigfaiss_gpu.LinearTransform_verbose_set)
-    def maybe_subsample_train_set(self, *args): return _swigfaiss_gpu.LinearTransform_maybe_subsample_train_set(self, *args)
    __swig_destroy__ = _swigfaiss_gpu.delete_LinearTransform
    __del__ = lambda self : None;
 LinearTransform_swigregister = _swigfaiss_gpu.LinearTransform_swigregister
@@ -1077,6 +1077,9 @@ class PCAMatrix(LinearTransform):
    __swig_setmethods__["random_rotation"] = _swigfaiss_gpu.PCAMatrix_random_rotation_set
    __swig_getmethods__["random_rotation"] = _swigfaiss_gpu.PCAMatrix_random_rotation_get
    if _newclass:random_rotation = _swig_property(_swigfaiss_gpu.PCAMatrix_random_rotation_get, _swigfaiss_gpu.PCAMatrix_random_rotation_set)
+    __swig_setmethods__["max_points_per_d"] = _swigfaiss_gpu.PCAMatrix_max_points_per_d_set
+    __swig_getmethods__["max_points_per_d"] = _swigfaiss_gpu.PCAMatrix_max_points_per_d_get
+    if _newclass:max_points_per_d = _swig_property(_swigfaiss_gpu.PCAMatrix_max_points_per_d_get, _swigfaiss_gpu.PCAMatrix_max_points_per_d_set)
    __swig_setmethods__["balanced_bins"] = _swigfaiss_gpu.PCAMatrix_balanced_bins_set
    __swig_getmethods__["balanced_bins"] = _swigfaiss_gpu.PCAMatrix_balanced_bins_get
    if _newclass:balanced_bins = _swig_property(_swigfaiss_gpu.PCAMatrix_balanced_bins_get, _swigfaiss_gpu.PCAMatrix_balanced_bins_set)
@@ -1122,9 +1125,9 @@ class OPQMatrix(LinearTransform):
    __swig_setmethods__["niter_pq_0"] = _swigfaiss_gpu.OPQMatrix_niter_pq_0_set
    __swig_getmethods__["niter_pq_0"] = _swigfaiss_gpu.OPQMatrix_niter_pq_0_get
    if _newclass:niter_pq_0 = _swig_property(_swigfaiss_gpu.OPQMatrix_niter_pq_0_get, _swigfaiss_gpu.OPQMatrix_niter_pq_0_set)
-    __swig_setmethods__["max_points_per_d"] = _swigfaiss_gpu.OPQMatrix_max_points_per_d_set
+    __swig_setmethods__["max_train_points"] = _swigfaiss_gpu.OPQMatrix_max_train_points_set
-    __swig_getmethods__["max_points_per_d"] = _swigfaiss_gpu.OPQMatrix_max_points_per_d_get
+    __swig_getmethods__["max_train_points"] = _swigfaiss_gpu.OPQMatrix_max_train_points_get
-    if _newclass:max_points_per_d = _swig_property(_swigfaiss_gpu.OPQMatrix_max_points_per_d_get, _swigfaiss_gpu.OPQMatrix_max_points_per_d_set)
+    if _newclass:max_train_points = _swig_property(_swigfaiss_gpu.OPQMatrix_max_train_points_get, _swigfaiss_gpu.OPQMatrix_max_train_points_set)
    __swig_setmethods__["verbose"] = _swigfaiss_gpu.OPQMatrix_verbose_set
    __swig_getmethods__["verbose"] = _swigfaiss_gpu.OPQMatrix_verbose_get
    if _newclass:verbose = _swig_property(_swigfaiss_gpu.OPQMatrix_verbose_get, _swigfaiss_gpu.OPQMatrix_verbose_set)
@@ -1954,10 +1957,6 @@ class IndexIDMap(Index):
    __swig_setmethods__["id_map"] = _swigfaiss_gpu.IndexIDMap_id_map_set
    __swig_getmethods__["id_map"] = _swigfaiss_gpu.IndexIDMap_id_map_get
    if _newclass:id_map = _swig_property(_swigfaiss_gpu.IndexIDMap_id_map_get, _swigfaiss_gpu.IndexIDMap_id_map_set)
-    def __init__(self, *args): 
-        this = _swigfaiss_gpu.new_IndexIDMap(*args)
-        try: self.this.append(this)
-        except: self.this = this
    def add_with_ids(self, *args): return _swigfaiss_gpu.IndexIDMap_add_with_ids(self, *args)
    def add(self, *args): return _swigfaiss_gpu.IndexIDMap_add(self, *args)
    def search(self, *args): return _swigfaiss_gpu.IndexIDMap_search(self, *args)
@@ -1966,6 +1965,10 @@ class IndexIDMap(Index):
    def set_typename(self): return _swigfaiss_gpu.IndexIDMap_set_typename(self)
    __swig_destroy__ = _swigfaiss_gpu.delete_IndexIDMap
    __del__ = lambda self : None;
+    def __init__(self, *args): 
+        this = _swigfaiss_gpu.new_IndexIDMap(*args)
+        try: self.this.append(this)
+        except: self.this = this
 IndexIDMap_swigregister = _swigfaiss_gpu.IndexIDMap_swigregister
 IndexIDMap_swigregister(IndexIDMap)
@@ -2064,6 +2067,30 @@ class GpuIndex(Index):
 GpuIndex_swigregister = _swigfaiss_gpu.GpuIndex_swigregister
 GpuIndex_swigregister(GpuIndex)
+class GpuIndexFlatConfig(_object):
+    __swig_setmethods__ = {}
+    __setattr__ = lambda self, name, value: _swig_setattr(self, GpuIndexFlatConfig, name, value)
+    __swig_getmethods__ = {}
+    __getattr__ = lambda self, name: _swig_getattr(self, GpuIndexFlatConfig, name)
+    __repr__ = _swig_repr
+    def __init__(self): 
+        this = _swigfaiss_gpu.new_GpuIndexFlatConfig()
+        try: self.this.append(this)
+        except: self.this = this
+    __swig_setmethods__["device"] = _swigfaiss_gpu.GpuIndexFlatConfig_device_set
+    __swig_getmethods__["device"] = _swigfaiss_gpu.GpuIndexFlatConfig_device_get
+    if _newclass:device = _swig_property(_swigfaiss_gpu.GpuIndexFlatConfig_device_get, _swigfaiss_gpu.GpuIndexFlatConfig_device_set)
+    __swig_setmethods__["useFloat16"] = _swigfaiss_gpu.GpuIndexFlatConfig_useFloat16_set
+    __swig_getmethods__["useFloat16"] = _swigfaiss_gpu.GpuIndexFlatConfig_useFloat16_get
+    if _newclass:useFloat16 = _swig_property(_swigfaiss_gpu.GpuIndexFlatConfig_useFloat16_get, _swigfaiss_gpu.GpuIndexFlatConfig_useFloat16_set)
+    __swig_setmethods__["storeTransposed"] = _swigfaiss_gpu.GpuIndexFlatConfig_storeTransposed_set
+    __swig_getmethods__["storeTransposed"] = _swigfaiss_gpu.GpuIndexFlatConfig_storeTransposed_get
+    if _newclass:storeTransposed = _swig_property(_swigfaiss_gpu.GpuIndexFlatConfig_storeTransposed_get, _swigfaiss_gpu.GpuIndexFlatConfig_storeTransposed_set)
+    __swig_destroy__ = _swigfaiss_gpu.delete_GpuIndexFlatConfig
+    __del__ = lambda self : None;
+GpuIndexFlatConfig_swigregister = _swigfaiss_gpu.GpuIndexFlatConfig_swigregister
+GpuIndexFlatConfig_swigregister(GpuIndexFlatConfig)
 class GpuIndexFlat(GpuIndex):
    __swig_setmethods__ = {}
    for _s in [GpuIndex]: __swig_setmethods__.update(getattr(_s,'__swig_setmethods__',{}))
@@ -2521,6 +2548,9 @@ class GpuClonerOptions(_object):
    __swig_setmethods__["reserveVecs"] = _swigfaiss_gpu.GpuClonerOptions_reserveVecs_set
    __swig_getmethods__["reserveVecs"] = _swigfaiss_gpu.GpuClonerOptions_reserveVecs_get
    if _newclass:reserveVecs = _swig_property(_swigfaiss_gpu.GpuClonerOptions_reserveVecs_get, _swigfaiss_gpu.GpuClonerOptions_reserveVecs_set)
+    __swig_setmethods__["storeTransposed"] = _swigfaiss_gpu.GpuClonerOptions_storeTransposed_set
+    __swig_getmethods__["storeTransposed"] = _swigfaiss_gpu.GpuClonerOptions_storeTransposed_get
+    if _newclass:storeTransposed = _swig_property(_swigfaiss_gpu.GpuClonerOptions_storeTransposed_get, _swigfaiss_gpu.GpuClonerOptions_storeTransposed_set)
    __swig_setmethods__["verbose"] = _swigfaiss_gpu.GpuClonerOptions_verbose_set
    __swig_getmethods__["verbose"] = _swigfaiss_gpu.GpuClonerOptions_verbose_get
    if _newclass:verbose = _swig_property(_swigfaiss_gpu.GpuClonerOptions_verbose_get, _swigfaiss_gpu.GpuClonerOptions_verbose_set)

--- a/python/swigfaiss_gpu_wrap.cxx
+++ b/python/swigfaiss_gpu_wrap.cxx
--- a/python/swigfaiss_wrap.cxx
+++ b/python/swigfaiss_wrap.cxx
--- a/utils.cpp
+++ b/utils.cpp
@@ -1790,6 +1790,28 @@ int fvec_madd_and_argmin (size_t n, const float *a,
+const float *fvecs_maybe_subsample (
+          size_t d, size_t *n, size_t nmax, const float *x,
+          bool verbose, long seed)
+{
+    if (*n <= nmax) return x; // nothing to do
+    size_t n2 = nmax;
+    if (verbose) {
+        printf ("  Input training set too big (max size is %ld), sampling "
+                "%ld / %ld vectors\n", nmax, n2, *n);
+    }
+    std::vector<int> subset (*n);
+    rand_perm (subset.data (), *n, seed);
+    float *x_subset = new float[n2 * d];
+    for (long i = 0; i < n2; i++)
+        memcpy (&x_subset[i * d],
+                &x[subset[i] * size_t(d)],
+                sizeof (x[0]) * d);
+    *n = n2;
+    return x_subset;
+}
 } // namespace faiss
--- a/utils.h
+++ b/utils.h