added benchmarking scripts

00c288c0 · matthijs · a67190c6 · 00c288c0 · 00c288c0 · 00c288c0
Commit 00c288c0 authored Feb 24, 2017 by matthijs
18 changed files
--- a/Clustering.cpp
+++ b/Clustering.cpp
@@ -166,8 +166,8 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
                                              assign, d, k, nx);

            if (verbose) {
-                printf ("  Iteration %d (%5g s, search %5g s): "
-                        "objective=%g imbalance=%g nsplit=%d       \r",
+                printf ("  Iteration %d (%.2f s, search %.2f s): "
+                        "objective=%g imbalance=%.3f nsplit=%d       \r",
                        i, (getmillisecs() - t0) / 1000.0,
                        t_search_tot / 1000,
                        err, imbalance_factor (nx, k, assign),

--- a/Makefile
+++ b/Makefile
@@ -7,15 +7,19 @@

 .SUFFIXES: .cpp .o

+
 MAKEFILE_INC=makefile.inc
+
 -include $(MAKEFILE_INC)

 LIBNAME=libfaiss

 all: .env_ok $(LIBNAME).a tests/demo_ivfpq_indexing
+
 py: _swigfaiss.so


+
 #############################
 # Various


--- a/MetaIndexes.cpp
+++ b/MetaIndexes.cpp
@@ -211,6 +211,7 @@ void translate_labels (long n, idx_t *labels, long translation)
 * @param all_labels     idem
 * @param translartions  label translations to apply, size nshard
 */
+
 template <class C>
 void merge_tables (long n, long k, long nshard,
                   float *distances, idx_t *labels,
@@ -218,22 +219,55 @@ void merge_tables (long n, long k, long nshard,
                   idx_t *all_labels,
                   const long *translations)
 {
-    long shard_stride = n * k;
-#pragma omp parallel for
-    for (long i = 0; i < n; i++) {
-        float *D = distances + i * k;
-        idx_t *I = labels + i * k;
-        const float *Ds = all_distances + i * k;
-        idx_t *Is = all_labels + i * k;
-        translate_labels (k, Is, translations[0]);
-        heap_heapify<C>(k, D, I, Ds, Is, k);
-        for (int s = 1; s < nshard; s++) {
-            Ds += shard_stride;
-            Is += shard_stride;
-            translate_labels (k, Is, translations[s]);
-            heap_addn<C> (k, D, I, Ds, Is, k);
+    if(k == 0) {
+        return;
+    }
+
+    long stride = n * k;
+#pragma omp parallel
+    {
+        std::vector<int> buf (2 * nshard);
+        int * pointer = buf.data();
+        int * shard_ids = pointer + nshard;
+        std::vector<float> buf2 (nshard);
+        float * heap_vals = buf2.data();
+#pragma omp for
+        for (long i = 0; i < n; i++) {
+            // the heap maps values to the shard where they are
+            // produced.
+            const float *D_in = all_distances + i * k;
+            const idx_t *I_in = all_labels + i * k;
+            int heap_size = 0;
+
+            for (long s = 0; s < nshard; s++) {
+                pointer[s] = 0;
+                if (I_in[stride * s] >= 0)
+                    heap_push<C> (++heap_size, heap_vals, shard_ids,
+                                 D_in[stride * s], s);
+            }
+
+            float *D = distances + i * k;
+            idx_t *I = labels + i * k;
+
+            for (int j = 0; j < k; j++) {
+                if (heap_size == 0) {
+                    I[j] = -1;
+                    D[j] = C::neutral();
+                } else {
+                    // pop best element
+                    int s = shard_ids[0];
+                    int & p = pointer[s];
+                    D[j] = heap_vals[0];
+                    I[j] = I_in[stride * s + p] + translations[s];
+
+                    heap_pop<C> (heap_size--, heap_vals, shard_ids);
+                    p++;
+                    if (p < k && I_in[stride * s + p] >= 0)
+                        heap_push<C> (++heap_size, heap_vals, shard_ids,
+                                     D_in[stride * s + p], s);
+                }
+            }
        }
-        heap_reorder<C>(k, D, I);
    }
 }

@@ -360,6 +394,9 @@ void IndexShards::add_with_ids (idx_t n, const float * x, const long *xids)
 }


+
+
+
 void IndexShards::reset ()
 {
    for (int i = 0; i < shard_indexes.size(); i++) {
@@ -433,15 +470,14 @@ void IndexShards::search (
    }

    if (metric_type == METRIC_L2) {
-        merge_tables< CMax<float, idx_t> > (
+        merge_tables< CMin<float, int> > (
             n, k, nshard, distances, labels,
             all_distances, all_labels, translations.data ());
    } else {
-        merge_tables< CMin<float, idx_t> > (
+        merge_tables< CMax<float, int> > (
             n, k, nshard, distances, labels,
             all_distances, all_labels, translations.data ());
    }
-
    delete [] all_distances;
    delete [] all_labels;
 }

--- a/example_makefiles/makefile.inc.Mac.brew
+++ b/example_makefiles/makefile.inc.Mac.brew

-# Copyright (c) 2015-present, Facebook, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the CC-by-NC license found in the
-# LICENSE file in the root directory of this source tree.
-
 # -*- makefile -*-

 # Tested on macOS Sierra (10.12.2) with llvm installed using Homebrew (https://brew.sh)

--- a/example_makefiles/makefile.inc.Mac.port
+++ b/example_makefiles/makefile.inc.Mac.port

-# Copyright (c) 2015-present, Facebook, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the CC-by-NC license found in the
-# LICENSE file in the root directory of this source tree.
-
 # -*- makefile -*-
 # tested on Mac OS X 10.12.2 Sierra with additional software installed via macports


--- a/faiss.py
+++ b/faiss.py
@@ -15,6 +15,9 @@ import types
 import sys
 import pdb

+
+# we import * so that the symbol X can be accessed as faiss.X
+
 try:
    from swigfaiss_gpu import *
 except ImportError as e:
@@ -29,6 +32,9 @@ except ImportError as e:
 ##################################################################
 # The functions below add or replace some methods for classes
 # this is to be able to pass in numpy arrays directly
+# The C++ version of the classnames will be suffixed with _c
+##################################################################
+


 def replace_method(the_class, name, replacement):
@@ -49,6 +55,32 @@ def handle_Clustering():

 handle_Clustering()

+def handle_ProductQuantizer():
+
+    def replacement_train(self, x):
+        n, d = x.shape
+        assert d == self.d
+        self.train_c(n, swig_ptr(x))
+
+    def replacement_compute_codes(self, x):
+        n, d = x.shape
+        assert d == self.d
+        codes = np.empty((n, self.code_size), dtype='uint8')
+        self.compute_codes_c(swig_ptr(x), swig_ptr(codes), n)
+        return codes
+
+    def replacement_decode(self, codes):
+        n, cs = codes.shape
+        assert cs == self.code_size
+        x = np.empty((n, self.d), dtype='float32')
+        self.decode_c(swig_ptr(codes), swig_ptr(x), n)
+        return x
+
+    replace_method(ProductQuantizer, 'train', replacement_train)
+    replace_method(ProductQuantizer, 'compute_codes', replacement_compute_codes)
+    replace_method(ProductQuantizer, 'decode', replacement_decode)
+
+handle_ProductQuantizer()

 def handle_Index(the_class):

@@ -153,6 +185,8 @@ for symbol in dir(this_module):
            handle_ParameterSpace(the_class)


+
+
 def vector_float_to_array(v):
    a = np.empty(v.size(), dtype='float32')
    memcpy(swig_ptr(a), v.data(), 4 * v.size())
@@ -241,7 +275,8 @@ def eval_intersection(I1, I2):
    k1, k2 = I1.shape[1], I2.shape[1]
    ninter = 0
    for i in range(n):
-        ninter += ranklist_intersection_size(k1, swig_ptr(I1[i]), k2, swig_ptr(I2[i]))
+        ninter += ranklist_intersection_size(
+            k1, swig_ptr(I1[i]), k2, swig_ptr(I2[i]))
    return ninter



--- a/gpu/impl/IVFBase.cu
+++ b/gpu/impl/IVFBase.cu
@@ -77,6 +77,7 @@ IVFBase::reset() {
  deviceListDataPointers_.clear();
  deviceListIndexPointers_.clear();
  deviceListLengths_.clear();
+  listOffsetToUserIndex_.clear();

  for (size_t i = 0; i < numLists_; ++i) {
    deviceListData_.emplace_back(

--- a/gpu/impl/IVFPQ.cu
+++ b/gpu/impl/IVFPQ.cu
@@ -85,6 +85,7 @@ IVFPQ::isSupportedPQCodeLength(int size) {
    case 48:
    case 56: // only supported with float16
    case 64: // only supported with float16
+    case 96: // only supported with float16
      return true;
    default:
      return false;

--- a/gpu/impl/PQCodeDistances.cu
+++ b/gpu/impl/PQCodeDistances.cu
@@ -485,7 +485,7 @@ runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
      CODE_DISTANCE(10);
      break;
    case 12:
-      CODE_DISTANCE(10);
+      CODE_DISTANCE(12);
      break;
    case 16:
      CODE_DISTANCE(16);

--- a/gpu/impl/PQCodeLoad.cuh
+++ b/gpu/impl/PQCodeLoad.cuh
@@ -174,20 +174,14 @@ struct LoadCode32<24> {
                                     unsigned char* p,
                                     int offset) {
    p += offset * 24;
-    // FIXME: this is a non-coalesced, unaligned, non-vectorized load
+    // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
    // unfortunately need to reorganize memory layout by warp
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
-        "=r"(code32[0]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
-        "=r"(code32[1]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
-        "=r"(code32[2]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
-        "=r"(code32[3]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
-        "=r"(code32[4]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
-        "=r"(code32[5]) : "l"(p));
+    asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 0];" :
+        "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
+    asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 8];" :
+        "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
+    asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 16];" :
+        "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
  }
 };

@@ -239,28 +233,18 @@ struct LoadCode32<40> {
                                     unsigned char* p,
                                     int offset) {
    p += offset * 40;
-    // FIXME: this is a non-coalesced, unaligned, non-vectorized load
+    // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
    // unfortunately need to reorganize memory layout by warp
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
-        "=r"(code32[0]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
-        "=r"(code32[1]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
-        "=r"(code32[2]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
-        "=r"(code32[3]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
-        "=r"(code32[4]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
-        "=r"(code32[5]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 24];" :
-        "=r"(code32[6]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 28];" :
-        "=r"(code32[7]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 32];" :
-        "=r"(code32[8]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 36];" :
-        "=r"(code32[9]) : "l"(p));
+    asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 0];" :
+        "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
+    asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 8];" :
+        "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
+    asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 16];" :
+        "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
+    asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 24];" :
+        "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
+    asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 32];" :
+        "=r"(code32[8]), "=r"(code32[9]) : "l"(p));
  }
 };

@@ -270,32 +254,17 @@ struct LoadCode32<48> {
                                     unsigned char* p,
                                     int offset) {
    p += offset * 48;
-    // FIXME: this is a non-coalesced, unaligned, non-vectorized load
+    // FIXME: this is a non-coalesced load
    // unfortunately need to reorganize memory layout by warp
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
-        "=r"(code32[0]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
-        "=r"(code32[1]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
-        "=r"(code32[2]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
-        "=r"(code32[3]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
-        "=r"(code32[4]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
-        "=r"(code32[5]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 24];" :
-        "=r"(code32[6]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 28];" :
-        "=r"(code32[7]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 32];" :
-        "=r"(code32[8]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 36];" :
-        "=r"(code32[9]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 40];" :
-        "=r"(code32[10]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 44];" :
-        "=r"(code32[11]) : "l"(p));
+    asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4];" :
+        "=r"(code32[0]), "=r"(code32[1]),
+        "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
+    asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 16];" :
+        "=r"(code32[4]), "=r"(code32[5]),
+        "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
+    asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 32];" :
+        "=r"(code32[8]), "=r"(code32[9]),
+        "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
  }
 };

@@ -305,36 +274,22 @@ struct LoadCode32<56> {
                                     unsigned char* p,
                                     int offset) {
    p += offset * 56;
-    // FIXME: this is a non-coalesced, unaligned, non-vectorized load
+    // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
    // unfortunately need to reorganize memory layout by warp
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
-        "=r"(code32[0]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
-        "=r"(code32[1]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
-        "=r"(code32[2]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
-        "=r"(code32[3]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
-        "=r"(code32[4]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
-        "=r"(code32[5]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 24];" :
-        "=r"(code32[6]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 28];" :
-        "=r"(code32[7]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 32];" :
-        "=r"(code32[8]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 36];" :
-        "=r"(code32[9]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 40];" :
-        "=r"(code32[10]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 44];" :
-        "=r"(code32[11]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 48];" :
-        "=r"(code32[12]) : "l"(p));
-    asm("ld.global.cs.nc.u32 {%0}, [%1 + 52];" :
-        "=r"(code32[13]) : "l"(p));
+    asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 0];" :
+        "=r"(code32[0]), "=r"(code32[1]) : "l"(p));
+    asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 8];" :
+        "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
+    asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 16];" :
+        "=r"(code32[4]), "=r"(code32[5]) : "l"(p));
+    asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 24];" :
+        "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
+    asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 32];" :
+        "=r"(code32[8]), "=r"(code32[9]) : "l"(p));
+    asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 40];" :
+        "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
+    asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 48];" :
+        "=r"(code32[12]), "=r"(code32[13]) : "l"(p));
  }
 };

@@ -361,5 +316,34 @@ struct LoadCode32<64> {
  }
 };

+template<>
+struct LoadCode32<96> {
+  static inline __device__ void load(unsigned int code32[24],
+                                     unsigned char* p,
+                                     int offset) {
+    p += offset * 96;
+    // FIXME: this is a non-coalesced load
+    // unfortunately need to reorganize memory layout by warp
+    asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4];" :
+        "=r"(code32[0]), "=r"(code32[1]),
+        "=r"(code32[2]), "=r"(code32[3]) : "l"(p));
+    asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 16];" :
+        "=r"(code32[4]), "=r"(code32[5]),
+        "=r"(code32[6]), "=r"(code32[7]) : "l"(p));
+    asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 32];" :
+        "=r"(code32[8]), "=r"(code32[9]),
+        "=r"(code32[10]), "=r"(code32[11]) : "l"(p));
+    asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 48];" :
+        "=r"(code32[12]), "=r"(code32[13]),
+        "=r"(code32[14]), "=r"(code32[15]) : "l"(p));
+    asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 64];" :
+        "=r"(code32[16]), "=r"(code32[17]),
+        "=r"(code32[18]), "=r"(code32[19]) : "l"(p));
+    asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 80];" :
+        "=r"(code32[20]), "=r"(code32[21]),
+        "=r"(code32[22]), "=r"(code32[23]) : "l"(p));
+  }
+};
+

 } } // namespace
--- a/gpu/impl/PQScanMultiPassNoPrecomputed.cu
+++ b/gpu/impl/PQScanMultiPassNoPrecomputed.cu
@@ -358,6 +358,9 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
      case 64:
        RUN_PQ(64);
        break;
+      case 96:
+        RUN_PQ(96);
+        break;
      default:
        FAISS_ASSERT(false);
        break;

--- a/gpu/impl/PQScanMultiPassPrecomputed.cu
+++ b/gpu/impl/PQScanMultiPassPrecomputed.cu
@@ -343,6 +343,9 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
      case 64:
        RUN_PQ(64);
        break;
+      case 96:
+        RUN_PQ(96);
+        break;
      default:
        FAISS_ASSERT(false);
        break;

--- a/gpu/test/TestGpuIndexIVFPQ.cpp
+++ b/gpu/test/TestGpuIndexIVFPQ.cpp
@@ -22,7 +22,11 @@


 void pickEncoding(int& codes, int& dim) {
-  std::vector<int> codeSizes{3, 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64};
+  std::vector<int> codeSizes{
+    3, 4, 8, 12, 16, 20, 24,
+      28, 32, 40, 48, 56, 64, 96
+  };
+
  std::vector<int> dimSizes{4, 8, 16, 32};

  codes = codeSizes[faiss::gpu::randVal(0, codeSizes.size() - 1)];
@@ -55,8 +59,8 @@ struct Options {
        faiss::gpu::INDICES_CPU,
          faiss::gpu::INDICES_32_BIT,
          faiss::gpu::INDICES_64_BIT});
-    if (codes == 64) {
-      // 64x8 lookup can only fit using float16
+    if (codes > 48) {
+      // large codes can only fit using float16
      useFloat16 = true;
    } else {
      useFloat16 = faiss::gpu::randBool();

--- a/gpu/utils/Float16.cu
+++ b/gpu/utils/Float16.cu
@@ -10,6 +10,7 @@
 // Copyright 2004-present Facebook. All Rights Reserved.

 #include "Float16.cuh"
+#include "nvidia/fp16_emu.cuh"
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>

@@ -48,67 +49,10 @@ void runConvertToFloat32(float* out,
                    in, in + num, out, HalfToFloat());
 }

-// FIXME: replace
-/*
-  Copyright (c) 2015, Norbert Juffa
-  All rights reserved.
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-  1. Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the following disclaimer.
-  2. Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
 half hostFloat2Half(float a) {
-  uint32_t ia;
-  uint16_t ir;
-  memcpy(&ia, &a, sizeof(float));
-
-  ir = (ia >> 16) & 0x8000;
-  if ((ia & 0x7f800000) == 0x7f800000) {
-    if ((ia & 0x7fffffff) == 0x7f800000) {
-      ir |= 0x7c00; /* infinity */
-    } else {
-      ir = 0x7fff; /* canonical NaN */
-    }
-  } else if ((ia & 0x7f800000) >= 0x33000000) {
-    int shift = (int)((ia >> 23) & 0xff) - 127;
-    if (shift > 15) {
-      ir |= 0x7c00; /* infinity */
-    } else {
-      ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
-      if (shift < -14) { /* denormal */
-        ir |= ia >> (-1 - shift);
-        ia = ia << (32 - (-1 - shift));
-      } else { /* normal */
-        ir |= ia >> (24 - 11);
-        ia = ia << (32 - (24 - 11));
-        ir = ir + ((14 + shift) << 10);
-      }
-      /* IEEE-754 round to nearest of even */
-      if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
-        ir++;
-      }
-    }
-  }
-
-  half ret;
-  memcpy(&ret, &ir, sizeof(half));
-  return ret;
+  half h;
+  h.x = cpu_float2half_rn(a).x;
+  return h;
 }

 } } // namespace

--- a/python/swigfaiss_gpu_wrap.cxx
+++ b/python/swigfaiss_gpu_wrap.cxx
--- a/python/swigfaiss_wrap.cxx
+++ b/python/swigfaiss_wrap.cxx
--- a/python/test_build_blocks.py
+++ b/python/test_build_blocks.py
@@ -58,3 +58,22 @@ class TestPCA(testutil.BaseFacebookTestCase):
        for o in column_norm2:
            self.assertGreater(prev, o)
            prev = o
+
+
+class TestProductQuantizer(testutil.BaseFacebookTestCase):
+
+    def test_pq(self):
+        d = 64
+        n = 1000
+        cs = 4
+        np.random.seed(123)
+        x = np.random.random(size=(n, d)).astype('float32')
+        pq = faiss.ProductQuantizer(d, cs, 8)
+        pq.train(x)
+        codes = pq.compute_codes(x)
+        x2 = pq.decode(codes)
+        diff = ((x - x2)**2).sum()
+
+        # print "diff=", diff
+        # diff= 1807.98
+        self.assertGreater(2500, diff)
--- a/utils.cpp
+++ b/utils.cpp
@@ -16,7 +16,6 @@
 #include <cassert>
 #include <cstring>
 #include <cmath>
-#include <cmath>

 #include <smmintrin.h>
 #include <mmintrin.h>
@@ -99,8 +98,8 @@ size_t get_mem_usage_kb ()

 size_t get_mem_usage_kb ()
 {
-    fprintf(stderr, "WARN: get_mem_usage_kb not implemented on the mac\n"); 
-    return 0;    
+    fprintf(stderr, "WARN: get_mem_usage_kb not implemented on the mac\n");
+    return 0;
 }

 #endif
@@ -173,13 +172,13 @@ int rand_r(unsigned *seed);

 RandomGenerator::RandomGenerator (long seed)
 {
-    rand_state = seed; 
+    rand_state = seed;
 }


 RandomGenerator::RandomGenerator (const RandomGenerator & other)
 {
-    rand_state = other.rand_state; 
+    rand_state = other.rand_state;
 }


@@ -208,7 +207,7 @@ int RandomGenerator::rand_int (int max)

 float RandomGenerator::rand_float ()
 {
-    return rand_int() / float(1 << 31);
+    return rand_int() / float(1L << 31);
 }

 double RandomGenerator::rand_double ()
@@ -233,7 +232,7 @@ void float_rand (float * x, size_t n, long seed)
    RandomGenerator rng0 (seed);
    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();

-#pragma omp parallel for 
+#pragma omp parallel for
    for (size_t j = 0; j < nblock; j++) {

        RandomGenerator rng (a0 + j * b0);
@@ -255,7 +254,7 @@ void float_randn (float * x, size_t n, long seed)
    RandomGenerator rng0 (seed);
    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();

-#pragma omp parallel for 
+#pragma omp parallel for
    for (size_t j = 0; j < nblock; j++) {
        RandomGenerator rng (a0 + j * b0);

@@ -292,7 +291,7 @@ void long_rand (long * x, size_t n, long seed)
    RandomGenerator rng0 (seed);
    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();

-#pragma omp parallel for 
+#pragma omp parallel for
    for (size_t j = 0; j < nblock; j++) {

        RandomGenerator rng (a0 + j * b0);
@@ -329,7 +328,7 @@ void byte_rand (uint8_t * x, size_t n, long seed)
    RandomGenerator rng0 (seed);
    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();

-#pragma omp parallel for 
+#pragma omp parallel for
    for (size_t j = 0; j < nblock; j++) {

        RandomGenerator rng (a0 + j * b0);
@@ -1278,7 +1277,9 @@ void pairwise_L2sqr (long d,
 * Kmeans subroutine
 ***************************************************************************/

-#define EPS 1e-7
+// a bit above machine epsilon for float16
+
+#define EPS (1 / 1024.)

 /* For k-means, compute centroids given assignment of vectors to centroids */
 /* NOTE: This could be multi-threaded (use histogram of indexes) */
@@ -1347,12 +1348,12 @@ int km_update_centroids (const float * x,
            /* small symmetric pertubation. Much better than  */
            for (size_t j = 0; j < d; j++)
                if (j % 2 == 0) {
-                    centroids[ci * d + j] += EPS;
-                    centroids[cj * d + j] -= EPS;
+                    centroids[ci * d + j] *= 1 + EPS;
+                    centroids[cj * d + j] *= 1 - EPS;
                }
                else {
-                    centroids[ci * d + j] -= EPS;
-                    centroids[cj * d + j] += EPS;
+                    centroids[ci * d + j] *= 1 + EPS;
+                    centroids[cj * d + j] *= 1 - EPS;
                }

            /* assume even split of the cluster */