Commit 00c288c0 authored by matthijs's avatar matthijs

added benchmarking scripts

parent a67190c6
......@@ -166,8 +166,8 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
assign, d, k, nx);
if (verbose) {
printf (" Iteration %d (%5g s, search %5g s): "
"objective=%g imbalance=%g nsplit=%d \r",
printf (" Iteration %d (%.2f s, search %.2f s): "
"objective=%g imbalance=%.3f nsplit=%d \r",
i, (getmillisecs() - t0) / 1000.0,
t_search_tot / 1000,
err, imbalance_factor (nx, k, assign),
......
......@@ -7,15 +7,19 @@
.SUFFIXES: .cpp .o
MAKEFILE_INC=makefile.inc
-include $(MAKEFILE_INC)
LIBNAME=libfaiss
all: .env_ok $(LIBNAME).a tests/demo_ivfpq_indexing
py: _swigfaiss.so
#############################
# Various
......
......@@ -211,6 +211,7 @@ void translate_labels (long n, idx_t *labels, long translation)
* @param all_labels idem
* @param translartions label translations to apply, size nshard
*/
template <class C>
void merge_tables (long n, long k, long nshard,
float *distances, idx_t *labels,
......@@ -218,22 +219,55 @@ void merge_tables (long n, long k, long nshard,
idx_t *all_labels,
const long *translations)
{
long shard_stride = n * k;
#pragma omp parallel for
for (long i = 0; i < n; i++) {
float *D = distances + i * k;
idx_t *I = labels + i * k;
const float *Ds = all_distances + i * k;
idx_t *Is = all_labels + i * k;
translate_labels (k, Is, translations[0]);
heap_heapify<C>(k, D, I, Ds, Is, k);
for (int s = 1; s < nshard; s++) {
Ds += shard_stride;
Is += shard_stride;
translate_labels (k, Is, translations[s]);
heap_addn<C> (k, D, I, Ds, Is, k);
if(k == 0) {
return;
}
long stride = n * k;
#pragma omp parallel
{
std::vector<int> buf (2 * nshard);
int * pointer = buf.data();
int * shard_ids = pointer + nshard;
std::vector<float> buf2 (nshard);
float * heap_vals = buf2.data();
#pragma omp for
for (long i = 0; i < n; i++) {
// the heap maps values to the shard where they are
// produced.
const float *D_in = all_distances + i * k;
const idx_t *I_in = all_labels + i * k;
int heap_size = 0;
for (long s = 0; s < nshard; s++) {
pointer[s] = 0;
if (I_in[stride * s] >= 0)
heap_push<C> (++heap_size, heap_vals, shard_ids,
D_in[stride * s], s);
}
float *D = distances + i * k;
idx_t *I = labels + i * k;
for (int j = 0; j < k; j++) {
if (heap_size == 0) {
I[j] = -1;
D[j] = C::neutral();
} else {
// pop best element
int s = shard_ids[0];
int & p = pointer[s];
D[j] = heap_vals[0];
I[j] = I_in[stride * s + p] + translations[s];
heap_pop<C> (heap_size--, heap_vals, shard_ids);
p++;
if (p < k && I_in[stride * s + p] >= 0)
heap_push<C> (++heap_size, heap_vals, shard_ids,
D_in[stride * s + p], s);
}
}
}
heap_reorder<C>(k, D, I);
}
}
......@@ -360,6 +394,9 @@ void IndexShards::add_with_ids (idx_t n, const float * x, const long *xids)
}
void IndexShards::reset ()
{
for (int i = 0; i < shard_indexes.size(); i++) {
......@@ -433,15 +470,14 @@ void IndexShards::search (
}
if (metric_type == METRIC_L2) {
merge_tables< CMax<float, idx_t> > (
merge_tables< CMin<float, int> > (
n, k, nshard, distances, labels,
all_distances, all_labels, translations.data ());
} else {
merge_tables< CMin<float, idx_t> > (
merge_tables< CMax<float, int> > (
n, k, nshard, distances, labels,
all_distances, all_labels, translations.data ());
}
delete [] all_distances;
delete [] all_labels;
}
......
# Copyright (c) 2015-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the CC-by-NC license found in the
# LICENSE file in the root directory of this source tree.
# -*- makefile -*-
# Tested on macOS Sierra (10.12.2) with llvm installed using Homebrew (https://brew.sh)
......
# Copyright (c) 2015-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the CC-by-NC license found in the
# LICENSE file in the root directory of this source tree.
# -*- makefile -*-
# tested on Mac OS X 10.12.2 Sierra with additional software installed via macports
......
......@@ -15,6 +15,9 @@ import types
import sys
import pdb
# we import * so that the symbol X can be accessed as faiss.X
try:
from swigfaiss_gpu import *
except ImportError as e:
......@@ -29,6 +32,9 @@ except ImportError as e:
##################################################################
# The functions below add or replace some methods for classes
# this is to be able to pass in numpy arrays directly
# The C++ version of the classnames will be suffixed with _c
##################################################################
def replace_method(the_class, name, replacement):
......@@ -49,6 +55,32 @@ def handle_Clustering():
handle_Clustering()
def handle_ProductQuantizer():
def replacement_train(self, x):
n, d = x.shape
assert d == self.d
self.train_c(n, swig_ptr(x))
def replacement_compute_codes(self, x):
n, d = x.shape
assert d == self.d
codes = np.empty((n, self.code_size), dtype='uint8')
self.compute_codes_c(swig_ptr(x), swig_ptr(codes), n)
return codes
def replacement_decode(self, codes):
n, cs = codes.shape
assert cs == self.code_size
x = np.empty((n, self.d), dtype='float32')
self.decode_c(swig_ptr(codes), swig_ptr(x), n)
return x
replace_method(ProductQuantizer, 'train', replacement_train)
replace_method(ProductQuantizer, 'compute_codes', replacement_compute_codes)
replace_method(ProductQuantizer, 'decode', replacement_decode)
handle_ProductQuantizer()
def handle_Index(the_class):
......@@ -153,6 +185,8 @@ for symbol in dir(this_module):
handle_ParameterSpace(the_class)
def vector_float_to_array(v):
a = np.empty(v.size(), dtype='float32')
memcpy(swig_ptr(a), v.data(), 4 * v.size())
......@@ -241,7 +275,8 @@ def eval_intersection(I1, I2):
k1, k2 = I1.shape[1], I2.shape[1]
ninter = 0
for i in range(n):
ninter += ranklist_intersection_size(k1, swig_ptr(I1[i]), k2, swig_ptr(I2[i]))
ninter += ranklist_intersection_size(
k1, swig_ptr(I1[i]), k2, swig_ptr(I2[i]))
return ninter
......
......@@ -77,6 +77,7 @@ IVFBase::reset() {
deviceListDataPointers_.clear();
deviceListIndexPointers_.clear();
deviceListLengths_.clear();
listOffsetToUserIndex_.clear();
for (size_t i = 0; i < numLists_; ++i) {
deviceListData_.emplace_back(
......
......@@ -85,6 +85,7 @@ IVFPQ::isSupportedPQCodeLength(int size) {
case 48:
case 56: // only supported with float16
case 64: // only supported with float16
case 96: // only supported with float16
return true;
default:
return false;
......
......@@ -485,7 +485,7 @@ runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
CODE_DISTANCE(10);
break;
case 12:
CODE_DISTANCE(10);
CODE_DISTANCE(12);
break;
case 16:
CODE_DISTANCE(16);
......
......@@ -174,20 +174,14 @@ struct LoadCode32<24> {
unsigned char* p,
int offset) {
p += offset * 24;
// FIXME: this is a non-coalesced, unaligned, non-vectorized load
// FIXME: this is a non-coalesced, unaligned, 2-vectorized load
// unfortunately need to reorganize memory layout by warp
asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
"=r"(code32[0]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
"=r"(code32[1]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
"=r"(code32[2]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
"=r"(code32[3]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
"=r"(code32[4]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
"=r"(code32[5]) : "l"(p));
asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 0];" :
"=r"(code32[0]), "=r"(code32[1]) : "l"(p));
asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 8];" :
"=r"(code32[2]), "=r"(code32[3]) : "l"(p));
asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 16];" :
"=r"(code32[4]), "=r"(code32[5]) : "l"(p));
}
};
......@@ -239,28 +233,18 @@ struct LoadCode32<40> {
unsigned char* p,
int offset) {
p += offset * 40;
// FIXME: this is a non-coalesced, unaligned, non-vectorized load
// FIXME: this is a non-coalesced, unaligned, 2-vectorized load
// unfortunately need to reorganize memory layout by warp
asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
"=r"(code32[0]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
"=r"(code32[1]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
"=r"(code32[2]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
"=r"(code32[3]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
"=r"(code32[4]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
"=r"(code32[5]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 24];" :
"=r"(code32[6]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 28];" :
"=r"(code32[7]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 32];" :
"=r"(code32[8]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 36];" :
"=r"(code32[9]) : "l"(p));
asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 0];" :
"=r"(code32[0]), "=r"(code32[1]) : "l"(p));
asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 8];" :
"=r"(code32[2]), "=r"(code32[3]) : "l"(p));
asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 16];" :
"=r"(code32[4]), "=r"(code32[5]) : "l"(p));
asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 24];" :
"=r"(code32[6]), "=r"(code32[7]) : "l"(p));
asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 32];" :
"=r"(code32[8]), "=r"(code32[9]) : "l"(p));
}
};
......@@ -270,32 +254,17 @@ struct LoadCode32<48> {
unsigned char* p,
int offset) {
p += offset * 48;
// FIXME: this is a non-coalesced, unaligned, non-vectorized load
// FIXME: this is a non-coalesced load
// unfortunately need to reorganize memory layout by warp
asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
"=r"(code32[0]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
"=r"(code32[1]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
"=r"(code32[2]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
"=r"(code32[3]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
"=r"(code32[4]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
"=r"(code32[5]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 24];" :
"=r"(code32[6]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 28];" :
"=r"(code32[7]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 32];" :
"=r"(code32[8]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 36];" :
"=r"(code32[9]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 40];" :
"=r"(code32[10]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 44];" :
"=r"(code32[11]) : "l"(p));
asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4];" :
"=r"(code32[0]), "=r"(code32[1]),
"=r"(code32[2]), "=r"(code32[3]) : "l"(p));
asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 16];" :
"=r"(code32[4]), "=r"(code32[5]),
"=r"(code32[6]), "=r"(code32[7]) : "l"(p));
asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 32];" :
"=r"(code32[8]), "=r"(code32[9]),
"=r"(code32[10]), "=r"(code32[11]) : "l"(p));
}
};
......@@ -305,36 +274,22 @@ struct LoadCode32<56> {
unsigned char* p,
int offset) {
p += offset * 56;
// FIXME: this is a non-coalesced, unaligned, non-vectorized load
// FIXME: this is a non-coalesced, unaligned, 2-vectorized load
// unfortunately need to reorganize memory layout by warp
asm("ld.global.cs.nc.u32 {%0}, [%1 + 0];" :
"=r"(code32[0]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 4];" :
"=r"(code32[1]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 8];" :
"=r"(code32[2]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 12];" :
"=r"(code32[3]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 16];" :
"=r"(code32[4]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 20];" :
"=r"(code32[5]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 24];" :
"=r"(code32[6]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 28];" :
"=r"(code32[7]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 32];" :
"=r"(code32[8]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 36];" :
"=r"(code32[9]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 40];" :
"=r"(code32[10]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 44];" :
"=r"(code32[11]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 48];" :
"=r"(code32[12]) : "l"(p));
asm("ld.global.cs.nc.u32 {%0}, [%1 + 52];" :
"=r"(code32[13]) : "l"(p));
asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 0];" :
"=r"(code32[0]), "=r"(code32[1]) : "l"(p));
asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 8];" :
"=r"(code32[2]), "=r"(code32[3]) : "l"(p));
asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 16];" :
"=r"(code32[4]), "=r"(code32[5]) : "l"(p));
asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 24];" :
"=r"(code32[6]), "=r"(code32[7]) : "l"(p));
asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 32];" :
"=r"(code32[8]), "=r"(code32[9]) : "l"(p));
asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 40];" :
"=r"(code32[10]), "=r"(code32[11]) : "l"(p));
asm("ld.global.cs.nc.v2.u32 {%0, %1}, [%2 + 48];" :
"=r"(code32[12]), "=r"(code32[13]) : "l"(p));
}
};
......@@ -361,5 +316,34 @@ struct LoadCode32<64> {
}
};
template<>
struct LoadCode32<96> {
static inline __device__ void load(unsigned int code32[24],
unsigned char* p,
int offset) {
p += offset * 96;
// FIXME: this is a non-coalesced load
// unfortunately need to reorganize memory layout by warp
asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4];" :
"=r"(code32[0]), "=r"(code32[1]),
"=r"(code32[2]), "=r"(code32[3]) : "l"(p));
asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 16];" :
"=r"(code32[4]), "=r"(code32[5]),
"=r"(code32[6]), "=r"(code32[7]) : "l"(p));
asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 32];" :
"=r"(code32[8]), "=r"(code32[9]),
"=r"(code32[10]), "=r"(code32[11]) : "l"(p));
asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 48];" :
"=r"(code32[12]), "=r"(code32[13]),
"=r"(code32[14]), "=r"(code32[15]) : "l"(p));
asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 64];" :
"=r"(code32[16]), "=r"(code32[17]),
"=r"(code32[18]), "=r"(code32[19]) : "l"(p));
asm("ld.global.cs.nc.v4.u32 {%0, %1, %2, %3}, [%4 + 80];" :
"=r"(code32[20]), "=r"(code32[21]),
"=r"(code32[22]), "=r"(code32[23]) : "l"(p));
}
};
} } // namespace
......@@ -358,6 +358,9 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
case 64:
RUN_PQ(64);
break;
case 96:
RUN_PQ(96);
break;
default:
FAISS_ASSERT(false);
break;
......
......@@ -343,6 +343,9 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
case 64:
RUN_PQ(64);
break;
case 96:
RUN_PQ(96);
break;
default:
FAISS_ASSERT(false);
break;
......
......@@ -22,7 +22,11 @@
void pickEncoding(int& codes, int& dim) {
std::vector<int> codeSizes{3, 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64};
std::vector<int> codeSizes{
3, 4, 8, 12, 16, 20, 24,
28, 32, 40, 48, 56, 64, 96
};
std::vector<int> dimSizes{4, 8, 16, 32};
codes = codeSizes[faiss::gpu::randVal(0, codeSizes.size() - 1)];
......@@ -55,8 +59,8 @@ struct Options {
faiss::gpu::INDICES_CPU,
faiss::gpu::INDICES_32_BIT,
faiss::gpu::INDICES_64_BIT});
if (codes == 64) {
// 64x8 lookup can only fit using float16
if (codes > 48) {
// large codes can only fit using float16
useFloat16 = true;
} else {
useFloat16 = faiss::gpu::randBool();
......
......@@ -10,6 +10,7 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#include "Float16.cuh"
#include "nvidia/fp16_emu.cuh"
#include <thrust/execution_policy.h>
#include <thrust/transform.h>
......@@ -48,67 +49,10 @@ void runConvertToFloat32(float* out,
in, in + num, out, HalfToFloat());
}
// FIXME: replace
/*
Copyright (c) 2015, Norbert Juffa
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
half hostFloat2Half(float a) {
uint32_t ia;
uint16_t ir;
memcpy(&ia, &a, sizeof(float));
ir = (ia >> 16) & 0x8000;
if ((ia & 0x7f800000) == 0x7f800000) {
if ((ia & 0x7fffffff) == 0x7f800000) {
ir |= 0x7c00; /* infinity */
} else {
ir = 0x7fff; /* canonical NaN */
}
} else if ((ia & 0x7f800000) >= 0x33000000) {
int shift = (int)((ia >> 23) & 0xff) - 127;
if (shift > 15) {
ir |= 0x7c00; /* infinity */
} else {
ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
if (shift < -14) { /* denormal */
ir |= ia >> (-1 - shift);
ia = ia << (32 - (-1 - shift));
} else { /* normal */
ir |= ia >> (24 - 11);
ia = ia << (32 - (24 - 11));
ir = ir + ((14 + shift) << 10);
}
/* IEEE-754 round to nearest of even */
if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
ir++;
}
}
}
half ret;
memcpy(&ret, &ir, sizeof(half));
return ret;
half h;
h.x = cpu_float2half_rn(a).x;
return h;
}
} } // namespace
......
This diff is collapsed.
This diff is collapsed.
......@@ -58,3 +58,22 @@ class TestPCA(testutil.BaseFacebookTestCase):
for o in column_norm2:
self.assertGreater(prev, o)
prev = o
class TestProductQuantizer(testutil.BaseFacebookTestCase):
def test_pq(self):
d = 64
n = 1000
cs = 4
np.random.seed(123)
x = np.random.random(size=(n, d)).astype('float32')
pq = faiss.ProductQuantizer(d, cs, 8)
pq.train(x)
codes = pq.compute_codes(x)
x2 = pq.decode(codes)
diff = ((x - x2)**2).sum()
# print "diff=", diff
# diff= 1807.98
self.assertGreater(2500, diff)
......@@ -16,7 +16,6 @@
#include <cassert>
#include <cstring>
#include <cmath>
#include <cmath>
#include <smmintrin.h>
#include <mmintrin.h>
......@@ -99,8 +98,8 @@ size_t get_mem_usage_kb ()
size_t get_mem_usage_kb ()
{
fprintf(stderr, "WARN: get_mem_usage_kb not implemented on the mac\n");
return 0;
fprintf(stderr, "WARN: get_mem_usage_kb not implemented on the mac\n");
return 0;
}
#endif
......@@ -173,13 +172,13 @@ int rand_r(unsigned *seed);
RandomGenerator::RandomGenerator (long seed)
{
rand_state = seed;
rand_state = seed;
}
RandomGenerator::RandomGenerator (const RandomGenerator & other)
{
rand_state = other.rand_state;
rand_state = other.rand_state;
}
......@@ -208,7 +207,7 @@ int RandomGenerator::rand_int (int max)
float RandomGenerator::rand_float ()
{
return rand_int() / float(1 << 31);
return rand_int() / float(1L << 31);
}
double RandomGenerator::rand_double ()
......@@ -233,7 +232,7 @@ void float_rand (float * x, size_t n, long seed)
RandomGenerator rng0 (seed);
int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
#pragma omp parallel for
#pragma omp parallel for
for (size_t j = 0; j < nblock; j++) {
RandomGenerator rng (a0 + j * b0);
......@@ -255,7 +254,7 @@ void float_randn (float * x, size_t n, long seed)
RandomGenerator rng0 (seed);
int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
#pragma omp parallel for
#pragma omp parallel for
for (size_t j = 0; j < nblock; j++) {
RandomGenerator rng (a0 + j * b0);
......@@ -292,7 +291,7 @@ void long_rand (long * x, size_t n, long seed)
RandomGenerator rng0 (seed);
int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
#pragma omp parallel for
#pragma omp parallel for
for (size_t j = 0; j < nblock; j++) {
RandomGenerator rng (a0 + j * b0);
......@@ -329,7 +328,7 @@ void byte_rand (uint8_t * x, size_t n, long seed)
RandomGenerator rng0 (seed);
int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
#pragma omp parallel for
#pragma omp parallel for
for (size_t j = 0; j < nblock; j++) {
RandomGenerator rng (a0 + j * b0);
......@@ -1278,7 +1277,9 @@ void pairwise_L2sqr (long d,
* Kmeans subroutine
***************************************************************************/
#define EPS 1e-7
// a bit above machine epsilon for float16
#define EPS (1 / 1024.)
/* For k-means, compute centroids given assignment of vectors to centroids */
/* NOTE: This could be multi-threaded (use histogram of indexes) */
......@@ -1347,12 +1348,12 @@ int km_update_centroids (const float * x,
/* small symmetric pertubation. Much better than */
for (size_t j = 0; j < d; j++)
if (j % 2 == 0) {
centroids[ci * d + j] += EPS;
centroids[cj * d + j] -= EPS;
centroids[ci * d + j] *= 1 + EPS;
centroids[cj * d + j] *= 1 - EPS;
}
else {
centroids[ci * d + j] -= EPS;
centroids[cj * d + j] += EPS;
centroids[ci * d + j] *= 1 + EPS;
centroids[cj * d + j] *= 1 - EPS;
}
/* assume even split of the cluster */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment