Commit 250a3d3f authored by matthijs's avatar matthijs

sync with FB version 2017-11-22

various bugfixes from github issues
kmean with some frozen centroids
GPU better tiling for large flat datasets
default AVX for vector ops
parent 71335194
......@@ -345,6 +345,7 @@ void ParameterSpace::initialize (const Index * index)
}
if (DC (IndexIVF)) {
{
ParameterRange & pr = add_range("nprobe");
for (int i = 0; i < 13; i++) {
size_t nprobe = 1 << i;
......@@ -352,6 +353,7 @@ void ParameterSpace::initialize (const Index * index)
pr.values.push_back (nprobe);
}
}
}
if (DC (IndexPQ)) {
ParameterRange & pr = add_range("ht");
init_pq_ParameterRange (ix->pq, pr);
......@@ -371,7 +373,6 @@ void ParameterSpace::initialize (const Index * index)
}
}
if (DC (IndexIVFPQR)) {
assert (ix);
ParameterRange & pr = add_range("k_factor");
for (int i = 0; i <= 6; i++) {
pr.values.push_back (1 << i);
......@@ -427,12 +428,21 @@ void ParameterSpace::set_index_parameter (
if (name == "verbose") {
index->verbose = int(val);
// and fall through to also enable it on sub-indexes
}
if (DC (IndexPreTransform)) {
index = ix->index;
}
if (DC (IndexShards)) {
// call on all sub-indexes
for (auto & shard_index : ix->shard_indexes) {
set_index_parameter (shard_index, name, val);
}
return;
}
if (name == "verbose") {
index->verbose = int(val);
// in case it was an IndexPreTransform
}
if (DC (IndexRefineFlat)) {
if (name == "k_factor_rf") {
......@@ -449,9 +459,12 @@ void ParameterSpace::set_index_parameter (
return; // last verbose that we could find
}
if (name == "nprobe") {
DC(IndexIVF);
if ( DC(IndexIVF)) {
ix->nprobe = int(val);
} else if (name == "ht") {
return;
}
}
if (name == "ht") {
if (DC (IndexPQ)) {
if (val >= ix->pq.code_size * 8) {
ix->search_type = IndexPQ::ST_PQ;
......@@ -459,25 +472,32 @@ void ParameterSpace::set_index_parameter (
ix->search_type = IndexPQ::ST_polysemous;
ix->polysemous_ht = int(val);
}
return;
} else if (DC (IndexIVFPQ)) {
if (val >= ix->pq.code_size * 8) {
ix->polysemous_ht = 0;
} else {
ix->polysemous_ht = int(val);
}
return;
}
}
} else if (name == "k_factor") {
DC (IndexIVFPQR);
if (name == "k_factor") {
if (DC (IndexIVFPQR)) {
ix->k_factor = val;
} else if (name == "max_codes") {
DC (IndexIVFPQ);
return;
}
}
if (name == "max_codes") {
if (DC (IndexIVFPQ)) {
ix->max_codes = finite(val) ? size_t(val) : 0;
} else {
FAISS_THROW_FMT (
"ParameterSpace::set_index_parameter:"
return;
}
}
FAISS_THROW_FMT ("ParameterSpace::set_index_parameter:"
"could not set parameter %s",
name.c_str());
}
}
void ParameterSpace::display () const
......@@ -634,6 +654,15 @@ struct VTChain {
}
};
/// what kind of training does this coarse quantizer require?
char get_trains_alone(const Index *coarse_quantizer) {
return
dynamic_cast<const MultiIndexQuantizer*>(coarse_quantizer) ? 1 :
0;
}
}
Index *index_factory (int d, const char *description_in, MetricType metric)
......@@ -656,6 +685,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
tok;
tok = strtok_r (nullptr, " ,", &ptr)) {
int d_out, opq_M, nbit, M, M2;
char option[100];
std::string stok(tok);
// to avoid mem leaks with exceptions:
......@@ -686,7 +716,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
} else if (stok == "L2norm") {
vt_1 = new NormalizationTransform (d, 2.0);
// coarse quantizers
} else if (!coarse_quantizer &&
sscanf (tok, "IVF%d", &ncentroids) == 1) {
if (metric == METRIC_L2) {
......@@ -709,8 +739,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
IndexIVF *index_ivf = new IndexIVFFlat (
coarse_quantizer, d, ncentroids, metric);
index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
!= nullptr;
get_trains_alone (coarse_quantizer);
index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
del_coarse_quantizer.release ();
index_ivf->own_fields = true;
......@@ -728,8 +757,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
new IndexIVFScalarQuantizer (
coarse_quantizer, d, ncentroids, qt, metric);
index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
!= nullptr;
get_trains_alone (coarse_quantizer);
del_coarse_quantizer.release ();
index_ivf->own_fields = true;
index_1 = index_ivf;
......@@ -744,29 +772,31 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
IndexIVFPQR *index_ivf = new IndexIVFPQR (
coarse_quantizer, d, ncentroids, M, 8, M2, 8);
index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
!= nullptr;
get_trains_alone (coarse_quantizer);
del_coarse_quantizer.release ();
index_ivf->own_fields = true;
index_1 = index_ivf;
} else if (!index && sscanf (tok, "PQ%d", &M) == 1) {
} else if (!index && sscanf (tok, "PQ%d%10s", &M, option) == 2) {
std::string soption = option;
// np to disable polysemous trainign
FAISS_THROW_IF_NOT(soption == "" || soption == "np");
if (coarse_quantizer) {
IndexIVFPQ *index_ivf = new IndexIVFPQ (
coarse_quantizer, d, ncentroids, M, 8);
index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
!= nullptr;
get_trains_alone (coarse_quantizer);
index_ivf->metric_type = metric;
index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
del_coarse_quantizer.release ();
index_ivf->own_fields = true;
index_ivf->do_polysemous_training = true;
index_ivf->do_polysemous_training = soption != "np";
index_1 = index_ivf;
} else {
IndexPQ *index_pq = new IndexPQ (d, M, 8, metric);
index_pq->do_polysemous_training = true;
index_pq->do_polysemous_training = soption != "np";
index_1 = index_pq;
}
} else if (stok == "RFlat") {
make_IndexRefineFlat = true;
} else {
......
......@@ -25,7 +25,7 @@ namespace faiss {
/** The objective is to have a simple result structure while
* minimizing the number of mem copies in the result. The method
* do_allocation can be overloaded to allocate the result tables in
* the matrix type of a srcipting language like Lua or Python. */
* the matrix type of a scripting language like Lua or Python. */
struct RangeSearchResult {
size_t nq; ///< nb of queries
size_t *lims; ///< size (nq + 1)
......
......@@ -29,6 +29,7 @@ ClusteringParameters::ClusteringParameters ():
nredo(1),
verbose(false), spherical(false),
update_index(false),
frozen_centroids(false),
min_points_per_centroid(39),
max_points_per_centroid(256),
seed(1234)
......@@ -110,7 +111,24 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
float * dis = new float[nx];
ScopeDeleter<float> del2(dis);
// for redo
float best_err = 1e50;
std::vector<float> best_obj;
std::vector<float> best_centroids;
// support input centroids
FAISS_THROW_IF_NOT_MSG (
centroids.size() % d == 0,
"size of provided input centroids not a multiple of dimension");
size_t n_input_centroids = centroids.size() / d;
if (verbose && n_input_centroids > 0) {
printf (" Using %zd centroids provided as input (%sfrozen)\n",
n_input_centroids, frozen_centroids ? "" : "not ");
}
double t_search_tot = 0;
if (verbose) {
printf(" Preprocessing in %.2f s\n",
......@@ -120,39 +138,28 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
for (int redo = 0; redo < nredo; redo++) {
std::vector<float> buf_centroids;
std::vector<float> &cur_centroids =
nredo == 1 ? centroids : buf_centroids;
if (verbose && nredo > 1) {
printf("Outer iteration %d / %d\n", redo, nredo);
}
if (cur_centroids.size() == 0) {
// initialize centroids with random points from the dataset
cur_centroids.resize (d * k);
// initialize remaining centroids with random points from the dataset
centroids.resize (d * k);
std::vector<int> perm (nx);
rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L);
#pragma omp parallel for
for (int i = 0; i < k ; i++)
memcpy (&cur_centroids[i * d], x + perm[i] * d,
for (int i = n_input_centroids; i < k ; i++)
memcpy (&centroids[i * d], x + perm[i] * d,
d * sizeof (float));
} else { // assume user provides some meaningful initialization
FAISS_THROW_IF_NOT (cur_centroids.size() == d * k);
FAISS_THROW_IF_NOT_MSG (nredo == 1,
"will redo with same initialization");
}
if (spherical)
fvec_renorm_L2 (d, k, cur_centroids.data());
fvec_renorm_L2 (d, k, centroids.data());
if (!index.is_trained)
index.train (k, cur_centroids.data());
index.train (k, centroids.data());
FAISS_THROW_IF_NOT (index.ntotal == 0);
index.add (k, cur_centroids.data());
index.add (k, centroids.data());
float err = 0;
for (int i = 0; i < niter; i++) {
double t0s = getmillisecs();
......@@ -164,8 +171,9 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
err += dis[j];
obj.push_back (err);
int nsplit = km_update_centroids (x, cur_centroids.data(),
assign, d, k, nx);
int nsplit = km_update_centroids (
x, centroids.data(),
assign, d, k, nx, frozen_centroids ? n_input_centroids : 0);
if (verbose) {
printf (" Iteration %d (%.2f s, search %.2f s): "
......@@ -178,26 +186,31 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
}
if (spherical)
fvec_renorm_L2 (d, k, cur_centroids.data());
fvec_renorm_L2 (d, k, centroids.data());
index.reset ();
if (update_index)
index.train (k, cur_centroids.data());
index.train (k, centroids.data());
assert (index.ntotal == 0);
index.add (k, cur_centroids.data());
index.add (k, centroids.data());
}
if (verbose) printf("\n");
if (nredo > 1) {
if (err < best_err) {
if (verbose)
printf ("Objective improved: keep new clusters\n");
centroids = buf_centroids;
best_centroids = centroids;
best_obj = obj;
best_err = err;
}
index.reset ();
}
}
if (nredo > 1) {
centroids = best_centroids;
obj = best_obj;
}
}
......
......@@ -28,6 +28,7 @@ struct ClusteringParameters {
bool verbose;
bool spherical; ///< do we want normalized centroids?
bool update_index; ///< update index after each iteration?
bool frozen_centroids; ///< use the centroids provided as input and do not change them during iterations
int min_points_per_centroid; ///< otherwise you get a warning
int max_points_per_centroid; ///< to limit size of dataset
......
......@@ -41,8 +41,7 @@ long Index::remove_ids(const IDSelector& /*sel*/) {
void Index::reconstruct (idx_t, float * ) const {
FAISS_THROW_MSG ("Can not compute reconstruct without "
"knowing how to do so");
FAISS_THROW_MSG ("reconstruct not implemented for this type of index");
}
......
......@@ -34,8 +34,9 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
nlist (nlist),
nprobe (1),
quantizer (quantizer),
quantizer_trains_alone (false),
quantizer_trains_alone (0),
own_fields (false),
clustering_index (nullptr),
ids (nlist),
maintain_direct_map (false)
{
......@@ -56,7 +57,8 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
IndexIVF::IndexIVF ():
nlist (0), nprobe (1), quantizer (nullptr),
quantizer_trains_alone (false), own_fields (false),
quantizer_trains_alone (0), own_fields (false),
clustering_index (nullptr),
maintain_direct_map (false)
{}
......@@ -157,22 +159,44 @@ void IndexIVF::train (idx_t n, const float *x)
if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
if (verbose)
printf ("IVF quantizer does not need training.\n");
} else if (quantizer_trains_alone) {
} else if (quantizer_trains_alone == 1) {
if (verbose)
printf ("IVF quantizer trains alone...\n");
quantizer->train (n, x);
quantizer->verbose = verbose;
FAISS_THROW_IF_NOT_MSG (quantizer->ntotal == nlist,
"nlist not consistent with quantizer size");
} else {
} else if (quantizer_trains_alone == 0) {
if (verbose)
printf ("Training IVF quantizer on %ld vectors in %dD\n",
n, d);
Clustering clus (d, nlist, cp);
quantizer->reset();
if (clustering_index) {
clus.train (n, x, *clustering_index);
quantizer->add (nlist, clus.centroids.data());
} else {
clus.train (n, x, *quantizer);
}
quantizer->is_trained = true;
} else if (quantizer_trains_alone == 2) {
if (verbose)
printf (
"Training L2 quantizer on %ld vectors in %dD%s\n",
n, d,
clustering_index ? "(user provided index)" : "");
FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
Clustering clus (d, nlist, cp);
if (!clustering_index) {
IndexFlatL2 assigner (d);
clus.train(n, x, assigner);
} else {
clus.train(n, x, *clustering_index);
}
if (verbose)
printf ("Adding centroids to quantizer\n");
quantizer->add (nlist, clus.centroids.data());
}
if (verbose)
printf ("Training IVF residual\n");
......@@ -250,8 +274,9 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
{
FAISS_THROW_IF_NOT (nlist == other.nlist);
FAISS_THROW_IF_NOT (!other.maintain_direct_map);
FAISS_THROW_IF_NOT_MSG (subset_type == 0 || subset_type == 2,
"this subset type is not implemented");
FAISS_THROW_IF_NOT_FMT (
subset_type == 0 || subset_type == 1 || subset_type == 2,
"subset type %d not implemented", subset_type);
size_t accu_n = 0;
size_t accu_a1 = 0;
......@@ -275,15 +300,24 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
other.ntotal++;
}
}
} else if (subset_type == 1) {
for (long i = 0; i < n; i++) {
idx_t id = ids_in[i];
if (id % a1 == a2) {
ids_out.push_back (id);
codes_out.insert (codes_out.end(),
codes_in.begin() + i * code_size,
codes_in.begin() + (i + 1) * code_size);
other.ntotal++;
}
}
} else if (subset_type == 2) {
// see what is allocated to a1 and to a2
size_t next_accu_n = accu_n + n;
size_t next_accu_a1 = next_accu_n * a1 / ntotal;
size_t i1 = next_accu_a1 - accu_a1;
accu_a1 = next_accu_a1;
size_t next_accu_a2 = next_accu_n * a2 / ntotal;
size_t i2 = next_accu_a2 - accu_a2;
accu_a2 = next_accu_a2;
ids_out.insert(ids_out.end(),
ids_in.begin() + i1,
ids_in.begin() + i2);
......@@ -291,6 +325,8 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
codes_in.begin() + i1 * code_size,
codes_in.begin() + i2 * code_size);
other.ntotal += i2 - i1;
accu_a1 = next_accu_a1;
accu_a2 = next_accu_a2;
}
accu_n += n;
}
......
......@@ -47,10 +47,17 @@ struct IndexIVF: Index {
size_t nprobe; ///< number of probes at query time
Index * quantizer; ///< quantizer that maps vectors to inverted lists
bool quantizer_trains_alone; ///< just pass over the trainset to quantizer
/**
* = 0: use the quantizer as index in a kmeans training
* = 1: just pass on the training set to the train() of the quantizer
* = 2: kmeans training on a flat index + add the centroids to the quantizer
*/
char quantizer_trains_alone;
bool own_fields; ///< whether object owns the quantizer
ClusteringParameters cp; ///< to override default clustering params
Index *clustering_index; ///< to override index used during clustering
std::vector < std::vector<long> > ids; ///< Inverted lists for indexes
......
......@@ -291,8 +291,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
for (int j = 0; j < d; j++) {
r[j] += centroid[j];
}
}
else {
} else {
pq.decode (code_line + ofs * pq.code_size, r);
}
}
......@@ -303,6 +302,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
void IndexIVFPQ::reconstruct (idx_t key, float * recons) const
{
FAISS_THROW_IF_NOT (direct_map.size() == ntotal);
int list_no = direct_map[key] >> 32;
int ofs = direct_map[key] & 0xffffffff;
......@@ -1029,6 +1029,51 @@ void IndexIVFPQ::search_preassigned (idx_t nx, const float *qx, idx_t k,
}
void IndexIVFPQ::search_and_reconstruct (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels,
float *reconstructed)
{
long * idx = new long [n * nprobe];
ScopeDeleter<long> del (idx);
float * coarse_dis = new float [n * nprobe];
ScopeDeleter<float> del2 (coarse_dis);
quantizer->search (n, x, nprobe, coarse_dis, idx);
search_preassigned (n, x, k, idx, coarse_dis,
distances, labels, true);
for (long i = 0; i < n; i++) {
for (long j = 0; j < k; j++) {
long ij = i * k + j;
idx_t res = labels[ij];
float *recons = reconstructed + d * (ij);
if (res < 0) {
// fill with NaNs
memset(recons, -1, sizeof(*recons) * d);
} else {
int list_no = res >> 32;
int ofs = res & 0xffffffff;
labels[ij] = ids[list_no][ofs];
quantizer->reconstruct (list_no, recons);
const uint8_t * code = &(codes[list_no][ofs * pq.code_size]);
for (size_t m = 0; m < pq.M; m++) {
float * out = recons + m * pq.dsub;
const float * cent = pq.get_centroids (m, code[m]);
for (size_t l = 0; l < pq.dsub; l++) {
out[l] += cent[l];
}
}
}
}
}
}
IndexIVFPQ::IndexIVFPQ ()
......
......@@ -114,6 +114,15 @@ struct IndexIVFPQ: IndexIVF {
float *distances, idx_t *labels,
bool store_pairs) const override;
/** Same as the search function, but also reconstruct approximate
* vectors for the search results
*
* @param reconstructed size (n, k, d)
**/
void search_and_reconstruct (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels,
float *reconstructed);
/// build precomputed table
void precompute_table ();
......
......@@ -124,8 +124,8 @@ struct Codec4bit {
struct SimilarityL2 {
const float *y, *yi;
explicit SimilarityL2 (const float * y): y(y) {}
explicit SimilarityL2 (const float * y): y(y) {}
/******* scalar accumulator *******/
......@@ -676,19 +676,19 @@ void ScalarQuantizer::compute_codes (const float * x,
size_t n) const
{
Quantizer *squant = select_quantizer (*this);
ScopeDeleter1<Quantizer> del(squant);
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
squant->encode_vector (x + i * d, codes + i * code_size);
delete squant;
}
void ScalarQuantizer::decode (const uint8_t *codes, float *x, size_t n) const
{
Quantizer *squant = select_quantizer (*this);
ScopeDeleter1<Quantizer> del(squant);
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
squant->decode_vector (codes + i * code_size, x + i * d);
delete squant;
}
/*******************************************************************
......@@ -754,6 +754,7 @@ void IndexScalarQuantizer::search(
}
ci += code_size;
}
minheap_reorder (k, simi, idxi);
}
} else {
#pragma omp parallel for
......@@ -774,7 +775,7 @@ void IndexScalarQuantizer::search(
}
ci += code_size;
}
maxheap_reorder (k, simi, idxi);
}
}
......@@ -855,6 +856,7 @@ void IndexIVFScalarQuantizer::add_with_ids
int nt = omp_get_num_threads();
int rank = omp_get_thread_num();
// each thread takes care of a subset of lists
for (size_t i = 0; i < n; i++) {
long list_no = idx [i];
......@@ -879,6 +881,7 @@ void IndexIVFScalarQuantizer::add_with_ids
ntotal += nadd;
}
namespace {
void search_with_probes_ip (const IndexIVFScalarQuantizer & index,
const float *x,
......@@ -958,6 +961,8 @@ void search_with_probes_L2 (const IndexIVFScalarQuantizer & index,
maxheap_reorder (k, simi, idxi);
}
} // anonymous namespace
void IndexIVFScalarQuantizer::search_preassigned (
idx_t n, const float *x, idx_t k,
const idx_t *idx,
......
......@@ -87,54 +87,59 @@ _swigfaiss.so: python/_swigfaiss.so
cp python/_swigfaiss.so python/swigfaiss.py .
#############################
# Dependencies
# Dependencies.
# make dep > x
# then copy/paste from x by hand below
# for i in *.cpp ; do g++ -std=c++11 -I.. -MM $i -msse4; done
dep:
for i in $(patsubst %.o,%.cpp,$(LIBOBJ)) ; do \
cpp -MM -std=gnu++0x $$i ; \
done
AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h IndexFlat.h
FaissException.o: FaissException.cpp FaissException.h
hamming.o: hamming.cpp hamming.h Heap.h FaissAssert.h FaissException.h
Heap.o: Heap.cpp Heap.h
Index.o: Index.cpp IndexFlat.h Index.h FaissAssert.h FaissException.h
utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \
FaissAssert.h FaissException.h
IndexFlat.o: IndexFlat.cpp IndexFlat.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h AuxIndexStructures.h
index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
IndexIVF.o: IndexIVF.cpp IndexIVF.h Index.h Clustering.h Heap.h utils.h \
hamming.h FaissAssert.h FaissException.h IndexFlat.h \
AuxIndexStructures.h
IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
IndexFlat.h hamming.h FaissAssert.h FaissException.h \
AuxIndexStructures.h
IndexLSH.o: IndexLSH.cpp IndexLSH.h Index.h VectorTransform.h utils.h \
Heap.h hamming.h FaissAssert.h FaissException.h
IndexPQ.o: IndexPQ.cpp IndexPQ.h Index.h ProductQuantizer.h Clustering.h \
Heap.h PolysemousTraining.h FaissAssert.h FaissException.h hamming.h
IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \
IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \
FaissException.h
MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \
FaissException.h Heap.h AuxIndexStructures.h
IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
IndexFlat.h hamming.h FaissAssert.h FaissException.h \
AuxIndexStructures.h
Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h IndexFlat.h
Heap.o: Heap.cpp Heap.h
VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \
Heap.h FaissAssert.h FaissException.h IndexPQ.h ProductQuantizer.h \
Clustering.h PolysemousTraining.h
index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
PolysemousTraining.o: PolysemousTraining.cpp PolysemousTraining.h \
ProductQuantizer.h Clustering.h Index.h Heap.h utils.h hamming.h \
FaissAssert.h FaissException.h
MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \
FaissException.h Heap.h AuxIndexStructures.h
Index.o: Index.cpp IndexFlat.h Index.h FaissAssert.h FaissException.h
ProductQuantizer.o: ProductQuantizer.cpp ProductQuantizer.h Clustering.h \
Index.h Heap.h FaissAssert.h FaissException.h VectorTransform.h \
IndexFlat.h utils.h
utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \
FaissAssert.h FaissException.h
VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \
Heap.h FaissAssert.h FaissException.h IndexPQ.h ProductQuantizer.h \
Clustering.h PolysemousTraining.h
AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \
IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \
FaissException.h
FaissException.o: FaissException.cpp FaissException.h
clean:
......
......@@ -76,6 +76,17 @@ void IndexIDMap::search (idx_t n, const float *x, idx_t k,
}
}
void IndexIDMap::range_search (idx_t n, const float *x, float radius,
RangeSearchResult *result) const
{
index->range_search(n, x, radius, result);
for (idx_t i = 0; i < result->lims[result->nq]; i++) {
result->labels[i] = result->labels[i] < 0 ?
result->labels[i] : id_map[result->labels[i]];
}
}
namespace {
struct IDTranslatedSelector: IDSelector {
......@@ -109,6 +120,7 @@ long IndexIDMap::remove_ids (const IDSelector & sel)
}
FAISS_ASSERT (j == index->ntotal);
ntotal = j;
id_map.resize(ntotal);
return nremove;
}
......
......@@ -51,6 +51,9 @@ struct IndexIDMap : Index {
/// remove ids adapted to IndexFlat
long remove_ids(const IDSelector& sel) override;
void range_search (idx_t n, const float *x, float radius,
RangeSearchResult *result) const override;
~IndexIDMap() override;
IndexIDMap () {own_fields=false; index=nullptr; }
};
......
......@@ -804,18 +804,38 @@ void IndexPreTransform::train (idx_t n, const float *x)
const float *prev_x = x;
ScopeDeleter<float> del;
if (verbose) {
printf("IndexPreTransform::train: training chain 0 to %d\n",
last_untrained);
}
for (int i = 0; i <= last_untrained; i++) {
if (i < chain.size()) {
VectorTransform *ltrans = chain [i];
if (!ltrans->is_trained)
ltrans->train(n, prev_x);
if (!ltrans->is_trained) {
if (verbose) {
printf(" Training chain component %d/%zd\n",
i, chain.size());
if (OPQMatrix *opqm = dynamic_cast<OPQMatrix*>(ltrans)) {
opqm->verbose = true;
}
}
ltrans->train (n, prev_x);
}
} else {
if (verbose) {
printf(" Training sub-index\n");
}
index->train (n, prev_x);
}
if (i == last_untrained) break;
if (verbose) {
printf(" Applying transform %d/%zd\n",
i, chain.size());
}
float * xt = chain[i]->apply (n, prev_x);
if (prev_x != x) delete prev_x;
if (prev_x != x) delete [] prev_x;
prev_x = xt;
del.set(xt);
}
......
......@@ -521,7 +521,7 @@ def compute_populated_index(preproc):
co.verbose = True
co.reserveVecs = max_add if max_add > 0 else xb.shape[0]
co.shard = True
assert co.shard_type in (0, 1, 2)
vres, vdev = make_vres_vdev()
gpu_index = faiss.index_cpu_to_gpu_multiple(
vres, vdev, indexall, co)
......
......@@ -121,6 +121,18 @@ def handle_Index(the_class):
swig_ptr(labels))
return distances, labels
def replacement_search_and_reconstruct(self, x, k):
n, d = x.shape
assert d == self.d
distances = np.empty((n, k), dtype=np.float32)
labels = np.empty((n, k), dtype=np.int64)
recons = np.empty((n, k, d), dtype=np.float32)
self.search_and_reconstruct_c(n, swig_ptr(x),
k, swig_ptr(distances),
swig_ptr(labels),
swig_ptr(recons))
return distances, labels, recons
def replacement_remove_ids(self, x):
if isinstance(x, IDSelector):
sel = x
......@@ -167,6 +179,8 @@ def handle_Index(the_class):
replace_method(the_class, 'range_search', replacement_range_search)
replace_method(the_class, 'update_vectors', replacement_update_vectors,
ignore_missing=True)
replace_method(the_class, 'search_and_reconstruct',
replacement_search_and_reconstruct, ignore_missing=True)
def handle_VectorTransform(the_class):
......@@ -258,12 +272,52 @@ def index_cpu_to_gpu_multiple_py(resources, index, co=None):
return index_cpu_to_gpu_multiple(vres, vdev, index, co)
def vector_float_to_array(v):
a = np.empty(v.size(), dtype='float32')
memcpy(swig_ptr(a), v.data(), 4 * v.size())
def index_cpu_to_all_gpus(index, co=None, ngpu=-1):
if ngpu == -1:
ngpu = get_num_gpus()
res = [StandardGpuResources() for i in range(ngpu)]
index2 = index_cpu_to_gpu_multiple_py(res, index, co)
index2.dont_dealloc = res
return index2
# mapping from vector names in swigfaiss.swig and the numpy dtype names
vector_name_map = {
'Float': 'float32',
'Byte': 'uint8',
'Uint64': 'uint64',
'Long': 'int64',
'Int': 'int32',
'Double': 'float64'
}
def vector_to_array(v):
""" convert a C++ vector to a numpy array """
classname = v.__class__.__name__
assert classname.endswith('Vector')
dtype = np.dtype(vector_name_map[classname[:-6]])
a = np.empty(v.size(), dtype=dtype)
memcpy(swig_ptr(a), v.data(), a.nbytes)
return a
def vector_float_to_array(v):
return vector_to_array(v)
def copy_array_to_vector(a, v):
""" copy a numpy array to a vector """
n, = a.shape
classname = v.__class__.__name__
assert classname.endswith('Vector')
dtype = np.dtype(vector_name_map[classname[:-6]])
assert dtype == a.dtype, (
'cannot copy a %s array to a %s (should be %s)' % (
a.dtype, classname, dtype))
v.resize(n)
memcpy(v.data(), swig_ptr(a), a.nbytes)
class Kmeans:
def __init__(self, d, k, niter=25, verbose=False, spherical = False):
......@@ -364,3 +418,18 @@ def eval_intersection(I1, I2):
def normalize_L2(x):
fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x))
def replacement_map_add(self, keys, vals):
n, = keys.shape
assert (n,) == keys.shape
self.add_c(n, swig_ptr(keys), swig_ptr(vals))
def replacement_map_search_multiple(self, keys):
n, = keys.shape
vals = np.empty(n, dtype='uint64')
self.search_multiple_c(n, swig_ptr(keys), swig_ptr(vals))
return vals
replace_method(MapLong2Long, 'add', replacement_map_add)
replace_method(MapLong2Long, 'search_multiple', replacement_map_search_multiple)
......@@ -8,6 +8,7 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#include "GpuAutoTune.h"
#include <typeinfo>
#include "GpuIndex.h"
#include "../FaissAssert.h"
......@@ -97,17 +98,6 @@ faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index)
GpuClonerOptions::GpuClonerOptions():
indicesOptions(INDICES_64_BIT),
useFloat16CoarseQuantizer(false),
useFloat16(false),
usePrecomputed(true),
reserveVecs(0),
storeTransposed(false),
verbose(0)
{}
struct ToGpuCloner: faiss::Cloner, GpuClonerOptions {
GpuResources *resources;
int device;
......@@ -185,9 +175,6 @@ faiss::Index * index_cpu_to_gpu(
return cl.clone_Index(index);
}
GpuMultipleClonerOptions::GpuMultipleClonerOptions(): shard(false)
{}
struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
std::vector<ToGpuCloner> sub_cloners;
......@@ -211,6 +198,28 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
{}
void copy_ivf_shard (const IndexIVF *index_ivf, IndexIVF *idx2,
long n, long i) {
if (shard_type == 2) {
long i0 = i * index_ivf->ntotal / n;
long i1 = (i + 1) * index_ivf->ntotal / n;
if(verbose)
printf("IndexShards shard %ld indices %ld:%ld\n",
i, i0, i1);
index_ivf->copy_subset_to(*idx2, 2, i0, i1);
FAISS_ASSERT(idx2->ntotal == i1 - i0);
} else if (shard_type == 1) {
if(verbose)
printf("IndexShards shard %ld select modulo %ld = %ld\n",
i, n, i);
index_ivf->copy_subset_to(*idx2, 1, n, i);
} else {
FAISS_THROW_FMT ("shard_type %d not implemented", shard_type);
}
}
Index *clone_Index(const Index *index) override {
long n = sub_cloners.size();
if (n == 1)
......@@ -231,19 +240,13 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
dynamic_cast<const faiss::IndexIVFPQ *>(index);
auto index_ivfflat =
dynamic_cast<const faiss::IndexIVFFlat *>(index);
FAISS_ASSERT_MSG (index_ivfpq || index_ivfflat,
FAISS_THROW_IF_NOT_MSG (index_ivfpq || index_ivfflat,
"IndexShards implemented only for "
"IndexIVFFlat or IndexIVFPQ");
std::vector<faiss::Index*> shards(n);
for(long i = 0; i < n; i++) {
// make a shallow copy
long i0 = i * index->ntotal / n;
long i1 = (i + 1) * index->ntotal / n;
if(verbose)
printf("IndexShards shard %ld indices %ld:%ld\n",
i, i0, i1);
if(reserveVecs)
sub_cloners[i].reserveVecs =
(reserveVecs + n - 1) / n;
......@@ -258,18 +261,19 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
idx2.nprobe = index_ivfpq->nprobe;
idx2.use_precomputed_table = 0;
idx2.is_trained = index->is_trained;
index_ivfpq->copy_subset_to(idx2, 2, i0, i1);
FAISS_ASSERT(idx2.ntotal == i1 - i0);
copy_ivf_shard (index_ivfpq, &idx2, n, i);
shards[i] = sub_cloners[i].clone_Index(&idx2);
} else if (index_ivfflat) {
faiss::IndexIVFFlat idx2(
index_ivfflat->quantizer, index->d,
index_ivfflat->nlist, index_ivfflat->metric_type);
idx2.nprobe = index_ivfflat->nprobe;
index_ivfflat->copy_subset_to(idx2, 2, i0, i1);
idx2.nprobe = index_ivfflat->nprobe;
copy_ivf_shard (index_ivfflat, &idx2, n, i);
shards[i] = sub_cloners[i].clone_Index(&idx2);
}
}
faiss::IndexShards *res =
new faiss::IndexShards(index->d, true, false);
......@@ -372,33 +376,26 @@ void GpuParameterSpace::initialize (const Index * index)
void GpuParameterSpace::set_index_parameter (
Index * index, const std::string & name, double val) const
{
if (DC (IndexPreTransform)) {
index = ix->index;
}
if (DC (IndexProxy)) {
for (int i = 0; i < ix->count(); i++)
set_index_parameter (ix->at(i), name, val);
return;
}
if (DC (faiss::IndexShards)) {
for (auto sub_index : ix->shard_indexes)
set_index_parameter (sub_index, name, val);
return;
}
if (DC (GpuIndexIVF)) {
if (name == "nprobe") {
DC (GpuIndexIVF);
FAISS_ASSERT(ix);
ix->setNumProbes (int (val));
return;
}
}
if(DC (GpuIndexIVFPQ)) {
if (name == "use_precomputed_table") {
DC (GpuIndexIVFPQ);
FAISS_ASSERT(ix);
ix->setPrecomputedCodes(bool (val));
return;
}
}
FAISS_ASSERT_MSG (false, "unknown parameter");
// maybe norma lindex parameters apply?
ParameterSpace::set_index_parameter (index, name, val);
}
......
......@@ -22,7 +22,9 @@ GpuClonerOptions::GpuClonerOptions()
}
GpuMultipleClonerOptions::GpuMultipleClonerOptions()
: shard(false) {
: shard(false),
shard_type(1)
{
}
} } // namespace
......@@ -47,6 +47,9 @@ struct GpuMultipleClonerOptions : public GpuClonerOptions {
/// Whether to shard the index across GPUs, versus replication
/// across GPUs
bool shard;
/// IndexIVF::copy_subset_to subset type
int shard_type;
};
} } // namespace
......@@ -26,7 +26,7 @@ struct GpuIndexConfig {
/// GPU device on which the index is resident
int device;
/// What memory space to use for primary storae.
/// What memory space to use for primary storage.
/// On Pascal and above (CC 6+) architectures, allows GPUs to use
/// more memory than is available on the GPU.
MemorySpace memorySpace;
......
......@@ -184,7 +184,7 @@ GpuIndexIVF::copyTo(faiss::IndexIVF* index) const {
}
index->quantizer = q;
index->quantizer_trains_alone = false;
index->quantizer_trains_alone = 0;
index->own_fields = true;
index->cp = this->cp;
index->ids.clear();
......
......@@ -96,7 +96,6 @@ GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
FAISS_ASSERT(index->pq.byte_per_idx == 1);
FAISS_ASSERT(index->by_residual);
FAISS_ASSERT(index->polysemous_ht == 0);
ivfpqConfig_.usePrecomputedTables = (bool) index->use_precomputed_table;
verifySettings_();
......
This diff is collapsed.
This diff is collapsed.
......@@ -31,11 +31,7 @@ void runL2Distance(GpuResources* resources,
Tensor<int, 2, true>& outIndices,
// Do we care about `outDistances`? If not, we can
// take shortcuts.
bool ignoreOutDistances = false,
// Hint to use a different sized tile for
// multi-streaming the queries. If <= 0, we use the
// default
int tileSizeOverride = -1);
bool ignoreOutDistances = false);
/// Calculates brute-force inner product distance between `vectors`
/// and `queries`, returning the k closest results seen
......@@ -45,11 +41,7 @@ void runIPDistance(GpuResources* resources,
Tensor<float, 2, true>& queries,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
// Hint to use a different sized tile for
// multi-streaming the queries. If <= 0, we use the
// default
int tileSizeOverride = -1);
Tensor<int, 2, true>& outIndices);
#ifdef FAISS_USE_FLOAT16
void runIPDistance(GpuResources* resources,
......@@ -59,8 +51,7 @@ void runIPDistance(GpuResources* resources,
int k,
Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool useHgemm,
int tileSizeOverride = -1);
bool useHgemm);
void runL2Distance(GpuResources* resources,
Tensor<half, 2, true>& vectors,
......@@ -71,8 +62,7 @@ void runL2Distance(GpuResources* resources,
Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool useHgemm,
bool ignoreOutDistances = false,
int tileSizeOverride = -1);
bool ignoreOutDistances = false);
#endif
} } // namespace
......@@ -114,8 +114,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance,
int tileSize) {
bool exactDistance) {
auto stream = resources_->getDefaultStreamCurrentDevice();
auto& mem = resources_->getMemoryManagerCurrentDevice();
......@@ -127,7 +126,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
DeviceTensor<half, 2, true> outDistancesHalf(
mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream);
query(inputHalf, k, outDistancesHalf, outIndices, exactDistance, tileSize);
query(inputHalf, k, outDistancesHalf, outIndices, exactDistance);
if (exactDistance) {
// Convert outDistances back
......@@ -145,8 +144,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
outDistances,
outIndices,
// FIXME
!exactDistance,
tileSize);
!exactDistance);
} else {
runIPDistance(resources_,
vectors_,
......@@ -154,8 +152,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
input,
k,
outDistances,
outIndices,
tileSize);
outIndices);
}
}
}
......@@ -166,8 +163,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
int k,
Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance,
int tileSize) {
bool exactDistance) {
FAISS_ASSERT(useFloat16_);
if (l2Distance_) {
......@@ -181,8 +177,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
outIndices,
useFloat16Accumulator_,
// FIXME
!exactDistance,
tileSize);
!exactDistance);
} else {
runIPDistance(resources_,
vectorsHalf_,
......@@ -191,8 +186,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
k,
outDistances,
outIndices,
useFloat16Accumulator_,
tileSize);
useFloat16Accumulator_);
}
}
#endif
......@@ -217,12 +211,14 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
rawData_.append((char*) devDataHalf.data(),
devDataHalf.getSizeInBytes(),
stream);
stream,
true /* reserve exactly */);
#endif
} else {
rawData_.append((char*) data,
(size_t) dim_ * numVecs * sizeof(float),
stream);
stream,
true /* reserve exactly */);
}
num_ += numVecs;
......
......@@ -61,16 +61,14 @@ class FlatIndex {
int k,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance,
int tileSize = -1);
bool exactDistance);
#ifdef FAISS_USE_FLOAT16
void query(Tensor<half, 2, true>& vecs,
int k,
Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance,
int tileSize = -1);
bool exactDistance);
#endif
/// Add vectors to ourselves; the pointer passed can be on the host
......
......@@ -195,10 +195,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
closestSubQDistanceView,
closestSubQIndexView,
// We don't care about distances
true,
// Much larger tile size, since these vectors are a
// lot smaller than query vectors
1024);
true);
}
// Now, we have the nearest sub-q centroid for each slice of the
......
......@@ -10,10 +10,10 @@
#include "IVFUtils.cuh"
#include "../utils/DeviceUtils.h"
#include "../utils/Limits.cuh"
#include "../utils/Select.cuh"
#include "../utils/StaticUtils.h"
#include "../utils/Tensor.cuh"
#include <limits>
//
// This kernel is split into a separate compilation unit to cut down
......@@ -22,9 +22,6 @@
namespace faiss { namespace gpu {
constexpr auto kMax = std::numeric_limits<float>::max();
constexpr auto kMin = std::numeric_limits<float>::min();
template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir>
__global__ void
pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
......@@ -38,7 +35,7 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
__shared__ float smemK[kNumWarps * NumWarpQ];
__shared__ int smemV[kNumWarps * NumWarpQ];
constexpr auto kInit = Dir ? kMin : kMax;
constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
BlockSelect<float, int, Dir, Comparator<float>,
NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(kInit, -1, smemK, smemV, k);
......
......@@ -10,10 +10,10 @@
#include "IVFUtils.cuh"
#include "../utils/DeviceUtils.h"
#include "../utils/Limits.cuh"
#include "../utils/Select.cuh"
#include "../utils/StaticUtils.h"
#include "../utils/Tensor.cuh"
#include <limits>
//
// This kernel is split into a separate compilation unit to cut down
......@@ -22,9 +22,6 @@
namespace faiss { namespace gpu {
constexpr auto kMax = std::numeric_limits<float>::max();
constexpr auto kMin = std::numeric_limits<float>::min();
// This is warp divergence central, but this is really a final step
// and happening a small number of times
inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
......@@ -71,7 +68,7 @@ pass2SelectLists(Tensor<float, 2, true> heapDistances,
__shared__ float smemK[kNumWarps * NumWarpQ];
__shared__ int smemV[kNumWarps * NumWarpQ];
constexpr auto kInit = Dir ? kMin : kMax;
constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
BlockSelect<float, int, Dir, Comparator<float>,
NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(kInit, -1, smemK, smemV, k);
......
......@@ -31,28 +31,29 @@ namespace faiss { namespace gpu {
// T: the type we are doing the math in (e.g., float, half)
// TVec: the potentially vectorized type we are loading in (e.g.,
// float4, half2)
template <typename T, typename TVec,
template <typename T, typename TVec, typename TIndex,
int RowTileSize, bool NormLoop, bool NormSquared>
__global__ void l2Norm(Tensor<TVec, 2, true> input,
Tensor<T, 1, true> output) {
__global__ void l2Norm(Tensor<TVec, 2, true, TIndex> input,
Tensor<T, 1, true, TIndex> output) {
extern __shared__ char smemByte[]; // #warps * RowTileSize elements
T* smem = (T*) smemByte;
int numWarps = utils::divUp(blockDim.x, kWarpSize);
int laneId = getLaneId();
int warpId = threadIdx.x / kWarpSize;
TIndex numWarps = utils::divUp(blockDim.x, kWarpSize);
TIndex laneId = getLaneId();
TIndex warpId = threadIdx.x / kWarpSize;
bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
int rowStart = RowTileSize * blockIdx.x;
TIndex rowStart = RowTileSize * blockIdx.x;
T rowNorm[RowTileSize];
if (lastRowTile) {
// We are handling the very end of the input matrix rows
for (int row = 0; row < input.getSize(0) - rowStart; ++row) {
for (TIndex row = 0; row < input.getSize(0) - rowStart; ++row) {
if (NormLoop) {
rowNorm[0] = Math<T>::zero();
for (int col = threadIdx.x; col < input.getSize(1); col += blockDim.x) {
for (TIndex col = threadIdx.x;
col < input.getSize(1); col += blockDim.x) {
TVec val = input[rowStart + row][col];
val = Math<TVec>::mul(val, val);
rowNorm[0] = Math<T>::add(rowNorm[0], Math<TVec>::reduceAdd(val));
......@@ -82,7 +83,8 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input,
rowNorm[row] = Math<T>::zero();
}
for (int col = threadIdx.x; col < input.getSize(1); col += blockDim.x) {
for (TIndex col = threadIdx.x;
col < input.getSize(1); col += blockDim.x) {
#pragma unroll
for (int row = 0; row < RowTileSize; ++row) {
tmp[row] = input[rowStart + row][col];
......@@ -172,32 +174,32 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input,
}
}
template <typename T, typename TVec>
void runL2Norm(Tensor<T, 2, true>& input,
Tensor<T, 1, true>& output,
template <typename T, typename TVec, typename TIndex>
void runL2Norm(Tensor<T, 2, true, TIndex>& input,
Tensor<T, 1, true, TIndex>& output,
bool normSquared,
cudaStream_t stream) {
FAISS_ASSERT(input.getSize(0) == output.getSize(0));
int maxThreads = getMaxThreadsCurrentDevice();
TIndex maxThreads = (TIndex) getMaxThreadsCurrentDevice();
constexpr int rowTileSize = 8;
#define RUN_L2(TYPE_T, TYPE_TVEC, INPUT) \
do { \
if (normLoop) { \
if (normSquared) { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, true, true> \
l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, true, true> \
<<<grid, block, smem, stream>>>(INPUT, output); \
} else { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, true, false> \
l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, true, false> \
<<<grid, block, smem, stream>>>(INPUT, output); \
} \
} else { \
if (normSquared) { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, false, true> \
l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, false, true> \
<<<grid, block, smem, stream>>>(INPUT, output); \
} else { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, false, false> \
l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, false, false> \
<<<grid, block, smem, stream>>>(INPUT, output); \
} \
} \
......@@ -207,9 +209,9 @@ void runL2Norm(Tensor<T, 2, true>& input,
// Can load using the vectorized type
auto inputV = input.template castResize<TVec>();
int dim = inputV.getSize(1);
auto dim = inputV.getSize(1);
bool normLoop = dim > maxThreads;
int numThreads = min(dim, maxThreads);
auto numThreads = min(dim, maxThreads);
auto grid = dim3(utils::divUp(inputV.getSize(0), rowTileSize));
auto block = dim3(numThreads);
......@@ -220,9 +222,9 @@ void runL2Norm(Tensor<T, 2, true>& input,
} else {
// Can't load using the vectorized type
int dim = input.getSize(1);
auto dim = input.getSize(1);
bool normLoop = dim > maxThreads;
int numThreads = min(dim, maxThreads);
auto numThreads = min(dim, maxThreads);
auto grid = dim3(utils::divUp(input.getSize(0), rowTileSize));
auto block = dim3(numThreads);
......@@ -241,7 +243,13 @@ void runL2Norm(Tensor<float, 2, true>& input,
Tensor<float, 1, true>& output,
bool normSquared,
cudaStream_t stream) {
runL2Norm<float, float4>(input, output, normSquared, stream);
if (input.canUseIndexType<int>()) {
runL2Norm<float, float4, int>(input, output, normSquared, stream);
} else {
auto inputCast = input.castIndexType<long>();
auto outputCast = output.castIndexType<long>();
runL2Norm<float, float4, long>(inputCast, outputCast, normSquared, stream);
}
}
#ifdef FAISS_USE_FLOAT16
......@@ -249,7 +257,13 @@ void runL2Norm(Tensor<half, 2, true>& input,
Tensor<half, 1, true>& output,
bool normSquared,
cudaStream_t stream) {
runL2Norm<half, half2>(input, output, normSquared, stream);
if (input.canUseIndexType<int>()) {
runL2Norm<half, half2, int>(input, output, normSquared, stream);
} else {
auto inputCast = input.castIndexType<long>();
auto outputCast = output.castIndexType<long>();
runL2Norm<half, half2, long>(inputCast, outputCast, normSquared, stream);
}
}
#endif
......
......@@ -29,11 +29,14 @@ DEFINE_int32(num, 128, "# of vecs");
DEFINE_int32(dim, 128, "# of dimensions");
DEFINE_int32(num_queries, 3, "number of query vectors");
DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
DEFINE_bool(use_float16, false, "use encodings in float16 instead of float32");
DEFINE_bool(use_float16, false, "use encodings in float16");
DEFINE_bool(use_float16_math, false, "perform math in float16");
DEFINE_bool(transposed, false, "store vectors transposed");
DEFINE_int64(seed, -1, "specify random seed");
DEFINE_int32(num_gpus, 1, "number of gpus to use");
DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use");
DEFINE_bool(cpu, true, "run the CPU code for timing and comparison");
DEFINE_bool(use_unified_mem, false, "use Pascal unified memory for the index");
using namespace faiss::gpu;
......@@ -72,7 +75,10 @@ int main(int argc, char** argv) {
GpuIndexFlatConfig config;
config.device = dev;
config.useFloat16 = FLAGS_use_float16;
config.useFloat16Accumulator = FLAGS_use_float16_math;
config.storeTransposed = FLAGS_transposed;
config.memorySpace = FLAGS_use_unified_mem ?
MemorySpace::Unified : MemorySpace::Device;
auto p = std::unique_ptr<faiss::gpu::GpuIndexFlatL2>(
new faiss::gpu::GpuIndexFlatL2(res, index.get(), config));
......@@ -90,9 +96,9 @@ int main(int argc, char** argv) {
HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
if (FLAGS_cpu) {
float cpuTime = 0.0f;
{
CpuTimer timer;
index->search(numQueries,
cpuQuery.data(),
......@@ -101,9 +107,8 @@ int main(int argc, char** argv) {
cpuIndices.data());
cpuTime = timer.elapsedMilliseconds();
}
printf("CPU time %.3f ms\n", cpuTime);
}
HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
......@@ -131,14 +136,14 @@ int main(int argc, char** argv) {
CUDA_VERIFY(cudaProfilerStop());
printf("GPU time %.3f ms\n", gpuTime);
if (FLAGS_cpu) {
compareLists(cpuDistances.data(), cpuIndices.data(),
gpuDistances.data(), gpuIndices.data(),
numQueries, FLAGS_k,
"", true, FLAGS_diff, false);
}
CUDA_VERIFY(cudaDeviceSynchronize());
// printf("\ncudaMalloc usage %zd\n",
// resources.getMemoryManager().getHighWaterCudaMalloc());
return 0;
}
......@@ -21,29 +21,47 @@
constexpr float kF16MaxRelErr = 0.07f;
constexpr float kF32MaxRelErr = 6e-3f;
void testFlat(bool useL2,
bool useFloat16,
bool useTransposed,
int kOverride = -1) {
int numVecs = faiss::gpu::randVal(1000, 20000);
struct TestFlatOptions {
TestFlatOptions()
: useL2(true),
useFloat16(false),
useTransposed(false),
numVecsOverride(-1),
numQueriesOverride(-1),
kOverride(-1) {
}
bool useL2;
bool useFloat16;
bool useTransposed;
int numVecsOverride;
int numQueriesOverride;
int kOverride;
};
void testFlat(const TestFlatOptions& opt) {
int numVecs = opt.numVecsOverride > 0 ?
opt.numVecsOverride : faiss::gpu::randVal(1000, 20000);
int dim = faiss::gpu::randVal(50, 800);
int numQuery = faiss::gpu::randVal(1, 512);
int numQuery = opt.numQueriesOverride > 0 ?
opt.numQueriesOverride : faiss::gpu::randVal(1, 512);
// Due to loss of precision in a float16 accumulator, for large k,
// the number of differences is pretty huge. Restrict ourselves to a
// fairly small `k` for float16
int k = useFloat16 ?
int k = opt.useFloat16 ?
std::min(faiss::gpu::randVal(1, 50), numVecs) :
std::min(faiss::gpu::randVal(1, 1024), numVecs);
if (kOverride > 0) {
k = kOverride;
if (opt.kOverride > 0) {
k = opt.kOverride;
}
faiss::IndexFlatIP cpuIndexIP(dim);
faiss::IndexFlatL2 cpuIndexL2(dim);
faiss::IndexFlat* cpuIndex =
useL2 ? (faiss::IndexFlat*) &cpuIndexL2 : (faiss::IndexFlat*) &cpuIndexIP;
opt.useL2 ? (faiss::IndexFlat*) &cpuIndexL2 :
(faiss::IndexFlat*) &cpuIndexIP;
// Construct on a random device to test multi-device, if we have
// multiple devices
......@@ -55,14 +73,14 @@ void testFlat(bool useL2,
faiss::gpu::GpuIndexFlatConfig config;
config.device = device;
config.useFloat16 = useFloat16;
config.storeTransposed = useTransposed;
config.useFloat16 = opt.useFloat16;
config.storeTransposed = opt.useTransposed;
faiss::gpu::GpuIndexFlatIP gpuIndexIP(&res, dim, config);
faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
faiss::gpu::GpuIndexFlat* gpuIndex =
useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
opt.useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
(faiss::gpu::GpuIndexFlat*) &gpuIndexIP;
std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
......@@ -70,37 +88,53 @@ void testFlat(bool useL2,
gpuIndex->add(numVecs, vecs.data());
std::stringstream str;
str << (useL2 ? "L2" : "IP") << " numVecs " << numVecs
str << (opt.useL2 ? "L2" : "IP") << " numVecs " << numVecs
<< " dim " << dim
<< " useFloat16 " << useFloat16
<< " transposed " << useTransposed
<< " useFloat16 " << opt.useFloat16
<< " transposed " << opt.useTransposed
<< " numQuery " << numQuery
<< " k " << k;
// To some extent, we depend upon the relative error for the test
// for float16
faiss::gpu::compareIndices(*cpuIndex, *gpuIndex, numQuery, dim, k, str.str(),
useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
// FIXME: the fp16 bounds are
// useless when math (the accumulator) is
// in fp16. Figure out another way to test
useFloat16 ? 0.99f : 0.1f,
useFloat16 ? 0.65f : 0.015f);
opt.useFloat16 ? 0.99f : 0.1f,
opt.useFloat16 ? 0.65f : 0.015f);
}
TEST(TestGpuIndexFlat, IP_Float32) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(false, false, false);
testFlat(false, false, true);
TestFlatOptions opt;
opt.useL2 = false;
opt.useFloat16 = false;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
}
}
TEST(TestGpuIndexFlat, L2_Float32) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(true, false, false);
testFlat(true, false, true);
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
}
}
......@@ -108,24 +142,46 @@ TEST(TestGpuIndexFlat, L2_Float32) {
TEST(TestGpuIndexFlat, L2_Float32_K1) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(true, false, false, 1);
testFlat(true, false, true, 1);
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
opt.kOverride = 1;
testFlat(opt);
}
}
TEST(TestGpuIndexFlat, IP_Float16) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(false, true, false);
testFlat(false, true, false);
TestFlatOptions opt;
opt.useL2 = false;
opt.useFloat16 = true;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
}
}
TEST(TestGpuIndexFlat, L2_Float16) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(true, true, false);
testFlat(true, true, true);
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = true;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
}
}
......@@ -133,8 +189,33 @@ TEST(TestGpuIndexFlat, L2_Float16) {
TEST(TestGpuIndexFlat, L2_Float16_K1) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(true, true, false, 1);
testFlat(true, true, true, 1);
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = true;
opt.useTransposed = false;
opt.kOverride = 1;
testFlat(opt);
}
}
// test tiling along a huge vector set
TEST(TestGpuIndexFlat, L2_Tiling) {
for (int tries = 0; tries < 3; ++tries) {
faiss::gpu::newTestSeed();
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
opt.numVecsOverride = 1000000;
opt.numQueriesOverride = 8;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
}
}
......
......@@ -14,6 +14,7 @@
#include "../StandardGpuResources.h"
#include "../utils/DeviceUtils.h"
#include "../test/TestUtils.h"
#include <cmath>
#include <gtest/gtest.h>
#include <glog/logging.h>
#include <sstream>
......@@ -390,6 +391,68 @@ TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) {
copyToTest(false, false);
}
TEST(TestGpuIndexIVFFlat, Float32_negative) {
faiss::gpu::newTestSeed();
Options opt;
auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
// Put all vecs on negative side
for (auto& f : trainVecs) {
f = std::abs(f) * -1.0f;
}
for (auto& f : addVecs) {
f *= std::abs(f) * -1.0f;
}
faiss::IndexFlatIP quantizerIP(opt.dim);
faiss::Index* quantizer = (faiss::Index*) &quantizerIP;
faiss::IndexIVFFlat cpuIndex(quantizer,
opt.dim, opt.numCentroids,
faiss::METRIC_INNER_PRODUCT);
cpuIndex.train(opt.numTrain, trainVecs.data());
cpuIndex.add(opt.numAdd, addVecs.data());
cpuIndex.nprobe = opt.nprobe;
faiss::gpu::StandardGpuResources res;
res.noTempMemory();
faiss::gpu::GpuIndexIVFFlatConfig config;
config.device = opt.device;
config.indicesOptions = opt.indicesOpt;
faiss::gpu::GpuIndexIVFFlat gpuIndex(&res,
cpuIndex.d,
cpuIndex.nlist,
cpuIndex.metric_type,
config);
gpuIndex.copyFrom(&cpuIndex);
gpuIndex.setNumProbes(opt.nprobe);
// Construct a positive test set
auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
// Put all vecs on positive size
for (auto& f : queryVecs) {
f = std::abs(f);
}
bool compFloat16 = false;
faiss::gpu::compareIndices(queryVecs,
cpuIndex, gpuIndex,
opt.numQuery, opt.dim, opt.k, opt.toString(),
compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
// FIXME: the fp16 bounds are
// useless when math (the accumulator) is
// in fp16. Figure out another way to test
compFloat16 ? 0.99f : 0.1f,
compFloat16 ? 0.65f : 0.015f);
}
//
// NaN tests
//
......
......@@ -64,24 +64,23 @@ std::vector<float> randVecs(size_t num, size_t dim) {
return v;
}
void compareIndices(faiss::Index& refIndex,
void compareIndices(const std::vector<float>& queryVecs,
faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
const std::string& configMsg,
float maxRelativeError,
float pctMaxDiff1,
float pctMaxDiffN) {
auto queries = faiss::gpu::randVecs(numQuery, dim);
// Compare
std::vector<float> refDistance(numQuery * k, 0);
std::vector<faiss::Index::idx_t> refIndices(numQuery * k, -1);
refIndex.search(numQuery, queries.data(),
refIndex.search(numQuery, queryVecs.data(),
k, refDistance.data(), refIndices.data());
std::vector<float> testDistance(numQuery * k, 0);
std::vector<faiss::Index::idx_t> testIndices(numQuery * k, -1);
testIndex.search(numQuery, queries.data(),
testIndex.search(numQuery, queryVecs.data(),
k, testDistance.data(), testIndices.data());
faiss::gpu::compareLists(refDistance.data(),
......@@ -94,6 +93,25 @@ void compareIndices(faiss::Index& refIndex,
maxRelativeError, pctMaxDiff1, pctMaxDiffN);
}
void compareIndices(faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
const std::string& configMsg,
float maxRelativeError,
float pctMaxDiff1,
float pctMaxDiffN) {
auto queryVecs = faiss::gpu::randVecs(numQuery, dim);
compareIndices(queryVecs,
refIndex,
testIndex,
numQuery, dim, k,
configMsg,
maxRelativeError,
pctMaxDiff1,
pctMaxDiffN);
}
template <typename T>
inline T lookup(const T* p, int i, int j, int /*dim1*/, int dim2) {
return p[i * dim2 + j];
......
......@@ -56,7 +56,19 @@ T randSelect(std::initializer_list<T> vals) {
/// Generates a collection of random vectors in the range [0, 1]
std::vector<float> randVecs(size_t num, size_t dim);
/// Compare two indices via query for similarity
/// Compare two indices via query for similarity, with a user-specified set of
/// query vectors
void compareIndices(const std::vector<float>& queryVecs,
faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
const std::string& configMsg,
float maxRelativeError = 6e-5f,
float pctMaxDiff1 = 0.1f,
float pctMaxDiffN = 0.005f);
/// Compare two indices via query for similarity, generating random query
/// vectors
void compareIndices(faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
......
......@@ -38,14 +38,14 @@ def search_index_pytorch(index, x, k, D=None, I=None):
assert I.__class__ in (torch.LongTensor, torch.cuda.LongTensor)
assert I.size() == (n, k)
assert I.is_contiguous()
torch.cuda.synchronize()
xptr = x.storage().data_ptr()
Iptr = I.storage().data_ptr()
Dptr = D.storage().data_ptr()
index.search_c(n, faiss.cast_integer_to_float_ptr(xptr),
k, faiss.cast_integer_to_float_ptr(Dptr),
faiss.cast_integer_to_long_ptr(Iptr))
torch.cuda.synchronize()
return D, I
......
......@@ -77,4 +77,46 @@ void runBlockSelect(Tensor<float, 2, true>& in,
}
}
void runBlockSelectPair(Tensor<float, 2, true>& inK,
Tensor<int, 2, true>& inV,
Tensor<float, 2, true>& outK,
Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) {
FAISS_ASSERT(k <= 1024);
if (dir) {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(float, true, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(float, true, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(float, true, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(float, true, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(float, true, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(float, true, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(float, true, 1024);
}
} else {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(float, false, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(float, false, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(float, false, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(float, false, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(float, false, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(float, false, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(float, false, 1024);
}
}
}
} } // namespace
......@@ -79,6 +79,48 @@ void runBlockSelect(Tensor<half, 2, true>& in,
}
}
void runBlockSelectPair(Tensor<half, 2, true>& inK,
Tensor<int, 2, true>& inV,
Tensor<half, 2, true>& outK,
Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) {
FAISS_ASSERT(k <= 1024);
if (dir) {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(half, true, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(half, true, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(half, true, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(half, true, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(half, true, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(half, true, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(half, true, 1024);
}
} else {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(half, false, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(half, false, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(half, false, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(half, false, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(half, false, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(half, false, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(half, false, 1024);
}
}
}
#endif
} } // namespace
......@@ -62,16 +62,79 @@ __global__ void blockSelect(Tensor<K, 2, true> in,
}
}
template <typename K,
typename IndexType,
bool Dir,
int NumWarpQ,
int NumThreadQ,
int ThreadsPerBlock>
__global__ void blockSelectPair(Tensor<K, 2, true> inK,
Tensor<IndexType, 2, true> inV,
Tensor<K, 2, true> outK,
Tensor<IndexType, 2, true> outV,
K initK,
IndexType initV,
int k) {
constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
__shared__ K smemK[kNumWarps * NumWarpQ];
__shared__ IndexType smemV[kNumWarps * NumWarpQ];
BlockSelect<K, IndexType, Dir, Comparator<K>,
NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(initK, initV, smemK, smemV, k);
// Grid is exactly sized to rows available
int row = blockIdx.x;
int i = threadIdx.x;
K* inKStart = inK[row][i].data();
IndexType* inVStart = inV[row][i].data();
// Whole warps must participate in the selection
int limit = utils::roundDown(inK.getSize(1), kWarpSize);
for (; i < limit; i += ThreadsPerBlock) {
heap.add(*inKStart, *inVStart);
inKStart += ThreadsPerBlock;
inVStart += ThreadsPerBlock;
}
// Handle last remainder fraction of a warp of elements
if (i < inK.getSize(1)) {
heap.addThreadQ(*inKStart, *inVStart);
}
heap.reduce();
for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
outK[row][i] = smemK[i];
outV[row][i] = smemV[i];
}
}
void runBlockSelect(Tensor<float, 2, true>& in,
Tensor<float, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
void runBlockSelectPair(Tensor<float, 2, true>& inKeys,
Tensor<int, 2, true>& inIndices,
Tensor<float, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
#ifdef FAISS_USE_FLOAT16
void runBlockSelect(Tensor<half, 2, true>& in,
Tensor<half, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
void runBlockSelectPair(Tensor<half, 2, true>& inKeys,
Tensor<int, 2, true>& inIndices,
Tensor<half, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
#endif
} } // namespace
......@@ -12,37 +12,37 @@
namespace faiss { namespace gpu {
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor() :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(),
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor() :
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner),
space_(MemorySpace::Device) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(),
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) :
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner),
space_(MemorySpace::Device) {
this->operator=(std::move(t));
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t) {
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
if (this->state_ == AllocState::Owner) {
CUDA_VERIFY(cudaFree(this->data_));
}
this->Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
this->Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
std::move(t));
this->state_ = t.state_; t.state_ = AllocState::NotOwner;
......@@ -52,10 +52,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
return *this;
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() {
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~DeviceTensor() {
if (state_ == AllocState::Owner) {
FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
CUDA_VERIFY(cudaFree(this->data_));
......@@ -66,13 +66,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() {
// destructor will return the reservation
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
const IndexT sizes[Dim],
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner),
space_(space) {
......@@ -80,13 +80,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
std::initializer_list<IndexT> sizes,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner),
space_(space) {
......@@ -95,15 +95,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
}
// memory reservation constructor
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m,
const IndexT sizes[Dim],
cudaStream_t stream,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Reservation),
space_(space) {
......@@ -116,15 +116,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
}
// memory reservation constructor
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m,
std::initializer_list<IndexT> sizes,
cudaStream_t stream,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Reservation),
space_(space) {
......@@ -136,51 +136,51 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
reservation_ = std::move(memory);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data,
const IndexT sizes[Dim],
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner),
space_(space) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data,
std::initializer_list<IndexT> sizes,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner),
space_(space) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data,
const IndexT sizes[Dim],
const IndexT strides[Dim],
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes, strides),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
state_(AllocState::NotOwner),
space_(space) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Owner),
space_(space) {
......@@ -189,15 +189,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
this->copyFrom(t, stream);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m,
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Reservation),
space_(space) {
......@@ -211,10 +211,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
this->copyFrom(t, stream);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::zero(
__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero(
cudaStream_t stream) {
if (this->data_) {
// Region must be contiguous
......
......@@ -18,10 +18,10 @@ namespace faiss { namespace gpu {
template <typename T,
int Dim,
bool Contig = false,
bool InnerContig = false,
typename IndexT = int,
template <typename U> class PtrTraits = traits::DefaultPtrTraits>
class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
class DeviceTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
public:
typedef IndexT IndexType;
typedef typename PtrTraits<T>::PtrType DataPtrType;
......@@ -33,11 +33,11 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
__host__ ~DeviceTensor();
/// Move constructor
__host__ DeviceTensor(DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t);
__host__ DeviceTensor(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
/// Move assignment
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
operator=(DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t);
__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
operator=(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
/// Constructs a tensor of the given size, allocating memory for it
/// locally
......@@ -76,19 +76,19 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
MemorySpace space = MemorySpace::Device);
/// Copies a tensor into ourselves, allocating memory for it locally
__host__ DeviceTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
__host__ DeviceTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream,
MemorySpace space = MemorySpace::Device);
/// Copies a tensor into ourselves, reserving a temporary
/// memory reservation via a memory manager.
__host__ DeviceTensor(DeviceMemory& m,
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream,
MemorySpace space = MemorySpace::Device);
/// Call to zero out memory
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
zero(cudaStream_t stream);
private:
......
......@@ -43,7 +43,7 @@ void synchronizeAllDevices() {
}
}
cudaDeviceProp& getDeviceProperties(int device) {
const cudaDeviceProp& getDeviceProperties(int device) {
static std::mutex mutex;
static std::unordered_map<int, cudaDeviceProp> properties;
......@@ -61,6 +61,10 @@ cudaDeviceProp& getDeviceProperties(int device) {
return it->second;
}
const cudaDeviceProp& getCurrentDeviceProperties() {
return getDeviceProperties(getCurrentDevice());
}
int getMaxThreads(int device) {
return getDeviceProperties(device).maxThreadsPerBlock;
}
......
......@@ -31,7 +31,10 @@ int getNumDevices();
void synchronizeAllDevices();
/// Returns a cached cudaDeviceProp for the given device
cudaDeviceProp& getDeviceProperties(int device);
const cudaDeviceProp& getDeviceProperties(int device);
/// Returns the cached cudaDeviceProp for the current device
const cudaDeviceProp& getCurrentDeviceProperties();
/// Returns the maximum number of threads available for the given GPU
/// device
......
......@@ -10,18 +10,18 @@
namespace faiss { namespace gpu {
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor() :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(),
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor() :
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() {
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~HostTensor() {
if (state_ == AllocState::Owner) {
FAISS_ASSERT(this->data_ != nullptr);
delete[] this->data_;
......@@ -29,67 +29,67 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() {
}
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
const IndexT sizes[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner) {
this->data_ = new T[this->numElements()];
FAISS_ASSERT(this->data_ != nullptr);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
std::initializer_list<IndexT> sizes) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner) {
this->data_ = new T[this->numElements()];
FAISS_ASSERT(this->data_ != nullptr);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data,
const IndexT sizes[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data,
std::initializer_list<IndexT> sizes) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data,
const IndexT sizes[Dim],
const IndexT strides[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes, strides),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
state_(AllocState::NotOwner) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Owner) {
// Only contiguous arrays handled for now
FAISS_ASSERT(t.isContiguous());
......@@ -99,10 +99,10 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
}
/// Call to zero out memory
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>&
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() {
__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero() {
// Region must be contiguous
FAISS_ASSERT(this->isContiguous());
......@@ -113,17 +113,17 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() {
return *this;
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ T
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::maxDiff(
const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const {
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::maxDiff(
const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
auto size = this->numElements();
FAISS_ASSERT(size == t.numElements());
FAISS_ASSERT(size > 0);
if (Contig) {
if (InnerContig) {
auto a = this->data();
auto b = t.data();
......
......@@ -16,10 +16,10 @@ namespace faiss { namespace gpu {
template <typename T,
int Dim,
bool Contig = false,
bool InnerContig = false,
typename IndexT = int,
template <typename U> class PtrTraits = traits::DefaultPtrTraits>
class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
class HostTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
public:
typedef IndexT IndexType;
typedef typename PtrTraits<T>::PtrType DataPtrType;
......@@ -51,19 +51,19 @@ class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
/// Copies a tensor into ourselves, allocating memory for it
/// locally. If the tensor is on the GPU, then we will copy it to
/// ourselves wrt the given stream.
__host__ HostTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
__host__ HostTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream);
/// Call to zero out memory
__host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>& zero();
__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& zero();
/// Returns the maximum difference seen between two tensors
__host__ T
maxDiff(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const;
maxDiff(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const;
/// Are the two tensors exactly equal?
__host__ bool
equal(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const {
equal(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
return (maxDiff(t) == (T) 0);
}
......
......@@ -24,11 +24,12 @@ struct Limits {
// constexpr constructor for half
// FIXME: faiss CPU uses +/-FLT_MAX instead of +/-infinity
constexpr float kFloatMax = std::numeric_limits<float>::max();
constexpr float kFloatMin = std::numeric_limits<float>::lowest();
template <>
struct Limits<float> {
static __device__ __host__ inline float getMin() {
return -kFloatMax;
return kFloatMin;
}
static __device__ __host__ inline float getMax() {
return kFloatMax;
......@@ -55,8 +56,8 @@ struct Limits<half> {
#endif // FAISS_USE_FLOAT16
constexpr int kIntMin = std::numeric_limits<int>::min();
constexpr int kIntMax = std::numeric_limits<int>::max();
constexpr int kIntMin = std::numeric_limits<int>::lowest();
template <>
struct Limits<int> {
......
......@@ -112,6 +112,10 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC,
FAISS_ASSERT(aK == bK);
FAISS_ASSERT(bN == cN);
FAISS_ASSERT(a.getStride(1) == 1);
FAISS_ASSERT(b.getStride(1) == 1);
FAISS_ASSERT(c.getStride(1) == 1);
// Now, we have to represent the matrix multiplication in
// column-major layout
T* pA = transC ? a.data() : b.data();
......@@ -122,9 +126,9 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC,
int n = c.getSize(0); // other size
int k = transA ? a.getSize(0) : a.getSize(1);
int lda = transC ? a.getSize(1) : b.getSize(1);
int ldb = transC ? b.getSize(1) : a.getSize(1);
int ldc = c.getSize(1);
int lda = transC ? a.getStride(0) : b.getStride(0);
int ldb = transC ? b.getStride(0) : a.getStride(0);
int ldc = c.getStride(0);
auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
......@@ -238,9 +242,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
int n = c.getSize(1); // other size
int k = transA ? a.getSize(1) : a.getSize(2);
int lda = transC ? a.getSize(2) : b.getSize(2);
int ldb = transC ? b.getSize(2) : a.getSize(2);
int ldc = c.getSize(2);
int lda = transC ? a.getStride(1) : b.getStride(1);
int ldb = transC ? b.getStride(1) : a.getStride(1);
int ldc = c.getStride(1);
auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
......@@ -254,9 +258,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
HostTensor<float*, 1, true> hostB({b.getSize(0)});
HostTensor<float*, 1, true> hostC({c.getSize(0)});
size_t aOffset = a.getSize(1) * a.getSize(2);
size_t bOffset = b.getSize(1) * b.getSize(2);
size_t cOffset = c.getSize(1) * c.getSize(2);
size_t aOffset = a.getStride(0);
size_t bOffset = b.getStride(0);
size_t cOffset = c.getStride(0);
for (int i = 0; i < a.getSize(0); ++i) {
hostA[i] = transC ? a.data() + i * aOffset : b.data() + i * bOffset;
......
......@@ -16,7 +16,7 @@
namespace faiss { namespace gpu {
template <int Dim, bool Contig = false, typename IndexT = int>
template <int Dim, bool InnerContig = false, typename IndexT = int>
class NoTypeTensor {
public:
NoTypeTensor()
......@@ -25,7 +25,7 @@ class NoTypeTensor {
}
template <typename T>
NoTypeTensor(Tensor<T, Dim, Contig, IndexT>& t)
NoTypeTensor(Tensor<T, Dim, InnerContig, IndexT>& t)
: mem_(t.data()),
typeSize_(sizeof(T)) {
for (int i = 0; i < Dim; ++i) {
......@@ -87,13 +87,14 @@ class NoTypeTensor {
}
template <typename T>
Tensor<T, Dim, Contig, IndexT> toTensor() {
Tensor<T, Dim, InnerContig, IndexT> toTensor() {
FAISS_ASSERT(sizeof(T) == typeSize_);
return Tensor<T, Dim, Contig, IndexT>((T*) mem_, size_, stride_);
return Tensor<T, Dim, InnerContig, IndexT>((T*) mem_, size_, stride_);
}
NoTypeTensor<Dim, Contig, IndexT> narrowOutermost(IndexT start, IndexT size) {
NoTypeTensor<Dim, InnerContig, IndexT> narrowOutermost(IndexT start,
IndexT size) {
char* newPtr = (char*) mem_;
if (start > 0) {
......@@ -110,7 +111,7 @@ class NoTypeTensor {
}
}
return NoTypeTensor<Dim, Contig, IndexT>(
return NoTypeTensor<Dim, InnerContig, IndexT>(
newPtr, typeSize_, newSize, stride_);
}
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -222,7 +222,6 @@ void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) {
}
static void write_ivf_header (const IndexIVF * ivf, FILE *f,
bool include_ids = true) {
write_index_header (ivf, f);
......@@ -445,6 +444,7 @@ static void read_ScalarQuantizer (ScalarQuantizer *ivsc, FILE *f) {
READVECTOR (ivsc->trained);
}
ProductQuantizer * read_ProductQuantizer (const char*fname) {
FILE *f = fopen (fname, "r");
FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
......@@ -676,8 +676,8 @@ Index *read_index (FILE * f, bool try_mmap) {
}
idx = idxmap;
} else {
fprintf (stderr, "Index type 0x%08x not supported\n", h);
abort ();
FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
idx = nullptr;
}
return idx;
}
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -307,12 +307,20 @@ int fvec_madd_and_argmin (size_t n, const float *a,
void reflection (const float * u, float * x, size_t n, size_t d, size_t nu);
/** For k-means: update stage. Returns nb of split clusters. */
/** For k-means: update stage.
*
* @param x training vectors, size n * d
* @param centroids centroid vectors, size k * d
* @param assign nearest centroid for each training vector, size n
* @param k_frozen do not update the k_frozen first centroids
* @return nb of spliting operations to fight empty clusters
*/
int km_update_centroids (
const float * x,
float * centroids,
long * assign,
size_t d, size_t k, size_t n);
size_t d, size_t k, size_t n,
size_t k_frozen);
/** compute the Q of the QR decomposition for m > n
* @param a size n * m: input matrix and output Q
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment