Commit 250a3d3f authored by matthijs's avatar matthijs

sync with FB version 2017-11-22

various bugfixes from github issues
kmean with some frozen centroids
GPU better tiling for large flat datasets
default AVX for vector ops
parent 71335194
......@@ -345,6 +345,7 @@ void ParameterSpace::initialize (const Index * index)
}
if (DC (IndexIVF)) {
{
ParameterRange & pr = add_range("nprobe");
for (int i = 0; i < 13; i++) {
size_t nprobe = 1 << i;
......@@ -352,6 +353,7 @@ void ParameterSpace::initialize (const Index * index)
pr.values.push_back (nprobe);
}
}
}
if (DC (IndexPQ)) {
ParameterRange & pr = add_range("ht");
init_pq_ParameterRange (ix->pq, pr);
......@@ -371,7 +373,6 @@ void ParameterSpace::initialize (const Index * index)
}
}
if (DC (IndexIVFPQR)) {
assert (ix);
ParameterRange & pr = add_range("k_factor");
for (int i = 0; i <= 6; i++) {
pr.values.push_back (1 << i);
......@@ -427,12 +428,21 @@ void ParameterSpace::set_index_parameter (
if (name == "verbose") {
index->verbose = int(val);
// and fall through to also enable it on sub-indexes
}
if (DC (IndexPreTransform)) {
index = ix->index;
}
if (DC (IndexShards)) {
// call on all sub-indexes
for (auto & shard_index : ix->shard_indexes) {
set_index_parameter (shard_index, name, val);
}
return;
}
if (name == "verbose") {
index->verbose = int(val);
// in case it was an IndexPreTransform
}
if (DC (IndexRefineFlat)) {
if (name == "k_factor_rf") {
......@@ -449,9 +459,12 @@ void ParameterSpace::set_index_parameter (
return; // last verbose that we could find
}
if (name == "nprobe") {
DC(IndexIVF);
if ( DC(IndexIVF)) {
ix->nprobe = int(val);
} else if (name == "ht") {
return;
}
}
if (name == "ht") {
if (DC (IndexPQ)) {
if (val >= ix->pq.code_size * 8) {
ix->search_type = IndexPQ::ST_PQ;
......@@ -459,25 +472,32 @@ void ParameterSpace::set_index_parameter (
ix->search_type = IndexPQ::ST_polysemous;
ix->polysemous_ht = int(val);
}
return;
} else if (DC (IndexIVFPQ)) {
if (val >= ix->pq.code_size * 8) {
ix->polysemous_ht = 0;
} else {
ix->polysemous_ht = int(val);
}
return;
}
}
} else if (name == "k_factor") {
DC (IndexIVFPQR);
if (name == "k_factor") {
if (DC (IndexIVFPQR)) {
ix->k_factor = val;
} else if (name == "max_codes") {
DC (IndexIVFPQ);
return;
}
}
if (name == "max_codes") {
if (DC (IndexIVFPQ)) {
ix->max_codes = finite(val) ? size_t(val) : 0;
} else {
FAISS_THROW_FMT (
"ParameterSpace::set_index_parameter:"
return;
}
}
FAISS_THROW_FMT ("ParameterSpace::set_index_parameter:"
"could not set parameter %s",
name.c_str());
}
}
void ParameterSpace::display () const
......@@ -634,6 +654,15 @@ struct VTChain {
}
};
/// what kind of training does this coarse quantizer require?
char get_trains_alone(const Index *coarse_quantizer) {
return
dynamic_cast<const MultiIndexQuantizer*>(coarse_quantizer) ? 1 :
0;
}
}
Index *index_factory (int d, const char *description_in, MetricType metric)
......@@ -656,6 +685,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
tok;
tok = strtok_r (nullptr, " ,", &ptr)) {
int d_out, opq_M, nbit, M, M2;
char option[100];
std::string stok(tok);
// to avoid mem leaks with exceptions:
......@@ -686,7 +716,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
} else if (stok == "L2norm") {
vt_1 = new NormalizationTransform (d, 2.0);
// coarse quantizers
} else if (!coarse_quantizer &&
sscanf (tok, "IVF%d", &ncentroids) == 1) {
if (metric == METRIC_L2) {
......@@ -709,8 +739,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
IndexIVF *index_ivf = new IndexIVFFlat (
coarse_quantizer, d, ncentroids, metric);
index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
!= nullptr;
get_trains_alone (coarse_quantizer);
index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
del_coarse_quantizer.release ();
index_ivf->own_fields = true;
......@@ -728,8 +757,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
new IndexIVFScalarQuantizer (
coarse_quantizer, d, ncentroids, qt, metric);
index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
!= nullptr;
get_trains_alone (coarse_quantizer);
del_coarse_quantizer.release ();
index_ivf->own_fields = true;
index_1 = index_ivf;
......@@ -744,29 +772,31 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
IndexIVFPQR *index_ivf = new IndexIVFPQR (
coarse_quantizer, d, ncentroids, M, 8, M2, 8);
index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
!= nullptr;
get_trains_alone (coarse_quantizer);
del_coarse_quantizer.release ();
index_ivf->own_fields = true;
index_1 = index_ivf;
} else if (!index && sscanf (tok, "PQ%d", &M) == 1) {
} else if (!index && sscanf (tok, "PQ%d%10s", &M, option) == 2) {
std::string soption = option;
// np to disable polysemous trainign
FAISS_THROW_IF_NOT(soption == "" || soption == "np");
if (coarse_quantizer) {
IndexIVFPQ *index_ivf = new IndexIVFPQ (
coarse_quantizer, d, ncentroids, M, 8);
index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
!= nullptr;
get_trains_alone (coarse_quantizer);
index_ivf->metric_type = metric;
index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
del_coarse_quantizer.release ();
index_ivf->own_fields = true;
index_ivf->do_polysemous_training = true;
index_ivf->do_polysemous_training = soption != "np";
index_1 = index_ivf;
} else {
IndexPQ *index_pq = new IndexPQ (d, M, 8, metric);
index_pq->do_polysemous_training = true;
index_pq->do_polysemous_training = soption != "np";
index_1 = index_pq;
}
} else if (stok == "RFlat") {
make_IndexRefineFlat = true;
} else {
......
......@@ -25,7 +25,7 @@ namespace faiss {
/** The objective is to have a simple result structure while
* minimizing the number of mem copies in the result. The method
* do_allocation can be overloaded to allocate the result tables in
* the matrix type of a srcipting language like Lua or Python. */
* the matrix type of a scripting language like Lua or Python. */
struct RangeSearchResult {
size_t nq; ///< nb of queries
size_t *lims; ///< size (nq + 1)
......
......@@ -29,6 +29,7 @@ ClusteringParameters::ClusteringParameters ():
nredo(1),
verbose(false), spherical(false),
update_index(false),
frozen_centroids(false),
min_points_per_centroid(39),
max_points_per_centroid(256),
seed(1234)
......@@ -110,7 +111,24 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
float * dis = new float[nx];
ScopeDeleter<float> del2(dis);
// for redo
float best_err = 1e50;
std::vector<float> best_obj;
std::vector<float> best_centroids;
// support input centroids
FAISS_THROW_IF_NOT_MSG (
centroids.size() % d == 0,
"size of provided input centroids not a multiple of dimension");
size_t n_input_centroids = centroids.size() / d;
if (verbose && n_input_centroids > 0) {
printf (" Using %zd centroids provided as input (%sfrozen)\n",
n_input_centroids, frozen_centroids ? "" : "not ");
}
double t_search_tot = 0;
if (verbose) {
printf(" Preprocessing in %.2f s\n",
......@@ -120,39 +138,28 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
for (int redo = 0; redo < nredo; redo++) {
std::vector<float> buf_centroids;
std::vector<float> &cur_centroids =
nredo == 1 ? centroids : buf_centroids;
if (verbose && nredo > 1) {
printf("Outer iteration %d / %d\n", redo, nredo);
}
if (cur_centroids.size() == 0) {
// initialize centroids with random points from the dataset
cur_centroids.resize (d * k);
// initialize remaining centroids with random points from the dataset
centroids.resize (d * k);
std::vector<int> perm (nx);
rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L);
#pragma omp parallel for
for (int i = 0; i < k ; i++)
memcpy (&cur_centroids[i * d], x + perm[i] * d,
for (int i = n_input_centroids; i < k ; i++)
memcpy (&centroids[i * d], x + perm[i] * d,
d * sizeof (float));
} else { // assume user provides some meaningful initialization
FAISS_THROW_IF_NOT (cur_centroids.size() == d * k);
FAISS_THROW_IF_NOT_MSG (nredo == 1,
"will redo with same initialization");
}
if (spherical)
fvec_renorm_L2 (d, k, cur_centroids.data());
fvec_renorm_L2 (d, k, centroids.data());
if (!index.is_trained)
index.train (k, cur_centroids.data());
index.train (k, centroids.data());
FAISS_THROW_IF_NOT (index.ntotal == 0);
index.add (k, cur_centroids.data());
index.add (k, centroids.data());
float err = 0;
for (int i = 0; i < niter; i++) {
double t0s = getmillisecs();
......@@ -164,8 +171,9 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
err += dis[j];
obj.push_back (err);
int nsplit = km_update_centroids (x, cur_centroids.data(),
assign, d, k, nx);
int nsplit = km_update_centroids (
x, centroids.data(),
assign, d, k, nx, frozen_centroids ? n_input_centroids : 0);
if (verbose) {
printf (" Iteration %d (%.2f s, search %.2f s): "
......@@ -178,26 +186,31 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
}
if (spherical)
fvec_renorm_L2 (d, k, cur_centroids.data());
fvec_renorm_L2 (d, k, centroids.data());
index.reset ();
if (update_index)
index.train (k, cur_centroids.data());
index.train (k, centroids.data());
assert (index.ntotal == 0);
index.add (k, cur_centroids.data());
index.add (k, centroids.data());
}
if (verbose) printf("\n");
if (nredo > 1) {
if (err < best_err) {
if (verbose)
printf ("Objective improved: keep new clusters\n");
centroids = buf_centroids;
best_centroids = centroids;
best_obj = obj;
best_err = err;
}
index.reset ();
}
}
if (nredo > 1) {
centroids = best_centroids;
obj = best_obj;
}
}
......
......@@ -28,6 +28,7 @@ struct ClusteringParameters {
bool verbose;
bool spherical; ///< do we want normalized centroids?
bool update_index; ///< update index after each iteration?
bool frozen_centroids; ///< use the centroids provided as input and do not change them during iterations
int min_points_per_centroid; ///< otherwise you get a warning
int max_points_per_centroid; ///< to limit size of dataset
......
......@@ -41,8 +41,7 @@ long Index::remove_ids(const IDSelector& /*sel*/) {
void Index::reconstruct (idx_t, float * ) const {
FAISS_THROW_MSG ("Can not compute reconstruct without "
"knowing how to do so");
FAISS_THROW_MSG ("reconstruct not implemented for this type of index");
}
......
......@@ -34,8 +34,9 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
nlist (nlist),
nprobe (1),
quantizer (quantizer),
quantizer_trains_alone (false),
quantizer_trains_alone (0),
own_fields (false),
clustering_index (nullptr),
ids (nlist),
maintain_direct_map (false)
{
......@@ -56,7 +57,8 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
IndexIVF::IndexIVF ():
nlist (0), nprobe (1), quantizer (nullptr),
quantizer_trains_alone (false), own_fields (false),
quantizer_trains_alone (0), own_fields (false),
clustering_index (nullptr),
maintain_direct_map (false)
{}
......@@ -157,22 +159,44 @@ void IndexIVF::train (idx_t n, const float *x)
if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
if (verbose)
printf ("IVF quantizer does not need training.\n");
} else if (quantizer_trains_alone) {
} else if (quantizer_trains_alone == 1) {
if (verbose)
printf ("IVF quantizer trains alone...\n");
quantizer->train (n, x);
quantizer->verbose = verbose;
FAISS_THROW_IF_NOT_MSG (quantizer->ntotal == nlist,
"nlist not consistent with quantizer size");
} else {
} else if (quantizer_trains_alone == 0) {
if (verbose)
printf ("Training IVF quantizer on %ld vectors in %dD\n",
n, d);
Clustering clus (d, nlist, cp);
quantizer->reset();
if (clustering_index) {
clus.train (n, x, *clustering_index);
quantizer->add (nlist, clus.centroids.data());
} else {
clus.train (n, x, *quantizer);
}
quantizer->is_trained = true;
} else if (quantizer_trains_alone == 2) {
if (verbose)
printf (
"Training L2 quantizer on %ld vectors in %dD%s\n",
n, d,
clustering_index ? "(user provided index)" : "");
FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
Clustering clus (d, nlist, cp);
if (!clustering_index) {
IndexFlatL2 assigner (d);
clus.train(n, x, assigner);
} else {
clus.train(n, x, *clustering_index);
}
if (verbose)
printf ("Adding centroids to quantizer\n");
quantizer->add (nlist, clus.centroids.data());
}
if (verbose)
printf ("Training IVF residual\n");
......@@ -250,8 +274,9 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
{
FAISS_THROW_IF_NOT (nlist == other.nlist);
FAISS_THROW_IF_NOT (!other.maintain_direct_map);
FAISS_THROW_IF_NOT_MSG (subset_type == 0 || subset_type == 2,
"this subset type is not implemented");
FAISS_THROW_IF_NOT_FMT (
subset_type == 0 || subset_type == 1 || subset_type == 2,
"subset type %d not implemented", subset_type);
size_t accu_n = 0;
size_t accu_a1 = 0;
......@@ -275,15 +300,24 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
other.ntotal++;
}
}
} else if (subset_type == 1) {
for (long i = 0; i < n; i++) {
idx_t id = ids_in[i];
if (id % a1 == a2) {
ids_out.push_back (id);
codes_out.insert (codes_out.end(),
codes_in.begin() + i * code_size,
codes_in.begin() + (i + 1) * code_size);
other.ntotal++;
}
}
} else if (subset_type == 2) {
// see what is allocated to a1 and to a2
size_t next_accu_n = accu_n + n;
size_t next_accu_a1 = next_accu_n * a1 / ntotal;
size_t i1 = next_accu_a1 - accu_a1;
accu_a1 = next_accu_a1;
size_t next_accu_a2 = next_accu_n * a2 / ntotal;
size_t i2 = next_accu_a2 - accu_a2;
accu_a2 = next_accu_a2;
ids_out.insert(ids_out.end(),
ids_in.begin() + i1,
ids_in.begin() + i2);
......@@ -291,6 +325,8 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
codes_in.begin() + i1 * code_size,
codes_in.begin() + i2 * code_size);
other.ntotal += i2 - i1;
accu_a1 = next_accu_a1;
accu_a2 = next_accu_a2;
}
accu_n += n;
}
......
......@@ -47,10 +47,17 @@ struct IndexIVF: Index {
size_t nprobe; ///< number of probes at query time
Index * quantizer; ///< quantizer that maps vectors to inverted lists
bool quantizer_trains_alone; ///< just pass over the trainset to quantizer
/**
* = 0: use the quantizer as index in a kmeans training
* = 1: just pass on the training set to the train() of the quantizer
* = 2: kmeans training on a flat index + add the centroids to the quantizer
*/
char quantizer_trains_alone;
bool own_fields; ///< whether object owns the quantizer
ClusteringParameters cp; ///< to override default clustering params
Index *clustering_index; ///< to override index used during clustering
std::vector < std::vector<long> > ids; ///< Inverted lists for indexes
......
......@@ -291,8 +291,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
for (int j = 0; j < d; j++) {
r[j] += centroid[j];
}
}
else {
} else {
pq.decode (code_line + ofs * pq.code_size, r);
}
}
......@@ -303,6 +302,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
void IndexIVFPQ::reconstruct (idx_t key, float * recons) const
{
FAISS_THROW_IF_NOT (direct_map.size() == ntotal);
int list_no = direct_map[key] >> 32;
int ofs = direct_map[key] & 0xffffffff;
......@@ -1029,6 +1029,51 @@ void IndexIVFPQ::search_preassigned (idx_t nx, const float *qx, idx_t k,
}
void IndexIVFPQ::search_and_reconstruct (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels,
float *reconstructed)
{
long * idx = new long [n * nprobe];
ScopeDeleter<long> del (idx);
float * coarse_dis = new float [n * nprobe];
ScopeDeleter<float> del2 (coarse_dis);
quantizer->search (n, x, nprobe, coarse_dis, idx);
search_preassigned (n, x, k, idx, coarse_dis,
distances, labels, true);
for (long i = 0; i < n; i++) {
for (long j = 0; j < k; j++) {
long ij = i * k + j;
idx_t res = labels[ij];
float *recons = reconstructed + d * (ij);
if (res < 0) {
// fill with NaNs
memset(recons, -1, sizeof(*recons) * d);
} else {
int list_no = res >> 32;
int ofs = res & 0xffffffff;
labels[ij] = ids[list_no][ofs];
quantizer->reconstruct (list_no, recons);
const uint8_t * code = &(codes[list_no][ofs * pq.code_size]);
for (size_t m = 0; m < pq.M; m++) {
float * out = recons + m * pq.dsub;
const float * cent = pq.get_centroids (m, code[m]);
for (size_t l = 0; l < pq.dsub; l++) {
out[l] += cent[l];
}
}
}
}
}
}
IndexIVFPQ::IndexIVFPQ ()
......
......@@ -114,6 +114,15 @@ struct IndexIVFPQ: IndexIVF {
float *distances, idx_t *labels,
bool store_pairs) const override;
/** Same as the search function, but also reconstruct approximate
* vectors for the search results
*
* @param reconstructed size (n, k, d)
**/
void search_and_reconstruct (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels,
float *reconstructed);
/// build precomputed table
void precompute_table ();
......
......@@ -124,8 +124,8 @@ struct Codec4bit {
struct SimilarityL2 {
const float *y, *yi;
explicit SimilarityL2 (const float * y): y(y) {}
explicit SimilarityL2 (const float * y): y(y) {}
/******* scalar accumulator *******/
......@@ -676,19 +676,19 @@ void ScalarQuantizer::compute_codes (const float * x,
size_t n) const
{
Quantizer *squant = select_quantizer (*this);
ScopeDeleter1<Quantizer> del(squant);
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
squant->encode_vector (x + i * d, codes + i * code_size);
delete squant;
}
void ScalarQuantizer::decode (const uint8_t *codes, float *x, size_t n) const
{
Quantizer *squant = select_quantizer (*this);
ScopeDeleter1<Quantizer> del(squant);
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
squant->decode_vector (codes + i * code_size, x + i * d);
delete squant;
}
/*******************************************************************
......@@ -754,6 +754,7 @@ void IndexScalarQuantizer::search(
}
ci += code_size;
}
minheap_reorder (k, simi, idxi);
}
} else {
#pragma omp parallel for
......@@ -774,7 +775,7 @@ void IndexScalarQuantizer::search(
}
ci += code_size;
}
maxheap_reorder (k, simi, idxi);
}
}
......@@ -855,6 +856,7 @@ void IndexIVFScalarQuantizer::add_with_ids
int nt = omp_get_num_threads();
int rank = omp_get_thread_num();
// each thread takes care of a subset of lists
for (size_t i = 0; i < n; i++) {
long list_no = idx [i];
......@@ -879,6 +881,7 @@ void IndexIVFScalarQuantizer::add_with_ids
ntotal += nadd;
}
namespace {
void search_with_probes_ip (const IndexIVFScalarQuantizer & index,
const float *x,
......@@ -958,6 +961,8 @@ void search_with_probes_L2 (const IndexIVFScalarQuantizer & index,
maxheap_reorder (k, simi, idxi);
}
} // anonymous namespace
void IndexIVFScalarQuantizer::search_preassigned (
idx_t n, const float *x, idx_t k,
const idx_t *idx,
......
......@@ -87,54 +87,59 @@ _swigfaiss.so: python/_swigfaiss.so
cp python/_swigfaiss.so python/swigfaiss.py .
#############################
# Dependencies
# Dependencies.
# make dep > x
# then copy/paste from x by hand below
# for i in *.cpp ; do g++ -std=c++11 -I.. -MM $i -msse4; done
dep:
for i in $(patsubst %.o,%.cpp,$(LIBOBJ)) ; do \
cpp -MM -std=gnu++0x $$i ; \
done
AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h IndexFlat.h
FaissException.o: FaissException.cpp FaissException.h
hamming.o: hamming.cpp hamming.h Heap.h FaissAssert.h FaissException.h
Heap.o: Heap.cpp Heap.h
Index.o: Index.cpp IndexFlat.h Index.h FaissAssert.h FaissException.h
utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \
FaissAssert.h FaissException.h
IndexFlat.o: IndexFlat.cpp IndexFlat.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h AuxIndexStructures.h
index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
IndexIVF.o: IndexIVF.cpp IndexIVF.h Index.h Clustering.h Heap.h utils.h \
hamming.h FaissAssert.h FaissException.h IndexFlat.h \
AuxIndexStructures.h
IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
IndexFlat.h hamming.h FaissAssert.h FaissException.h \
AuxIndexStructures.h
IndexLSH.o: IndexLSH.cpp IndexLSH.h Index.h VectorTransform.h utils.h \
Heap.h hamming.h FaissAssert.h FaissException.h
IndexPQ.o: IndexPQ.cpp IndexPQ.h Index.h ProductQuantizer.h Clustering.h \
Heap.h PolysemousTraining.h FaissAssert.h FaissException.h hamming.h
IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \
IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \
FaissException.h
MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \
FaissException.h Heap.h AuxIndexStructures.h
IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
IndexFlat.h hamming.h FaissAssert.h FaissException.h \
AuxIndexStructures.h
Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h IndexFlat.h
Heap.o: Heap.cpp Heap.h
VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \
Heap.h FaissAssert.h FaissException.h IndexPQ.h ProductQuantizer.h \
Clustering.h PolysemousTraining.h
index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
PolysemousTraining.o: PolysemousTraining.cpp PolysemousTraining.h \
ProductQuantizer.h Clustering.h Index.h Heap.h utils.h hamming.h \
FaissAssert.h FaissException.h
MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \
FaissException.h Heap.h AuxIndexStructures.h
Index.o: Index.cpp IndexFlat.h Index.h FaissAssert.h FaissException.h
ProductQuantizer.o: ProductQuantizer.cpp ProductQuantizer.h Clustering.h \
Index.h Heap.h FaissAssert.h FaissException.h VectorTransform.h \
IndexFlat.h utils.h
utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \
FaissAssert.h FaissException.h
VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \
Heap.h FaissAssert.h FaissException.h IndexPQ.h ProductQuantizer.h \
Clustering.h PolysemousTraining.h
AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \
IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \
FaissException.h
FaissException.o: FaissException.cpp FaissException.h
clean:
......
......@@ -76,6 +76,17 @@ void IndexIDMap::search (idx_t n, const float *x, idx_t k,
}
}
void IndexIDMap::range_search (idx_t n, const float *x, float radius,
RangeSearchResult *result) const
{
index->range_search(n, x, radius, result);
for (idx_t i = 0; i < result->lims[result->nq]; i++) {
result->labels[i] = result->labels[i] < 0 ?
result->labels[i] : id_map[result->labels[i]];
}
}
namespace {
struct IDTranslatedSelector: IDSelector {
......@@ -109,6 +120,7 @@ long IndexIDMap::remove_ids (const IDSelector & sel)
}
FAISS_ASSERT (j == index->ntotal);
ntotal = j;
id_map.resize(ntotal);
return nremove;
}
......
......@@ -51,6 +51,9 @@ struct IndexIDMap : Index {
/// remove ids adapted to IndexFlat
long remove_ids(const IDSelector& sel) override;
void range_search (idx_t n, const float *x, float radius,
RangeSearchResult *result) const override;
~IndexIDMap() override;
IndexIDMap () {own_fields=false; index=nullptr; }
};
......
......@@ -804,18 +804,38 @@ void IndexPreTransform::train (idx_t n, const float *x)
const float *prev_x = x;
ScopeDeleter<float> del;
if (verbose) {
printf("IndexPreTransform::train: training chain 0 to %d\n",
last_untrained);
}
for (int i = 0; i <= last_untrained; i++) {
if (i < chain.size()) {
VectorTransform *ltrans = chain [i];
if (!ltrans->is_trained)
ltrans->train(n, prev_x);
if (!ltrans->is_trained) {
if (verbose) {
printf(" Training chain component %d/%zd\n",
i, chain.size());
if (OPQMatrix *opqm = dynamic_cast<OPQMatrix*>(ltrans)) {
opqm->verbose = true;
}
}
ltrans->train (n, prev_x);
}
} else {
if (verbose) {
printf(" Training sub-index\n");
}
index->train (n, prev_x);
}
if (i == last_untrained) break;
if (verbose) {
printf(" Applying transform %d/%zd\n",
i, chain.size());
}
float * xt = chain[i]->apply (n, prev_x);
if (prev_x != x) delete prev_x;
if (prev_x != x) delete [] prev_x;
prev_x = xt;
del.set(xt);
}
......
......@@ -521,7 +521,7 @@ def compute_populated_index(preproc):
co.verbose = True
co.reserveVecs = max_add if max_add > 0 else xb.shape[0]
co.shard = True
assert co.shard_type in (0, 1, 2)
vres, vdev = make_vres_vdev()
gpu_index = faiss.index_cpu_to_gpu_multiple(
vres, vdev, indexall, co)
......
......@@ -121,6 +121,18 @@ def handle_Index(the_class):
swig_ptr(labels))
return distances, labels
def replacement_search_and_reconstruct(self, x, k):
n, d = x.shape
assert d == self.d
distances = np.empty((n, k), dtype=np.float32)
labels = np.empty((n, k), dtype=np.int64)
recons = np.empty((n, k, d), dtype=np.float32)
self.search_and_reconstruct_c(n, swig_ptr(x),
k, swig_ptr(distances),
swig_ptr(labels),
swig_ptr(recons))
return distances, labels, recons
def replacement_remove_ids(self, x):
if isinstance(x, IDSelector):
sel = x
......@@ -167,6 +179,8 @@ def handle_Index(the_class):
replace_method(the_class, 'range_search', replacement_range_search)
replace_method(the_class, 'update_vectors', replacement_update_vectors,
ignore_missing=True)
replace_method(the_class, 'search_and_reconstruct',
replacement_search_and_reconstruct, ignore_missing=True)
def handle_VectorTransform(the_class):
......@@ -258,12 +272,52 @@ def index_cpu_to_gpu_multiple_py(resources, index, co=None):
return index_cpu_to_gpu_multiple(vres, vdev, index, co)
def vector_float_to_array(v):
a = np.empty(v.size(), dtype='float32')
memcpy(swig_ptr(a), v.data(), 4 * v.size())
def index_cpu_to_all_gpus(index, co=None, ngpu=-1):
if ngpu == -1:
ngpu = get_num_gpus()
res = [StandardGpuResources() for i in range(ngpu)]
index2 = index_cpu_to_gpu_multiple_py(res, index, co)
index2.dont_dealloc = res
return index2
# mapping from vector names in swigfaiss.swig and the numpy dtype names
vector_name_map = {
'Float': 'float32',
'Byte': 'uint8',
'Uint64': 'uint64',
'Long': 'int64',
'Int': 'int32',
'Double': 'float64'
}
def vector_to_array(v):
""" convert a C++ vector to a numpy array """
classname = v.__class__.__name__
assert classname.endswith('Vector')
dtype = np.dtype(vector_name_map[classname[:-6]])
a = np.empty(v.size(), dtype=dtype)
memcpy(swig_ptr(a), v.data(), a.nbytes)
return a
def vector_float_to_array(v):
return vector_to_array(v)
def copy_array_to_vector(a, v):
""" copy a numpy array to a vector """
n, = a.shape
classname = v.__class__.__name__
assert classname.endswith('Vector')
dtype = np.dtype(vector_name_map[classname[:-6]])
assert dtype == a.dtype, (
'cannot copy a %s array to a %s (should be %s)' % (
a.dtype, classname, dtype))
v.resize(n)
memcpy(v.data(), swig_ptr(a), a.nbytes)
class Kmeans:
def __init__(self, d, k, niter=25, verbose=False, spherical = False):
......@@ -364,3 +418,18 @@ def eval_intersection(I1, I2):
def normalize_L2(x):
fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x))
def replacement_map_add(self, keys, vals):
n, = keys.shape
assert (n,) == keys.shape
self.add_c(n, swig_ptr(keys), swig_ptr(vals))
def replacement_map_search_multiple(self, keys):
n, = keys.shape
vals = np.empty(n, dtype='uint64')
self.search_multiple_c(n, swig_ptr(keys), swig_ptr(vals))
return vals
replace_method(MapLong2Long, 'add', replacement_map_add)
replace_method(MapLong2Long, 'search_multiple', replacement_map_search_multiple)
......@@ -8,6 +8,7 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#include "GpuAutoTune.h"
#include <typeinfo>
#include "GpuIndex.h"
#include "../FaissAssert.h"
......@@ -97,17 +98,6 @@ faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index)
GpuClonerOptions::GpuClonerOptions():
indicesOptions(INDICES_64_BIT),
useFloat16CoarseQuantizer(false),
useFloat16(false),
usePrecomputed(true),
reserveVecs(0),
storeTransposed(false),
verbose(0)
{}
struct ToGpuCloner: faiss::Cloner, GpuClonerOptions {
GpuResources *resources;
int device;
......@@ -185,9 +175,6 @@ faiss::Index * index_cpu_to_gpu(
return cl.clone_Index(index);
}
GpuMultipleClonerOptions::GpuMultipleClonerOptions(): shard(false)
{}
struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
std::vector<ToGpuCloner> sub_cloners;
......@@ -211,6 +198,28 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
{}
void copy_ivf_shard (const IndexIVF *index_ivf, IndexIVF *idx2,
long n, long i) {
if (shard_type == 2) {
long i0 = i * index_ivf->ntotal / n;
long i1 = (i + 1) * index_ivf->ntotal / n;
if(verbose)
printf("IndexShards shard %ld indices %ld:%ld\n",
i, i0, i1);
index_ivf->copy_subset_to(*idx2, 2, i0, i1);
FAISS_ASSERT(idx2->ntotal == i1 - i0);
} else if (shard_type == 1) {
if(verbose)
printf("IndexShards shard %ld select modulo %ld = %ld\n",
i, n, i);
index_ivf->copy_subset_to(*idx2, 1, n, i);
} else {
FAISS_THROW_FMT ("shard_type %d not implemented", shard_type);
}
}
Index *clone_Index(const Index *index) override {
long n = sub_cloners.size();
if (n == 1)
......@@ -231,19 +240,13 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
dynamic_cast<const faiss::IndexIVFPQ *>(index);
auto index_ivfflat =
dynamic_cast<const faiss::IndexIVFFlat *>(index);
FAISS_ASSERT_MSG (index_ivfpq || index_ivfflat,
FAISS_THROW_IF_NOT_MSG (index_ivfpq || index_ivfflat,
"IndexShards implemented only for "
"IndexIVFFlat or IndexIVFPQ");
std::vector<faiss::Index*> shards(n);
for(long i = 0; i < n; i++) {
// make a shallow copy
long i0 = i * index->ntotal / n;
long i1 = (i + 1) * index->ntotal / n;
if(verbose)
printf("IndexShards shard %ld indices %ld:%ld\n",
i, i0, i1);
if(reserveVecs)
sub_cloners[i].reserveVecs =
(reserveVecs + n - 1) / n;
......@@ -258,18 +261,19 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
idx2.nprobe = index_ivfpq->nprobe;
idx2.use_precomputed_table = 0;
idx2.is_trained = index->is_trained;
index_ivfpq->copy_subset_to(idx2, 2, i0, i1);
FAISS_ASSERT(idx2.ntotal == i1 - i0);
copy_ivf_shard (index_ivfpq, &idx2, n, i);
shards[i] = sub_cloners[i].clone_Index(&idx2);
} else if (index_ivfflat) {
faiss::IndexIVFFlat idx2(
index_ivfflat->quantizer, index->d,
index_ivfflat->nlist, index_ivfflat->metric_type);
idx2.nprobe = index_ivfflat->nprobe;
index_ivfflat->copy_subset_to(idx2, 2, i0, i1);
idx2.nprobe = index_ivfflat->nprobe;
copy_ivf_shard (index_ivfflat, &idx2, n, i);
shards[i] = sub_cloners[i].clone_Index(&idx2);
}
}
faiss::IndexShards *res =
new faiss::IndexShards(index->d, true, false);
......@@ -372,33 +376,26 @@ void GpuParameterSpace::initialize (const Index * index)
void GpuParameterSpace::set_index_parameter (
Index * index, const std::string & name, double val) const
{
if (DC (IndexPreTransform)) {
index = ix->index;
}
if (DC (IndexProxy)) {
for (int i = 0; i < ix->count(); i++)
set_index_parameter (ix->at(i), name, val);
return;
}
if (DC (faiss::IndexShards)) {
for (auto sub_index : ix->shard_indexes)
set_index_parameter (sub_index, name, val);
return;
}
if (DC (GpuIndexIVF)) {
if (name == "nprobe") {
DC (GpuIndexIVF);
FAISS_ASSERT(ix);
ix->setNumProbes (int (val));
return;
}
}
if(DC (GpuIndexIVFPQ)) {
if (name == "use_precomputed_table") {
DC (GpuIndexIVFPQ);
FAISS_ASSERT(ix);
ix->setPrecomputedCodes(bool (val));
return;
}
}
FAISS_ASSERT_MSG (false, "unknown parameter");
// maybe norma lindex parameters apply?
ParameterSpace::set_index_parameter (index, name, val);
}
......
......@@ -22,7 +22,9 @@ GpuClonerOptions::GpuClonerOptions()
}
GpuMultipleClonerOptions::GpuMultipleClonerOptions()
: shard(false) {
: shard(false),
shard_type(1)
{
}
} } // namespace
......@@ -47,6 +47,9 @@ struct GpuMultipleClonerOptions : public GpuClonerOptions {
/// Whether to shard the index across GPUs, versus replication
/// across GPUs
bool shard;
/// IndexIVF::copy_subset_to subset type
int shard_type;
};
} } // namespace
......@@ -26,7 +26,7 @@ struct GpuIndexConfig {
/// GPU device on which the index is resident
int device;
/// What memory space to use for primary storae.
/// What memory space to use for primary storage.
/// On Pascal and above (CC 6+) architectures, allows GPUs to use
/// more memory than is available on the GPU.
MemorySpace memorySpace;
......
......@@ -184,7 +184,7 @@ GpuIndexIVF::copyTo(faiss::IndexIVF* index) const {
}
index->quantizer = q;
index->quantizer_trains_alone = false;
index->quantizer_trains_alone = 0;
index->own_fields = true;
index->cp = this->cp;
index->ids.clear();
......
......@@ -96,7 +96,6 @@ GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
FAISS_ASSERT(index->pq.byte_per_idx == 1);
FAISS_ASSERT(index->by_residual);
FAISS_ASSERT(index->polysemous_ht == 0);
ivfpqConfig_.usePrecomputedTables = (bool) index->use_precomputed_table;
verifySettings_();
......
......@@ -23,6 +23,7 @@ CPPOBJ= GpuResources.o \
IndexProxy.o \
StandardGpuResources.o \
GpuAutoTune.o \
GpuClonerOptions.o \
impl/RemapIndices.o \
utils/DeviceMemory.o \
utils/StackDeviceMemory.o \
......@@ -123,23 +124,24 @@ clean:
dep:
for i in $(patsubst %.o,%.cpp,$(CPPOBJ)) \
$(patsubst %.o,%.cu,$(CUOBJ)); do \
echo -n $${i%/*}/ ; \
echo -n $$( dirname $$i )/ ; \
cpp -MM -std=gnu++0x $$i; \
done
GpuResources.cpp/GpuResources.o: GpuResources.cpp GpuResources.h utils/DeviceMemory.h \
./GpuResources.o: GpuResources.cpp GpuResources.h utils/DeviceMemory.h \
utils/DeviceUtils.h utils/../../FaissAssert.h \
utils/../../FaissException.h
IndexProxy.cpp/IndexProxy.o: IndexProxy.cpp IndexProxy.h ../Index.h utils/WorkerThread.h \
./IndexProxy.o: IndexProxy.cpp IndexProxy.h ../Index.h utils/WorkerThread.h \
../FaissAssert.h ../FaissException.h ../Clustering.h ../Index.h \
GpuIndexFlat.h GpuIndex.h utils/MemorySpace.h utils/../../FaissAssert.h \
StandardGpuResources.h GpuResources.h utils/DeviceMemory.h \
utils/StackDeviceMemory.h utils/DeviceUtils.h
StandardGpuResources.cpp/StandardGpuResources.o: StandardGpuResources.cpp StandardGpuResources.h \
./StandardGpuResources.o: StandardGpuResources.cpp StandardGpuResources.h \
GpuResources.h utils/DeviceMemory.h utils/StackDeviceMemory.h \
utils/DeviceUtils.h utils/../../FaissAssert.h \
utils/../../FaissException.h ../FaissAssert.h
GpuAutoTune.cpp/GpuAutoTune.o: GpuAutoTune.cpp GpuAutoTune.h ../Index.h ../AutoTune.h \
./GpuAutoTune.o: GpuAutoTune.cpp GpuAutoTune.h ../Index.h ../AutoTune.h \
../Index.h GpuClonerOptions.h GpuIndicesOptions.h GpuIndex.h \
utils/MemorySpace.h utils/../../FaissAssert.h \
utils/../../FaissException.h ../FaissAssert.h ../index_io.h \
......@@ -161,6 +163,8 @@ utils/DeviceUtils.o: utils/DeviceUtils.cpp utils/DeviceUtils.h \
utils/../../FaissAssert.h utils/../../FaissException.h
utils/Timer.o: utils/Timer.cpp utils/Timer.h utils/DeviceUtils.h \
utils/../../FaissAssert.h utils/../../FaissException.h
utils/MemorySpace.o: utils/MemorySpace.cpp utils/MemorySpace.h \
utils/../../FaissAssert.h utils/../../FaissException.h
utils/WorkerThread.o: utils/WorkerThread.cpp utils/WorkerThread.h \
utils/../../FaissAssert.h utils/../../FaissException.h
impl/BroadcastSum.o: impl/BroadcastSum.cu impl/../../FaissAssert.h \
......@@ -169,12 +173,14 @@ impl/BroadcastSum.o: impl/BroadcastSum.cu impl/../../FaissAssert.h \
impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \
impl/../utils/../utils/DeviceMemory.h impl/../utils/DeviceTensor.cuh \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \
impl/../utils/StaticUtils.h
impl/Distance.o: impl/Distance.cu impl/Distance.cuh \
impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceMemory.h impl/../utils/MemorySpace.h \
impl/../utils/DeviceTensor-inl.cuh impl/../utils/Float16.cuh \
impl/../utils/../GpuResources.h impl/BroadcastSum.cuh impl/L2Norm.cuh \
......@@ -189,8 +195,9 @@ impl/Distance.o: impl/Distance.cu impl/Distance.cuh \
impl/../utils/ReductionOperators.cuh
impl/FlatIndex.o: impl/FlatIndex.cu impl/FlatIndex.cuh \
impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceMemory.h impl/../utils/MemorySpace.h \
impl/../utils/DeviceTensor-inl.cuh impl/../utils/DeviceVector.cuh \
impl/../utils/StaticUtils.h impl/../utils/Float16.cuh \
......@@ -200,8 +207,9 @@ impl/FlatIndex.o: impl/FlatIndex.cu impl/FlatIndex.cuh \
impl/InvertedListAppend.o: impl/InvertedListAppend.cu \
impl/InvertedListAppend.cuh impl/../GpuIndicesOptions.h \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../../FaissAssert.h \
impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/../../FaissAssert.h impl/../../FaissAssert.h \
impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \
impl/../utils/../utils/DeviceMemory.h impl/../utils/DeviceTensor.cuh \
impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \
......@@ -211,7 +219,8 @@ impl/IVFBase.o: impl/IVFBase.cu impl/IVFBase.cuh impl/../GpuIndicesOptions.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/MemorySpace.h impl/../utils/StaticUtils.h \
impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/DeviceMemory.h \
impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissAssert.h impl/../utils/DeviceMemory.h \
impl/../utils/DeviceTensor-inl.cuh impl/../GpuResources.h \
impl/FlatIndex.cuh impl/../utils/Float16.cuh impl/InvertedListAppend.cuh \
impl/RemapIndices.h impl/../utils/DeviceDefs.cuh \
......@@ -222,6 +231,7 @@ impl/IVFFlat.o: impl/IVFFlat.cu impl/IVFFlat.cuh impl/IVFBase.cuh \
impl/../utils/DeviceUtils.h impl/../utils/MemorySpace.h \
impl/../utils/StaticUtils.h impl/../utils/DeviceTensor.cuh \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceMemory.h impl/../utils/DeviceTensor-inl.cuh \
impl/../GpuResources.h impl/FlatIndex.cuh impl/../utils/Float16.cuh \
impl/InvertedListAppend.cuh impl/IVFFlatScan.cuh impl/RemapIndices.h \
......@@ -230,8 +240,9 @@ impl/IVFFlat.o: impl/IVFFlat.cu impl/IVFFlat.cuh impl/IVFBase.cuh \
impl/../utils/Transpose.cuh
impl/IVFFlatScan.o: impl/IVFFlatScan.cu impl/IVFFlatScan.cuh \
impl/../GpuIndicesOptions.h impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../GpuResources.h impl/../utils/DeviceMemory.h impl/IVFUtils.cuh \
impl/../utils/ConversionOperators.cuh impl/../utils/Float16.cuh \
impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \
......@@ -247,6 +258,7 @@ impl/IVFPQ.o: impl/IVFPQ.cu impl/IVFPQ.cuh impl/IVFBase.cuh \
impl/../utils/DeviceUtils.h impl/../utils/MemorySpace.h \
impl/../utils/StaticUtils.h impl/../utils/DeviceTensor.cuh \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceMemory.h impl/../utils/DeviceTensor-inl.cuh \
impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \
impl/BroadcastSum.cuh impl/Distance.cuh impl/FlatIndex.cuh \
......@@ -258,42 +270,46 @@ impl/IVFPQ.o: impl/IVFPQ.cu impl/IVFPQ.cuh impl/IVFBase.cuh \
impl/../utils/MatrixMult.cuh impl/../utils/Transpose.cuh
impl/IVFUtils.o: impl/IVFUtils.cu impl/IVFUtils.cuh \
impl/../GpuIndicesOptions.h impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../utils/StaticUtils.h impl/../utils/ThrustAllocator.cuh
impl/IVFUtilsSelect1.o: impl/IVFUtilsSelect1.cu impl/IVFUtils.cuh \
impl/../GpuIndicesOptions.h impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/Select.cuh impl/../utils/Comparators.cuh \
impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \
impl/../utils/../utils/DeviceMemory.h impl/../utils/DeviceTensor.cuh \
impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \
impl/../utils/DeviceDefs.cuh impl/../utils/MergeNetworkBlock.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../utils/Limits.cuh impl/../utils/Float16.cuh \
impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \
impl/../utils/DeviceTensor-inl.cuh impl/../utils/Pair.cuh \
impl/../utils/MathOperators.cuh impl/../utils/WarpShuffles.cuh \
impl/../utils/DeviceDefs.cuh impl/../utils/Select.cuh \
impl/../utils/Comparators.cuh impl/../utils/MergeNetworkBlock.cuh \
impl/../utils/MergeNetworkUtils.cuh impl/../utils/PtxUtils.cuh \
impl/../utils/StaticUtils.h impl/../utils/WarpShuffles.cuh \
impl/../utils/MergeNetworkWarp.cuh impl/../utils/Reductions.cuh \
impl/../utils/ReductionOperators.cuh impl/../utils/Limits.cuh \
impl/../utils/Pair.cuh impl/../utils/MathOperators.cuh
impl/../utils/StaticUtils.h impl/../utils/MergeNetworkWarp.cuh \
impl/../utils/Reductions.cuh impl/../utils/ReductionOperators.cuh
impl/IVFUtilsSelect2.o: impl/IVFUtilsSelect2.cu impl/IVFUtils.cuh \
impl/../GpuIndicesOptions.h impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/Select.cuh impl/../utils/Comparators.cuh \
impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \
impl/../utils/../utils/DeviceMemory.h impl/../utils/DeviceTensor.cuh \
impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \
impl/../utils/DeviceDefs.cuh impl/../utils/MergeNetworkBlock.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../utils/Limits.cuh impl/../utils/Float16.cuh \
impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \
impl/../utils/DeviceTensor-inl.cuh impl/../utils/Pair.cuh \
impl/../utils/MathOperators.cuh impl/../utils/WarpShuffles.cuh \
impl/../utils/DeviceDefs.cuh impl/../utils/Select.cuh \
impl/../utils/Comparators.cuh impl/../utils/MergeNetworkBlock.cuh \
impl/../utils/MergeNetworkUtils.cuh impl/../utils/PtxUtils.cuh \
impl/../utils/StaticUtils.h impl/../utils/WarpShuffles.cuh \
impl/../utils/MergeNetworkWarp.cuh impl/../utils/Reductions.cuh \
impl/../utils/ReductionOperators.cuh impl/../utils/Limits.cuh \
impl/../utils/Pair.cuh impl/../utils/MathOperators.cuh
impl/../utils/StaticUtils.h impl/../utils/MergeNetworkWarp.cuh \
impl/../utils/Reductions.cuh impl/../utils/ReductionOperators.cuh
impl/L2Norm.o: impl/L2Norm.cu impl/L2Norm.cuh impl/../utils/Float16.cuh \
impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \
impl/../../FaissAssert.h impl/../utils/ConversionOperators.cuh \
impl/../utils/DeviceDefs.cuh impl/../utils/MathOperators.cuh \
......@@ -304,8 +320,9 @@ impl/L2Norm.o: impl/L2Norm.cu impl/L2Norm.cuh impl/../utils/Float16.cuh \
impl/L2Select.o: impl/L2Select.cu impl/L2Select.cuh impl/../utils/Float16.cuh \
impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \
impl/../../FaissAssert.h impl/../utils/MathOperators.cuh \
impl/../utils/Pair.cuh impl/../utils/WarpShuffles.cuh \
......@@ -317,8 +334,9 @@ impl/L2Select.o: impl/L2Select.cu impl/L2Select.cuh impl/../utils/Float16.cuh \
impl/../utils/MergeNetworkWarp.cuh
impl/PQCodeDistances.o: impl/PQCodeDistances.cu impl/PQCodeDistances.cuh \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/NoTypeTensor.cuh \
impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/../../FaissAssert.h impl/../utils/NoTypeTensor.cuh \
impl/BroadcastSum.cuh impl/../utils/Float16.cuh \
impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \
......@@ -329,8 +347,9 @@ impl/PQCodeDistances.o: impl/PQCodeDistances.cu impl/PQCodeDistances.cuh \
impl/PQScanMultiPassNoPrecomputed.o: impl/PQScanMultiPassNoPrecomputed.cu \
impl/PQScanMultiPassNoPrecomputed.cuh impl/../GpuIndicesOptions.h \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../GpuResources.h \
impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/../../FaissAssert.h impl/../GpuResources.h \
impl/../utils/DeviceMemory.h impl/PQCodeDistances.cuh \
impl/../utils/NoTypeTensor.cuh impl/PQCodeLoad.cuh \
impl/../utils/PtxUtils.cuh impl/IVFUtils.cuh \
......@@ -342,8 +361,9 @@ impl/PQScanMultiPassNoPrecomputed.o: impl/PQScanMultiPassNoPrecomputed.cu \
impl/PQScanMultiPassPrecomputed.o: impl/PQScanMultiPassPrecomputed.cu \
impl/PQScanMultiPassPrecomputed.cuh impl/../GpuIndicesOptions.h \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/NoTypeTensor.cuh \
impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/../../FaissAssert.h impl/../utils/NoTypeTensor.cuh \
impl/../GpuResources.h impl/../utils/DeviceMemory.h impl/PQCodeLoad.cuh \
impl/../utils/PtxUtils.cuh impl/IVFUtils.cuh \
impl/../utils/ConversionOperators.cuh impl/../utils/Float16.cuh \
......@@ -352,33 +372,36 @@ impl/PQScanMultiPassPrecomputed.o: impl/PQScanMultiPassPrecomputed.cu \
impl/../utils/MathOperators.cuh impl/../utils/StaticUtils.h
impl/VectorResidual.o: impl/VectorResidual.cu impl/VectorResidual.cuh \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/Float16.cuh \
impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/../../FaissAssert.h impl/../utils/Float16.cuh \
impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \
impl/../utils/DeviceTensor-inl.cuh impl/../../FaissAssert.h \
impl/../utils/ConversionOperators.cuh impl/../utils/StaticUtils.h
GpuIndex.cu/GpuIndex.o: GpuIndex.cu GpuIndex.h ../Index.h utils/MemorySpace.h \
./GpuIndex.o: GpuIndex.cu GpuIndex.h ../Index.h utils/MemorySpace.h \
utils/../../FaissAssert.h utils/../../FaissException.h ../FaissAssert.h \
GpuResources.h utils/DeviceMemory.h utils/DeviceUtils.h
GpuIndexFlat.cu/GpuIndexFlat.o: GpuIndexFlat.cu GpuIndexFlat.h GpuIndex.h ../Index.h \
./GpuIndexFlat.o: GpuIndexFlat.cu GpuIndexFlat.h GpuIndex.h ../Index.h \
utils/MemorySpace.h utils/../../FaissAssert.h \
utils/../../FaissException.h ../IndexFlat.h ../Index.h GpuResources.h \
utils/DeviceMemory.h impl/FlatIndex.cuh impl/../utils/DeviceTensor.cuh \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../../FaissAssert.h impl/../utils/DeviceUtils.h \
impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceTensor-inl.cuh impl/../utils/DeviceVector.cuh \
impl/../utils/StaticUtils.h impl/../utils/Float16.cuh \
utils/CopyUtils.cuh utils/HostTensor.cuh utils/HostTensor-inl.cuh
GpuIndexIVF.cu/GpuIndexIVF.o: GpuIndexIVF.cu GpuIndexIVF.h GpuIndex.h ../Index.h \
./GpuIndexIVF.o: GpuIndexIVF.cu GpuIndexIVF.h GpuIndex.h ../Index.h \
utils/MemorySpace.h utils/../../FaissAssert.h \
utils/../../FaissException.h GpuIndexFlat.h GpuIndicesOptions.h \
../Clustering.h ../Index.h ../FaissAssert.h ../IndexFlat.h ../IndexIVF.h \
../Clustering.h ../Heap.h utils/DeviceUtils.h utils/Float16.cuh \
utils/../GpuResources.h utils/../utils/DeviceMemory.h \
utils/DeviceTensor.cuh utils/Tensor.cuh utils/Tensor-inl.cuh \
utils/../GpuFaissAssert.h utils/../../FaissAssert.h \
utils/DeviceTensor-inl.cuh
GpuIndexIVFFlat.cu/GpuIndexIVFFlat.o: GpuIndexIVFFlat.cu GpuIndexIVFFlat.h GpuIndexIVF.h \
./GpuIndexIVFFlat.o: GpuIndexIVFFlat.cu GpuIndexIVFFlat.h GpuIndexIVF.h \
GpuIndex.h ../Index.h utils/MemorySpace.h utils/../../FaissAssert.h \
utils/../../FaissException.h GpuIndexFlat.h GpuIndicesOptions.h \
../Clustering.h ../Index.h ../IndexFlat.h ../IndexIVF.h ../Clustering.h \
......@@ -387,9 +410,10 @@ GpuIndexIVFFlat.cu/GpuIndexIVFFlat.o: GpuIndexIVFFlat.cu GpuIndexIVFFlat.h GpuIn
impl/../utils/../../FaissAssert.h impl/../utils/DeviceUtils.h \
impl/../utils/StaticUtils.h impl/../utils/DeviceTensor.cuh \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceTensor-inl.cuh utils/CopyUtils.cuh \
utils/HostTensor.cuh utils/HostTensor-inl.cuh utils/Float16.cuh
GpuIndexIVFPQ.cu/GpuIndexIVFPQ.o: GpuIndexIVFPQ.cu GpuIndexIVFPQ.h GpuIndexIVF.h \
./GpuIndexIVFPQ.o: GpuIndexIVFPQ.cu GpuIndexIVFPQ.h GpuIndexIVF.h \
GpuIndex.h ../Index.h utils/MemorySpace.h utils/../../FaissAssert.h \
utils/../../FaissException.h GpuIndexFlat.h GpuIndicesOptions.h \
../Clustering.h ../Index.h ../IndexFlat.h ../IndexIVFPQ.h ../IndexIVF.h \
......@@ -399,19 +423,22 @@ GpuIndexIVFPQ.cu/GpuIndexIVFPQ.o: GpuIndexIVFPQ.cu GpuIndexIVFPQ.h GpuIndexIVF.h
impl/../utils/DeviceVector.cuh impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceUtils.h impl/../utils/StaticUtils.h \
impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/DeviceTensor-inl.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissAssert.h impl/../utils/DeviceTensor-inl.cuh \
impl/../utils/Float16.cuh utils/CopyUtils.cuh utils/HostTensor.cuh \
utils/HostTensor-inl.cuh
utils/Float16.o: utils/Float16.cu utils/Float16.cuh utils/../GpuResources.h \
utils/../utils/DeviceMemory.h utils/DeviceTensor.cuh utils/Tensor.cuh \
utils/Tensor-inl.cuh utils/../../FaissAssert.h \
utils/../../FaissException.h utils/DeviceUtils.h utils/MemorySpace.h \
utils/DeviceTensor-inl.cuh utils/nvidia/fp16_emu.cuh
utils/Tensor-inl.cuh utils/../GpuFaissAssert.h utils/../../FaissAssert.h \
utils/../../FaissException.h utils/DeviceUtils.h \
utils/../../FaissAssert.h utils/MemorySpace.h utils/DeviceTensor-inl.cuh \
utils/nvidia/fp16_emu.cuh
utils/MatrixMult.o: utils/MatrixMult.cu utils/MatrixMult.cuh utils/Float16.cuh \
utils/../GpuResources.h utils/../utils/DeviceMemory.h \
utils/DeviceTensor.cuh utils/Tensor.cuh utils/Tensor-inl.cuh \
utils/../../FaissAssert.h utils/../../FaissException.h \
utils/DeviceUtils.h utils/MemorySpace.h utils/DeviceTensor-inl.cuh \
utils/../GpuFaissAssert.h utils/../../FaissAssert.h \
utils/../../FaissException.h utils/DeviceUtils.h \
utils/../../FaissAssert.h utils/MemorySpace.h utils/DeviceTensor-inl.cuh \
utils/HostTensor.cuh utils/HostTensor-inl.cuh
utils/BlockSelectFloat.o: utils/BlockSelectFloat.cu \
utils/blockselect/BlockSelectImpl.cuh \
......@@ -420,9 +447,12 @@ utils/BlockSelectFloat.o: utils/BlockSelectFloat.cu \
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -442,9 +472,12 @@ utils/BlockSelectHalf.o: utils/BlockSelectHalf.cu \
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -464,9 +497,12 @@ utils/WarpSelectFloat.o: utils/WarpSelectFloat.cu \
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -485,9 +521,12 @@ utils/WarpSelectHalf.o: utils/WarpSelectHalf.cu \
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -507,9 +546,12 @@ utils/blockselect/BlockSelectHalf1.o: utils/blockselect/BlockSelectHalf1.cu \
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -529,9 +571,12 @@ utils/blockselect/BlockSelectFloat1.o: utils/blockselect/BlockSelectFloat1.cu \
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -551,9 +596,12 @@ utils/warpselect/WarpSelectHalf1.o: utils/warpselect/WarpSelectHalf1.cu \
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -572,9 +620,12 @@ utils/warpselect/WarpSelectFloat1.o: utils/warpselect/WarpSelectFloat1.cu \
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -593,9 +644,12 @@ utils/blockselect/BlockSelectHalf32.o: utils/blockselect/BlockSelectHalf32.cu \
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -615,9 +669,12 @@ utils/blockselect/BlockSelectFloat32.o: utils/blockselect/BlockSelectFloat32.cu
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -637,9 +694,12 @@ utils/warpselect/WarpSelectHalf32.o: utils/warpselect/WarpSelectHalf32.cu \
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -658,9 +718,12 @@ utils/warpselect/WarpSelectFloat32.o: utils/warpselect/WarpSelectFloat32.cu \
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -679,9 +742,12 @@ utils/blockselect/BlockSelectHalf64.o: utils/blockselect/BlockSelectHalf64.cu \
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -701,9 +767,12 @@ utils/blockselect/BlockSelectFloat64.o: utils/blockselect/BlockSelectFloat64.cu
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -723,9 +792,12 @@ utils/warpselect/WarpSelectHalf64.o: utils/warpselect/WarpSelectHalf64.cu \
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -744,9 +816,12 @@ utils/warpselect/WarpSelectFloat64.o: utils/warpselect/WarpSelectFloat64.cu \
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -765,9 +840,12 @@ utils/blockselect/BlockSelectHalf128.o: utils/blockselect/BlockSelectHalf128.cu
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -787,9 +865,12 @@ utils/blockselect/BlockSelectFloat128.o: utils/blockselect/BlockSelectFloat128.c
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -809,9 +890,12 @@ utils/warpselect/WarpSelectHalf128.o: utils/warpselect/WarpSelectHalf128.cu \
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -830,9 +914,12 @@ utils/warpselect/WarpSelectFloat128.o: utils/warpselect/WarpSelectFloat128.cu \
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -851,9 +938,12 @@ utils/blockselect/BlockSelectHalf256.o: utils/blockselect/BlockSelectHalf256.cu
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -873,9 +963,12 @@ utils/blockselect/BlockSelectFloat256.o: utils/blockselect/BlockSelectFloat256.c
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -895,9 +988,12 @@ utils/warpselect/WarpSelectHalf256.o: utils/warpselect/WarpSelectHalf256.cu \
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -916,9 +1012,12 @@ utils/warpselect/WarpSelectFloat256.o: utils/warpselect/WarpSelectFloat256.cu \
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -937,9 +1036,12 @@ utils/blockselect/BlockSelectHalfF512.o: utils/blockselect/BlockSelectHalfF512.c
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -959,9 +1061,12 @@ utils/blockselect/BlockSelectFloatF512.o: utils/blockselect/BlockSelectFloatF512
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -981,9 +1086,12 @@ utils/warpselect/WarpSelectHalfF512.o: utils/warpselect/WarpSelectHalfF512.cu \
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -1002,9 +1110,12 @@ utils/warpselect/WarpSelectFloatF512.o: utils/warpselect/WarpSelectFloatF512.cu
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -1023,9 +1134,12 @@ utils/blockselect/BlockSelectHalfT512.o: utils/blockselect/BlockSelectHalfT512.c
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -1045,9 +1159,12 @@ utils/blockselect/BlockSelectFloatT512.o: utils/blockselect/BlockSelectFloatT512
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -1067,9 +1184,12 @@ utils/warpselect/WarpSelectHalfT512.o: utils/warpselect/WarpSelectHalfT512.cu \
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -1088,9 +1208,12 @@ utils/warpselect/WarpSelectFloatT512.o: utils/warpselect/WarpSelectFloatT512.cu
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -1109,9 +1232,12 @@ utils/blockselect/BlockSelectHalfF1024.o: utils/blockselect/BlockSelectHalfF1024
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -1131,9 +1257,12 @@ utils/blockselect/BlockSelectFloatF1024.o: utils/blockselect/BlockSelectFloatF10
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -1153,9 +1282,12 @@ utils/warpselect/WarpSelectHalfF1024.o: utils/warpselect/WarpSelectHalfF1024.cu
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -1174,9 +1306,12 @@ utils/warpselect/WarpSelectFloatF1024.o: utils/warpselect/WarpSelectFloatF1024.c
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -1195,9 +1330,12 @@ utils/blockselect/BlockSelectHalfT1024.o: utils/blockselect/BlockSelectHalfT1024
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -1217,9 +1355,12 @@ utils/blockselect/BlockSelectFloatT1024.o: utils/blockselect/BlockSelectFloatT10
utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \
......@@ -1239,9 +1380,12 @@ utils/warpselect/WarpSelectHalfT1024.o: utils/warpselect/WarpSelectHalfT1024.cu
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......@@ -1260,9 +1404,12 @@ utils/warpselect/WarpSelectFloatT1024.o: utils/warpselect/WarpSelectFloatT1024.c
utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \
......
......@@ -29,44 +29,107 @@ namespace faiss { namespace gpu {
namespace {
constexpr int kDefaultTileSize = 256;
template <typename T>
Tensor<T, 2, true> sliceCentroids(Tensor<T, 2, true>& centroids,
Tensor<T, 2, true>* centroidsTransposed,
int startCentroid,
int num) {
if (startCentroid == 0 && num == centroids.getSize(0)) {
if (centroidsTransposed) {
return *centroidsTransposed;
} else {
return centroids;
}
}
if (centroidsTransposed) {
// (dim, num)
return centroidsTransposed->narrow(1, startCentroid, num);
} else {
return centroids.narrow(0, startCentroid, num);
}
}
// For each chunk of k indices, increment the index by chunk * increment
template <typename T>
int chooseTileSize(int tileSizeOverride,
size_t numCentroids,
size_t tempMemAvailable) {
if (tileSizeOverride > 0) {
return tileSizeOverride;
__global__ void incrementIndex(Tensor<T, 2, true> indices,
int k,
int increment) {
for (int i = threadIdx.x; i < k; i += blockDim.x) {
indices[blockIdx.y][blockIdx.x * k + i] += blockIdx.x * increment;
}
}
size_t tileSize =
sizeof(T) < 4 ? kDefaultTileSize * 2 : kDefaultTileSize;
// Used to update result indices in distance computation where the number of
// centroids is high, and is tiled
template <typename T>
void runIncrementIndex(Tensor<T, 2, true>& indices,
int k,
int increment,
cudaStream_t stream) {
dim3 grid(indices.getSize(1) / k, indices.getSize(0));
int block = std::min(k, 512);
while (tileSize > 64) {
size_t memRequirement = 2 * tileSize * numCentroids * sizeof(T);
// should be exact
FAISS_ASSERT(grid.x * k == indices.getSize(1));
if (memRequirement <= tempMemAvailable) {
// This fits entirely into our temporary memory
return tileSize;
}
incrementIndex<<<grid, block, 0, stream>>>(indices, k, increment);
// Otherwise, halve the tile size
tileSize /= 2;
cudaDeviceSynchronize();
}
// If the inner size (dim) of the vectors is small, we want a larger query tile
// size, like 1024
void chooseTileSize(int numQueries,
int numCentroids,
int dim,
int elementSize,
size_t tempMemAvailable,
int& tileRows,
int& tileCols) {
// The matrix multiplication should be large enough to be efficient, but if it
// is too large, we seem to lose efficiency as opposed to double-streaming.
// Each tile size here defines 1/2 of the memory use due to double streaming.
// We ignore available temporary memory, as that is adjusted independently by
// the user and can thus meet these requirements (or not).
// For <= 4 GB GPUs, prefer 512 MB of usage.
// For <= 8 GB GPUs, prefer 768 MB of usage.
// Otherwise, prefer 1 GB of usage.
auto totalMem = getCurrentDeviceProperties().totalGlobalMem;
int targetUsage = 0;
if (totalMem <= ((size_t) 4) * 1024 * 1024 * 1024) {
targetUsage = 512 * 1024 * 1024;
} else if (totalMem <= ((size_t) 8) * 1024 * 1024 * 1024) {
targetUsage = 768 * 1024 * 1024;
} else {
targetUsage = 1024 * 1024 * 1024;
}
// We use 64 as the minimum acceptable tile size
FAISS_ASSERT(tileSize >= 64);
targetUsage /= 2 * elementSize;
// FIXME: if we're running with no available temp memory, do we try
// and go larger based on free memory available on the device?
// 512 seems to be a batch size sweetspot for float32.
// If we are on float16, increase to 512.
// If the k size (vec dim) of the matrix multiplication is small (<= 32),
// increase to 1024.
int preferredTileRows = 512;
if (dim <= 32) {
preferredTileRows = 1024;
}
return tileSize;
tileRows = std::min(preferredTileRows, numQueries);
// tileCols is the remainder size
tileCols = std::min(targetUsage / preferredTileRows, numCentroids);
}
}
template <typename T>
void runL2Distance(GpuResources* resources,
void runDistance(bool computeL2,
GpuResources* resources,
Tensor<T, 2, true>& centroids,
Tensor<T, 2, true>* centroidsTransposed,
Tensor<T, 1, true>* centroidNorms,
......@@ -75,8 +138,7 @@ void runL2Distance(GpuResources* resources,
Tensor<T, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool useHgemm,
bool ignoreOutDistances = false,
int tileSizeOverride = -1) {
bool ignoreOutDistances) {
FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
FAISS_ASSERT(outDistances.getSize(1) == k);
......@@ -98,9 +160,9 @@ void runL2Distance(GpuResources* resources,
return;
}
// If ||c||^2 is not pre-computed, calculate it
// L2: If ||c||^2 is not pre-computed, calculate it
DeviceTensor<T, 1, true> cNorms;
if (!centroidNorms) {
if (computeL2 && !centroidNorms) {
cNorms = std::move(DeviceTensor<T, 1, true>(
mem,
{centroids.getSize(0)}, defaultStream));
......@@ -115,68 +177,111 @@ void runL2Distance(GpuResources* resources,
DeviceTensor<T, 1, true> queryNorms(mem, qNormSize, defaultStream);
// ||q||^2
if (computeL2) {
runL2Norm(queries, queryNorms, true, defaultStream);
}
//
// Handle the problem in row tiles, to avoid excessive temporary
// memory requests
//
// By default, aim to use up to 512 MB of memory for the processing, with both
// number of queries and number of centroids being at least 512.
int tileRows = 0;
int tileCols = 0;
chooseTileSize(queries.getSize(0),
centroids.getSize(0),
queries.getSize(1),
sizeof(T),
mem.getSizeAvailable(),
tileRows,
tileCols);
int numColTiles = utils::divUp(centroids.getSize(0), tileCols);
FAISS_ASSERT(k <= centroids.getSize(0));
FAISS_ASSERT(k <= 1024); // select limitation
int tileSize =
chooseTileSize<T>(
tileSizeOverride,
centroids.getSize(0),
resources->getMemoryManagerCurrentDevice().getSizeAvailable());
int maxQueriesPerIteration = std::min(tileSize, queries.getSize(0));
// Temporary output memory space we'll use
DeviceTensor<T, 2, true> distanceBuf1(
mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
mem, {tileRows, tileCols}, defaultStream);
DeviceTensor<T, 2, true> distanceBuf2(
mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
mem, {tileRows, tileCols}, defaultStream);
DeviceTensor<T, 2, true>* distanceBufs[2] =
{&distanceBuf1, &distanceBuf2};
DeviceTensor<T, 2, true> outDistanceBuf1(
mem, {tileRows, numColTiles * k}, defaultStream);
DeviceTensor<T, 2, true> outDistanceBuf2(
mem, {tileRows, numColTiles * k}, defaultStream);
DeviceTensor<T, 2, true>* outDistanceBufs[2] =
{&outDistanceBuf1, &outDistanceBuf2};
DeviceTensor<int, 2, true> outIndexBuf1(
mem, {tileRows, numColTiles * k}, defaultStream);
DeviceTensor<int, 2, true> outIndexBuf2(
mem, {tileRows, numColTiles * k}, defaultStream);
DeviceTensor<int, 2, true>* outIndexBufs[2] =
{&outIndexBuf1, &outIndexBuf2};
auto streams = resources->getAlternateStreamsCurrentDevice();
streamWait(streams, {defaultStream});
int curStream = 0;
for (int i = 0; i < queries.getSize(0); i += maxQueriesPerIteration) {
int numQueriesForIteration = std::min(maxQueriesPerIteration,
queries.getSize(0) - i);
// Tile over the input queries
for (int i = 0; i < queries.getSize(0); i += tileRows) {
int curQuerySize = std::min(tileRows, queries.getSize(0) - i);
auto distanceBufView =
distanceBufs[curStream]->narrowOutermost(0, numQueriesForIteration);
auto queryView =
queries.narrowOutermost(i, numQueriesForIteration);
auto outDistanceView =
outDistances.narrowOutermost(i, numQueriesForIteration);
outDistances.narrow(0, i, curQuerySize);
auto outIndexView =
outIndices.narrowOutermost(i, numQueriesForIteration);
outIndices.narrow(0, i, curQuerySize);
auto queryView =
queries.narrow(0, i, curQuerySize);
auto queryNormNiew =
queryNorms.narrowOutermost(i, numQueriesForIteration);
queryNorms.narrow(0, i, curQuerySize);
auto outDistanceBufRowView =
outDistanceBufs[curStream]->narrow(0, 0, curQuerySize);
auto outIndexBufRowView =
outIndexBufs[curStream]->narrow(0, 0, curQuerySize);
// Tile over the centroids
for (int j = 0; j < centroids.getSize(0); j += tileCols) {
int curCentroidSize = std::min(tileCols, centroids.getSize(0) - j);
int curColTile = j / tileCols;
// L2 distance is ||c||^2 - 2qc + ||q||^2
auto centroidsView =
sliceCentroids(centroids, centroidsTransposed, j, curCentroidSize);
// -2qc
auto distanceBufView = distanceBufs[curStream]->
narrow(0, 0, curQuerySize).narrow(1, 0, curCentroidSize);
auto outDistanceBufColView =
outDistanceBufRowView.narrow(1, k * curColTile, k);
auto outIndexBufColView =
outIndexBufRowView.narrow(1, k * curColTile, k);
// L2: distance is ||c||^2 - 2qc + ||q||^2, we compute -2qc
// IP: just compute qc
// (query id x dim) x (centroid id, dim)' = (query id, centroid id)
runMatrixMult(distanceBufView, false,
queryView, false,
centroidsTransposed ? *centroidsTransposed : centroids,
centroidsView,
centroidsTransposed ? false : true,
-2.0f, 0.0f, useHgemm,
computeL2 ? -2.0f : 1.0f, 0.0f, useHgemm,
resources->getBlasHandleCurrentDevice(),
streams[curStream]);
if (computeL2) {
// For L2 distance, we use this fused kernel that performs both
// adding ||c||^2 to -2qc and k-selection, so we only need two
// passes (one write by the gemm, one read here) over the huge
// region of output memory
//
// If we aren't tiling along the number of centroids, we can perform the
// output work directly
if (tileCols == centroids.getSize(0)) {
// Write into the final output
runL2SelectMin(distanceBufView,
*centroidNorms,
outDistanceView,
......@@ -189,6 +294,57 @@ void runL2Distance(GpuResources* resources,
// top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)
runSumAlongRows(queryNormNiew, outDistanceView, streams[curStream]);
}
} else {
auto centroidNormsView =
centroidNorms->narrow(0, j, curCentroidSize);
// Write into our intermediate output
runL2SelectMin(distanceBufView,
centroidNormsView,
outDistanceBufColView,
outIndexBufColView,
k,
streams[curStream]);
if (!ignoreOutDistances) {
// expand (query id) to (query id, k) by duplicating along rows
// top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)
runSumAlongRows(queryNormNiew,
outDistanceBufColView,
streams[curStream]);
}
}
} else {
// For IP, just k-select the output for this tile
if (tileCols == centroids.getSize(0)) {
// Write into the final output
runBlockSelect(distanceBufView,
outDistanceView,
outIndexView,
true, k, streams[curStream]);
} else {
// Write into the intermediate output
runBlockSelect(distanceBufView,
outDistanceBufColView,
outIndexBufColView,
true, k, streams[curStream]);
}
}
}
// As we're finished with processing a full set of centroids, perform the
// final k-selection
if (tileCols != centroids.getSize(0)) {
// The indices are tile-relative; for each tile of k, we need to add
// tileCols to the index
runIncrementIndex(outIndexBufRowView, k, tileCols, streams[curStream]);
runBlockSelectPair(outDistanceBufRowView,
outIndexBufRowView,
outDistanceView,
outIndexView,
computeL2 ? false : true, k, streams[curStream]);
}
curStream = (curStream + 1) % 2;
}
......@@ -198,98 +354,49 @@ void runL2Distance(GpuResources* resources,
}
template <typename T>
void runIPDistance(GpuResources* resources,
void runL2Distance(GpuResources* resources,
Tensor<T, 2, true>& centroids,
Tensor<T, 2, true>* centroidsTransposed,
Tensor<T, 1, true>* centroidNorms,
Tensor<T, 2, true>& queries,
int k,
Tensor<T, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool useHgemm,
int tileSizeOverride = -1) {
FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
FAISS_ASSERT(outDistances.getSize(1) == k);
FAISS_ASSERT(outIndices.getSize(1) == k);
auto& mem = resources->getMemoryManagerCurrentDevice();
auto defaultStream = resources->getDefaultStreamCurrentDevice();
// If we're quering against a 0 sized set, just return empty results
if (centroids.numElements() == 0) {
thrust::fill(thrust::cuda::par.on(defaultStream),
outDistances.data(), outDistances.end(),
Limits<T>::getMax());
thrust::fill(thrust::cuda::par.on(defaultStream),
outIndices.data(), outIndices.end(),
-1);
return;
}
//
// Handle the problem in row tiles, to avoid excessive temporary
// memory requests
//
FAISS_ASSERT(k <= centroids.getSize(0));
FAISS_ASSERT(k <= 1024); // select limitation
int tileSize =
chooseTileSize<T>(
tileSizeOverride,
centroids.getSize(0),
resources->getMemoryManagerCurrentDevice().getSizeAvailable());
int maxQueriesPerIteration = std::min(tileSize, queries.getSize(0));
// Temporary output memory space we'll use
DeviceTensor<T, 2, true> distanceBuf1(
mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
DeviceTensor<T, 2, true> distanceBuf2(
mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
DeviceTensor<T, 2, true>* distanceBufs[2] =
{&distanceBuf1, &distanceBuf2};
auto streams = resources->getAlternateStreamsCurrentDevice();
streamWait(streams, {defaultStream});
int curStream = 0;
for (int i = 0; i < queries.getSize(0); i += maxQueriesPerIteration) {
int numQueriesForIteration = std::min(maxQueriesPerIteration,
queries.getSize(0) - i);
auto distanceBufView =
distanceBufs[curStream]->narrowOutermost(0, numQueriesForIteration);
auto queryView =
queries.narrowOutermost(i, numQueriesForIteration);
auto outDistanceView =
outDistances.narrowOutermost(i, numQueriesForIteration);
auto outIndexView =
outIndices.narrowOutermost(i, numQueriesForIteration);
// (query id x dim) x (centroid id, dim)' = (query id, centroid id)
runMatrixMult(distanceBufView, false,
queryView, false,
centroidsTransposed ? *centroidsTransposed : centroids,
centroidsTransposed ? false : true,
1.0f, 0.0f, useHgemm,
resources->getBlasHandleCurrentDevice(),
streams[curStream]);
// top-k of dot products
// (query id, top k centroids)
runBlockSelect(distanceBufView,
outDistanceView,
outIndexView,
true, k, streams[curStream]);
curStream = (curStream + 1) % 2;
}
bool ignoreOutDistances = false) {
runDistance<T>(true, // L2
resources,
centroids,
centroidsTransposed,
centroidNorms,
queries,
k,
outDistances,
outIndices,
useHgemm,
ignoreOutDistances);
}
streamWait({defaultStream}, streams);
template <typename T>
void runIPDistance(GpuResources* resources,
Tensor<T, 2, true>& centroids,
Tensor<T, 2, true>* centroidsTransposed,
Tensor<T, 2, true>& queries,
int k,
Tensor<T, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool useHgemm) {
runDistance<T>(false, // IP
resources,
centroids,
centroidsTransposed,
nullptr,
queries,
k,
outDistances,
outIndices,
useHgemm,
false);
}
//
......@@ -303,8 +410,7 @@ runIPDistance(GpuResources* resources,
Tensor<float, 2, true>& queries,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
int tileSizeOverride) {
Tensor<int, 2, true>& outIndices) {
runIPDistance<float>(resources,
vectors,
vectorsTransposed,
......@@ -312,8 +418,7 @@ runIPDistance(GpuResources* resources,
k,
outDistances,
outIndices,
false,
tileSizeOverride);
false);
}
#ifdef FAISS_USE_FLOAT16
......@@ -325,8 +430,7 @@ runIPDistance(GpuResources* resources,
int k,
Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool useHgemm,
int tileSizeOverride) {
bool useHgemm) {
runIPDistance<half>(resources,
vectors,
vectorsTransposed,
......@@ -334,8 +438,7 @@ runIPDistance(GpuResources* resources,
k,
outDistances,
outIndices,
useHgemm,
tileSizeOverride);
useHgemm);
}
#endif
......@@ -348,8 +451,7 @@ runL2Distance(GpuResources* resources,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool ignoreOutDistances,
int tileSizeOverride) {
bool ignoreOutDistances) {
runL2Distance<float>(resources,
vectors,
vectorsTransposed,
......@@ -359,8 +461,7 @@ runL2Distance(GpuResources* resources,
outDistances,
outIndices,
false,
ignoreOutDistances,
tileSizeOverride);
ignoreOutDistances);
}
#ifdef FAISS_USE_FLOAT16
......@@ -374,8 +475,7 @@ runL2Distance(GpuResources* resources,
Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool useHgemm,
bool ignoreOutDistances,
int tileSizeOverride) {
bool ignoreOutDistances) {
runL2Distance<half>(resources,
vectors,
vectorsTransposed,
......@@ -385,8 +485,7 @@ runL2Distance(GpuResources* resources,
outDistances,
outIndices,
useHgemm,
ignoreOutDistances,
tileSizeOverride);
ignoreOutDistances);
}
#endif
......
......@@ -31,11 +31,7 @@ void runL2Distance(GpuResources* resources,
Tensor<int, 2, true>& outIndices,
// Do we care about `outDistances`? If not, we can
// take shortcuts.
bool ignoreOutDistances = false,
// Hint to use a different sized tile for
// multi-streaming the queries. If <= 0, we use the
// default
int tileSizeOverride = -1);
bool ignoreOutDistances = false);
/// Calculates brute-force inner product distance between `vectors`
/// and `queries`, returning the k closest results seen
......@@ -45,11 +41,7 @@ void runIPDistance(GpuResources* resources,
Tensor<float, 2, true>& queries,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
// Hint to use a different sized tile for
// multi-streaming the queries. If <= 0, we use the
// default
int tileSizeOverride = -1);
Tensor<int, 2, true>& outIndices);
#ifdef FAISS_USE_FLOAT16
void runIPDistance(GpuResources* resources,
......@@ -59,8 +51,7 @@ void runIPDistance(GpuResources* resources,
int k,
Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool useHgemm,
int tileSizeOverride = -1);
bool useHgemm);
void runL2Distance(GpuResources* resources,
Tensor<half, 2, true>& vectors,
......@@ -71,8 +62,7 @@ void runL2Distance(GpuResources* resources,
Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool useHgemm,
bool ignoreOutDistances = false,
int tileSizeOverride = -1);
bool ignoreOutDistances = false);
#endif
} } // namespace
......@@ -114,8 +114,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance,
int tileSize) {
bool exactDistance) {
auto stream = resources_->getDefaultStreamCurrentDevice();
auto& mem = resources_->getMemoryManagerCurrentDevice();
......@@ -127,7 +126,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
DeviceTensor<half, 2, true> outDistancesHalf(
mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream);
query(inputHalf, k, outDistancesHalf, outIndices, exactDistance, tileSize);
query(inputHalf, k, outDistancesHalf, outIndices, exactDistance);
if (exactDistance) {
// Convert outDistances back
......@@ -145,8 +144,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
outDistances,
outIndices,
// FIXME
!exactDistance,
tileSize);
!exactDistance);
} else {
runIPDistance(resources_,
vectors_,
......@@ -154,8 +152,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
input,
k,
outDistances,
outIndices,
tileSize);
outIndices);
}
}
}
......@@ -166,8 +163,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
int k,
Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance,
int tileSize) {
bool exactDistance) {
FAISS_ASSERT(useFloat16_);
if (l2Distance_) {
......@@ -181,8 +177,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
outIndices,
useFloat16Accumulator_,
// FIXME
!exactDistance,
tileSize);
!exactDistance);
} else {
runIPDistance(resources_,
vectorsHalf_,
......@@ -191,8 +186,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
k,
outDistances,
outIndices,
useFloat16Accumulator_,
tileSize);
useFloat16Accumulator_);
}
}
#endif
......@@ -217,12 +211,14 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
rawData_.append((char*) devDataHalf.data(),
devDataHalf.getSizeInBytes(),
stream);
stream,
true /* reserve exactly */);
#endif
} else {
rawData_.append((char*) data,
(size_t) dim_ * numVecs * sizeof(float),
stream);
stream,
true /* reserve exactly */);
}
num_ += numVecs;
......
......@@ -61,16 +61,14 @@ class FlatIndex {
int k,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance,
int tileSize = -1);
bool exactDistance);
#ifdef FAISS_USE_FLOAT16
void query(Tensor<half, 2, true>& vecs,
int k,
Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance,
int tileSize = -1);
bool exactDistance);
#endif
/// Add vectors to ourselves; the pointer passed can be on the host
......
......@@ -195,10 +195,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
closestSubQDistanceView,
closestSubQIndexView,
// We don't care about distances
true,
// Much larger tile size, since these vectors are a
// lot smaller than query vectors
1024);
true);
}
// Now, we have the nearest sub-q centroid for each slice of the
......
......@@ -10,10 +10,10 @@
#include "IVFUtils.cuh"
#include "../utils/DeviceUtils.h"
#include "../utils/Limits.cuh"
#include "../utils/Select.cuh"
#include "../utils/StaticUtils.h"
#include "../utils/Tensor.cuh"
#include <limits>
//
// This kernel is split into a separate compilation unit to cut down
......@@ -22,9 +22,6 @@
namespace faiss { namespace gpu {
constexpr auto kMax = std::numeric_limits<float>::max();
constexpr auto kMin = std::numeric_limits<float>::min();
template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir>
__global__ void
pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
......@@ -38,7 +35,7 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
__shared__ float smemK[kNumWarps * NumWarpQ];
__shared__ int smemV[kNumWarps * NumWarpQ];
constexpr auto kInit = Dir ? kMin : kMax;
constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
BlockSelect<float, int, Dir, Comparator<float>,
NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(kInit, -1, smemK, smemV, k);
......
......@@ -10,10 +10,10 @@
#include "IVFUtils.cuh"
#include "../utils/DeviceUtils.h"
#include "../utils/Limits.cuh"
#include "../utils/Select.cuh"
#include "../utils/StaticUtils.h"
#include "../utils/Tensor.cuh"
#include <limits>
//
// This kernel is split into a separate compilation unit to cut down
......@@ -22,9 +22,6 @@
namespace faiss { namespace gpu {
constexpr auto kMax = std::numeric_limits<float>::max();
constexpr auto kMin = std::numeric_limits<float>::min();
// This is warp divergence central, but this is really a final step
// and happening a small number of times
inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
......@@ -71,7 +68,7 @@ pass2SelectLists(Tensor<float, 2, true> heapDistances,
__shared__ float smemK[kNumWarps * NumWarpQ];
__shared__ int smemV[kNumWarps * NumWarpQ];
constexpr auto kInit = Dir ? kMin : kMax;
constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
BlockSelect<float, int, Dir, Comparator<float>,
NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(kInit, -1, smemK, smemV, k);
......
......@@ -31,28 +31,29 @@ namespace faiss { namespace gpu {
// T: the type we are doing the math in (e.g., float, half)
// TVec: the potentially vectorized type we are loading in (e.g.,
// float4, half2)
template <typename T, typename TVec,
template <typename T, typename TVec, typename TIndex,
int RowTileSize, bool NormLoop, bool NormSquared>
__global__ void l2Norm(Tensor<TVec, 2, true> input,
Tensor<T, 1, true> output) {
__global__ void l2Norm(Tensor<TVec, 2, true, TIndex> input,
Tensor<T, 1, true, TIndex> output) {
extern __shared__ char smemByte[]; // #warps * RowTileSize elements
T* smem = (T*) smemByte;
int numWarps = utils::divUp(blockDim.x, kWarpSize);
int laneId = getLaneId();
int warpId = threadIdx.x / kWarpSize;
TIndex numWarps = utils::divUp(blockDim.x, kWarpSize);
TIndex laneId = getLaneId();
TIndex warpId = threadIdx.x / kWarpSize;
bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
int rowStart = RowTileSize * blockIdx.x;
TIndex rowStart = RowTileSize * blockIdx.x;
T rowNorm[RowTileSize];
if (lastRowTile) {
// We are handling the very end of the input matrix rows
for (int row = 0; row < input.getSize(0) - rowStart; ++row) {
for (TIndex row = 0; row < input.getSize(0) - rowStart; ++row) {
if (NormLoop) {
rowNorm[0] = Math<T>::zero();
for (int col = threadIdx.x; col < input.getSize(1); col += blockDim.x) {
for (TIndex col = threadIdx.x;
col < input.getSize(1); col += blockDim.x) {
TVec val = input[rowStart + row][col];
val = Math<TVec>::mul(val, val);
rowNorm[0] = Math<T>::add(rowNorm[0], Math<TVec>::reduceAdd(val));
......@@ -82,7 +83,8 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input,
rowNorm[row] = Math<T>::zero();
}
for (int col = threadIdx.x; col < input.getSize(1); col += blockDim.x) {
for (TIndex col = threadIdx.x;
col < input.getSize(1); col += blockDim.x) {
#pragma unroll
for (int row = 0; row < RowTileSize; ++row) {
tmp[row] = input[rowStart + row][col];
......@@ -172,32 +174,32 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input,
}
}
template <typename T, typename TVec>
void runL2Norm(Tensor<T, 2, true>& input,
Tensor<T, 1, true>& output,
template <typename T, typename TVec, typename TIndex>
void runL2Norm(Tensor<T, 2, true, TIndex>& input,
Tensor<T, 1, true, TIndex>& output,
bool normSquared,
cudaStream_t stream) {
FAISS_ASSERT(input.getSize(0) == output.getSize(0));
int maxThreads = getMaxThreadsCurrentDevice();
TIndex maxThreads = (TIndex) getMaxThreadsCurrentDevice();
constexpr int rowTileSize = 8;
#define RUN_L2(TYPE_T, TYPE_TVEC, INPUT) \
do { \
if (normLoop) { \
if (normSquared) { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, true, true> \
l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, true, true> \
<<<grid, block, smem, stream>>>(INPUT, output); \
} else { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, true, false> \
l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, true, false> \
<<<grid, block, smem, stream>>>(INPUT, output); \
} \
} else { \
if (normSquared) { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, false, true> \
l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, false, true> \
<<<grid, block, smem, stream>>>(INPUT, output); \
} else { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, false, false> \
l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, false, false> \
<<<grid, block, smem, stream>>>(INPUT, output); \
} \
} \
......@@ -207,9 +209,9 @@ void runL2Norm(Tensor<T, 2, true>& input,
// Can load using the vectorized type
auto inputV = input.template castResize<TVec>();
int dim = inputV.getSize(1);
auto dim = inputV.getSize(1);
bool normLoop = dim > maxThreads;
int numThreads = min(dim, maxThreads);
auto numThreads = min(dim, maxThreads);
auto grid = dim3(utils::divUp(inputV.getSize(0), rowTileSize));
auto block = dim3(numThreads);
......@@ -220,9 +222,9 @@ void runL2Norm(Tensor<T, 2, true>& input,
} else {
// Can't load using the vectorized type
int dim = input.getSize(1);
auto dim = input.getSize(1);
bool normLoop = dim > maxThreads;
int numThreads = min(dim, maxThreads);
auto numThreads = min(dim, maxThreads);
auto grid = dim3(utils::divUp(input.getSize(0), rowTileSize));
auto block = dim3(numThreads);
......@@ -241,7 +243,13 @@ void runL2Norm(Tensor<float, 2, true>& input,
Tensor<float, 1, true>& output,
bool normSquared,
cudaStream_t stream) {
runL2Norm<float, float4>(input, output, normSquared, stream);
if (input.canUseIndexType<int>()) {
runL2Norm<float, float4, int>(input, output, normSquared, stream);
} else {
auto inputCast = input.castIndexType<long>();
auto outputCast = output.castIndexType<long>();
runL2Norm<float, float4, long>(inputCast, outputCast, normSquared, stream);
}
}
#ifdef FAISS_USE_FLOAT16
......@@ -249,7 +257,13 @@ void runL2Norm(Tensor<half, 2, true>& input,
Tensor<half, 1, true>& output,
bool normSquared,
cudaStream_t stream) {
runL2Norm<half, half2>(input, output, normSquared, stream);
if (input.canUseIndexType<int>()) {
runL2Norm<half, half2, int>(input, output, normSquared, stream);
} else {
auto inputCast = input.castIndexType<long>();
auto outputCast = output.castIndexType<long>();
runL2Norm<half, half2, long>(inputCast, outputCast, normSquared, stream);
}
}
#endif
......
......@@ -29,11 +29,14 @@ DEFINE_int32(num, 128, "# of vecs");
DEFINE_int32(dim, 128, "# of dimensions");
DEFINE_int32(num_queries, 3, "number of query vectors");
DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
DEFINE_bool(use_float16, false, "use encodings in float16 instead of float32");
DEFINE_bool(use_float16, false, "use encodings in float16");
DEFINE_bool(use_float16_math, false, "perform math in float16");
DEFINE_bool(transposed, false, "store vectors transposed");
DEFINE_int64(seed, -1, "specify random seed");
DEFINE_int32(num_gpus, 1, "number of gpus to use");
DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use");
DEFINE_bool(cpu, true, "run the CPU code for timing and comparison");
DEFINE_bool(use_unified_mem, false, "use Pascal unified memory for the index");
using namespace faiss::gpu;
......@@ -72,7 +75,10 @@ int main(int argc, char** argv) {
GpuIndexFlatConfig config;
config.device = dev;
config.useFloat16 = FLAGS_use_float16;
config.useFloat16Accumulator = FLAGS_use_float16_math;
config.storeTransposed = FLAGS_transposed;
config.memorySpace = FLAGS_use_unified_mem ?
MemorySpace::Unified : MemorySpace::Device;
auto p = std::unique_ptr<faiss::gpu::GpuIndexFlatL2>(
new faiss::gpu::GpuIndexFlatL2(res, index.get(), config));
......@@ -90,9 +96,9 @@ int main(int argc, char** argv) {
HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
if (FLAGS_cpu) {
float cpuTime = 0.0f;
{
CpuTimer timer;
index->search(numQueries,
cpuQuery.data(),
......@@ -101,9 +107,8 @@ int main(int argc, char** argv) {
cpuIndices.data());
cpuTime = timer.elapsedMilliseconds();
}
printf("CPU time %.3f ms\n", cpuTime);
}
HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
......@@ -131,14 +136,14 @@ int main(int argc, char** argv) {
CUDA_VERIFY(cudaProfilerStop());
printf("GPU time %.3f ms\n", gpuTime);
if (FLAGS_cpu) {
compareLists(cpuDistances.data(), cpuIndices.data(),
gpuDistances.data(), gpuIndices.data(),
numQueries, FLAGS_k,
"", true, FLAGS_diff, false);
}
CUDA_VERIFY(cudaDeviceSynchronize());
// printf("\ncudaMalloc usage %zd\n",
// resources.getMemoryManager().getHighWaterCudaMalloc());
return 0;
}
......@@ -21,29 +21,47 @@
constexpr float kF16MaxRelErr = 0.07f;
constexpr float kF32MaxRelErr = 6e-3f;
void testFlat(bool useL2,
bool useFloat16,
bool useTransposed,
int kOverride = -1) {
int numVecs = faiss::gpu::randVal(1000, 20000);
struct TestFlatOptions {
TestFlatOptions()
: useL2(true),
useFloat16(false),
useTransposed(false),
numVecsOverride(-1),
numQueriesOverride(-1),
kOverride(-1) {
}
bool useL2;
bool useFloat16;
bool useTransposed;
int numVecsOverride;
int numQueriesOverride;
int kOverride;
};
void testFlat(const TestFlatOptions& opt) {
int numVecs = opt.numVecsOverride > 0 ?
opt.numVecsOverride : faiss::gpu::randVal(1000, 20000);
int dim = faiss::gpu::randVal(50, 800);
int numQuery = faiss::gpu::randVal(1, 512);
int numQuery = opt.numQueriesOverride > 0 ?
opt.numQueriesOverride : faiss::gpu::randVal(1, 512);
// Due to loss of precision in a float16 accumulator, for large k,
// the number of differences is pretty huge. Restrict ourselves to a
// fairly small `k` for float16
int k = useFloat16 ?
int k = opt.useFloat16 ?
std::min(faiss::gpu::randVal(1, 50), numVecs) :
std::min(faiss::gpu::randVal(1, 1024), numVecs);
if (kOverride > 0) {
k = kOverride;
if (opt.kOverride > 0) {
k = opt.kOverride;
}
faiss::IndexFlatIP cpuIndexIP(dim);
faiss::IndexFlatL2 cpuIndexL2(dim);
faiss::IndexFlat* cpuIndex =
useL2 ? (faiss::IndexFlat*) &cpuIndexL2 : (faiss::IndexFlat*) &cpuIndexIP;
opt.useL2 ? (faiss::IndexFlat*) &cpuIndexL2 :
(faiss::IndexFlat*) &cpuIndexIP;
// Construct on a random device to test multi-device, if we have
// multiple devices
......@@ -55,14 +73,14 @@ void testFlat(bool useL2,
faiss::gpu::GpuIndexFlatConfig config;
config.device = device;
config.useFloat16 = useFloat16;
config.storeTransposed = useTransposed;
config.useFloat16 = opt.useFloat16;
config.storeTransposed = opt.useTransposed;
faiss::gpu::GpuIndexFlatIP gpuIndexIP(&res, dim, config);
faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
faiss::gpu::GpuIndexFlat* gpuIndex =
useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
opt.useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
(faiss::gpu::GpuIndexFlat*) &gpuIndexIP;
std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
......@@ -70,37 +88,53 @@ void testFlat(bool useL2,
gpuIndex->add(numVecs, vecs.data());
std::stringstream str;
str << (useL2 ? "L2" : "IP") << " numVecs " << numVecs
str << (opt.useL2 ? "L2" : "IP") << " numVecs " << numVecs
<< " dim " << dim
<< " useFloat16 " << useFloat16
<< " transposed " << useTransposed
<< " useFloat16 " << opt.useFloat16
<< " transposed " << opt.useTransposed
<< " numQuery " << numQuery
<< " k " << k;
// To some extent, we depend upon the relative error for the test
// for float16
faiss::gpu::compareIndices(*cpuIndex, *gpuIndex, numQuery, dim, k, str.str(),
useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
// FIXME: the fp16 bounds are
// useless when math (the accumulator) is
// in fp16. Figure out another way to test
useFloat16 ? 0.99f : 0.1f,
useFloat16 ? 0.65f : 0.015f);
opt.useFloat16 ? 0.99f : 0.1f,
opt.useFloat16 ? 0.65f : 0.015f);
}
TEST(TestGpuIndexFlat, IP_Float32) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(false, false, false);
testFlat(false, false, true);
TestFlatOptions opt;
opt.useL2 = false;
opt.useFloat16 = false;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
}
}
TEST(TestGpuIndexFlat, L2_Float32) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(true, false, false);
testFlat(true, false, true);
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
}
}
......@@ -108,24 +142,46 @@ TEST(TestGpuIndexFlat, L2_Float32) {
TEST(TestGpuIndexFlat, L2_Float32_K1) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(true, false, false, 1);
testFlat(true, false, true, 1);
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
opt.kOverride = 1;
testFlat(opt);
}
}
TEST(TestGpuIndexFlat, IP_Float16) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(false, true, false);
testFlat(false, true, false);
TestFlatOptions opt;
opt.useL2 = false;
opt.useFloat16 = true;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
}
}
TEST(TestGpuIndexFlat, L2_Float16) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(true, true, false);
testFlat(true, true, true);
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = true;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
}
}
......@@ -133,8 +189,33 @@ TEST(TestGpuIndexFlat, L2_Float16) {
TEST(TestGpuIndexFlat, L2_Float16_K1) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(true, true, false, 1);
testFlat(true, true, true, 1);
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = true;
opt.useTransposed = false;
opt.kOverride = 1;
testFlat(opt);
}
}
// test tiling along a huge vector set
TEST(TestGpuIndexFlat, L2_Tiling) {
for (int tries = 0; tries < 3; ++tries) {
faiss::gpu::newTestSeed();
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
opt.numVecsOverride = 1000000;
opt.numQueriesOverride = 8;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
}
}
......
......@@ -14,6 +14,7 @@
#include "../StandardGpuResources.h"
#include "../utils/DeviceUtils.h"
#include "../test/TestUtils.h"
#include <cmath>
#include <gtest/gtest.h>
#include <glog/logging.h>
#include <sstream>
......@@ -390,6 +391,68 @@ TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) {
copyToTest(false, false);
}
TEST(TestGpuIndexIVFFlat, Float32_negative) {
faiss::gpu::newTestSeed();
Options opt;
auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
// Put all vecs on negative side
for (auto& f : trainVecs) {
f = std::abs(f) * -1.0f;
}
for (auto& f : addVecs) {
f *= std::abs(f) * -1.0f;
}
faiss::IndexFlatIP quantizerIP(opt.dim);
faiss::Index* quantizer = (faiss::Index*) &quantizerIP;
faiss::IndexIVFFlat cpuIndex(quantizer,
opt.dim, opt.numCentroids,
faiss::METRIC_INNER_PRODUCT);
cpuIndex.train(opt.numTrain, trainVecs.data());
cpuIndex.add(opt.numAdd, addVecs.data());
cpuIndex.nprobe = opt.nprobe;
faiss::gpu::StandardGpuResources res;
res.noTempMemory();
faiss::gpu::GpuIndexIVFFlatConfig config;
config.device = opt.device;
config.indicesOptions = opt.indicesOpt;
faiss::gpu::GpuIndexIVFFlat gpuIndex(&res,
cpuIndex.d,
cpuIndex.nlist,
cpuIndex.metric_type,
config);
gpuIndex.copyFrom(&cpuIndex);
gpuIndex.setNumProbes(opt.nprobe);
// Construct a positive test set
auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
// Put all vecs on positive size
for (auto& f : queryVecs) {
f = std::abs(f);
}
bool compFloat16 = false;
faiss::gpu::compareIndices(queryVecs,
cpuIndex, gpuIndex,
opt.numQuery, opt.dim, opt.k, opt.toString(),
compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
// FIXME: the fp16 bounds are
// useless when math (the accumulator) is
// in fp16. Figure out another way to test
compFloat16 ? 0.99f : 0.1f,
compFloat16 ? 0.65f : 0.015f);
}
//
// NaN tests
//
......
......@@ -64,24 +64,23 @@ std::vector<float> randVecs(size_t num, size_t dim) {
return v;
}
void compareIndices(faiss::Index& refIndex,
void compareIndices(const std::vector<float>& queryVecs,
faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
const std::string& configMsg,
float maxRelativeError,
float pctMaxDiff1,
float pctMaxDiffN) {
auto queries = faiss::gpu::randVecs(numQuery, dim);
// Compare
std::vector<float> refDistance(numQuery * k, 0);
std::vector<faiss::Index::idx_t> refIndices(numQuery * k, -1);
refIndex.search(numQuery, queries.data(),
refIndex.search(numQuery, queryVecs.data(),
k, refDistance.data(), refIndices.data());
std::vector<float> testDistance(numQuery * k, 0);
std::vector<faiss::Index::idx_t> testIndices(numQuery * k, -1);
testIndex.search(numQuery, queries.data(),
testIndex.search(numQuery, queryVecs.data(),
k, testDistance.data(), testIndices.data());
faiss::gpu::compareLists(refDistance.data(),
......@@ -94,6 +93,25 @@ void compareIndices(faiss::Index& refIndex,
maxRelativeError, pctMaxDiff1, pctMaxDiffN);
}
void compareIndices(faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
const std::string& configMsg,
float maxRelativeError,
float pctMaxDiff1,
float pctMaxDiffN) {
auto queryVecs = faiss::gpu::randVecs(numQuery, dim);
compareIndices(queryVecs,
refIndex,
testIndex,
numQuery, dim, k,
configMsg,
maxRelativeError,
pctMaxDiff1,
pctMaxDiffN);
}
template <typename T>
inline T lookup(const T* p, int i, int j, int /*dim1*/, int dim2) {
return p[i * dim2 + j];
......
......@@ -56,7 +56,19 @@ T randSelect(std::initializer_list<T> vals) {
/// Generates a collection of random vectors in the range [0, 1]
std::vector<float> randVecs(size_t num, size_t dim);
/// Compare two indices via query for similarity
/// Compare two indices via query for similarity, with a user-specified set of
/// query vectors
void compareIndices(const std::vector<float>& queryVecs,
faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
const std::string& configMsg,
float maxRelativeError = 6e-5f,
float pctMaxDiff1 = 0.1f,
float pctMaxDiffN = 0.005f);
/// Compare two indices via query for similarity, generating random query
/// vectors
void compareIndices(faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
......
......@@ -38,14 +38,14 @@ def search_index_pytorch(index, x, k, D=None, I=None):
assert I.__class__ in (torch.LongTensor, torch.cuda.LongTensor)
assert I.size() == (n, k)
assert I.is_contiguous()
torch.cuda.synchronize()
xptr = x.storage().data_ptr()
Iptr = I.storage().data_ptr()
Dptr = D.storage().data_ptr()
index.search_c(n, faiss.cast_integer_to_float_ptr(xptr),
k, faiss.cast_integer_to_float_ptr(Dptr),
faiss.cast_integer_to_long_ptr(Iptr))
torch.cuda.synchronize()
return D, I
......
......@@ -77,4 +77,46 @@ void runBlockSelect(Tensor<float, 2, true>& in,
}
}
void runBlockSelectPair(Tensor<float, 2, true>& inK,
Tensor<int, 2, true>& inV,
Tensor<float, 2, true>& outK,
Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) {
FAISS_ASSERT(k <= 1024);
if (dir) {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(float, true, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(float, true, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(float, true, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(float, true, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(float, true, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(float, true, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(float, true, 1024);
}
} else {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(float, false, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(float, false, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(float, false, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(float, false, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(float, false, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(float, false, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(float, false, 1024);
}
}
}
} } // namespace
......@@ -79,6 +79,48 @@ void runBlockSelect(Tensor<half, 2, true>& in,
}
}
void runBlockSelectPair(Tensor<half, 2, true>& inK,
Tensor<int, 2, true>& inV,
Tensor<half, 2, true>& outK,
Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) {
FAISS_ASSERT(k <= 1024);
if (dir) {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(half, true, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(half, true, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(half, true, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(half, true, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(half, true, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(half, true, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(half, true, 1024);
}
} else {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(half, false, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(half, false, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(half, false, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(half, false, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(half, false, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(half, false, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(half, false, 1024);
}
}
}
#endif
} } // namespace
......@@ -62,16 +62,79 @@ __global__ void blockSelect(Tensor<K, 2, true> in,
}
}
template <typename K,
typename IndexType,
bool Dir,
int NumWarpQ,
int NumThreadQ,
int ThreadsPerBlock>
__global__ void blockSelectPair(Tensor<K, 2, true> inK,
Tensor<IndexType, 2, true> inV,
Tensor<K, 2, true> outK,
Tensor<IndexType, 2, true> outV,
K initK,
IndexType initV,
int k) {
constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
__shared__ K smemK[kNumWarps * NumWarpQ];
__shared__ IndexType smemV[kNumWarps * NumWarpQ];
BlockSelect<K, IndexType, Dir, Comparator<K>,
NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(initK, initV, smemK, smemV, k);
// Grid is exactly sized to rows available
int row = blockIdx.x;
int i = threadIdx.x;
K* inKStart = inK[row][i].data();
IndexType* inVStart = inV[row][i].data();
// Whole warps must participate in the selection
int limit = utils::roundDown(inK.getSize(1), kWarpSize);
for (; i < limit; i += ThreadsPerBlock) {
heap.add(*inKStart, *inVStart);
inKStart += ThreadsPerBlock;
inVStart += ThreadsPerBlock;
}
// Handle last remainder fraction of a warp of elements
if (i < inK.getSize(1)) {
heap.addThreadQ(*inKStart, *inVStart);
}
heap.reduce();
for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
outK[row][i] = smemK[i];
outV[row][i] = smemV[i];
}
}
void runBlockSelect(Tensor<float, 2, true>& in,
Tensor<float, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
void runBlockSelectPair(Tensor<float, 2, true>& inKeys,
Tensor<int, 2, true>& inIndices,
Tensor<float, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
#ifdef FAISS_USE_FLOAT16
void runBlockSelect(Tensor<half, 2, true>& in,
Tensor<half, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
void runBlockSelectPair(Tensor<half, 2, true>& inKeys,
Tensor<int, 2, true>& inIndices,
Tensor<half, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
#endif
} } // namespace
......@@ -12,37 +12,37 @@
namespace faiss { namespace gpu {
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor() :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(),
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor() :
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner),
space_(MemorySpace::Device) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(),
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) :
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner),
space_(MemorySpace::Device) {
this->operator=(std::move(t));
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t) {
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
if (this->state_ == AllocState::Owner) {
CUDA_VERIFY(cudaFree(this->data_));
}
this->Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
this->Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
std::move(t));
this->state_ = t.state_; t.state_ = AllocState::NotOwner;
......@@ -52,10 +52,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
return *this;
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() {
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~DeviceTensor() {
if (state_ == AllocState::Owner) {
FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
CUDA_VERIFY(cudaFree(this->data_));
......@@ -66,13 +66,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() {
// destructor will return the reservation
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
const IndexT sizes[Dim],
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner),
space_(space) {
......@@ -80,13 +80,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
std::initializer_list<IndexT> sizes,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner),
space_(space) {
......@@ -95,15 +95,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
}
// memory reservation constructor
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m,
const IndexT sizes[Dim],
cudaStream_t stream,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Reservation),
space_(space) {
......@@ -116,15 +116,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
}
// memory reservation constructor
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m,
std::initializer_list<IndexT> sizes,
cudaStream_t stream,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Reservation),
space_(space) {
......@@ -136,51 +136,51 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
reservation_ = std::move(memory);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data,
const IndexT sizes[Dim],
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner),
space_(space) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data,
std::initializer_list<IndexT> sizes,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner),
space_(space) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data,
const IndexT sizes[Dim],
const IndexT strides[Dim],
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes, strides),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
state_(AllocState::NotOwner),
space_(space) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Owner),
space_(space) {
......@@ -189,15 +189,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
this->copyFrom(t, stream);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m,
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Reservation),
space_(space) {
......@@ -211,10 +211,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
this->copyFrom(t, stream);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::zero(
__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero(
cudaStream_t stream) {
if (this->data_) {
// Region must be contiguous
......
......@@ -18,10 +18,10 @@ namespace faiss { namespace gpu {
template <typename T,
int Dim,
bool Contig = false,
bool InnerContig = false,
typename IndexT = int,
template <typename U> class PtrTraits = traits::DefaultPtrTraits>
class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
class DeviceTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
public:
typedef IndexT IndexType;
typedef typename PtrTraits<T>::PtrType DataPtrType;
......@@ -33,11 +33,11 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
__host__ ~DeviceTensor();
/// Move constructor
__host__ DeviceTensor(DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t);
__host__ DeviceTensor(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
/// Move assignment
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
operator=(DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t);
__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
operator=(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
/// Constructs a tensor of the given size, allocating memory for it
/// locally
......@@ -76,19 +76,19 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
MemorySpace space = MemorySpace::Device);
/// Copies a tensor into ourselves, allocating memory for it locally
__host__ DeviceTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
__host__ DeviceTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream,
MemorySpace space = MemorySpace::Device);
/// Copies a tensor into ourselves, reserving a temporary
/// memory reservation via a memory manager.
__host__ DeviceTensor(DeviceMemory& m,
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream,
MemorySpace space = MemorySpace::Device);
/// Call to zero out memory
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
zero(cudaStream_t stream);
private:
......
......@@ -43,7 +43,7 @@ void synchronizeAllDevices() {
}
}
cudaDeviceProp& getDeviceProperties(int device) {
const cudaDeviceProp& getDeviceProperties(int device) {
static std::mutex mutex;
static std::unordered_map<int, cudaDeviceProp> properties;
......@@ -61,6 +61,10 @@ cudaDeviceProp& getDeviceProperties(int device) {
return it->second;
}
const cudaDeviceProp& getCurrentDeviceProperties() {
return getDeviceProperties(getCurrentDevice());
}
int getMaxThreads(int device) {
return getDeviceProperties(device).maxThreadsPerBlock;
}
......
......@@ -31,7 +31,10 @@ int getNumDevices();
void synchronizeAllDevices();
/// Returns a cached cudaDeviceProp for the given device
cudaDeviceProp& getDeviceProperties(int device);
const cudaDeviceProp& getDeviceProperties(int device);
/// Returns the cached cudaDeviceProp for the current device
const cudaDeviceProp& getCurrentDeviceProperties();
/// Returns the maximum number of threads available for the given GPU
/// device
......
......@@ -10,18 +10,18 @@
namespace faiss { namespace gpu {
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor() :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(),
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor() :
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() {
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~HostTensor() {
if (state_ == AllocState::Owner) {
FAISS_ASSERT(this->data_ != nullptr);
delete[] this->data_;
......@@ -29,67 +29,67 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() {
}
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
const IndexT sizes[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner) {
this->data_ = new T[this->numElements()];
FAISS_ASSERT(this->data_ != nullptr);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
std::initializer_list<IndexT> sizes) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner) {
this->data_ = new T[this->numElements()];
FAISS_ASSERT(this->data_ != nullptr);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data,
const IndexT sizes[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data,
std::initializer_list<IndexT> sizes) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data,
const IndexT sizes[Dim],
const IndexT strides[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes, strides),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
state_(AllocState::NotOwner) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Owner) {
// Only contiguous arrays handled for now
FAISS_ASSERT(t.isContiguous());
......@@ -99,10 +99,10 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
}
/// Call to zero out memory
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>&
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() {
__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero() {
// Region must be contiguous
FAISS_ASSERT(this->isContiguous());
......@@ -113,17 +113,17 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() {
return *this;
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ T
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::maxDiff(
const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const {
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::maxDiff(
const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
auto size = this->numElements();
FAISS_ASSERT(size == t.numElements());
FAISS_ASSERT(size > 0);
if (Contig) {
if (InnerContig) {
auto a = this->data();
auto b = t.data();
......
......@@ -16,10 +16,10 @@ namespace faiss { namespace gpu {
template <typename T,
int Dim,
bool Contig = false,
bool InnerContig = false,
typename IndexT = int,
template <typename U> class PtrTraits = traits::DefaultPtrTraits>
class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
class HostTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
public:
typedef IndexT IndexType;
typedef typename PtrTraits<T>::PtrType DataPtrType;
......@@ -51,19 +51,19 @@ class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
/// Copies a tensor into ourselves, allocating memory for it
/// locally. If the tensor is on the GPU, then we will copy it to
/// ourselves wrt the given stream.
__host__ HostTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
__host__ HostTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream);
/// Call to zero out memory
__host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>& zero();
__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& zero();
/// Returns the maximum difference seen between two tensors
__host__ T
maxDiff(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const;
maxDiff(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const;
/// Are the two tensors exactly equal?
__host__ bool
equal(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const {
equal(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
return (maxDiff(t) == (T) 0);
}
......
......@@ -24,11 +24,12 @@ struct Limits {
// constexpr constructor for half
// FIXME: faiss CPU uses +/-FLT_MAX instead of +/-infinity
constexpr float kFloatMax = std::numeric_limits<float>::max();
constexpr float kFloatMin = std::numeric_limits<float>::lowest();
template <>
struct Limits<float> {
static __device__ __host__ inline float getMin() {
return -kFloatMax;
return kFloatMin;
}
static __device__ __host__ inline float getMax() {
return kFloatMax;
......@@ -55,8 +56,8 @@ struct Limits<half> {
#endif // FAISS_USE_FLOAT16
constexpr int kIntMin = std::numeric_limits<int>::min();
constexpr int kIntMax = std::numeric_limits<int>::max();
constexpr int kIntMin = std::numeric_limits<int>::lowest();
template <>
struct Limits<int> {
......
......@@ -112,6 +112,10 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC,
FAISS_ASSERT(aK == bK);
FAISS_ASSERT(bN == cN);
FAISS_ASSERT(a.getStride(1) == 1);
FAISS_ASSERT(b.getStride(1) == 1);
FAISS_ASSERT(c.getStride(1) == 1);
// Now, we have to represent the matrix multiplication in
// column-major layout
T* pA = transC ? a.data() : b.data();
......@@ -122,9 +126,9 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC,
int n = c.getSize(0); // other size
int k = transA ? a.getSize(0) : a.getSize(1);
int lda = transC ? a.getSize(1) : b.getSize(1);
int ldb = transC ? b.getSize(1) : a.getSize(1);
int ldc = c.getSize(1);
int lda = transC ? a.getStride(0) : b.getStride(0);
int ldb = transC ? b.getStride(0) : a.getStride(0);
int ldc = c.getStride(0);
auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
......@@ -238,9 +242,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
int n = c.getSize(1); // other size
int k = transA ? a.getSize(1) : a.getSize(2);
int lda = transC ? a.getSize(2) : b.getSize(2);
int ldb = transC ? b.getSize(2) : a.getSize(2);
int ldc = c.getSize(2);
int lda = transC ? a.getStride(1) : b.getStride(1);
int ldb = transC ? b.getStride(1) : a.getStride(1);
int ldc = c.getStride(1);
auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
......@@ -254,9 +258,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
HostTensor<float*, 1, true> hostB({b.getSize(0)});
HostTensor<float*, 1, true> hostC({c.getSize(0)});
size_t aOffset = a.getSize(1) * a.getSize(2);
size_t bOffset = b.getSize(1) * b.getSize(2);
size_t cOffset = c.getSize(1) * c.getSize(2);
size_t aOffset = a.getStride(0);
size_t bOffset = b.getStride(0);
size_t cOffset = c.getStride(0);
for (int i = 0; i < a.getSize(0); ++i) {
hostA[i] = transC ? a.data() + i * aOffset : b.data() + i * bOffset;
......
......@@ -16,7 +16,7 @@
namespace faiss { namespace gpu {
template <int Dim, bool Contig = false, typename IndexT = int>
template <int Dim, bool InnerContig = false, typename IndexT = int>
class NoTypeTensor {
public:
NoTypeTensor()
......@@ -25,7 +25,7 @@ class NoTypeTensor {
}
template <typename T>
NoTypeTensor(Tensor<T, Dim, Contig, IndexT>& t)
NoTypeTensor(Tensor<T, Dim, InnerContig, IndexT>& t)
: mem_(t.data()),
typeSize_(sizeof(T)) {
for (int i = 0; i < Dim; ++i) {
......@@ -87,13 +87,14 @@ class NoTypeTensor {
}
template <typename T>
Tensor<T, Dim, Contig, IndexT> toTensor() {
Tensor<T, Dim, InnerContig, IndexT> toTensor() {
FAISS_ASSERT(sizeof(T) == typeSize_);
return Tensor<T, Dim, Contig, IndexT>((T*) mem_, size_, stride_);
return Tensor<T, Dim, InnerContig, IndexT>((T*) mem_, size_, stride_);
}
NoTypeTensor<Dim, Contig, IndexT> narrowOutermost(IndexT start, IndexT size) {
NoTypeTensor<Dim, InnerContig, IndexT> narrowOutermost(IndexT start,
IndexT size) {
char* newPtr = (char*) mem_;
if (start > 0) {
......@@ -110,7 +111,7 @@ class NoTypeTensor {
}
}
return NoTypeTensor<Dim, Contig, IndexT>(
return NoTypeTensor<Dim, InnerContig, IndexT>(
newPtr, typeSize_, newSize, stride_);
}
......
......@@ -8,16 +8,16 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#include "../../FaissAssert.h"
#include "../GpuFaissAssert.h"
#include "DeviceUtils.h"
#include <limits>
namespace faiss { namespace gpu {
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
Tensor<T, Dim, Contig, IndexT, PtrTraits>::Tensor()
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor()
: data_(nullptr) {
static_assert(Dim > 0, "must have > 0 dimensions");
......@@ -27,12 +27,12 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::Tensor()
}
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
Tensor<T, Dim, Contig, IndexT, PtrTraits>&
Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
Tensor<T, Dim, Contig, IndexT, PtrTraits>&& t) {
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
data_ = t.data_; t.data_ = nullptr;
for (int i = 0; i < Dim; ++i) {
stride_[i] = t.stride_[i]; t.stride_[i] = 0;
......@@ -42,10 +42,10 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
return *this;
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
Tensor<T, Dim, Contig, IndexT, PtrTraits>::
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
Tensor(DataPtrType data, const IndexT sizes[Dim])
: data_(data) {
static_assert(Dim > 0, "must have > 0 dimensions");
......@@ -60,13 +60,13 @@ Tensor(DataPtrType data, const IndexT sizes[Dim])
}
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
Tensor<T, Dim, Contig, IndexT, PtrTraits>::
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
Tensor(DataPtrType data, std::initializer_list<IndexT> sizes)
: data_(data) {
assert(sizes.size() == Dim);
GPU_FAISS_ASSERT(sizes.size() == Dim);
static_assert(Dim > 0, "must have > 0 dimensions");
int i = 0;
......@@ -81,10 +81,10 @@ Tensor(DataPtrType data, std::initializer_list<IndexT> sizes)
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
Tensor<T, Dim, Contig, IndexT, PtrTraits>::Tensor(
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim])
: data_(data) {
static_assert(Dim > 0, "must have > 0 dimensions");
......@@ -95,22 +95,23 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::Tensor(
}
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ void
Tensor<T, Dim, Contig, IndexT, PtrTraits>::copyFrom(
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::copyFrom(
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream) {
static_assert(Contig, "only contiguous tensors handled");
// The tensor must be fully contiguous
GPU_FAISS_ASSERT(this->isContiguous());
// Size must be the same (since dimensions are checked and
// continuity is assumed, we need only check total number of
// elements
FAISS_ASSERT(this->numElements() == t.numElements());
GPU_FAISS_ASSERT(this->numElements() == t.numElements());
if (t.numElements() > 0) {
FAISS_ASSERT(this->data_);
FAISS_ASSERT(t.data());
GPU_FAISS_ASSERT(this->data_);
GPU_FAISS_ASSERT(t.data());
int ourDev = getDeviceForAddress(this->data_);
int tDev = getDeviceForAddress(t.data());
......@@ -133,22 +134,23 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::copyFrom(
}
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ void
Tensor<T, Dim, Contig, IndexT, PtrTraits>::copyTo(
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::copyTo(
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream) {
static_assert(Contig, "only contiguous tensors handled");
// The tensor must be fully contiguous
GPU_FAISS_ASSERT(this->isContiguous());
// Size must be the same (since dimensions are checked and
// continuity is assumed, we need only check total number of
// elements
FAISS_ASSERT(this->numElements() == t.numElements());
GPU_FAISS_ASSERT(this->numElements() == t.numElements());
if (t.numElements() > 0) {
FAISS_ASSERT(this->data_);
FAISS_ASSERT(t.data());
GPU_FAISS_ASSERT(this->data_);
GPU_FAISS_ASSERT(t.data());
int ourDev = getDeviceForAddress(this->data_);
int tDev = getDeviceForAddress(t.data());
......@@ -171,62 +173,79 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::copyTo(
}
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <int OtherDim>
template <typename OtherT, int OtherDim>
__host__ __device__ bool
Tensor<T, Dim, Contig, IndexT, PtrTraits>::isSame(
const Tensor<T, OtherDim, Contig, IndexT, PtrTraits>& rhs) const {
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isSame(
const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs) const {
if (Dim != OtherDim) {
return false;
}
for (int i = 0; i < Dim; ++i) {
if (size_[i] != rhs.size_[i]) {
if (this->getSize(i) != rhs.getSize(i)) {
return false;
}
if (!Contig) {
if (stride_[i] != rhs.stride_[i]) {
if (this->getStride(i) != rhs.getStride(i)) {
return false;
}
}
return true;
}
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <typename OtherT, int OtherDim>
__host__ __device__ bool
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isSameSize(
const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs) const {
if (Dim != OtherDim) {
return false;
}
for (int i = 0; i < Dim; ++i) {
if (this->getSize(i) != rhs.getSize(i)) {
return false;
}
}
return true;
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <typename U>
__host__ __device__ Tensor<U, Dim, Contig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::cast() {
__host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::cast() {
static_assert(sizeof(U) == sizeof(T), "cast must be to same size object");
return Tensor<U, Dim, Contig, IndexT, PtrTraits>(
return Tensor<U, Dim, InnerContig, IndexT, PtrTraits>(
reinterpret_cast<U*>(data_), size_, stride_);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <typename U>
__host__ __device__ const Tensor<U, Dim, Contig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::cast() const {
__host__ __device__ const Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::cast() const {
static_assert(sizeof(U) == sizeof(T), "cast must be to same size object");
return Tensor<U, Dim, Contig, IndexT, PtrTraits>(
return Tensor<U, Dim, InnerContig, IndexT, PtrTraits>(
reinterpret_cast<U*>(data_), size_, stride_);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <typename U>
__host__ __device__ Tensor<U, Dim, Contig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::castResize() {
__host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::castResize() {
static_assert(sizeof(U) >= sizeof(T), "only handles greater sizes");
constexpr int kMultiple = sizeof(U) / sizeof(T);
assert(canCastResize<U>());
GPU_FAISS_ASSERT(canCastResize<U>());
IndexT newSize[Dim];
IndexT newStride[Dim];
......@@ -239,24 +258,24 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::castResize() {
newStride[Dim - 1] = 1; // this is the same as the old stride
newSize[Dim - 1] = size_[Dim - 1] / kMultiple;
return Tensor<U, Dim, Contig, IndexT, PtrTraits>(
return Tensor<U, Dim, InnerContig, IndexT, PtrTraits>(
reinterpret_cast<U*>(data_), newSize, newStride);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <typename U>
__host__ __device__ const Tensor<U, Dim, Contig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::castResize() const {
return const_cast<Tensor<T, Dim, Contig, IndexT, PtrTraits>*>(this)->
__host__ __device__ const Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::castResize() const {
return const_cast<Tensor<T, Dim, InnerContig, IndexT, PtrTraits>*>(this)->
castResize<U>();
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <typename U>
__host__ __device__ bool
Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastResize() const {
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::canCastResize() const {
static_assert(sizeof(U) >= sizeof(T), "only handles greater sizes");
constexpr int kMultiple = sizeof(U) / sizeof(T);
......@@ -279,13 +298,13 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastResize() const {
return true;
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <typename NewIndexT>
__host__ Tensor<T, Dim, Contig, NewIndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::castIndexType() const {
__host__ Tensor<T, Dim, InnerContig, NewIndexT, PtrTraits>
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::castIndexType() const {
if (sizeof(NewIndexT) < sizeof(IndexT)) {
assert(this->canCastIndexType<NewIndexT>());
GPU_FAISS_ASSERT(this->canUseIndexType<NewIndexT>());
}
NewIndexT newSize[Dim];
......@@ -295,15 +314,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::castIndexType() const {
newStride[i] = (NewIndexT) stride_[i];
}
return Tensor<T, Dim, Contig, NewIndexT, PtrTraits>(
return Tensor<T, Dim, InnerContig, NewIndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <typename NewIndexT>
__host__ bool
Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastIndexType() const {
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::canUseIndexType() const {
static_assert(sizeof(size_t) >= sizeof(IndexT),
"index size too large");
static_assert(sizeof(size_t) >= sizeof(NewIndexT),
......@@ -313,16 +332,12 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastIndexType() const {
// FIXME: maybe also consider offset in bytes? multiply by sizeof(T)?
size_t maxOffset = 0;
if (Contig) {
maxOffset = (size_t) size_[0] * (size_t) stride_[0];
} else {
for (int i = 0; i < Dim; ++i) {
size_t curMaxOffset = (size_t) size_[i] * (size_t) stride_[i];
if (curMaxOffset > maxOffset) {
maxOffset = curMaxOffset;
}
}
}
if (maxOffset > (size_t) std::numeric_limits<NewIndexT>::max()) {
return false;
......@@ -331,23 +346,23 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastIndexType() const {
return true;
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ IndexT
Tensor<T, Dim, Contig, IndexT, PtrTraits>::numElements() const {
long size = getSize(0);
__host__ __device__ size_t
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::numElements() const {
size_t size = (size_t) getSize(0);
for (int i = 1; i < Dim; ++i) {
size *= getSize(i);
size *= (size_t) getSize(i);
}
return size;
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool
Tensor<T, Dim, Contig, IndexT, PtrTraits>::isContiguous() const {
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isContiguous() const {
long prevSize = 1;
for (int i = Dim - 1; i >= 0; --i) {
......@@ -363,10 +378,10 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::isContiguous() const {
return true;
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool
Tensor<T, Dim, Contig, IndexT, PtrTraits>::isConsistentlySized(int i) const {
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isConsistentlySized(int i) const {
if (i == 0 && getStride(i) > 0 && getSize(i) > 0) {
return true;
} else if ((i > 0) && (i < Dim) && (getStride(i) > 0) &&
......@@ -377,10 +392,10 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::isConsistentlySized(int i) const {
return false;
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool
Tensor<T, Dim, Contig, IndexT, PtrTraits>::isConsistentlySized() const {
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isConsistentlySized() const {
for (int i = 0; i < Dim; ++i) {
if (!isConsistentlySized(i)) {
return false;
......@@ -390,23 +405,28 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::isConsistentlySized() const {
return true;
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool
Tensor<T, Dim, Contig, IndexT, PtrTraits>::isContiguousDim(int i) const {
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isContiguousDim(int i) const {
return (i == Dim - 1) || // just in case
((i < Dim - 1) &&
((getStride(i) / getStride(i + 1)) == getSize(i + 1)));
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::transpose(int dim1,
__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::transpose(int dim1,
int dim2) const {
assert(dim1 >= 0 && dim1 < Dim);
assert(dim1 >= 0 && dim2 < Dim);
static_assert(!Contig, "cannot transpose contiguous arrays");
GPU_FAISS_ASSERT(dim1 >= 0 && dim1 < Dim);
GPU_FAISS_ASSERT(dim1 >= 0 && dim2 < Dim);
// If a tensor is innermost contiguous, one cannot transpose the innermost
// dimension
if (InnerContig) {
GPU_FAISS_ASSERT(dim1 != Dim - 1 && dim2 != Dim - 1);
}
IndexT newSize[Dim];
IndexT newStride[Dim];
......@@ -424,14 +444,14 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::transpose(int dim1,
newStride[dim1] = newStride[dim2];
newStride[dim2] = tmp;
return Tensor<T, Dim, Contig, IndexT, PtrTraits>(data_, newSize, newStride);
return Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data_, newSize, newStride);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::upcastOuter() {
__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::upcastOuter() {
// Can only create tensors of greater dimension
static_assert(NewDim > Dim, "Can only upcast to greater dim");
......@@ -452,15 +472,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::upcastOuter() {
}
}
return Tensor<T, NewDim, Contig, IndexT, PtrTraits>(
return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::upcastInner() {
__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::upcastInner() {
// Can only create tensors of greater dimension
static_assert(NewDim > Dim, "Can only upcast to greater dim");
......@@ -479,15 +499,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::upcastInner() {
}
}
return Tensor<T, NewDim, Contig, IndexT, PtrTraits>(
return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastOuter() {
__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::downcastOuter() {
// Can only create tensors of lesser dimension
static_assert(NewDim < Dim, "Can only downcast to lesser dim");
......@@ -497,7 +517,7 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastOuter() {
// them).
for (int i = 0; i < Dim - NewDim; ++i) {
bool cont = isContiguousDim(i);
assert(cont);
GPU_FAISS_ASSERT(cont);
}
IndexT newSize[NewDim];
......@@ -524,15 +544,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastOuter() {
}
}
return Tensor<T, NewDim, Contig, IndexT, PtrTraits>(
return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastInner() {
__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::downcastInner() {
// Can only create tensors of lesser dimension
static_assert(NewDim < Dim, "Can only downcast to lesser dim");
......@@ -541,7 +561,7 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastInner() {
// in all of the dimensions we are collapsing (no padding in
// them).
for (int i = NewDim; i < Dim; ++i) {
assert(isContiguousDim(i));
GPU_FAISS_ASSERT(isContiguousDim(i));
}
IndexT newSize[NewDim];
......@@ -567,15 +587,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastInner() {
}
}
return Tensor<T, NewDim, Contig, IndexT, PtrTraits>(
return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <int SubDim>
__host__ __device__ Tensor<T, SubDim, Contig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::view(DataPtrType at) {
__host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::view(DataPtrType at) {
static_assert(SubDim >= 1 && SubDim < Dim,
"can only create view of lesser dim");
......@@ -587,89 +607,76 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::view(DataPtrType at) {
viewStrides[i] = stride_[Dim - SubDim + i];
}
return Tensor<T, SubDim, Contig, IndexT, PtrTraits>(
return Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>(
at, viewSizes, viewStrides);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <int SubDim>
__host__ __device__ Tensor<T, SubDim, Contig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::view() {
__host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::view() {
return view<SubDim>(data_);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::narrowOutermost(IndexT start,
__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::narrowOutermost(IndexT start,
IndexT size) {
DataPtrType newData = data_;
if (start > 0) {
newData += start * stride_[0];
}
IndexT newSize[Dim];
for (int i = 0; i < Dim; ++i) {
if (i == 0) {
assert(start + size <= size_[0]);
newSize[i] = size;
} else {
newSize[i] = size_[i];
}
}
return Tensor<T, Dim, Contig, IndexT, PtrTraits>(newData, newSize, stride_);
return this->narrow(0, start, size);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ Tensor<T, Dim, false, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::narrow(int dim,
__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::narrow(int dim,
IndexT start,
IndexT size) {
DataPtrType newData = data_;
GPU_FAISS_ASSERT(start >= 0 &&
start < size_[dim] &&
(start + size) <= size_[dim]);
if (start > 0) {
newData += start * stride_[dim];
newData += (size_t) start * stride_[dim];
}
IndexT newSize[Dim];
for (int i = 0; i < Dim; ++i) {
if (i == dim) {
assert(start + size <= size_[dim]);
GPU_FAISS_ASSERT(start + size <= size_[dim]);
newSize[i] = size;
} else {
newSize[i] = size_[i];
}
}
// The narrowed tensor is not necessarily contiguous
return Tensor<T, Dim, false, IndexT, PtrTraits>(newData, newSize, stride_);
// If we were innermost contiguous before, we are still innermost contiguous
return Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(newData, newSize, stride_);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::view(
__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::view(
std::initializer_list<IndexT> sizes) {
static_assert(Contig, "on contiguous tensors only");
GPU_FAISS_ASSERT(this->isContiguous());
assert(sizes.size() == NewDim);
GPU_FAISS_ASSERT(sizes.size() == NewDim);
// The total size of the new view must be the same as the total size
// of the old view
size_t curSize = numElements();
size_t newSize = 1;
for (auto s : sizes) {
newSize *= s;
}
assert(curSize == newSize);
GPU_FAISS_ASSERT(curSize == newSize);
return Tensor<T, NewDim, true, IndexT, PtrTraits>(data(), sizes);
}
......
......@@ -24,7 +24,7 @@ namespace faiss { namespace gpu {
/// Our tensor type
template <typename T,
int Dim,
bool Contig,
bool InnerContig,
typename IndexT,
template <typename U> class PtrTraits>
class Tensor;
......@@ -58,7 +58,7 @@ struct DefaultPtrTraits {
- `T` is the contained type (e.g., `float`)
- `Dim` is the tensor rank
- If `Contig` is true, then the tensor is assumed to be
- If `InnerContig` is true, then the tensor is assumed to be innermost
- contiguous, and only operations that make sense on contiguous
- arrays are allowed (e.g., no transpose). Strides are still
- calculated, but innermost stride is assumed to be 1.
......@@ -71,7 +71,7 @@ struct DefaultPtrTraits {
*/
template <typename T,
int Dim,
bool Contig = false,
bool InnerContig = false,
typename IndexT = int,
template <typename U> class PtrTraits = traits::DefaultPtrTraits>
class Tensor {
......@@ -79,28 +79,28 @@ class Tensor {
enum { NumDim = Dim };
typedef T DataType;
typedef IndexT IndexType;
enum { IsContig = Contig };
enum { IsInnerContig = InnerContig };
typedef typename PtrTraits<T>::PtrType DataPtrType;
typedef Tensor<T, Dim, Contig, IndexT, PtrTraits> TensorType;
typedef Tensor<T, Dim, InnerContig, IndexT, PtrTraits> TensorType;
/// Default constructor
__host__ __device__ Tensor();
/// Copy constructor
__host__ __device__ Tensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t)
__host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t)
= default;
/// Move constructor
__host__ __device__ Tensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>&& t)
__host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t)
= default;
/// Assignment
__host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits>&
operator=(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t) = default;
__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) = default;
/// Move assignment
__host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits>&
operator=(Tensor<T, Dim, Contig, IndexT, PtrTraits>&& t);
__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
/// Constructor that calculates strides with no padding
__host__ __device__ Tensor(DataPtrType data,
......@@ -116,28 +116,33 @@ class Tensor {
const IndexT strides[Dim]);
/// Copies a tensor into ourselves; sizes must match
__host__ void copyFrom(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
__host__ void copyFrom(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream);
/// Copies ourselves into a tensor; sizes must match
__host__ void copyTo(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
__host__ void copyTo(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream);
/// Returns true if the two tensors are of the same dimensionality,
/// size and stride.
template <int OtherDim>
template <typename OtherT, int OtherDim>
__host__ __device__ bool
isSame(const Tensor<T, OtherDim, Contig, IndexT, PtrTraits>& rhs) const;
isSame(const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs) const;
/// Returns true if the two tensors are of the same dimensionality and size
template <typename OtherT, int OtherDim>
__host__ __device__ bool
isSameSize(const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs) const;
/// Cast to a tensor of a different type of the same size and
/// stride. U and our type T must be of the same size
template <typename U>
__host__ __device__ Tensor<U, Dim, Contig, IndexT, PtrTraits> cast();
__host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits> cast();
/// Const version of `cast`
template <typename U>
__host__ __device__
const Tensor<U, Dim, Contig, IndexT, PtrTraits> cast() const;
const Tensor<U, Dim, InnerContig, IndexT, PtrTraits> cast() const;
/// Cast to a tensor of a different type which is potentially a
/// different size than our type T. Tensor must be aligned and the
......@@ -146,11 +151,11 @@ class Tensor {
/// must be contiguous. The stride of all outer dimensions must be a
/// multiple of sizeof(U) / sizeof(T) as well.
template <typename U>
__host__ __device__ Tensor<U, Dim, Contig, IndexT, PtrTraits> castResize();
__host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits> castResize();
/// Const version of `castResize`
template <typename U>
__host__ __device__ const Tensor<U, Dim, Contig, IndexT, PtrTraits>
__host__ __device__ const Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
castResize() const;
/// Returns true if we can castResize() this tensor to the new type
......@@ -161,13 +166,13 @@ class Tensor {
/// Fails if size or stride entries are not representable in the new
/// IndexT.
template <typename NewIndexT>
__host__ Tensor<T, Dim, Contig, NewIndexT, PtrTraits>
__host__ Tensor<T, Dim, InnerContig, NewIndexT, PtrTraits>
castIndexType() const;
/// Returns true if we can castIndexType() this tensor to the new
/// Returns true if we can use this indexing type to access all elements
/// index type
template <typename NewIndexT>
__host__ bool canCastIndexType() const;
__host__ bool canUseIndexType() const;
/// Returns a raw pointer to the start of our data.
__host__ __device__ inline DataPtrType data() {
......@@ -230,12 +235,12 @@ class Tensor {
/// Returns the total number of elements contained within our data
/// (product of `getSize(i)`)
__host__ __device__ IndexT numElements() const;
__host__ __device__ size_t numElements() const;
/// If we are contiguous, returns the total size in bytes of our
/// data
__host__ __device__ size_t getSizeInBytes() const {
return (size_t) numElements() * sizeof(T);
return numElements() * sizeof(T);
}
/// Returns the size array.
......@@ -273,21 +278,21 @@ class Tensor {
/// dimensions given. Does not actually move elements; transposition
/// is made by permuting the size/stride arrays.
/// If the dimensions are not valid, asserts.
__host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits>
__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
transpose(int dim1, int dim2) const;
/// Upcast a tensor of dimension `D` to some tensor of dimension
/// D' > D by padding the leading dimensions by 1
/// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
template <int NewDim>
__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits>
__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
upcastOuter();
/// Upcast a tensor of dimension `D` to some tensor of dimension
/// D' > D by padding the lowest/most varying dimensions by 1
/// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
template <int NewDim>
__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits>
__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
upcastInner();
/// Downcast a tensor of dimension `D` to some tensor of dimension
......@@ -295,46 +300,45 @@ class Tensor {
/// padding on the leading dimensions.
template <int NewDim>
__host__ __device__
Tensor<T, NewDim, Contig, IndexT, PtrTraits> downcastOuter();
Tensor<T, NewDim, InnerContig, IndexT, PtrTraits> downcastOuter();
/// Downcast a tensor of dimension `D` to some tensor of dimension
/// D' < D by collapsing the leading dimensions. asserts if there is
/// padding on the leading dimensions.
template <int NewDim>
__host__ __device__
Tensor<T, NewDim, Contig, IndexT, PtrTraits> downcastInner();
Tensor<T, NewDim, InnerContig, IndexT, PtrTraits> downcastInner();
/// Returns a tensor that is a view of the `SubDim`-dimensional slice
/// of this tensor, starting at `at`.
template <int SubDim>
__host__ __device__ Tensor<T, SubDim, Contig, IndexT, PtrTraits>
__host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>
view(DataPtrType at);
/// Returns a tensor that is a view of the `SubDim`-dimensional slice
/// of this tensor, starting where our data begins
template <int SubDim>
__host__ __device__ Tensor<T, SubDim, Contig, IndexT, PtrTraits>
__host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>
view();
/// Returns a tensor of the same dimension that is a view of the
/// original tensor with the specified dimension restricted to the
/// elements in the range [start, start + size)
__host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits>
__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
narrowOutermost(IndexT start, IndexT size);
/// Returns a tensor of the same dimension that is a view of the
/// original tensor with the specified dimension restricted to the
/// elements in the range [start, start + size).
/// Can occur in an arbitrary dimension, and is possibly
/// non-contiguous
__host__ __device__ Tensor<T, Dim, false, IndexT, PtrTraits>
/// Can occur in an arbitrary dimension
__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
narrow(int dim, IndexT start, IndexT size);
/// Returns a view of the given tensor expressed as a tensor of a
/// different number of dimensions.
/// Only works if we are contiguous.
template <int NewDim>
__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits>
__host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
view(std::initializer_list<IndexT> sizes);
protected:
......@@ -352,21 +356,21 @@ class Tensor {
namespace detail {
template <typename IndexType>
bool canCastIndexType() {
bool canUseIndexType() {
return true;
}
template <typename IndexType, typename T, typename... U>
bool canCastIndexType(const T& arg, const U&... args) {
return arg.canCastIndexType<IndexType>() &&
canCastIndexType(args...);
bool canUseIndexType(const T& arg, const U&... args) {
return arg.canUseIndexType<IndexType>() &&
canUseIndexType(args...);
}
} // namespace detail
template <typename IndexType, typename... T>
bool canCastIndexType(const T&... args) {
return detail::canCastIndexType(args...);
bool canUseIndexType(const T&... args) {
return detail::canUseIndexType(args...);
}
namespace detail {
......@@ -464,7 +468,7 @@ class SubTensor<TensorType, 0, PtrTraits> {
/// Our parent tensor can create us
friend class Tensor<typename TensorType::DataType,
1,
TensorType::IsContig,
TensorType::IsInnerContig,
typename TensorType::IndexType,
PtrTraits>;
......@@ -493,7 +497,7 @@ class SubTensor {
__host__ __device__ inline
SubTensor<TensorType, SubDim - 1, PtrTraits>
operator[](typename TensorType::IndexType index) {
if (TensorType::IsContig && SubDim == 1) {
if (TensorType::IsInnerContig && SubDim == 1) {
// Innermost dimension is stride 1 for contiguous arrays
return SubTensor<TensorType, SubDim - 1, PtrTraits>(
tensor_, data_ + index);
......@@ -509,7 +513,7 @@ class SubTensor {
__host__ __device__ inline
const SubTensor<TensorType, SubDim - 1, PtrTraits>
operator[](typename TensorType::IndexType index) const {
if (TensorType::IsContig && SubDim == 1) {
if (TensorType::IsInnerContig && SubDim == 1) {
// Innermost dimension is stride 1 for contiguous arrays
return SubTensor<TensorType, SubDim - 1, PtrTraits>(
tensor_, data_ + index);
......@@ -590,7 +594,7 @@ class SubTensor {
/// of this tensor, starting where our data begins
Tensor<typename TensorType::DataType,
SubDim,
TensorType::IsContig,
TensorType::IsInnerContig,
typename TensorType::IndexType,
PtrTraits> view() {
return tensor_.template view<SubDim>(data_);
......@@ -604,7 +608,7 @@ class SubTensor {
friend class
Tensor<typename TensorType::DataType,
TensorType::NumDim,
TensorType::IsContig,
TensorType::IsInnerContig,
typename TensorType::IndexType,
PtrTraits>;
......@@ -624,23 +628,23 @@ class SubTensor {
} // namespace detail
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ inline
detail::SubTensor<Tensor<T, Dim, Contig, IndexT, PtrTraits>,
detail::SubTensor<Tensor<T, Dim, InnerContig, IndexT, PtrTraits>,
Dim - 1, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator[](IndexT index) {
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator[](IndexT index) {
return detail::SubTensor<TensorType, Dim - 1, PtrTraits>(
detail::SubTensor<TensorType, Dim, PtrTraits>(
*this, data_)[index]);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ inline
const detail::SubTensor<Tensor<T, Dim, Contig, IndexT, PtrTraits>,
const detail::SubTensor<Tensor<T, Dim, InnerContig, IndexT, PtrTraits>,
Dim - 1, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator[](IndexT index) const {
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator[](IndexT index) const {
return detail::SubTensor<TensorType, Dim - 1, PtrTraits>(
detail::SubTensor<TensorType, Dim, PtrTraits>(
const_cast<TensorType&>(*this), data_)[index]);
......
......@@ -19,26 +19,26 @@
namespace faiss { namespace gpu {
template <typename T>
template <typename T, typename IndexT>
struct TensorInfo {
static constexpr int kMaxDims = 8;
T* data;
int sizes[kMaxDims];
int strides[kMaxDims];
IndexT sizes[kMaxDims];
IndexT strides[kMaxDims];
int dims;
};
template <typename T, int Dim>
template <typename T, typename IndexT, int Dim>
struct TensorInfoOffset {
__device__ inline static unsigned int get(const TensorInfo<T>& info,
unsigned int linearId) {
unsigned int offset = 0;
__device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
IndexT linearId) {
IndexT offset = 0;
#pragma unroll
for (int i = Dim - 1; i >= 0; --i) {
unsigned int curDimIndex = linearId % info.sizes[i];
unsigned int curDimOffset = curDimIndex * info.strides[i];
IndexT curDimIndex = linearId % info.sizes[i];
IndexT curDimOffset = curDimIndex * info.strides[i];
offset += curDimOffset;
......@@ -51,21 +51,21 @@ struct TensorInfoOffset {
}
};
template <typename T>
struct TensorInfoOffset<T, -1> {
__device__ inline static unsigned int get(const TensorInfo<T>& info,
unsigned int linearId) {
template <typename T, typename IndexT>
struct TensorInfoOffset<T, IndexT, -1> {
__device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
IndexT linearId) {
return linearId;
}
};
template <typename T, int Dim>
TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) {
TensorInfo<T> info;
template <typename T, typename IndexT, int Dim>
TensorInfo<T, IndexT> getTensorInfo(const Tensor<T, Dim, true>& t) {
TensorInfo<T, IndexT> info;
for (int i = 0; i < Dim; ++i) {
info.sizes[i] = t.getSize(i);
info.strides[i] = t.getStride(i);
info.sizes[i] = (IndexT) t.getSize(i);
info.strides[i] = (IndexT) t.getStride(i);
}
info.data = t.data();
......@@ -74,26 +74,22 @@ TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) {
return info;
}
template <typename T, int DimInput, int DimOutput>
__global__ void transposeAny(TensorInfo<T> input,
TensorInfo<T> output,
unsigned int totalSize) {
auto linearThreadId = blockIdx.x * blockDim.x + threadIdx.x;
if (linearThreadId >= totalSize) {
return;
}
auto inputOffset =
TensorInfoOffset<T, DimInput>::get(input, linearThreadId);
auto outputOffset =
TensorInfoOffset<T, DimOutput>::get(output, linearThreadId);
template <typename T, typename IndexT, int DimInput, int DimOutput>
__global__ void transposeAny(TensorInfo<T, IndexT> input,
TensorInfo<T, IndexT> output,
IndexT totalSize) {
for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x;
i < totalSize;
i += gridDim.x + blockDim.x) {
auto inputOffset = TensorInfoOffset<T, IndexT, DimInput>::get(input, i);
auto outputOffset = TensorInfoOffset<T, IndexT, DimOutput>::get(output, i);
#if __CUDA_ARCH__ >= 350
output.data[outputOffset] = __ldg(&input.data[inputOffset]);
#else
output.data[outputOffset] = input.data[inputOffset];
#endif
}
}
/// Performs an out-of-place transposition between any two dimensions.
......@@ -110,7 +106,8 @@ void runTransposeAny(Tensor<T, Dim, true>& in,
int dim1, int dim2,
Tensor<T, Dim, true>& out,
cudaStream_t stream) {
static_assert(Dim <= TensorInfo<T>::kMaxDims, "too many dimensions");
static_assert(Dim <= TensorInfo<T, unsigned int>::kMaxDims,
"too many dimensions");
FAISS_ASSERT(dim1 != dim2);
FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
......@@ -127,20 +124,33 @@ void runTransposeAny(Tensor<T, Dim, true>& in,
FAISS_ASSERT(out.getSize(i) == outSize[i]);
}
auto inInfo = getTensorInfo<T, Dim>(in);
auto outInfo = getTensorInfo<T, Dim>(out);
size_t totalSize = in.numElements();
size_t block = std::min((size_t) getMaxThreadsCurrentDevice(), totalSize);
if (totalSize <= (size_t) std::numeric_limits<int>::max()) {
// div/mod seems faster with unsigned types
auto inInfo = getTensorInfo<T, unsigned int, Dim>(in);
auto outInfo = getTensorInfo<T, unsigned int, Dim>(out);
std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
int totalSize = in.numElements();
auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
transposeAny<T, unsigned int, Dim, -1>
<<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
} else {
auto inInfo = getTensorInfo<T, unsigned long, Dim>(in);
auto outInfo = getTensorInfo<T, unsigned long, Dim>(out);
std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
int numThreads = std::min(getMaxThreadsCurrentDevice(), totalSize);
auto grid = dim3(utils::divUp(totalSize, numThreads));
auto block = dim3(numThreads);
auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
transposeAny<T, Dim, -1><<<grid, block, 0, stream>>>(
inInfo, outInfo, totalSize);
transposeAny<T, unsigned long, Dim, -1>
<<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
}
CUDA_TEST_ERROR();
}
......
......@@ -7,6 +7,8 @@
*/
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#include "../BlockSelectKernel.cuh"
#include "../Limits.cuh"
......@@ -17,6 +19,15 @@
Tensor<int, 2, true>& outV, \
bool dir, \
int k, \
cudaStream_t stream); \
\
extern void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
Tensor<TYPE, 2, true>& inK, \
Tensor<int, 2, true>& inV, \
Tensor<TYPE, 2, true>& outK, \
Tensor<int, 2, true>& outV, \
bool dir, \
int k, \
cudaStream_t stream)
#define BLOCK_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q) \
......@@ -27,6 +38,11 @@
bool dir, \
int k, \
cudaStream_t stream) { \
FAISS_ASSERT(in.getSize(0) == outK.getSize(0)); \
FAISS_ASSERT(in.getSize(0) == outV.getSize(0)); \
FAISS_ASSERT(outK.getSize(1) == k); \
FAISS_ASSERT(outV.getSize(1) == k); \
\
auto grid = dim3(in.getSize(0)); \
\
constexpr int kBlockSelectNumThreads = 128; \
......@@ -41,8 +57,40 @@
blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
<<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k); \
CUDA_TEST_ERROR(); \
} \
\
void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
Tensor<TYPE, 2, true>& inK, \
Tensor<int, 2, true>& inV, \
Tensor<TYPE, 2, true>& outK, \
Tensor<int, 2, true>& outV, \
bool dir, \
int k, \
cudaStream_t stream) { \
FAISS_ASSERT(inK.isSameSize(inV)); \
FAISS_ASSERT(outK.isSameSize(outV)); \
\
auto grid = dim3(inK.getSize(0)); \
\
constexpr int kBlockSelectNumThreads = 128; \
auto block = dim3(kBlockSelectNumThreads); \
\
FAISS_ASSERT(k <= WARP_Q); \
FAISS_ASSERT(dir == DIR); \
\
auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
auto vInit = -1; \
\
blockSelectPair<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
<<<grid, block, 0, stream>>>(inK, inV, outK, outV, kInit, vInit, k); \
CUDA_TEST_ERROR(); \
}
#define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q) \
runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
in, outK, outV, dir, k, stream)
#define BLOCK_SELECT_PAIR_CALL(TYPE, DIR, WARP_Q) \
runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
inK, inV, outK, outV, dir, k, stream)
......@@ -222,7 +222,6 @@ void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) {
}
static void write_ivf_header (const IndexIVF * ivf, FILE *f,
bool include_ids = true) {
write_index_header (ivf, f);
......@@ -445,6 +444,7 @@ static void read_ScalarQuantizer (ScalarQuantizer *ivsc, FILE *f) {
READVECTOR (ivsc->trained);
}
ProductQuantizer * read_ProductQuantizer (const char*fname) {
FILE *f = fopen (fname, "r");
FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
......@@ -676,8 +676,8 @@ Index *read_index (FILE * f, bool try_mmap) {
}
idx = idxmap;
} else {
fprintf (stderr, "Index type 0x%08x not supported\n", h);
abort ();
FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
idx = nullptr;
}
return idx;
}
......
......@@ -80,6 +80,7 @@ class FloatVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss.FloatVector_push_back(self, *args)
def clear(self): return _swigfaiss.FloatVector_clear(self)
def data(self): return _swigfaiss.FloatVector_data(self)
def size(self): return _swigfaiss.FloatVector_size(self)
def at(self, *args): return _swigfaiss.FloatVector_at(self, *args)
......@@ -89,6 +90,27 @@ class FloatVector(_object):
FloatVector_swigregister = _swigfaiss.FloatVector_swigregister
FloatVector_swigregister(FloatVector)
class DoubleVector(_object):
__swig_setmethods__ = {}
__setattr__ = lambda self, name, value: _swig_setattr(self, DoubleVector, name, value)
__swig_getmethods__ = {}
__getattr__ = lambda self, name: _swig_getattr(self, DoubleVector, name)
__repr__ = _swig_repr
def __init__(self):
this = _swigfaiss.new_DoubleVector()
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss.DoubleVector_push_back(self, *args)
def clear(self): return _swigfaiss.DoubleVector_clear(self)
def data(self): return _swigfaiss.DoubleVector_data(self)
def size(self): return _swigfaiss.DoubleVector_size(self)
def at(self, *args): return _swigfaiss.DoubleVector_at(self, *args)
def resize(self, *args): return _swigfaiss.DoubleVector_resize(self, *args)
__swig_destroy__ = _swigfaiss.delete_DoubleVector
__del__ = lambda self : None;
DoubleVector_swigregister = _swigfaiss.DoubleVector_swigregister
DoubleVector_swigregister(DoubleVector)
class ByteVector(_object):
__swig_setmethods__ = {}
__setattr__ = lambda self, name, value: _swig_setattr(self, ByteVector, name, value)
......@@ -100,6 +122,7 @@ class ByteVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss.ByteVector_push_back(self, *args)
def clear(self): return _swigfaiss.ByteVector_clear(self)
def data(self): return _swigfaiss.ByteVector_data(self)
def size(self): return _swigfaiss.ByteVector_size(self)
def at(self, *args): return _swigfaiss.ByteVector_at(self, *args)
......@@ -120,6 +143,7 @@ class Uint64Vector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss.Uint64Vector_push_back(self, *args)
def clear(self): return _swigfaiss.Uint64Vector_clear(self)
def data(self): return _swigfaiss.Uint64Vector_data(self)
def size(self): return _swigfaiss.Uint64Vector_size(self)
def at(self, *args): return _swigfaiss.Uint64Vector_at(self, *args)
......@@ -140,6 +164,7 @@ class LongVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss.LongVector_push_back(self, *args)
def clear(self): return _swigfaiss.LongVector_clear(self)
def data(self): return _swigfaiss.LongVector_data(self)
def size(self): return _swigfaiss.LongVector_size(self)
def at(self, *args): return _swigfaiss.LongVector_at(self, *args)
......@@ -160,6 +185,7 @@ class IntVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss.IntVector_push_back(self, *args)
def clear(self): return _swigfaiss.IntVector_clear(self)
def data(self): return _swigfaiss.IntVector_data(self)
def size(self): return _swigfaiss.IntVector_size(self)
def at(self, *args): return _swigfaiss.IntVector_at(self, *args)
......@@ -180,6 +206,7 @@ class VectorTransformVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss.VectorTransformVector_push_back(self, *args)
def clear(self): return _swigfaiss.VectorTransformVector_clear(self)
def data(self): return _swigfaiss.VectorTransformVector_data(self)
def size(self): return _swigfaiss.VectorTransformVector_size(self)
def at(self, *args): return _swigfaiss.VectorTransformVector_at(self, *args)
......@@ -200,6 +227,7 @@ class OperatingPointVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss.OperatingPointVector_push_back(self, *args)
def clear(self): return _swigfaiss.OperatingPointVector_clear(self)
def data(self): return _swigfaiss.OperatingPointVector_data(self)
def size(self): return _swigfaiss.OperatingPointVector_size(self)
def at(self, *args): return _swigfaiss.OperatingPointVector_at(self, *args)
......@@ -220,6 +248,7 @@ class FloatVectorVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss.FloatVectorVector_push_back(self, *args)
def clear(self): return _swigfaiss.FloatVectorVector_clear(self)
def data(self): return _swigfaiss.FloatVectorVector_data(self)
def size(self): return _swigfaiss.FloatVectorVector_size(self)
def at(self, *args): return _swigfaiss.FloatVectorVector_at(self, *args)
......@@ -240,6 +269,7 @@ class ByteVectorVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss.ByteVectorVector_push_back(self, *args)
def clear(self): return _swigfaiss.ByteVectorVector_clear(self)
def data(self): return _swigfaiss.ByteVectorVector_data(self)
def size(self): return _swigfaiss.ByteVectorVector_size(self)
def at(self, *args): return _swigfaiss.ByteVectorVector_at(self, *args)
......@@ -260,6 +290,7 @@ class LongVectorVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss.LongVectorVector_push_back(self, *args)
def clear(self): return _swigfaiss.LongVectorVector_clear(self)
def data(self): return _swigfaiss.LongVectorVector_data(self)
def size(self): return _swigfaiss.LongVectorVector_size(self)
def at(self, *args): return _swigfaiss.LongVectorVector_at(self, *args)
......@@ -876,6 +907,9 @@ class ClusteringParameters(_object):
__swig_setmethods__["update_index"] = _swigfaiss.ClusteringParameters_update_index_set
__swig_getmethods__["update_index"] = _swigfaiss.ClusteringParameters_update_index_get
if _newclass:update_index = _swig_property(_swigfaiss.ClusteringParameters_update_index_get, _swigfaiss.ClusteringParameters_update_index_set)
__swig_setmethods__["frozen_centroids"] = _swigfaiss.ClusteringParameters_frozen_centroids_set
__swig_getmethods__["frozen_centroids"] = _swigfaiss.ClusteringParameters_frozen_centroids_get
if _newclass:frozen_centroids = _swig_property(_swigfaiss.ClusteringParameters_frozen_centroids_get, _swigfaiss.ClusteringParameters_frozen_centroids_set)
__swig_setmethods__["min_points_per_centroid"] = _swigfaiss.ClusteringParameters_min_points_per_centroid_set
__swig_getmethods__["min_points_per_centroid"] = _swigfaiss.ClusteringParameters_min_points_per_centroid_get
if _newclass:min_points_per_centroid = _swig_property(_swigfaiss.ClusteringParameters_min_points_per_centroid_get, _swigfaiss.ClusteringParameters_min_points_per_centroid_set)
......@@ -1720,6 +1754,9 @@ class IndexIVF(Index):
__swig_setmethods__["cp"] = _swigfaiss.IndexIVF_cp_set
__swig_getmethods__["cp"] = _swigfaiss.IndexIVF_cp_get
if _newclass:cp = _swig_property(_swigfaiss.IndexIVF_cp_get, _swigfaiss.IndexIVF_cp_set)
__swig_setmethods__["clustering_index"] = _swigfaiss.IndexIVF_clustering_index_set
__swig_getmethods__["clustering_index"] = _swigfaiss.IndexIVF_clustering_index_get
if _newclass:clustering_index = _swig_property(_swigfaiss.IndexIVF_clustering_index_get, _swigfaiss.IndexIVF_clustering_index_set)
__swig_setmethods__["ids"] = _swigfaiss.IndexIVF_ids_set
__swig_getmethods__["ids"] = _swigfaiss.IndexIVF_ids_get
if _newclass:ids = _swig_property(_swigfaiss.IndexIVF_ids_get, _swigfaiss.IndexIVF_ids_set)
......@@ -1949,6 +1986,7 @@ class IndexIVFPQ(IndexIVF):
def encode_multiple(self, *args): return _swigfaiss.IndexIVFPQ_encode_multiple(self, *args)
def decode_multiple(self, *args): return _swigfaiss.IndexIVFPQ_decode_multiple(self, *args)
def search_preassigned(self, *args): return _swigfaiss.IndexIVFPQ_search_preassigned(self, *args)
def search_and_reconstruct(self, *args): return _swigfaiss.IndexIVFPQ_search_and_reconstruct(self, *args)
def precompute_table(self): return _swigfaiss.IndexIVFPQ_precompute_table(self)
def __init__(self, *args):
this = _swigfaiss.new_IndexIVFPQ(*args)
......@@ -2107,6 +2145,7 @@ class IndexIDMap(Index):
def train(self, *args): return _swigfaiss.IndexIDMap_train(self, *args)
def reset(self): return _swigfaiss.IndexIDMap_reset(self)
def remove_ids(self, *args): return _swigfaiss.IndexIDMap_remove_ids(self, *args)
def range_search(self, *args): return _swigfaiss.IndexIDMap_range_search(self, *args)
__swig_destroy__ = _swigfaiss.delete_IndexIDMap
__del__ = lambda self : None;
def __init__(self, *args):
......@@ -2775,6 +2814,27 @@ RangeSearchPartialResult_swigregister(RangeSearchPartialResult)
def ignore_SIGTTIN():
return _swigfaiss.ignore_SIGTTIN()
ignore_SIGTTIN = _swigfaiss.ignore_SIGTTIN
class MapLong2Long(_object):
__swig_setmethods__ = {}
__setattr__ = lambda self, name, value: _swig_setattr(self, MapLong2Long, name, value)
__swig_getmethods__ = {}
__getattr__ = lambda self, name: _swig_getattr(self, MapLong2Long, name)
__repr__ = _swig_repr
__swig_setmethods__["map"] = _swigfaiss.MapLong2Long_map_set
__swig_getmethods__["map"] = _swigfaiss.MapLong2Long_map_get
if _newclass:map = _swig_property(_swigfaiss.MapLong2Long_map_get, _swigfaiss.MapLong2Long_map_set)
def add(self, *args): return _swigfaiss.MapLong2Long_add(self, *args)
def search(self, *args): return _swigfaiss.MapLong2Long_search(self, *args)
def search_multiple(self, *args): return _swigfaiss.MapLong2Long_search_multiple(self, *args)
def __init__(self):
this = _swigfaiss.new_MapLong2Long()
try: self.this.append(this)
except: self.this = this
__swig_destroy__ = _swigfaiss.delete_MapLong2Long
__del__ = lambda self : None;
MapLong2Long_swigregister = _swigfaiss.MapLong2Long_swigregister
MapLong2Long_swigregister(MapLong2Long)
# This file is compatible with both classic and new-style classes.
......@@ -80,6 +80,7 @@ class FloatVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.FloatVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.FloatVector_clear(self)
def data(self): return _swigfaiss_gpu.FloatVector_data(self)
def size(self): return _swigfaiss_gpu.FloatVector_size(self)
def at(self, *args): return _swigfaiss_gpu.FloatVector_at(self, *args)
......@@ -89,6 +90,27 @@ class FloatVector(_object):
FloatVector_swigregister = _swigfaiss_gpu.FloatVector_swigregister
FloatVector_swigregister(FloatVector)
class DoubleVector(_object):
__swig_setmethods__ = {}
__setattr__ = lambda self, name, value: _swig_setattr(self, DoubleVector, name, value)
__swig_getmethods__ = {}
__getattr__ = lambda self, name: _swig_getattr(self, DoubleVector, name)
__repr__ = _swig_repr
def __init__(self):
this = _swigfaiss_gpu.new_DoubleVector()
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.DoubleVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.DoubleVector_clear(self)
def data(self): return _swigfaiss_gpu.DoubleVector_data(self)
def size(self): return _swigfaiss_gpu.DoubleVector_size(self)
def at(self, *args): return _swigfaiss_gpu.DoubleVector_at(self, *args)
def resize(self, *args): return _swigfaiss_gpu.DoubleVector_resize(self, *args)
__swig_destroy__ = _swigfaiss_gpu.delete_DoubleVector
__del__ = lambda self : None;
DoubleVector_swigregister = _swigfaiss_gpu.DoubleVector_swigregister
DoubleVector_swigregister(DoubleVector)
class ByteVector(_object):
__swig_setmethods__ = {}
__setattr__ = lambda self, name, value: _swig_setattr(self, ByteVector, name, value)
......@@ -100,6 +122,7 @@ class ByteVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.ByteVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.ByteVector_clear(self)
def data(self): return _swigfaiss_gpu.ByteVector_data(self)
def size(self): return _swigfaiss_gpu.ByteVector_size(self)
def at(self, *args): return _swigfaiss_gpu.ByteVector_at(self, *args)
......@@ -120,6 +143,7 @@ class Uint64Vector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.Uint64Vector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.Uint64Vector_clear(self)
def data(self): return _swigfaiss_gpu.Uint64Vector_data(self)
def size(self): return _swigfaiss_gpu.Uint64Vector_size(self)
def at(self, *args): return _swigfaiss_gpu.Uint64Vector_at(self, *args)
......@@ -140,6 +164,7 @@ class LongVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.LongVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.LongVector_clear(self)
def data(self): return _swigfaiss_gpu.LongVector_data(self)
def size(self): return _swigfaiss_gpu.LongVector_size(self)
def at(self, *args): return _swigfaiss_gpu.LongVector_at(self, *args)
......@@ -160,6 +185,7 @@ class IntVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.IntVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.IntVector_clear(self)
def data(self): return _swigfaiss_gpu.IntVector_data(self)
def size(self): return _swigfaiss_gpu.IntVector_size(self)
def at(self, *args): return _swigfaiss_gpu.IntVector_at(self, *args)
......@@ -180,6 +206,7 @@ class VectorTransformVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.VectorTransformVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.VectorTransformVector_clear(self)
def data(self): return _swigfaiss_gpu.VectorTransformVector_data(self)
def size(self): return _swigfaiss_gpu.VectorTransformVector_size(self)
def at(self, *args): return _swigfaiss_gpu.VectorTransformVector_at(self, *args)
......@@ -200,6 +227,7 @@ class OperatingPointVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.OperatingPointVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.OperatingPointVector_clear(self)
def data(self): return _swigfaiss_gpu.OperatingPointVector_data(self)
def size(self): return _swigfaiss_gpu.OperatingPointVector_size(self)
def at(self, *args): return _swigfaiss_gpu.OperatingPointVector_at(self, *args)
......@@ -220,6 +248,7 @@ class FloatVectorVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.FloatVectorVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.FloatVectorVector_clear(self)
def data(self): return _swigfaiss_gpu.FloatVectorVector_data(self)
def size(self): return _swigfaiss_gpu.FloatVectorVector_size(self)
def at(self, *args): return _swigfaiss_gpu.FloatVectorVector_at(self, *args)
......@@ -240,6 +269,7 @@ class ByteVectorVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.ByteVectorVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.ByteVectorVector_clear(self)
def data(self): return _swigfaiss_gpu.ByteVectorVector_data(self)
def size(self): return _swigfaiss_gpu.ByteVectorVector_size(self)
def at(self, *args): return _swigfaiss_gpu.ByteVectorVector_at(self, *args)
......@@ -260,6 +290,7 @@ class LongVectorVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.LongVectorVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.LongVectorVector_clear(self)
def data(self): return _swigfaiss_gpu.LongVectorVector_data(self)
def size(self): return _swigfaiss_gpu.LongVectorVector_size(self)
def at(self, *args): return _swigfaiss_gpu.LongVectorVector_at(self, *args)
......@@ -280,6 +311,7 @@ class GpuResourcesVector(_object):
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.GpuResourcesVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.GpuResourcesVector_clear(self)
def data(self): return _swigfaiss_gpu.GpuResourcesVector_data(self)
def size(self): return _swigfaiss_gpu.GpuResourcesVector_size(self)
def at(self, *args): return _swigfaiss_gpu.GpuResourcesVector_at(self, *args)
......@@ -949,6 +981,9 @@ class ClusteringParameters(_object):
__swig_setmethods__["update_index"] = _swigfaiss_gpu.ClusteringParameters_update_index_set
__swig_getmethods__["update_index"] = _swigfaiss_gpu.ClusteringParameters_update_index_get
if _newclass:update_index = _swig_property(_swigfaiss_gpu.ClusteringParameters_update_index_get, _swigfaiss_gpu.ClusteringParameters_update_index_set)
__swig_setmethods__["frozen_centroids"] = _swigfaiss_gpu.ClusteringParameters_frozen_centroids_set
__swig_getmethods__["frozen_centroids"] = _swigfaiss_gpu.ClusteringParameters_frozen_centroids_get
if _newclass:frozen_centroids = _swig_property(_swigfaiss_gpu.ClusteringParameters_frozen_centroids_get, _swigfaiss_gpu.ClusteringParameters_frozen_centroids_set)
__swig_setmethods__["min_points_per_centroid"] = _swigfaiss_gpu.ClusteringParameters_min_points_per_centroid_set
__swig_getmethods__["min_points_per_centroid"] = _swigfaiss_gpu.ClusteringParameters_min_points_per_centroid_get
if _newclass:min_points_per_centroid = _swig_property(_swigfaiss_gpu.ClusteringParameters_min_points_per_centroid_get, _swigfaiss_gpu.ClusteringParameters_min_points_per_centroid_set)
......@@ -1793,6 +1828,9 @@ class IndexIVF(Index):
__swig_setmethods__["cp"] = _swigfaiss_gpu.IndexIVF_cp_set
__swig_getmethods__["cp"] = _swigfaiss_gpu.IndexIVF_cp_get
if _newclass:cp = _swig_property(_swigfaiss_gpu.IndexIVF_cp_get, _swigfaiss_gpu.IndexIVF_cp_set)
__swig_setmethods__["clustering_index"] = _swigfaiss_gpu.IndexIVF_clustering_index_set
__swig_getmethods__["clustering_index"] = _swigfaiss_gpu.IndexIVF_clustering_index_get
if _newclass:clustering_index = _swig_property(_swigfaiss_gpu.IndexIVF_clustering_index_get, _swigfaiss_gpu.IndexIVF_clustering_index_set)
__swig_setmethods__["ids"] = _swigfaiss_gpu.IndexIVF_ids_set
__swig_getmethods__["ids"] = _swigfaiss_gpu.IndexIVF_ids_get
if _newclass:ids = _swig_property(_swigfaiss_gpu.IndexIVF_ids_get, _swigfaiss_gpu.IndexIVF_ids_set)
......@@ -2022,6 +2060,7 @@ class IndexIVFPQ(IndexIVF):
def encode_multiple(self, *args): return _swigfaiss_gpu.IndexIVFPQ_encode_multiple(self, *args)
def decode_multiple(self, *args): return _swigfaiss_gpu.IndexIVFPQ_decode_multiple(self, *args)
def search_preassigned(self, *args): return _swigfaiss_gpu.IndexIVFPQ_search_preassigned(self, *args)
def search_and_reconstruct(self, *args): return _swigfaiss_gpu.IndexIVFPQ_search_and_reconstruct(self, *args)
def precompute_table(self): return _swigfaiss_gpu.IndexIVFPQ_precompute_table(self)
def __init__(self, *args):
this = _swigfaiss_gpu.new_IndexIVFPQ(*args)
......@@ -2180,6 +2219,7 @@ class IndexIDMap(Index):
def train(self, *args): return _swigfaiss_gpu.IndexIDMap_train(self, *args)
def reset(self): return _swigfaiss_gpu.IndexIDMap_reset(self)
def remove_ids(self, *args): return _swigfaiss_gpu.IndexIDMap_remove_ids(self, *args)
def range_search(self, *args): return _swigfaiss_gpu.IndexIDMap_range_search(self, *args)
__swig_destroy__ = _swigfaiss_gpu.delete_IndexIDMap
__del__ = lambda self : None;
def __init__(self, *args):
......@@ -2340,6 +2380,9 @@ class GpuMultipleClonerOptions(GpuClonerOptions):
__swig_setmethods__["shard"] = _swigfaiss_gpu.GpuMultipleClonerOptions_shard_set
__swig_getmethods__["shard"] = _swigfaiss_gpu.GpuMultipleClonerOptions_shard_get
if _newclass:shard = _swig_property(_swigfaiss_gpu.GpuMultipleClonerOptions_shard_get, _swigfaiss_gpu.GpuMultipleClonerOptions_shard_set)
__swig_setmethods__["shard_type"] = _swigfaiss_gpu.GpuMultipleClonerOptions_shard_type_set
__swig_getmethods__["shard_type"] = _swigfaiss_gpu.GpuMultipleClonerOptions_shard_type_get
if _newclass:shard_type = _swig_property(_swigfaiss_gpu.GpuMultipleClonerOptions_shard_type_get, _swigfaiss_gpu.GpuMultipleClonerOptions_shard_type_set)
__swig_destroy__ = _swigfaiss_gpu.delete_GpuMultipleClonerOptions
__del__ = lambda self : None;
GpuMultipleClonerOptions_swigregister = _swigfaiss_gpu.GpuMultipleClonerOptions_swigregister
......@@ -3256,6 +3299,27 @@ RangeSearchPartialResult_swigregister(RangeSearchPartialResult)
def ignore_SIGTTIN():
return _swigfaiss_gpu.ignore_SIGTTIN()
ignore_SIGTTIN = _swigfaiss_gpu.ignore_SIGTTIN
class MapLong2Long(_object):
__swig_setmethods__ = {}
__setattr__ = lambda self, name, value: _swig_setattr(self, MapLong2Long, name, value)
__swig_getmethods__ = {}
__getattr__ = lambda self, name: _swig_getattr(self, MapLong2Long, name)
__repr__ = _swig_repr
__swig_setmethods__["map"] = _swigfaiss_gpu.MapLong2Long_map_set
__swig_getmethods__["map"] = _swigfaiss_gpu.MapLong2Long_map_get
if _newclass:map = _swig_property(_swigfaiss_gpu.MapLong2Long_map_get, _swigfaiss_gpu.MapLong2Long_map_set)
def add(self, *args): return _swigfaiss_gpu.MapLong2Long_add(self, *args)
def search(self, *args): return _swigfaiss_gpu.MapLong2Long_search(self, *args)
def search_multiple(self, *args): return _swigfaiss_gpu.MapLong2Long_search_multiple(self, *args)
def __init__(self):
this = _swigfaiss_gpu.new_MapLong2Long()
try: self.this.append(this)
except: self.this = this
__swig_destroy__ = _swigfaiss_gpu.delete_MapLong2Long
__del__ = lambda self : None;
MapLong2Long_swigregister = _swigfaiss_gpu.MapLong2Long_swigregister
MapLong2Long_swigregister(MapLong2Long)
# This file is compatible with both classic and new-style classes.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -23,6 +23,7 @@
#pragma SWIG nowarn=321
#pragma SWIG nowarn=403
#pragma SWIG nowarn=325
typedef unsigned long uint64_t;
typedef uint64_t size_t;
......@@ -108,6 +109,7 @@ namespace std {
public:
vector();
void push_back(T);
void clear();
T * data();
size_t size();
T at (size_t n) const;
......@@ -117,6 +119,7 @@ namespace std {
%template(FloatVector) std::vector<float>;
%template(DoubleVector) std::vector<double>;
%template(ByteVector) std::vector<uint8_t>;
%template(Uint64Vector) std::vector<uint64_t>;
%template(LongVector) std::vector<long>;
......@@ -709,4 +712,36 @@ void ignore_SIGTTIN() {
void ignore_SIGTTIN();
%inline %{
// numpy misses a hash table implementation, hence this class. It
// represents not found values as -1 like in the Index implementation
struct MapLong2Long {
std::unordered_map<long, long> map;
void add(size_t n, const long *keys, const long *vals) {
map.reserve(map.size() + n);
for (size_t i = 0; i < n; i++) {
map[keys[i]] = vals[i];
}
}
long search(long key) {
if (map.count(key) == 0) {
return -1;
} else {
return map[key];
}
}
void search_multiple(size_t n, const long *keys, long * vals) {
for (size_t i = 0; i < n; i++) {
vals[i] = search(keys[i]);
}
}
};
%}
// End of file...
......@@ -17,8 +17,8 @@ class TestClustering(unittest.TestCase):
def test_clustering(self):
d = 64
n = 1000
np.random.seed(123)
x = np.random.random(size=(n, d)).astype('float32')
rs = np.random.RandomState(123)
x = rs.uniform(size=(n, d)).astype('float32')
km = faiss.Kmeans(d, 32, niter=10)
err32 = km.train(x)
......@@ -37,15 +37,35 @@ class TestClustering(unittest.TestCase):
def test_nasty_clustering(self):
d = 2
np.random.seed(123)
rs = np.random.RandomState(123)
x = np.zeros((100, d), dtype='float32')
for i in range(5):
x[i * 20:i * 20 + 20] = np.random.random(size=d)
x[i * 20:i * 20 + 20] = rs.uniform(size=d)
# we have 5 distinct points but ask for 10 centroids...
km = faiss.Kmeans(d, 10, niter=10, verbose=True)
km.train(x)
def test_redo(self):
d = 64
n = 1000
rs = np.random.RandomState(123)
x = rs.uniform(size=(n, d)).astype('float32')
clus = faiss.Clustering(d, 20)
clus.nredo = 1
clus.train(x, faiss.IndexFlatL2(d))
obj1 = faiss.vector_to_array(clus.obj)
clus = faiss.Clustering(d, 20)
clus.nredo = 10
clus.train(x, faiss.IndexFlatL2(d))
obj10 = faiss.vector_to_array(clus.obj)
self.assertGreater(obj1[-1], obj10[-1])
class TestPCA(unittest.TestCase):
......@@ -87,7 +107,6 @@ class TestProductQuantizer(unittest.TestCase):
self.assertGreater(2500, diff)
class TestRevSwigPtr(unittest.TestCase):
def test_rev_swig_ptr(self):
......@@ -127,6 +146,19 @@ class TestException(unittest.TestCase):
else:
assert False, 'exception did not fire???'
class TestMapLong2Long:
def test_do_it(self):
keys = np.array([13, 45, 67])
vals = np.array([3, 8, 2])
m = faiss.MapLong2Long()
m.add(keys, vals)
assert np.all(m.search_multiple(keys) == vals)
assert m.search(12343) == -1
if __name__ == '__main__':
unittest.main()
......@@ -22,15 +22,32 @@ def get_dataset(d, nb, nt, nq):
return (xt, xb, xq)
def get_dataset_2(d, nb, nt, nq):
"""A dataset that is not completely random but still challenging to
index
"""
d1 = 10 # intrinsic dimension (more or less)
n = nb + nt + nq
rs = np.random.RandomState(1234)
x = rs.normal(size=(n, d1))
x = np.dot(x, rs.rand(d1, d))
# now we have a d1-dim ellipsoid in d-dimensional space
# higher factor (>4) -> higher frequency -> less linear
x = x * (rs.rand(d) * 4 + 0.1)
x = np.sin(x)
x = x.astype('float32')
return x[:nt], x[nt:-nq], x[-nq:]
class EvalIVFPQAccuracy(unittest.TestCase):
def test_IndexIVFPQ(self):
d = 64
d = 32
nb = 1000
nt = 1500
nq = 200
(xt, xb, xq) = get_dataset(d, nb, nt, nq)
(xt, xb, xq) = get_dataset_2(d, nb, nt, nq)
d = xt.shape[1]
gt_index = faiss.IndexFlatL2(d)
......@@ -38,15 +55,15 @@ class EvalIVFPQAccuracy(unittest.TestCase):
D, gt_nns = gt_index.search(xq, 1)
coarse_quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(coarse_quantizer, d, 25, 16, 8)
index = faiss.IndexIVFPQ(coarse_quantizer, d, 32, 8, 8)
index.train(xt)
index.add(xb)
index.nprobe = 5
index.nprobe = 4
D, nns = index.search(xq, 10)
n_ok = (nns == gt_nns).sum()
nq = xq.shape[0]
self.assertGreater(n_ok, nq * 0.4)
self.assertGreater(n_ok, nq * 0.66)
class TestMultiIndexQuantizer(unittest.TestCase):
......@@ -78,16 +95,16 @@ class TestScalarQuantizer(unittest.TestCase):
def test_4variants_ivf(self):
d = 32
nt = 1500
nq = 200
nb = 10000
nt = 2500
nq = 400
nb = 5000
(xt, xb, xq) = get_dataset(d, nb, nt, nq)
(xt, xb, xq) = get_dataset_2(d, nb, nt, nq)
# common quantizer
quantizer = faiss.IndexFlatL2(d)
ncent = 128
ncent = 64
index_gt = faiss.IndexFlatL2(d)
index_gt.add(xb)
......@@ -114,9 +131,12 @@ class TestScalarQuantizer(unittest.TestCase):
D, I = index.search(xq, 10)
nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
print(nok)
print(nok, nq)
self.assertGreaterEqual(nok['flat'], nq * 0.6)
# The tests below are a bit fragile, it happens that the
# ordering between uniform and non-uniform are reverted,
# probably because the dataset is small, which introduces
# jitter
self.assertGreaterEqual(nok['flat'], nok['QT_8bit'])
self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit'])
self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
......@@ -124,15 +144,15 @@ class TestScalarQuantizer(unittest.TestCase):
def test_4variants(self):
d = 32
nt = 1500
nq = 200
nb = 10000
nt = 2500
nq = 400
nb = 5000
(xt, xb, xq) = get_dataset(d, nb, nt, nq)
index_gt = faiss.IndexFlatL2(d)
index_gt.add(xb)
D, I_ref = index_gt.search(xq, 10)
D_ref, I_ref = index_gt.search(xq, 10)
nok = {}
......@@ -141,12 +161,12 @@ class TestScalarQuantizer(unittest.TestCase):
index = faiss.IndexScalarQuantizer(d, qtype, faiss.METRIC_L2)
index.train(xt)
index.add(xb)
D, I = index.search(xq, 10)
nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
print(nok)
self.assertGreaterEqual(nok['QT_8bit'], nq * 0.9)
self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit'])
self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform'])
......
......@@ -42,6 +42,42 @@ class TestRemove(unittest.TestCase):
else:
assert False, 'should have raised an exception'
def test_remove_id_map_2(self):
# from https://github.com/facebookresearch/faiss/issues/255
rs = np.random.RandomState(1234)
X = rs.randn(10, 10).astype(np.float32)
idx = np.array([0, 10, 20, 30, 40, 5, 15, 25, 35, 45], np.int64)
remove_set = np.array([10, 30], dtype=np.int64)
index = faiss.index_factory(10, 'IDMap,Flat')
index.add_with_ids(X[:5, :], idx[:5])
index.remove_ids(remove_set)
index.add_with_ids(X[5:, :], idx[5:])
print (index.search(X, 1))
for i in range(10):
_, searchres = index.search(X[i:i + 1, :], 1)
if idx[i] in remove_set:
assert searchres[0] != idx[i]
else:
assert searchres[0] == idx[i]
class TestRangeSearch(unittest.TestCase):
def test_range_search_id_map(self):
sub_index = faiss.IndexFlat(5, 1) # L2 search instead of inner product
xb = np.zeros((10, 5), dtype='float32')
xb[:, 0] = np.arange(10) + 1000
index = faiss.IndexIDMap2(sub_index)
index.add_with_ids(xb, np.arange(10) + 100)
dist = float(np.linalg.norm(xb[3] - xb[0])) * 0.99
res_subindex = sub_index.range_search(xb[[0], :], dist)
res_index = index.range_search(xb[[0], :], dist)
assert len(res_subindex[2]) == 2
np.testing.assert_array_equal(res_subindex[2] + 100, res_index[2])
class TestUpdate(unittest.TestCase):
......
......@@ -67,6 +67,10 @@ int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
namespace faiss {
#ifdef __AVX__
#define USE_AVX
#endif
double getmillisecs () {
struct timeval tv;
gettimeofday (&tv, nullptr);
......@@ -455,7 +459,7 @@ float fvec_norm_L2sqr_ref (const float * __restrict x,
/*********************************************************
* SSE implementations
* SSE and AVX implementations
*/
// reads 0 <= d < 4 floats as __m128
......@@ -475,7 +479,96 @@ static inline __m128 masked_read (int d, const float *x)
// cannot use AVX2 _mm_mask_set1_epi32
}
#ifdef USE_AVX
// reads 0 <= d < 8 floats as __m256
static inline __m256 masked_read_8 (int d, const float *x)
{
assert (0 <= d && d < 8);
if (d < 4) {
__m256 res = _mm256_setzero_ps ();
res = _mm256_insertf128_ps (res, masked_read (d, x), 0);
return res;
} else {
__m256 res = _mm256_setzero_ps ();
res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0);
res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1);
return res;
}
}
float fvec_inner_product (const float * x,
const float * y,
size_t d)
{
__m256 msum1 = _mm256_setzero_ps();
while (d >= 8) {
__m256 mx = _mm256_loadu_ps (x); x += 8;
__m256 my = _mm256_loadu_ps (y); y += 8;
msum1 = _mm256_add_ps (msum1, _mm256_mul_ps (mx, my));
d -= 8;
}
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
msum2 += _mm256_extractf128_ps(msum1, 0);
if (d >= 4) {
__m128 mx = _mm_loadu_ps (x); x += 4;
__m128 my = _mm_loadu_ps (y); y += 4;
msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
d -= 4;
}
if (d > 0) {
__m128 mx = masked_read (d, x);
__m128 my = masked_read (d, y);
msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
}
msum2 = _mm_hadd_ps (msum2, msum2);
msum2 = _mm_hadd_ps (msum2, msum2);
return _mm_cvtss_f32 (msum2);
}
float fvec_L2sqr (const float * x,
const float * y,
size_t d)
{
__m256 msum1 = _mm256_setzero_ps();
while (d >= 8) {
__m256 mx = _mm256_loadu_ps (x); x += 8;
__m256 my = _mm256_loadu_ps (y); y += 8;
const __m256 a_m_b1 = mx - my;
msum1 += a_m_b1 * a_m_b1;
d -= 8;
}
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
msum2 += _mm256_extractf128_ps(msum1, 0);
if (d >= 4) {
__m128 mx = _mm_loadu_ps (x); x += 4;
__m128 my = _mm_loadu_ps (y); y += 4;
const __m128 a_m_b1 = mx - my;
msum2 += a_m_b1 * a_m_b1;
d -= 4;
}
if (d > 0) {
__m128 mx = masked_read (d, x);
__m128 my = masked_read (d, y);
__m128 a_m_b1 = mx - my;
msum2 += a_m_b1 * a_m_b1;
}
msum2 = _mm_hadd_ps (msum2, msum2);
msum2 = _mm_hadd_ps (msum2, msum2);
return _mm_cvtss_f32 (msum2);
}
#else
/* SSE-implementation of L2 distance */
float fvec_L2sqr (const float * x,
......@@ -534,6 +627,7 @@ float fvec_inner_product (const float * x,
#endif
float fvec_norm_L2sqr (const float * x,
size_t d)
......@@ -557,69 +651,6 @@ float fvec_norm_L2sqr (const float * x,
/*********************************************************
* AVX implementations
*
* Disabled for now, it is not faster than SSE on current machines
* see P57425519
*/
#if 0
// reads 0 <= d < 8 floats as __m256
static inline __m256 masked_read_8 (int d, const float *x)
{
assert (0 <= d && d < 8);
if (d < 4) {
__m256 res = _mm256_setzero_ps ();
res = _mm256_insertf128_ps (res, masked_read (d, x), 0);
return res;
} else {
__m256 res;
res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0);
res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1);
return res;
}
}
float fvec_L2sqr (const float * x,
const float * y,
size_t d)
{
__m256 msum1 = _mm256_setzero_ps();
while (d >= 8) {
__m256 mx = _mm256_loadu_ps (x); x += 8;
__m256 my = _mm256_loadu_ps (y); y += 8;
const __m256 a_m_b1 = mx - my;
msum1 += a_m_b1 * a_m_b1;
d -= 8;
}
if (d > 0) {
// add the last 1, 2 or 3 values
__m256 mx = masked_read_8 (d, x);
__m256 my = masked_read_8 (d, y);
__m256 a_m_b1 = mx - my;
msum1 += a_m_b1 * a_m_b1;
}
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
msum2 += _mm256_extractf128_ps(msum1, 0);
msum2 = _mm_hadd_ps (msum2, msum2);
msum2 = _mm_hadd_ps (msum2, msum2);
return _mm_cvtss_f32 (msum2);
}
#endif
/***************************************************************************
* Matrix/vector ops
......@@ -1365,16 +1396,18 @@ void pairwise_L2sqr (long d,
#define EPS (1 / 1024.)
/* For k-means, compute centroids given assignment of vectors to centroids */
/* NOTE: This could be multi-threaded (use histogram of indexes) */
int km_update_centroids (const float * x,
float * centroids,
long * assign,
size_t d, size_t k, size_t n)
size_t d, size_t k, size_t n,
size_t k_frozen)
{
k -= k_frozen;
centroids += k_frozen * d;
std::vector<size_t> hassign(k);
memset (centroids, 0, sizeof(*centroids) * d * k);
#pragma omp parallel
{
int nt = omp_get_num_threads();
......@@ -1383,12 +1416,12 @@ int km_update_centroids (const float * x,
size_t c0 = (k * rank) / nt;
size_t c1 = (k * (rank + 1)) / nt;
const float *xi = x;
// printf("thread %d/%d: centroids %ld:%ld\n", rank, nt, c0, c1);
size_t nacc = 0;
for (size_t i = 0; i < n; i++) {
long ci = assign[i];
assert (ci >= 0 && ci < k);
assert (ci >= 0 && ci < k + k_frozen);
ci -= k_frozen;
if (ci >= c0 && ci < c1) {
float * c = centroids + ci * d;
hassign[ci]++;
......@@ -1398,7 +1431,6 @@ int km_update_centroids (const float * x,
}
xi += d;
}
// printf("thread %d/%d: nacc = %ld/%ld\n", rank, nt, nacc, n);
}
......
......@@ -307,12 +307,20 @@ int fvec_madd_and_argmin (size_t n, const float *a,
void reflection (const float * u, float * x, size_t n, size_t d, size_t nu);
/** For k-means: update stage. Returns nb of split clusters. */
/** For k-means: update stage.
*
* @param x training vectors, size n * d
* @param centroids centroid vectors, size k * d
* @param assign nearest centroid for each training vector, size n
* @param k_frozen do not update the k_frozen first centroids
* @return nb of spliting operations to fight empty clusters
*/
int km_update_centroids (
const float * x,
float * centroids,
long * assign,
size_t d, size_t k, size_t n);
size_t d, size_t k, size_t n,
size_t k_frozen);
/** compute the Q of the QR decomposition for m > n
* @param a size n * m: input matrix and output Q
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment