Commit 250a3d3f authored by matthijs's avatar matthijs

sync with FB version 2017-11-22

various bugfixes from github issues
kmean with some frozen centroids
GPU better tiling for large flat datasets
default AVX for vector ops
parent 71335194
...@@ -345,6 +345,7 @@ void ParameterSpace::initialize (const Index * index) ...@@ -345,6 +345,7 @@ void ParameterSpace::initialize (const Index * index)
} }
if (DC (IndexIVF)) { if (DC (IndexIVF)) {
{
ParameterRange & pr = add_range("nprobe"); ParameterRange & pr = add_range("nprobe");
for (int i = 0; i < 13; i++) { for (int i = 0; i < 13; i++) {
size_t nprobe = 1 << i; size_t nprobe = 1 << i;
...@@ -352,6 +353,7 @@ void ParameterSpace::initialize (const Index * index) ...@@ -352,6 +353,7 @@ void ParameterSpace::initialize (const Index * index)
pr.values.push_back (nprobe); pr.values.push_back (nprobe);
} }
} }
}
if (DC (IndexPQ)) { if (DC (IndexPQ)) {
ParameterRange & pr = add_range("ht"); ParameterRange & pr = add_range("ht");
init_pq_ParameterRange (ix->pq, pr); init_pq_ParameterRange (ix->pq, pr);
...@@ -371,7 +373,6 @@ void ParameterSpace::initialize (const Index * index) ...@@ -371,7 +373,6 @@ void ParameterSpace::initialize (const Index * index)
} }
} }
if (DC (IndexIVFPQR)) { if (DC (IndexIVFPQR)) {
assert (ix);
ParameterRange & pr = add_range("k_factor"); ParameterRange & pr = add_range("k_factor");
for (int i = 0; i <= 6; i++) { for (int i = 0; i <= 6; i++) {
pr.values.push_back (1 << i); pr.values.push_back (1 << i);
...@@ -427,12 +428,21 @@ void ParameterSpace::set_index_parameter ( ...@@ -427,12 +428,21 @@ void ParameterSpace::set_index_parameter (
if (name == "verbose") { if (name == "verbose") {
index->verbose = int(val); index->verbose = int(val);
// and fall through to also enable it on sub-indexes
} }
if (DC (IndexPreTransform)) { if (DC (IndexPreTransform)) {
index = ix->index; index = ix->index;
} }
if (DC (IndexShards)) {
// call on all sub-indexes
for (auto & shard_index : ix->shard_indexes) {
set_index_parameter (shard_index, name, val);
}
return;
}
if (name == "verbose") { if (name == "verbose") {
index->verbose = int(val); index->verbose = int(val);
// in case it was an IndexPreTransform
} }
if (DC (IndexRefineFlat)) { if (DC (IndexRefineFlat)) {
if (name == "k_factor_rf") { if (name == "k_factor_rf") {
...@@ -449,9 +459,12 @@ void ParameterSpace::set_index_parameter ( ...@@ -449,9 +459,12 @@ void ParameterSpace::set_index_parameter (
return; // last verbose that we could find return; // last verbose that we could find
} }
if (name == "nprobe") { if (name == "nprobe") {
DC(IndexIVF); if ( DC(IndexIVF)) {
ix->nprobe = int(val); ix->nprobe = int(val);
} else if (name == "ht") { return;
}
}
if (name == "ht") {
if (DC (IndexPQ)) { if (DC (IndexPQ)) {
if (val >= ix->pq.code_size * 8) { if (val >= ix->pq.code_size * 8) {
ix->search_type = IndexPQ::ST_PQ; ix->search_type = IndexPQ::ST_PQ;
...@@ -459,25 +472,32 @@ void ParameterSpace::set_index_parameter ( ...@@ -459,25 +472,32 @@ void ParameterSpace::set_index_parameter (
ix->search_type = IndexPQ::ST_polysemous; ix->search_type = IndexPQ::ST_polysemous;
ix->polysemous_ht = int(val); ix->polysemous_ht = int(val);
} }
return;
} else if (DC (IndexIVFPQ)) { } else if (DC (IndexIVFPQ)) {
if (val >= ix->pq.code_size * 8) { if (val >= ix->pq.code_size * 8) {
ix->polysemous_ht = 0; ix->polysemous_ht = 0;
} else { } else {
ix->polysemous_ht = int(val); ix->polysemous_ht = int(val);
} }
return;
}
} }
} else if (name == "k_factor") {
DC (IndexIVFPQR); if (name == "k_factor") {
if (DC (IndexIVFPQR)) {
ix->k_factor = val; ix->k_factor = val;
} else if (name == "max_codes") { return;
DC (IndexIVFPQ); }
}
if (name == "max_codes") {
if (DC (IndexIVFPQ)) {
ix->max_codes = finite(val) ? size_t(val) : 0; ix->max_codes = finite(val) ? size_t(val) : 0;
} else { return;
FAISS_THROW_FMT ( }
"ParameterSpace::set_index_parameter:" }
FAISS_THROW_FMT ("ParameterSpace::set_index_parameter:"
"could not set parameter %s", "could not set parameter %s",
name.c_str()); name.c_str());
}
} }
void ParameterSpace::display () const void ParameterSpace::display () const
...@@ -634,6 +654,15 @@ struct VTChain { ...@@ -634,6 +654,15 @@ struct VTChain {
} }
}; };
/// what kind of training does this coarse quantizer require?
char get_trains_alone(const Index *coarse_quantizer) {
return
dynamic_cast<const MultiIndexQuantizer*>(coarse_quantizer) ? 1 :
0;
}
} }
Index *index_factory (int d, const char *description_in, MetricType metric) Index *index_factory (int d, const char *description_in, MetricType metric)
...@@ -656,6 +685,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -656,6 +685,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
tok; tok;
tok = strtok_r (nullptr, " ,", &ptr)) { tok = strtok_r (nullptr, " ,", &ptr)) {
int d_out, opq_M, nbit, M, M2; int d_out, opq_M, nbit, M, M2;
char option[100];
std::string stok(tok); std::string stok(tok);
// to avoid mem leaks with exceptions: // to avoid mem leaks with exceptions:
...@@ -686,7 +716,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -686,7 +716,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
} else if (stok == "L2norm") { } else if (stok == "L2norm") {
vt_1 = new NormalizationTransform (d, 2.0); vt_1 = new NormalizationTransform (d, 2.0);
// coarse quantizers
} else if (!coarse_quantizer && } else if (!coarse_quantizer &&
sscanf (tok, "IVF%d", &ncentroids) == 1) { sscanf (tok, "IVF%d", &ncentroids) == 1) {
if (metric == METRIC_L2) { if (metric == METRIC_L2) {
...@@ -709,8 +739,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -709,8 +739,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
IndexIVF *index_ivf = new IndexIVFFlat ( IndexIVF *index_ivf = new IndexIVFFlat (
coarse_quantizer, d, ncentroids, metric); coarse_quantizer, d, ncentroids, metric);
index_ivf->quantizer_trains_alone = index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer) get_trains_alone (coarse_quantizer);
!= nullptr;
index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT; index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
del_coarse_quantizer.release (); del_coarse_quantizer.release ();
index_ivf->own_fields = true; index_ivf->own_fields = true;
...@@ -728,8 +757,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -728,8 +757,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
new IndexIVFScalarQuantizer ( new IndexIVFScalarQuantizer (
coarse_quantizer, d, ncentroids, qt, metric); coarse_quantizer, d, ncentroids, qt, metric);
index_ivf->quantizer_trains_alone = index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer) get_trains_alone (coarse_quantizer);
!= nullptr;
del_coarse_quantizer.release (); del_coarse_quantizer.release ();
index_ivf->own_fields = true; index_ivf->own_fields = true;
index_1 = index_ivf; index_1 = index_ivf;
...@@ -744,29 +772,31 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -744,29 +772,31 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
IndexIVFPQR *index_ivf = new IndexIVFPQR ( IndexIVFPQR *index_ivf = new IndexIVFPQR (
coarse_quantizer, d, ncentroids, M, 8, M2, 8); coarse_quantizer, d, ncentroids, M, 8, M2, 8);
index_ivf->quantizer_trains_alone = index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer) get_trains_alone (coarse_quantizer);
!= nullptr;
del_coarse_quantizer.release (); del_coarse_quantizer.release ();
index_ivf->own_fields = true; index_ivf->own_fields = true;
index_1 = index_ivf; index_1 = index_ivf;
} else if (!index && sscanf (tok, "PQ%d", &M) == 1) { } else if (!index && sscanf (tok, "PQ%d%10s", &M, option) == 2) {
std::string soption = option;
// np to disable polysemous trainign
FAISS_THROW_IF_NOT(soption == "" || soption == "np");
if (coarse_quantizer) { if (coarse_quantizer) {
IndexIVFPQ *index_ivf = new IndexIVFPQ ( IndexIVFPQ *index_ivf = new IndexIVFPQ (
coarse_quantizer, d, ncentroids, M, 8); coarse_quantizer, d, ncentroids, M, 8);
index_ivf->quantizer_trains_alone = index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer) get_trains_alone (coarse_quantizer);
!= nullptr;
index_ivf->metric_type = metric; index_ivf->metric_type = metric;
index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT; index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
del_coarse_quantizer.release (); del_coarse_quantizer.release ();
index_ivf->own_fields = true; index_ivf->own_fields = true;
index_ivf->do_polysemous_training = true; index_ivf->do_polysemous_training = soption != "np";
index_1 = index_ivf; index_1 = index_ivf;
} else { } else {
IndexPQ *index_pq = new IndexPQ (d, M, 8, metric); IndexPQ *index_pq = new IndexPQ (d, M, 8, metric);
index_pq->do_polysemous_training = true; index_pq->do_polysemous_training = soption != "np";
index_1 = index_pq; index_1 = index_pq;
} }
} else if (stok == "RFlat") { } else if (stok == "RFlat") {
make_IndexRefineFlat = true; make_IndexRefineFlat = true;
} else { } else {
......
...@@ -25,7 +25,7 @@ namespace faiss { ...@@ -25,7 +25,7 @@ namespace faiss {
/** The objective is to have a simple result structure while /** The objective is to have a simple result structure while
* minimizing the number of mem copies in the result. The method * minimizing the number of mem copies in the result. The method
* do_allocation can be overloaded to allocate the result tables in * do_allocation can be overloaded to allocate the result tables in
* the matrix type of a srcipting language like Lua or Python. */ * the matrix type of a scripting language like Lua or Python. */
struct RangeSearchResult { struct RangeSearchResult {
size_t nq; ///< nb of queries size_t nq; ///< nb of queries
size_t *lims; ///< size (nq + 1) size_t *lims; ///< size (nq + 1)
......
...@@ -29,6 +29,7 @@ ClusteringParameters::ClusteringParameters (): ...@@ -29,6 +29,7 @@ ClusteringParameters::ClusteringParameters ():
nredo(1), nredo(1),
verbose(false), spherical(false), verbose(false), spherical(false),
update_index(false), update_index(false),
frozen_centroids(false),
min_points_per_centroid(39), min_points_per_centroid(39),
max_points_per_centroid(256), max_points_per_centroid(256),
seed(1234) seed(1234)
...@@ -110,7 +111,24 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) { ...@@ -110,7 +111,24 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
float * dis = new float[nx]; float * dis = new float[nx];
ScopeDeleter<float> del2(dis); ScopeDeleter<float> del2(dis);
// for redo
float best_err = 1e50; float best_err = 1e50;
std::vector<float> best_obj;
std::vector<float> best_centroids;
// support input centroids
FAISS_THROW_IF_NOT_MSG (
centroids.size() % d == 0,
"size of provided input centroids not a multiple of dimension");
size_t n_input_centroids = centroids.size() / d;
if (verbose && n_input_centroids > 0) {
printf (" Using %zd centroids provided as input (%sfrozen)\n",
n_input_centroids, frozen_centroids ? "" : "not ");
}
double t_search_tot = 0; double t_search_tot = 0;
if (verbose) { if (verbose) {
printf(" Preprocessing in %.2f s\n", printf(" Preprocessing in %.2f s\n",
...@@ -120,39 +138,28 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) { ...@@ -120,39 +138,28 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
for (int redo = 0; redo < nredo; redo++) { for (int redo = 0; redo < nredo; redo++) {
std::vector<float> buf_centroids;
std::vector<float> &cur_centroids =
nredo == 1 ? centroids : buf_centroids;
if (verbose && nredo > 1) { if (verbose && nredo > 1) {
printf("Outer iteration %d / %d\n", redo, nredo); printf("Outer iteration %d / %d\n", redo, nredo);
} }
if (cur_centroids.size() == 0) {
// initialize centroids with random points from the dataset // initialize remaining centroids with random points from the dataset
cur_centroids.resize (d * k); centroids.resize (d * k);
std::vector<int> perm (nx); std::vector<int> perm (nx);
rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L); rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L);
#pragma omp parallel for for (int i = n_input_centroids; i < k ; i++)
for (int i = 0; i < k ; i++) memcpy (&centroids[i * d], x + perm[i] * d,
memcpy (&cur_centroids[i * d], x + perm[i] * d,
d * sizeof (float)); d * sizeof (float));
} else { // assume user provides some meaningful initialization
FAISS_THROW_IF_NOT (cur_centroids.size() == d * k);
FAISS_THROW_IF_NOT_MSG (nredo == 1,
"will redo with same initialization");
}
if (spherical) if (spherical)
fvec_renorm_L2 (d, k, cur_centroids.data()); fvec_renorm_L2 (d, k, centroids.data());
if (!index.is_trained) if (!index.is_trained)
index.train (k, cur_centroids.data()); index.train (k, centroids.data());
FAISS_THROW_IF_NOT (index.ntotal == 0); FAISS_THROW_IF_NOT (index.ntotal == 0);
index.add (k, cur_centroids.data()); index.add (k, centroids.data());
float err = 0; float err = 0;
for (int i = 0; i < niter; i++) { for (int i = 0; i < niter; i++) {
double t0s = getmillisecs(); double t0s = getmillisecs();
...@@ -164,8 +171,9 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) { ...@@ -164,8 +171,9 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
err += dis[j]; err += dis[j];
obj.push_back (err); obj.push_back (err);
int nsplit = km_update_centroids (x, cur_centroids.data(), int nsplit = km_update_centroids (
assign, d, k, nx); x, centroids.data(),
assign, d, k, nx, frozen_centroids ? n_input_centroids : 0);
if (verbose) { if (verbose) {
printf (" Iteration %d (%.2f s, search %.2f s): " printf (" Iteration %d (%.2f s, search %.2f s): "
...@@ -178,26 +186,31 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) { ...@@ -178,26 +186,31 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
} }
if (spherical) if (spherical)
fvec_renorm_L2 (d, k, cur_centroids.data()); fvec_renorm_L2 (d, k, centroids.data());
index.reset (); index.reset ();
if (update_index) if (update_index)
index.train (k, cur_centroids.data()); index.train (k, centroids.data());
assert (index.ntotal == 0); assert (index.ntotal == 0);
index.add (k, cur_centroids.data()); index.add (k, centroids.data());
} }
if (verbose) printf("\n"); if (verbose) printf("\n");
if (nredo > 1) { if (nredo > 1) {
if (err < best_err) { if (err < best_err) {
if (verbose) if (verbose)
printf ("Objective improved: keep new clusters\n"); printf ("Objective improved: keep new clusters\n");
centroids = buf_centroids; best_centroids = centroids;
best_obj = obj;
best_err = err; best_err = err;
} }
index.reset (); index.reset ();
} }
} }
if (nredo > 1) {
centroids = best_centroids;
obj = best_obj;
}
} }
......
...@@ -28,6 +28,7 @@ struct ClusteringParameters { ...@@ -28,6 +28,7 @@ struct ClusteringParameters {
bool verbose; bool verbose;
bool spherical; ///< do we want normalized centroids? bool spherical; ///< do we want normalized centroids?
bool update_index; ///< update index after each iteration? bool update_index; ///< update index after each iteration?
bool frozen_centroids; ///< use the centroids provided as input and do not change them during iterations
int min_points_per_centroid; ///< otherwise you get a warning int min_points_per_centroid; ///< otherwise you get a warning
int max_points_per_centroid; ///< to limit size of dataset int max_points_per_centroid; ///< to limit size of dataset
......
...@@ -41,8 +41,7 @@ long Index::remove_ids(const IDSelector& /*sel*/) { ...@@ -41,8 +41,7 @@ long Index::remove_ids(const IDSelector& /*sel*/) {
void Index::reconstruct (idx_t, float * ) const { void Index::reconstruct (idx_t, float * ) const {
FAISS_THROW_MSG ("Can not compute reconstruct without " FAISS_THROW_MSG ("reconstruct not implemented for this type of index");
"knowing how to do so");
} }
......
...@@ -34,8 +34,9 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist, ...@@ -34,8 +34,9 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
nlist (nlist), nlist (nlist),
nprobe (1), nprobe (1),
quantizer (quantizer), quantizer (quantizer),
quantizer_trains_alone (false), quantizer_trains_alone (0),
own_fields (false), own_fields (false),
clustering_index (nullptr),
ids (nlist), ids (nlist),
maintain_direct_map (false) maintain_direct_map (false)
{ {
...@@ -56,7 +57,8 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist, ...@@ -56,7 +57,8 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
IndexIVF::IndexIVF (): IndexIVF::IndexIVF ():
nlist (0), nprobe (1), quantizer (nullptr), nlist (0), nprobe (1), quantizer (nullptr),
quantizer_trains_alone (false), own_fields (false), quantizer_trains_alone (0), own_fields (false),
clustering_index (nullptr),
maintain_direct_map (false) maintain_direct_map (false)
{} {}
...@@ -157,22 +159,44 @@ void IndexIVF::train (idx_t n, const float *x) ...@@ -157,22 +159,44 @@ void IndexIVF::train (idx_t n, const float *x)
if (quantizer->is_trained && (quantizer->ntotal == nlist)) { if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
if (verbose) if (verbose)
printf ("IVF quantizer does not need training.\n"); printf ("IVF quantizer does not need training.\n");
} else if (quantizer_trains_alone) { } else if (quantizer_trains_alone == 1) {
if (verbose) if (verbose)
printf ("IVF quantizer trains alone...\n"); printf ("IVF quantizer trains alone...\n");
quantizer->train (n, x); quantizer->train (n, x);
quantizer->verbose = verbose;
FAISS_THROW_IF_NOT_MSG (quantizer->ntotal == nlist, FAISS_THROW_IF_NOT_MSG (quantizer->ntotal == nlist,
"nlist not consistent with quantizer size"); "nlist not consistent with quantizer size");
} else { } else if (quantizer_trains_alone == 0) {
if (verbose) if (verbose)
printf ("Training IVF quantizer on %ld vectors in %dD\n", printf ("Training IVF quantizer on %ld vectors in %dD\n",
n, d); n, d);
Clustering clus (d, nlist, cp); Clustering clus (d, nlist, cp);
quantizer->reset(); quantizer->reset();
if (clustering_index) {
clus.train (n, x, *clustering_index);
quantizer->add (nlist, clus.centroids.data());
} else {
clus.train (n, x, *quantizer); clus.train (n, x, *quantizer);
}
quantizer->is_trained = true; quantizer->is_trained = true;
} else if (quantizer_trains_alone == 2) {
if (verbose)
printf (
"Training L2 quantizer on %ld vectors in %dD%s\n",
n, d,
clustering_index ? "(user provided index)" : "");
FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
Clustering clus (d, nlist, cp);
if (!clustering_index) {
IndexFlatL2 assigner (d);
clus.train(n, x, assigner);
} else {
clus.train(n, x, *clustering_index);
}
if (verbose)
printf ("Adding centroids to quantizer\n");
quantizer->add (nlist, clus.centroids.data());
} }
if (verbose) if (verbose)
printf ("Training IVF residual\n"); printf ("Training IVF residual\n");
...@@ -250,8 +274,9 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type, ...@@ -250,8 +274,9 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
{ {
FAISS_THROW_IF_NOT (nlist == other.nlist); FAISS_THROW_IF_NOT (nlist == other.nlist);
FAISS_THROW_IF_NOT (!other.maintain_direct_map); FAISS_THROW_IF_NOT (!other.maintain_direct_map);
FAISS_THROW_IF_NOT_MSG (subset_type == 0 || subset_type == 2, FAISS_THROW_IF_NOT_FMT (
"this subset type is not implemented"); subset_type == 0 || subset_type == 1 || subset_type == 2,
"subset type %d not implemented", subset_type);
size_t accu_n = 0; size_t accu_n = 0;
size_t accu_a1 = 0; size_t accu_a1 = 0;
...@@ -275,15 +300,24 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type, ...@@ -275,15 +300,24 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
other.ntotal++; other.ntotal++;
} }
} }
} else if (subset_type == 1) {
for (long i = 0; i < n; i++) {
idx_t id = ids_in[i];
if (id % a1 == a2) {
ids_out.push_back (id);
codes_out.insert (codes_out.end(),
codes_in.begin() + i * code_size,
codes_in.begin() + (i + 1) * code_size);
other.ntotal++;
}
}
} else if (subset_type == 2) { } else if (subset_type == 2) {
// see what is allocated to a1 and to a2 // see what is allocated to a1 and to a2
size_t next_accu_n = accu_n + n; size_t next_accu_n = accu_n + n;
size_t next_accu_a1 = next_accu_n * a1 / ntotal; size_t next_accu_a1 = next_accu_n * a1 / ntotal;
size_t i1 = next_accu_a1 - accu_a1; size_t i1 = next_accu_a1 - accu_a1;
accu_a1 = next_accu_a1;
size_t next_accu_a2 = next_accu_n * a2 / ntotal; size_t next_accu_a2 = next_accu_n * a2 / ntotal;
size_t i2 = next_accu_a2 - accu_a2; size_t i2 = next_accu_a2 - accu_a2;
accu_a2 = next_accu_a2;
ids_out.insert(ids_out.end(), ids_out.insert(ids_out.end(),
ids_in.begin() + i1, ids_in.begin() + i1,
ids_in.begin() + i2); ids_in.begin() + i2);
...@@ -291,6 +325,8 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type, ...@@ -291,6 +325,8 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
codes_in.begin() + i1 * code_size, codes_in.begin() + i1 * code_size,
codes_in.begin() + i2 * code_size); codes_in.begin() + i2 * code_size);
other.ntotal += i2 - i1; other.ntotal += i2 - i1;
accu_a1 = next_accu_a1;
accu_a2 = next_accu_a2;
} }
accu_n += n; accu_n += n;
} }
......
...@@ -47,10 +47,17 @@ struct IndexIVF: Index { ...@@ -47,10 +47,17 @@ struct IndexIVF: Index {
size_t nprobe; ///< number of probes at query time size_t nprobe; ///< number of probes at query time
Index * quantizer; ///< quantizer that maps vectors to inverted lists Index * quantizer; ///< quantizer that maps vectors to inverted lists
bool quantizer_trains_alone; ///< just pass over the trainset to quantizer
/**
* = 0: use the quantizer as index in a kmeans training
* = 1: just pass on the training set to the train() of the quantizer
* = 2: kmeans training on a flat index + add the centroids to the quantizer
*/
char quantizer_trains_alone;
bool own_fields; ///< whether object owns the quantizer bool own_fields; ///< whether object owns the quantizer
ClusteringParameters cp; ///< to override default clustering params ClusteringParameters cp; ///< to override default clustering params
Index *clustering_index; ///< to override index used during clustering
std::vector < std::vector<long> > ids; ///< Inverted lists for indexes std::vector < std::vector<long> > ids; ///< Inverted lists for indexes
......
...@@ -291,8 +291,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const ...@@ -291,8 +291,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
for (int j = 0; j < d; j++) { for (int j = 0; j < d; j++) {
r[j] += centroid[j]; r[j] += centroid[j];
} }
} } else {
else {
pq.decode (code_line + ofs * pq.code_size, r); pq.decode (code_line + ofs * pq.code_size, r);
} }
} }
...@@ -303,6 +302,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const ...@@ -303,6 +302,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
void IndexIVFPQ::reconstruct (idx_t key, float * recons) const void IndexIVFPQ::reconstruct (idx_t key, float * recons) const
{ {
FAISS_THROW_IF_NOT (direct_map.size() == ntotal); FAISS_THROW_IF_NOT (direct_map.size() == ntotal);
int list_no = direct_map[key] >> 32; int list_no = direct_map[key] >> 32;
int ofs = direct_map[key] & 0xffffffff; int ofs = direct_map[key] & 0xffffffff;
...@@ -1029,6 +1029,51 @@ void IndexIVFPQ::search_preassigned (idx_t nx, const float *qx, idx_t k, ...@@ -1029,6 +1029,51 @@ void IndexIVFPQ::search_preassigned (idx_t nx, const float *qx, idx_t k,
} }
void IndexIVFPQ::search_and_reconstruct (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels,
float *reconstructed)
{
long * idx = new long [n * nprobe];
ScopeDeleter<long> del (idx);
float * coarse_dis = new float [n * nprobe];
ScopeDeleter<float> del2 (coarse_dis);
quantizer->search (n, x, nprobe, coarse_dis, idx);
search_preassigned (n, x, k, idx, coarse_dis,
distances, labels, true);
for (long i = 0; i < n; i++) {
for (long j = 0; j < k; j++) {
long ij = i * k + j;
idx_t res = labels[ij];
float *recons = reconstructed + d * (ij);
if (res < 0) {
// fill with NaNs
memset(recons, -1, sizeof(*recons) * d);
} else {
int list_no = res >> 32;
int ofs = res & 0xffffffff;
labels[ij] = ids[list_no][ofs];
quantizer->reconstruct (list_no, recons);
const uint8_t * code = &(codes[list_no][ofs * pq.code_size]);
for (size_t m = 0; m < pq.M; m++) {
float * out = recons + m * pq.dsub;
const float * cent = pq.get_centroids (m, code[m]);
for (size_t l = 0; l < pq.dsub; l++) {
out[l] += cent[l];
}
}
}
}
}
}
IndexIVFPQ::IndexIVFPQ () IndexIVFPQ::IndexIVFPQ ()
......
...@@ -114,6 +114,15 @@ struct IndexIVFPQ: IndexIVF { ...@@ -114,6 +114,15 @@ struct IndexIVFPQ: IndexIVF {
float *distances, idx_t *labels, float *distances, idx_t *labels,
bool store_pairs) const override; bool store_pairs) const override;
/** Same as the search function, but also reconstruct approximate
* vectors for the search results
*
* @param reconstructed size (n, k, d)
**/
void search_and_reconstruct (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels,
float *reconstructed);
/// build precomputed table /// build precomputed table
void precompute_table (); void precompute_table ();
......
...@@ -124,8 +124,8 @@ struct Codec4bit { ...@@ -124,8 +124,8 @@ struct Codec4bit {
struct SimilarityL2 { struct SimilarityL2 {
const float *y, *yi; const float *y, *yi;
explicit SimilarityL2 (const float * y): y(y) {}
explicit SimilarityL2 (const float * y): y(y) {}
/******* scalar accumulator *******/ /******* scalar accumulator *******/
...@@ -676,19 +676,19 @@ void ScalarQuantizer::compute_codes (const float * x, ...@@ -676,19 +676,19 @@ void ScalarQuantizer::compute_codes (const float * x,
size_t n) const size_t n) const
{ {
Quantizer *squant = select_quantizer (*this); Quantizer *squant = select_quantizer (*this);
ScopeDeleter1<Quantizer> del(squant);
#pragma omp parallel for #pragma omp parallel for
for (size_t i = 0; i < n; i++) for (size_t i = 0; i < n; i++)
squant->encode_vector (x + i * d, codes + i * code_size); squant->encode_vector (x + i * d, codes + i * code_size);
delete squant;
} }
void ScalarQuantizer::decode (const uint8_t *codes, float *x, size_t n) const void ScalarQuantizer::decode (const uint8_t *codes, float *x, size_t n) const
{ {
Quantizer *squant = select_quantizer (*this); Quantizer *squant = select_quantizer (*this);
ScopeDeleter1<Quantizer> del(squant);
#pragma omp parallel for #pragma omp parallel for
for (size_t i = 0; i < n; i++) for (size_t i = 0; i < n; i++)
squant->decode_vector (codes + i * code_size, x + i * d); squant->decode_vector (codes + i * code_size, x + i * d);
delete squant;
} }
/******************************************************************* /*******************************************************************
...@@ -754,6 +754,7 @@ void IndexScalarQuantizer::search( ...@@ -754,6 +754,7 @@ void IndexScalarQuantizer::search(
} }
ci += code_size; ci += code_size;
} }
minheap_reorder (k, simi, idxi);
} }
} else { } else {
#pragma omp parallel for #pragma omp parallel for
...@@ -774,7 +775,7 @@ void IndexScalarQuantizer::search( ...@@ -774,7 +775,7 @@ void IndexScalarQuantizer::search(
} }
ci += code_size; ci += code_size;
} }
maxheap_reorder (k, simi, idxi);
} }
} }
...@@ -855,6 +856,7 @@ void IndexIVFScalarQuantizer::add_with_ids ...@@ -855,6 +856,7 @@ void IndexIVFScalarQuantizer::add_with_ids
int nt = omp_get_num_threads(); int nt = omp_get_num_threads();
int rank = omp_get_thread_num(); int rank = omp_get_thread_num();
// each thread takes care of a subset of lists
for (size_t i = 0; i < n; i++) { for (size_t i = 0; i < n; i++) {
long list_no = idx [i]; long list_no = idx [i];
...@@ -879,6 +881,7 @@ void IndexIVFScalarQuantizer::add_with_ids ...@@ -879,6 +881,7 @@ void IndexIVFScalarQuantizer::add_with_ids
ntotal += nadd; ntotal += nadd;
} }
namespace {
void search_with_probes_ip (const IndexIVFScalarQuantizer & index, void search_with_probes_ip (const IndexIVFScalarQuantizer & index,
const float *x, const float *x,
...@@ -958,6 +961,8 @@ void search_with_probes_L2 (const IndexIVFScalarQuantizer & index, ...@@ -958,6 +961,8 @@ void search_with_probes_L2 (const IndexIVFScalarQuantizer & index,
maxheap_reorder (k, simi, idxi); maxheap_reorder (k, simi, idxi);
} }
} // anonymous namespace
void IndexIVFScalarQuantizer::search_preassigned ( void IndexIVFScalarQuantizer::search_preassigned (
idx_t n, const float *x, idx_t k, idx_t n, const float *x, idx_t k,
const idx_t *idx, const idx_t *idx,
......
...@@ -87,54 +87,59 @@ _swigfaiss.so: python/_swigfaiss.so ...@@ -87,54 +87,59 @@ _swigfaiss.so: python/_swigfaiss.so
cp python/_swigfaiss.so python/swigfaiss.py . cp python/_swigfaiss.so python/swigfaiss.py .
############################# #############################
# Dependencies # Dependencies.
# make dep > x
# then copy/paste from x by hand below
# for i in *.cpp ; do g++ -std=c++11 -I.. -MM $i -msse4; done dep:
for i in $(patsubst %.o,%.cpp,$(LIBOBJ)) ; do \
cpp -MM -std=gnu++0x $$i ; \
done
AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h IndexFlat.h
FaissException.o: FaissException.cpp FaissException.h
hamming.o: hamming.cpp hamming.h Heap.h FaissAssert.h FaissException.h hamming.o: hamming.cpp hamming.h Heap.h FaissAssert.h FaissException.h
Heap.o: Heap.cpp Heap.h utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \
Index.o: Index.cpp IndexFlat.h Index.h FaissAssert.h FaissException.h FaissAssert.h FaissException.h
IndexFlat.o: IndexFlat.cpp IndexFlat.h Index.h utils.h Heap.h \ IndexFlat.o: IndexFlat.cpp IndexFlat.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h AuxIndexStructures.h FaissAssert.h FaissException.h AuxIndexStructures.h
index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
IndexIVF.o: IndexIVF.cpp IndexIVF.h Index.h Clustering.h Heap.h utils.h \ IndexIVF.o: IndexIVF.cpp IndexIVF.h Index.h Clustering.h Heap.h utils.h \
hamming.h FaissAssert.h FaissException.h IndexFlat.h \ hamming.h FaissAssert.h FaissException.h IndexFlat.h \
AuxIndexStructures.h AuxIndexStructures.h
IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
IndexFlat.h hamming.h FaissAssert.h FaissException.h \
AuxIndexStructures.h
IndexLSH.o: IndexLSH.cpp IndexLSH.h Index.h VectorTransform.h utils.h \ IndexLSH.o: IndexLSH.cpp IndexLSH.h Index.h VectorTransform.h utils.h \
Heap.h hamming.h FaissAssert.h FaissException.h Heap.h hamming.h FaissAssert.h FaissException.h
IndexPQ.o: IndexPQ.cpp IndexPQ.h Index.h ProductQuantizer.h Clustering.h \ IndexPQ.o: IndexPQ.cpp IndexPQ.h Index.h ProductQuantizer.h Clustering.h \
Heap.h PolysemousTraining.h FaissAssert.h FaissException.h hamming.h Heap.h PolysemousTraining.h FaissAssert.h FaissException.h hamming.h
IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \ IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \ Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
FaissException.h IndexFlat.h hamming.h FaissAssert.h FaissException.h \
MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \ AuxIndexStructures.h
FaissException.h Heap.h AuxIndexStructures.h Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h IndexFlat.h
Heap.o: Heap.cpp Heap.h
VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \
Heap.h FaissAssert.h FaissException.h IndexPQ.h ProductQuantizer.h \
Clustering.h PolysemousTraining.h
index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
PolysemousTraining.o: PolysemousTraining.cpp PolysemousTraining.h \ PolysemousTraining.o: PolysemousTraining.cpp PolysemousTraining.h \
ProductQuantizer.h Clustering.h Index.h Heap.h utils.h hamming.h \ ProductQuantizer.h Clustering.h Index.h Heap.h utils.h hamming.h \
FaissAssert.h FaissException.h FaissAssert.h FaissException.h
MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \
FaissException.h Heap.h AuxIndexStructures.h
Index.o: Index.cpp IndexFlat.h Index.h FaissAssert.h FaissException.h
ProductQuantizer.o: ProductQuantizer.cpp ProductQuantizer.h Clustering.h \ ProductQuantizer.o: ProductQuantizer.cpp ProductQuantizer.h Clustering.h \
Index.h Heap.h FaissAssert.h FaissException.h VectorTransform.h \ Index.h Heap.h FaissAssert.h FaissException.h VectorTransform.h \
IndexFlat.h utils.h IndexFlat.h utils.h
utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \ AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
FaissAssert.h FaissException.h FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \ IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
Heap.h FaissAssert.h FaissException.h IndexPQ.h ProductQuantizer.h \ IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
Clustering.h PolysemousTraining.h AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \
IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \
FaissException.h
FaissException.o: FaissException.cpp FaissException.h
clean: clean:
......
...@@ -76,6 +76,17 @@ void IndexIDMap::search (idx_t n, const float *x, idx_t k, ...@@ -76,6 +76,17 @@ void IndexIDMap::search (idx_t n, const float *x, idx_t k,
} }
} }
void IndexIDMap::range_search (idx_t n, const float *x, float radius,
RangeSearchResult *result) const
{
index->range_search(n, x, radius, result);
for (idx_t i = 0; i < result->lims[result->nq]; i++) {
result->labels[i] = result->labels[i] < 0 ?
result->labels[i] : id_map[result->labels[i]];
}
}
namespace { namespace {
struct IDTranslatedSelector: IDSelector { struct IDTranslatedSelector: IDSelector {
...@@ -109,6 +120,7 @@ long IndexIDMap::remove_ids (const IDSelector & sel) ...@@ -109,6 +120,7 @@ long IndexIDMap::remove_ids (const IDSelector & sel)
} }
FAISS_ASSERT (j == index->ntotal); FAISS_ASSERT (j == index->ntotal);
ntotal = j; ntotal = j;
id_map.resize(ntotal);
return nremove; return nremove;
} }
......
...@@ -51,6 +51,9 @@ struct IndexIDMap : Index { ...@@ -51,6 +51,9 @@ struct IndexIDMap : Index {
/// remove ids adapted to IndexFlat /// remove ids adapted to IndexFlat
long remove_ids(const IDSelector& sel) override; long remove_ids(const IDSelector& sel) override;
void range_search (idx_t n, const float *x, float radius,
RangeSearchResult *result) const override;
~IndexIDMap() override; ~IndexIDMap() override;
IndexIDMap () {own_fields=false; index=nullptr; } IndexIDMap () {own_fields=false; index=nullptr; }
}; };
......
...@@ -804,18 +804,38 @@ void IndexPreTransform::train (idx_t n, const float *x) ...@@ -804,18 +804,38 @@ void IndexPreTransform::train (idx_t n, const float *x)
const float *prev_x = x; const float *prev_x = x;
ScopeDeleter<float> del; ScopeDeleter<float> del;
if (verbose) {
printf("IndexPreTransform::train: training chain 0 to %d\n",
last_untrained);
}
for (int i = 0; i <= last_untrained; i++) { for (int i = 0; i <= last_untrained; i++) {
if (i < chain.size()) { if (i < chain.size()) {
VectorTransform *ltrans = chain [i]; VectorTransform *ltrans = chain [i];
if (!ltrans->is_trained) if (!ltrans->is_trained) {
ltrans->train(n, prev_x); if (verbose) {
printf(" Training chain component %d/%zd\n",
i, chain.size());
if (OPQMatrix *opqm = dynamic_cast<OPQMatrix*>(ltrans)) {
opqm->verbose = true;
}
}
ltrans->train (n, prev_x);
}
} else { } else {
if (verbose) {
printf(" Training sub-index\n");
}
index->train (n, prev_x); index->train (n, prev_x);
} }
if (i == last_untrained) break; if (i == last_untrained) break;
if (verbose) {
printf(" Applying transform %d/%zd\n",
i, chain.size());
}
float * xt = chain[i]->apply (n, prev_x); float * xt = chain[i]->apply (n, prev_x);
if (prev_x != x) delete prev_x; if (prev_x != x) delete [] prev_x;
prev_x = xt; prev_x = xt;
del.set(xt); del.set(xt);
} }
......
...@@ -521,7 +521,7 @@ def compute_populated_index(preproc): ...@@ -521,7 +521,7 @@ def compute_populated_index(preproc):
co.verbose = True co.verbose = True
co.reserveVecs = max_add if max_add > 0 else xb.shape[0] co.reserveVecs = max_add if max_add > 0 else xb.shape[0]
co.shard = True co.shard = True
assert co.shard_type in (0, 1, 2)
vres, vdev = make_vres_vdev() vres, vdev = make_vres_vdev()
gpu_index = faiss.index_cpu_to_gpu_multiple( gpu_index = faiss.index_cpu_to_gpu_multiple(
vres, vdev, indexall, co) vres, vdev, indexall, co)
......
...@@ -121,6 +121,18 @@ def handle_Index(the_class): ...@@ -121,6 +121,18 @@ def handle_Index(the_class):
swig_ptr(labels)) swig_ptr(labels))
return distances, labels return distances, labels
def replacement_search_and_reconstruct(self, x, k):
n, d = x.shape
assert d == self.d
distances = np.empty((n, k), dtype=np.float32)
labels = np.empty((n, k), dtype=np.int64)
recons = np.empty((n, k, d), dtype=np.float32)
self.search_and_reconstruct_c(n, swig_ptr(x),
k, swig_ptr(distances),
swig_ptr(labels),
swig_ptr(recons))
return distances, labels, recons
def replacement_remove_ids(self, x): def replacement_remove_ids(self, x):
if isinstance(x, IDSelector): if isinstance(x, IDSelector):
sel = x sel = x
...@@ -167,6 +179,8 @@ def handle_Index(the_class): ...@@ -167,6 +179,8 @@ def handle_Index(the_class):
replace_method(the_class, 'range_search', replacement_range_search) replace_method(the_class, 'range_search', replacement_range_search)
replace_method(the_class, 'update_vectors', replacement_update_vectors, replace_method(the_class, 'update_vectors', replacement_update_vectors,
ignore_missing=True) ignore_missing=True)
replace_method(the_class, 'search_and_reconstruct',
replacement_search_and_reconstruct, ignore_missing=True)
def handle_VectorTransform(the_class): def handle_VectorTransform(the_class):
...@@ -258,12 +272,52 @@ def index_cpu_to_gpu_multiple_py(resources, index, co=None): ...@@ -258,12 +272,52 @@ def index_cpu_to_gpu_multiple_py(resources, index, co=None):
return index_cpu_to_gpu_multiple(vres, vdev, index, co) return index_cpu_to_gpu_multiple(vres, vdev, index, co)
def vector_float_to_array(v): def index_cpu_to_all_gpus(index, co=None, ngpu=-1):
a = np.empty(v.size(), dtype='float32') if ngpu == -1:
memcpy(swig_ptr(a), v.data(), 4 * v.size()) ngpu = get_num_gpus()
res = [StandardGpuResources() for i in range(ngpu)]
index2 = index_cpu_to_gpu_multiple_py(res, index, co)
index2.dont_dealloc = res
return index2
# mapping from vector names in swigfaiss.swig and the numpy dtype names
vector_name_map = {
'Float': 'float32',
'Byte': 'uint8',
'Uint64': 'uint64',
'Long': 'int64',
'Int': 'int32',
'Double': 'float64'
}
def vector_to_array(v):
""" convert a C++ vector to a numpy array """
classname = v.__class__.__name__
assert classname.endswith('Vector')
dtype = np.dtype(vector_name_map[classname[:-6]])
a = np.empty(v.size(), dtype=dtype)
memcpy(swig_ptr(a), v.data(), a.nbytes)
return a return a
def vector_float_to_array(v):
return vector_to_array(v)
def copy_array_to_vector(a, v):
""" copy a numpy array to a vector """
n, = a.shape
classname = v.__class__.__name__
assert classname.endswith('Vector')
dtype = np.dtype(vector_name_map[classname[:-6]])
assert dtype == a.dtype, (
'cannot copy a %s array to a %s (should be %s)' % (
a.dtype, classname, dtype))
v.resize(n)
memcpy(v.data(), swig_ptr(a), a.nbytes)
class Kmeans: class Kmeans:
def __init__(self, d, k, niter=25, verbose=False, spherical = False): def __init__(self, d, k, niter=25, verbose=False, spherical = False):
...@@ -364,3 +418,18 @@ def eval_intersection(I1, I2): ...@@ -364,3 +418,18 @@ def eval_intersection(I1, I2):
def normalize_L2(x): def normalize_L2(x):
fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x)) fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x))
def replacement_map_add(self, keys, vals):
n, = keys.shape
assert (n,) == keys.shape
self.add_c(n, swig_ptr(keys), swig_ptr(vals))
def replacement_map_search_multiple(self, keys):
n, = keys.shape
vals = np.empty(n, dtype='uint64')
self.search_multiple_c(n, swig_ptr(keys), swig_ptr(vals))
return vals
replace_method(MapLong2Long, 'add', replacement_map_add)
replace_method(MapLong2Long, 'search_multiple', replacement_map_search_multiple)
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
// Copyright 2004-present Facebook. All Rights Reserved. // Copyright 2004-present Facebook. All Rights Reserved.
#include "GpuAutoTune.h" #include "GpuAutoTune.h"
#include <typeinfo>
#include "GpuIndex.h" #include "GpuIndex.h"
#include "../FaissAssert.h" #include "../FaissAssert.h"
...@@ -97,17 +98,6 @@ faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index) ...@@ -97,17 +98,6 @@ faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index)
GpuClonerOptions::GpuClonerOptions():
indicesOptions(INDICES_64_BIT),
useFloat16CoarseQuantizer(false),
useFloat16(false),
usePrecomputed(true),
reserveVecs(0),
storeTransposed(false),
verbose(0)
{}
struct ToGpuCloner: faiss::Cloner, GpuClonerOptions { struct ToGpuCloner: faiss::Cloner, GpuClonerOptions {
GpuResources *resources; GpuResources *resources;
int device; int device;
...@@ -185,9 +175,6 @@ faiss::Index * index_cpu_to_gpu( ...@@ -185,9 +175,6 @@ faiss::Index * index_cpu_to_gpu(
return cl.clone_Index(index); return cl.clone_Index(index);
} }
GpuMultipleClonerOptions::GpuMultipleClonerOptions(): shard(false)
{}
struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions { struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
std::vector<ToGpuCloner> sub_cloners; std::vector<ToGpuCloner> sub_cloners;
...@@ -211,6 +198,28 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions { ...@@ -211,6 +198,28 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
{} {}
void copy_ivf_shard (const IndexIVF *index_ivf, IndexIVF *idx2,
long n, long i) {
if (shard_type == 2) {
long i0 = i * index_ivf->ntotal / n;
long i1 = (i + 1) * index_ivf->ntotal / n;
if(verbose)
printf("IndexShards shard %ld indices %ld:%ld\n",
i, i0, i1);
index_ivf->copy_subset_to(*idx2, 2, i0, i1);
FAISS_ASSERT(idx2->ntotal == i1 - i0);
} else if (shard_type == 1) {
if(verbose)
printf("IndexShards shard %ld select modulo %ld = %ld\n",
i, n, i);
index_ivf->copy_subset_to(*idx2, 1, n, i);
} else {
FAISS_THROW_FMT ("shard_type %d not implemented", shard_type);
}
}
Index *clone_Index(const Index *index) override { Index *clone_Index(const Index *index) override {
long n = sub_cloners.size(); long n = sub_cloners.size();
if (n == 1) if (n == 1)
...@@ -231,19 +240,13 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions { ...@@ -231,19 +240,13 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
dynamic_cast<const faiss::IndexIVFPQ *>(index); dynamic_cast<const faiss::IndexIVFPQ *>(index);
auto index_ivfflat = auto index_ivfflat =
dynamic_cast<const faiss::IndexIVFFlat *>(index); dynamic_cast<const faiss::IndexIVFFlat *>(index);
FAISS_ASSERT_MSG (index_ivfpq || index_ivfflat, FAISS_THROW_IF_NOT_MSG (index_ivfpq || index_ivfflat,
"IndexShards implemented only for " "IndexShards implemented only for "
"IndexIVFFlat or IndexIVFPQ"); "IndexIVFFlat or IndexIVFPQ");
std::vector<faiss::Index*> shards(n); std::vector<faiss::Index*> shards(n);
for(long i = 0; i < n; i++) { for(long i = 0; i < n; i++) {
// make a shallow copy // make a shallow copy
long i0 = i * index->ntotal / n;
long i1 = (i + 1) * index->ntotal / n;
if(verbose)
printf("IndexShards shard %ld indices %ld:%ld\n",
i, i0, i1);
if(reserveVecs) if(reserveVecs)
sub_cloners[i].reserveVecs = sub_cloners[i].reserveVecs =
(reserveVecs + n - 1) / n; (reserveVecs + n - 1) / n;
...@@ -258,18 +261,19 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions { ...@@ -258,18 +261,19 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
idx2.nprobe = index_ivfpq->nprobe; idx2.nprobe = index_ivfpq->nprobe;
idx2.use_precomputed_table = 0; idx2.use_precomputed_table = 0;
idx2.is_trained = index->is_trained; idx2.is_trained = index->is_trained;
index_ivfpq->copy_subset_to(idx2, 2, i0, i1); copy_ivf_shard (index_ivfpq, &idx2, n, i);
FAISS_ASSERT(idx2.ntotal == i1 - i0);
shards[i] = sub_cloners[i].clone_Index(&idx2); shards[i] = sub_cloners[i].clone_Index(&idx2);
} else if (index_ivfflat) { } else if (index_ivfflat) {
faiss::IndexIVFFlat idx2( faiss::IndexIVFFlat idx2(
index_ivfflat->quantizer, index->d, index_ivfflat->quantizer, index->d,
index_ivfflat->nlist, index_ivfflat->metric_type); index_ivfflat->nlist, index_ivfflat->metric_type);
idx2.nprobe = index_ivfflat->nprobe; idx2.nprobe = index_ivfflat->nprobe;
index_ivfflat->copy_subset_to(idx2, 2, i0, i1);
idx2.nprobe = index_ivfflat->nprobe; idx2.nprobe = index_ivfflat->nprobe;
copy_ivf_shard (index_ivfflat, &idx2, n, i);
shards[i] = sub_cloners[i].clone_Index(&idx2); shards[i] = sub_cloners[i].clone_Index(&idx2);
} }
} }
faiss::IndexShards *res = faiss::IndexShards *res =
new faiss::IndexShards(index->d, true, false); new faiss::IndexShards(index->d, true, false);
...@@ -372,33 +376,26 @@ void GpuParameterSpace::initialize (const Index * index) ...@@ -372,33 +376,26 @@ void GpuParameterSpace::initialize (const Index * index)
void GpuParameterSpace::set_index_parameter ( void GpuParameterSpace::set_index_parameter (
Index * index, const std::string & name, double val) const Index * index, const std::string & name, double val) const
{ {
if (DC (IndexPreTransform)) {
index = ix->index;
}
if (DC (IndexProxy)) { if (DC (IndexProxy)) {
for (int i = 0; i < ix->count(); i++) for (int i = 0; i < ix->count(); i++)
set_index_parameter (ix->at(i), name, val); set_index_parameter (ix->at(i), name, val);
return; return;
} }
if (DC (faiss::IndexShards)) { if (DC (GpuIndexIVF)) {
for (auto sub_index : ix->shard_indexes)
set_index_parameter (sub_index, name, val);
return;
}
if (name == "nprobe") { if (name == "nprobe") {
DC (GpuIndexIVF);
FAISS_ASSERT(ix);
ix->setNumProbes (int (val)); ix->setNumProbes (int (val));
return; return;
} }
}
if(DC (GpuIndexIVFPQ)) {
if (name == "use_precomputed_table") { if (name == "use_precomputed_table") {
DC (GpuIndexIVFPQ);
FAISS_ASSERT(ix);
ix->setPrecomputedCodes(bool (val)); ix->setPrecomputedCodes(bool (val));
return; return;
} }
}
FAISS_ASSERT_MSG (false, "unknown parameter"); // maybe norma lindex parameters apply?
ParameterSpace::set_index_parameter (index, name, val);
} }
......
...@@ -22,7 +22,9 @@ GpuClonerOptions::GpuClonerOptions() ...@@ -22,7 +22,9 @@ GpuClonerOptions::GpuClonerOptions()
} }
GpuMultipleClonerOptions::GpuMultipleClonerOptions() GpuMultipleClonerOptions::GpuMultipleClonerOptions()
: shard(false) { : shard(false),
shard_type(1)
{
} }
} } // namespace } } // namespace
...@@ -47,6 +47,9 @@ struct GpuMultipleClonerOptions : public GpuClonerOptions { ...@@ -47,6 +47,9 @@ struct GpuMultipleClonerOptions : public GpuClonerOptions {
/// Whether to shard the index across GPUs, versus replication /// Whether to shard the index across GPUs, versus replication
/// across GPUs /// across GPUs
bool shard; bool shard;
/// IndexIVF::copy_subset_to subset type
int shard_type;
}; };
} } // namespace } } // namespace
...@@ -26,7 +26,7 @@ struct GpuIndexConfig { ...@@ -26,7 +26,7 @@ struct GpuIndexConfig {
/// GPU device on which the index is resident /// GPU device on which the index is resident
int device; int device;
/// What memory space to use for primary storae. /// What memory space to use for primary storage.
/// On Pascal and above (CC 6+) architectures, allows GPUs to use /// On Pascal and above (CC 6+) architectures, allows GPUs to use
/// more memory than is available on the GPU. /// more memory than is available on the GPU.
MemorySpace memorySpace; MemorySpace memorySpace;
......
...@@ -184,7 +184,7 @@ GpuIndexIVF::copyTo(faiss::IndexIVF* index) const { ...@@ -184,7 +184,7 @@ GpuIndexIVF::copyTo(faiss::IndexIVF* index) const {
} }
index->quantizer = q; index->quantizer = q;
index->quantizer_trains_alone = false; index->quantizer_trains_alone = 0;
index->own_fields = true; index->own_fields = true;
index->cp = this->cp; index->cp = this->cp;
index->ids.clear(); index->ids.clear();
......
...@@ -96,7 +96,6 @@ GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) { ...@@ -96,7 +96,6 @@ GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
FAISS_ASSERT(index->pq.byte_per_idx == 1); FAISS_ASSERT(index->pq.byte_per_idx == 1);
FAISS_ASSERT(index->by_residual); FAISS_ASSERT(index->by_residual);
FAISS_ASSERT(index->polysemous_ht == 0); FAISS_ASSERT(index->polysemous_ht == 0);
ivfpqConfig_.usePrecomputedTables = (bool) index->use_precomputed_table;
verifySettings_(); verifySettings_();
......
...@@ -23,6 +23,7 @@ CPPOBJ= GpuResources.o \ ...@@ -23,6 +23,7 @@ CPPOBJ= GpuResources.o \
IndexProxy.o \ IndexProxy.o \
StandardGpuResources.o \ StandardGpuResources.o \
GpuAutoTune.o \ GpuAutoTune.o \
GpuClonerOptions.o \
impl/RemapIndices.o \ impl/RemapIndices.o \
utils/DeviceMemory.o \ utils/DeviceMemory.o \
utils/StackDeviceMemory.o \ utils/StackDeviceMemory.o \
...@@ -123,23 +124,24 @@ clean: ...@@ -123,23 +124,24 @@ clean:
dep: dep:
for i in $(patsubst %.o,%.cpp,$(CPPOBJ)) \ for i in $(patsubst %.o,%.cpp,$(CPPOBJ)) \
$(patsubst %.o,%.cu,$(CUOBJ)); do \ $(patsubst %.o,%.cu,$(CUOBJ)); do \
echo -n $${i%/*}/ ; \ echo -n $$( dirname $$i )/ ; \
cpp -MM -std=gnu++0x $$i; \ cpp -MM -std=gnu++0x $$i; \
done done
GpuResources.cpp/GpuResources.o: GpuResources.cpp GpuResources.h utils/DeviceMemory.h \
./GpuResources.o: GpuResources.cpp GpuResources.h utils/DeviceMemory.h \
utils/DeviceUtils.h utils/../../FaissAssert.h \ utils/DeviceUtils.h utils/../../FaissAssert.h \
utils/../../FaissException.h utils/../../FaissException.h
IndexProxy.cpp/IndexProxy.o: IndexProxy.cpp IndexProxy.h ../Index.h utils/WorkerThread.h \ ./IndexProxy.o: IndexProxy.cpp IndexProxy.h ../Index.h utils/WorkerThread.h \
../FaissAssert.h ../FaissException.h ../Clustering.h ../Index.h \ ../FaissAssert.h ../FaissException.h ../Clustering.h ../Index.h \
GpuIndexFlat.h GpuIndex.h utils/MemorySpace.h utils/../../FaissAssert.h \ GpuIndexFlat.h GpuIndex.h utils/MemorySpace.h utils/../../FaissAssert.h \
StandardGpuResources.h GpuResources.h utils/DeviceMemory.h \ StandardGpuResources.h GpuResources.h utils/DeviceMemory.h \
utils/StackDeviceMemory.h utils/DeviceUtils.h utils/StackDeviceMemory.h utils/DeviceUtils.h
StandardGpuResources.cpp/StandardGpuResources.o: StandardGpuResources.cpp StandardGpuResources.h \ ./StandardGpuResources.o: StandardGpuResources.cpp StandardGpuResources.h \
GpuResources.h utils/DeviceMemory.h utils/StackDeviceMemory.h \ GpuResources.h utils/DeviceMemory.h utils/StackDeviceMemory.h \
utils/DeviceUtils.h utils/../../FaissAssert.h \ utils/DeviceUtils.h utils/../../FaissAssert.h \
utils/../../FaissException.h ../FaissAssert.h utils/../../FaissException.h ../FaissAssert.h
GpuAutoTune.cpp/GpuAutoTune.o: GpuAutoTune.cpp GpuAutoTune.h ../Index.h ../AutoTune.h \ ./GpuAutoTune.o: GpuAutoTune.cpp GpuAutoTune.h ../Index.h ../AutoTune.h \
../Index.h GpuClonerOptions.h GpuIndicesOptions.h GpuIndex.h \ ../Index.h GpuClonerOptions.h GpuIndicesOptions.h GpuIndex.h \
utils/MemorySpace.h utils/../../FaissAssert.h \ utils/MemorySpace.h utils/../../FaissAssert.h \
utils/../../FaissException.h ../FaissAssert.h ../index_io.h \ utils/../../FaissException.h ../FaissAssert.h ../index_io.h \
...@@ -161,6 +163,8 @@ utils/DeviceUtils.o: utils/DeviceUtils.cpp utils/DeviceUtils.h \ ...@@ -161,6 +163,8 @@ utils/DeviceUtils.o: utils/DeviceUtils.cpp utils/DeviceUtils.h \
utils/../../FaissAssert.h utils/../../FaissException.h utils/../../FaissAssert.h utils/../../FaissException.h
utils/Timer.o: utils/Timer.cpp utils/Timer.h utils/DeviceUtils.h \ utils/Timer.o: utils/Timer.cpp utils/Timer.h utils/DeviceUtils.h \
utils/../../FaissAssert.h utils/../../FaissException.h utils/../../FaissAssert.h utils/../../FaissException.h
utils/MemorySpace.o: utils/MemorySpace.cpp utils/MemorySpace.h \
utils/../../FaissAssert.h utils/../../FaissException.h
utils/WorkerThread.o: utils/WorkerThread.cpp utils/WorkerThread.h \ utils/WorkerThread.o: utils/WorkerThread.cpp utils/WorkerThread.h \
utils/../../FaissAssert.h utils/../../FaissException.h utils/../../FaissAssert.h utils/../../FaissException.h
impl/BroadcastSum.o: impl/BroadcastSum.cu impl/../../FaissAssert.h \ impl/BroadcastSum.o: impl/BroadcastSum.cu impl/../../FaissAssert.h \
...@@ -169,12 +173,14 @@ impl/BroadcastSum.o: impl/BroadcastSum.cu impl/../../FaissAssert.h \ ...@@ -169,12 +173,14 @@ impl/BroadcastSum.o: impl/BroadcastSum.cu impl/../../FaissAssert.h \
impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \ impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \
impl/../utils/../utils/DeviceMemory.h impl/../utils/DeviceTensor.cuh \ impl/../utils/../utils/DeviceMemory.h impl/../utils/DeviceTensor.cuh \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \ impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \ impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \
impl/../utils/StaticUtils.h impl/../utils/StaticUtils.h
impl/Distance.o: impl/Distance.cu impl/Distance.cuh \ impl/Distance.o: impl/Distance.cu impl/Distance.cuh \
impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \ impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \ impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceMemory.h impl/../utils/MemorySpace.h \ impl/../utils/DeviceMemory.h impl/../utils/MemorySpace.h \
impl/../utils/DeviceTensor-inl.cuh impl/../utils/Float16.cuh \ impl/../utils/DeviceTensor-inl.cuh impl/../utils/Float16.cuh \
impl/../utils/../GpuResources.h impl/BroadcastSum.cuh impl/L2Norm.cuh \ impl/../utils/../GpuResources.h impl/BroadcastSum.cuh impl/L2Norm.cuh \
...@@ -189,8 +195,9 @@ impl/Distance.o: impl/Distance.cu impl/Distance.cuh \ ...@@ -189,8 +195,9 @@ impl/Distance.o: impl/Distance.cu impl/Distance.cuh \
impl/../utils/ReductionOperators.cuh impl/../utils/ReductionOperators.cuh
impl/FlatIndex.o: impl/FlatIndex.cu impl/FlatIndex.cuh \ impl/FlatIndex.o: impl/FlatIndex.cu impl/FlatIndex.cuh \
impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \ impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \ impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceMemory.h impl/../utils/MemorySpace.h \ impl/../utils/DeviceMemory.h impl/../utils/MemorySpace.h \
impl/../utils/DeviceTensor-inl.cuh impl/../utils/DeviceVector.cuh \ impl/../utils/DeviceTensor-inl.cuh impl/../utils/DeviceVector.cuh \
impl/../utils/StaticUtils.h impl/../utils/Float16.cuh \ impl/../utils/StaticUtils.h impl/../utils/Float16.cuh \
...@@ -200,8 +207,9 @@ impl/FlatIndex.o: impl/FlatIndex.cu impl/FlatIndex.cuh \ ...@@ -200,8 +207,9 @@ impl/FlatIndex.o: impl/FlatIndex.cu impl/FlatIndex.cuh \
impl/InvertedListAppend.o: impl/InvertedListAppend.cu \ impl/InvertedListAppend.o: impl/InvertedListAppend.cu \
impl/InvertedListAppend.cuh impl/../GpuIndicesOptions.h \ impl/InvertedListAppend.cuh impl/../GpuIndicesOptions.h \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \ impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \ impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceUtils.h impl/../../FaissAssert.h \ impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/../../FaissAssert.h impl/../../FaissAssert.h \
impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \ impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \
impl/../utils/../utils/DeviceMemory.h impl/../utils/DeviceTensor.cuh \ impl/../utils/../utils/DeviceMemory.h impl/../utils/DeviceTensor.cuh \
impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \ impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \
...@@ -211,7 +219,8 @@ impl/IVFBase.o: impl/IVFBase.cu impl/IVFBase.cuh impl/../GpuIndicesOptions.h \ ...@@ -211,7 +219,8 @@ impl/IVFBase.o: impl/IVFBase.cu impl/IVFBase.cuh impl/../GpuIndicesOptions.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \ impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/MemorySpace.h impl/../utils/StaticUtils.h \ impl/../utils/MemorySpace.h impl/../utils/StaticUtils.h \
impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \ impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/DeviceMemory.h \ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissAssert.h impl/../utils/DeviceMemory.h \
impl/../utils/DeviceTensor-inl.cuh impl/../GpuResources.h \ impl/../utils/DeviceTensor-inl.cuh impl/../GpuResources.h \
impl/FlatIndex.cuh impl/../utils/Float16.cuh impl/InvertedListAppend.cuh \ impl/FlatIndex.cuh impl/../utils/Float16.cuh impl/InvertedListAppend.cuh \
impl/RemapIndices.h impl/../utils/DeviceDefs.cuh \ impl/RemapIndices.h impl/../utils/DeviceDefs.cuh \
...@@ -222,6 +231,7 @@ impl/IVFFlat.o: impl/IVFFlat.cu impl/IVFFlat.cuh impl/IVFBase.cuh \ ...@@ -222,6 +231,7 @@ impl/IVFFlat.o: impl/IVFFlat.cu impl/IVFFlat.cuh impl/IVFBase.cuh \
impl/../utils/DeviceUtils.h impl/../utils/MemorySpace.h \ impl/../utils/DeviceUtils.h impl/../utils/MemorySpace.h \
impl/../utils/StaticUtils.h impl/../utils/DeviceTensor.cuh \ impl/../utils/StaticUtils.h impl/../utils/DeviceTensor.cuh \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \ impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceMemory.h impl/../utils/DeviceTensor-inl.cuh \ impl/../utils/DeviceMemory.h impl/../utils/DeviceTensor-inl.cuh \
impl/../GpuResources.h impl/FlatIndex.cuh impl/../utils/Float16.cuh \ impl/../GpuResources.h impl/FlatIndex.cuh impl/../utils/Float16.cuh \
impl/InvertedListAppend.cuh impl/IVFFlatScan.cuh impl/RemapIndices.h \ impl/InvertedListAppend.cuh impl/IVFFlatScan.cuh impl/RemapIndices.h \
...@@ -230,8 +240,9 @@ impl/IVFFlat.o: impl/IVFFlat.cu impl/IVFFlat.cuh impl/IVFBase.cuh \ ...@@ -230,8 +240,9 @@ impl/IVFFlat.o: impl/IVFFlat.cu impl/IVFFlat.cuh impl/IVFBase.cuh \
impl/../utils/Transpose.cuh impl/../utils/Transpose.cuh
impl/IVFFlatScan.o: impl/IVFFlatScan.cu impl/IVFFlatScan.cuh \ impl/IVFFlatScan.o: impl/IVFFlatScan.cu impl/IVFFlatScan.cuh \
impl/../GpuIndicesOptions.h impl/../utils/Tensor.cuh \ impl/../GpuIndicesOptions.h impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \ impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../GpuResources.h impl/../utils/DeviceMemory.h impl/IVFUtils.cuh \ impl/../GpuResources.h impl/../utils/DeviceMemory.h impl/IVFUtils.cuh \
impl/../utils/ConversionOperators.cuh impl/../utils/Float16.cuh \ impl/../utils/ConversionOperators.cuh impl/../utils/Float16.cuh \
impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \ impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \
...@@ -247,6 +258,7 @@ impl/IVFPQ.o: impl/IVFPQ.cu impl/IVFPQ.cuh impl/IVFBase.cuh \ ...@@ -247,6 +258,7 @@ impl/IVFPQ.o: impl/IVFPQ.cu impl/IVFPQ.cuh impl/IVFBase.cuh \
impl/../utils/DeviceUtils.h impl/../utils/MemorySpace.h \ impl/../utils/DeviceUtils.h impl/../utils/MemorySpace.h \
impl/../utils/StaticUtils.h impl/../utils/DeviceTensor.cuh \ impl/../utils/StaticUtils.h impl/../utils/DeviceTensor.cuh \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \ impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceMemory.h impl/../utils/DeviceTensor-inl.cuh \ impl/../utils/DeviceMemory.h impl/../utils/DeviceTensor-inl.cuh \
impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \ impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \
impl/BroadcastSum.cuh impl/Distance.cuh impl/FlatIndex.cuh \ impl/BroadcastSum.cuh impl/Distance.cuh impl/FlatIndex.cuh \
...@@ -258,42 +270,46 @@ impl/IVFPQ.o: impl/IVFPQ.cu impl/IVFPQ.cuh impl/IVFBase.cuh \ ...@@ -258,42 +270,46 @@ impl/IVFPQ.o: impl/IVFPQ.cu impl/IVFPQ.cuh impl/IVFBase.cuh \
impl/../utils/MatrixMult.cuh impl/../utils/Transpose.cuh impl/../utils/MatrixMult.cuh impl/../utils/Transpose.cuh
impl/IVFUtils.o: impl/IVFUtils.cu impl/IVFUtils.cuh \ impl/IVFUtils.o: impl/IVFUtils.cu impl/IVFUtils.cuh \
impl/../GpuIndicesOptions.h impl/../utils/Tensor.cuh \ impl/../GpuIndicesOptions.h impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \ impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../utils/StaticUtils.h impl/../utils/ThrustAllocator.cuh impl/../utils/StaticUtils.h impl/../utils/ThrustAllocator.cuh
impl/IVFUtilsSelect1.o: impl/IVFUtilsSelect1.cu impl/IVFUtils.cuh \ impl/IVFUtilsSelect1.o: impl/IVFUtilsSelect1.cu impl/IVFUtils.cuh \
impl/../GpuIndicesOptions.h impl/../utils/Tensor.cuh \ impl/../GpuIndicesOptions.h impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \ impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/Select.cuh impl/../utils/Comparators.cuh \ impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \ impl/../utils/Limits.cuh impl/../utils/Float16.cuh \
impl/../utils/../utils/DeviceMemory.h impl/../utils/DeviceTensor.cuh \ impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \ impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \
impl/../utils/DeviceDefs.cuh impl/../utils/MergeNetworkBlock.cuh \ impl/../utils/DeviceTensor-inl.cuh impl/../utils/Pair.cuh \
impl/../utils/MathOperators.cuh impl/../utils/WarpShuffles.cuh \
impl/../utils/DeviceDefs.cuh impl/../utils/Select.cuh \
impl/../utils/Comparators.cuh impl/../utils/MergeNetworkBlock.cuh \
impl/../utils/MergeNetworkUtils.cuh impl/../utils/PtxUtils.cuh \ impl/../utils/MergeNetworkUtils.cuh impl/../utils/PtxUtils.cuh \
impl/../utils/StaticUtils.h impl/../utils/WarpShuffles.cuh \ impl/../utils/StaticUtils.h impl/../utils/MergeNetworkWarp.cuh \
impl/../utils/MergeNetworkWarp.cuh impl/../utils/Reductions.cuh \ impl/../utils/Reductions.cuh impl/../utils/ReductionOperators.cuh
impl/../utils/ReductionOperators.cuh impl/../utils/Limits.cuh \
impl/../utils/Pair.cuh impl/../utils/MathOperators.cuh
impl/IVFUtilsSelect2.o: impl/IVFUtilsSelect2.cu impl/IVFUtils.cuh \ impl/IVFUtilsSelect2.o: impl/IVFUtilsSelect2.cu impl/IVFUtils.cuh \
impl/../GpuIndicesOptions.h impl/../utils/Tensor.cuh \ impl/../GpuIndicesOptions.h impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \ impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/Select.cuh impl/../utils/Comparators.cuh \ impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../utils/Float16.cuh impl/../utils/../GpuResources.h \ impl/../utils/Limits.cuh impl/../utils/Float16.cuh \
impl/../utils/../utils/DeviceMemory.h impl/../utils/DeviceTensor.cuh \ impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \ impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \
impl/../utils/DeviceDefs.cuh impl/../utils/MergeNetworkBlock.cuh \ impl/../utils/DeviceTensor-inl.cuh impl/../utils/Pair.cuh \
impl/../utils/MathOperators.cuh impl/../utils/WarpShuffles.cuh \
impl/../utils/DeviceDefs.cuh impl/../utils/Select.cuh \
impl/../utils/Comparators.cuh impl/../utils/MergeNetworkBlock.cuh \
impl/../utils/MergeNetworkUtils.cuh impl/../utils/PtxUtils.cuh \ impl/../utils/MergeNetworkUtils.cuh impl/../utils/PtxUtils.cuh \
impl/../utils/StaticUtils.h impl/../utils/WarpShuffles.cuh \ impl/../utils/StaticUtils.h impl/../utils/MergeNetworkWarp.cuh \
impl/../utils/MergeNetworkWarp.cuh impl/../utils/Reductions.cuh \ impl/../utils/Reductions.cuh impl/../utils/ReductionOperators.cuh
impl/../utils/ReductionOperators.cuh impl/../utils/Limits.cuh \
impl/../utils/Pair.cuh impl/../utils/MathOperators.cuh
impl/L2Norm.o: impl/L2Norm.cu impl/L2Norm.cuh impl/../utils/Float16.cuh \ impl/L2Norm.o: impl/L2Norm.cu impl/L2Norm.cuh impl/../utils/Float16.cuh \
impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \ impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \ impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \ impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \ impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \
impl/../../FaissAssert.h impl/../utils/ConversionOperators.cuh \ impl/../../FaissAssert.h impl/../utils/ConversionOperators.cuh \
impl/../utils/DeviceDefs.cuh impl/../utils/MathOperators.cuh \ impl/../utils/DeviceDefs.cuh impl/../utils/MathOperators.cuh \
...@@ -304,8 +320,9 @@ impl/L2Norm.o: impl/L2Norm.cu impl/L2Norm.cuh impl/../utils/Float16.cuh \ ...@@ -304,8 +320,9 @@ impl/L2Norm.o: impl/L2Norm.cu impl/L2Norm.cuh impl/../utils/Float16.cuh \
impl/L2Select.o: impl/L2Select.cu impl/L2Select.cuh impl/../utils/Float16.cuh \ impl/L2Select.o: impl/L2Select.cu impl/L2Select.cuh impl/../utils/Float16.cuh \
impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \ impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \ impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/../../FaissAssert.h \ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \ impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \
impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \ impl/../utils/MemorySpace.h impl/../utils/DeviceTensor-inl.cuh \
impl/../../FaissAssert.h impl/../utils/MathOperators.cuh \ impl/../../FaissAssert.h impl/../utils/MathOperators.cuh \
impl/../utils/Pair.cuh impl/../utils/WarpShuffles.cuh \ impl/../utils/Pair.cuh impl/../utils/WarpShuffles.cuh \
...@@ -317,8 +334,9 @@ impl/L2Select.o: impl/L2Select.cu impl/L2Select.cuh impl/../utils/Float16.cuh \ ...@@ -317,8 +334,9 @@ impl/L2Select.o: impl/L2Select.cu impl/L2Select.cuh impl/../utils/Float16.cuh \
impl/../utils/MergeNetworkWarp.cuh impl/../utils/MergeNetworkWarp.cuh
impl/PQCodeDistances.o: impl/PQCodeDistances.cu impl/PQCodeDistances.cuh \ impl/PQCodeDistances.o: impl/PQCodeDistances.cu impl/PQCodeDistances.cuh \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \ impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \ impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceUtils.h impl/../utils/NoTypeTensor.cuh \ impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/../../FaissAssert.h impl/../utils/NoTypeTensor.cuh \
impl/BroadcastSum.cuh impl/../utils/Float16.cuh \ impl/BroadcastSum.cuh impl/../utils/Float16.cuh \
impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \ impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \ impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \
...@@ -329,8 +347,9 @@ impl/PQCodeDistances.o: impl/PQCodeDistances.cu impl/PQCodeDistances.cuh \ ...@@ -329,8 +347,9 @@ impl/PQCodeDistances.o: impl/PQCodeDistances.cu impl/PQCodeDistances.cuh \
impl/PQScanMultiPassNoPrecomputed.o: impl/PQScanMultiPassNoPrecomputed.cu \ impl/PQScanMultiPassNoPrecomputed.o: impl/PQScanMultiPassNoPrecomputed.cu \
impl/PQScanMultiPassNoPrecomputed.cuh impl/../GpuIndicesOptions.h \ impl/PQScanMultiPassNoPrecomputed.cuh impl/../GpuIndicesOptions.h \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \ impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \ impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceUtils.h impl/../GpuResources.h \ impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/../../FaissAssert.h impl/../GpuResources.h \
impl/../utils/DeviceMemory.h impl/PQCodeDistances.cuh \ impl/../utils/DeviceMemory.h impl/PQCodeDistances.cuh \
impl/../utils/NoTypeTensor.cuh impl/PQCodeLoad.cuh \ impl/../utils/NoTypeTensor.cuh impl/PQCodeLoad.cuh \
impl/../utils/PtxUtils.cuh impl/IVFUtils.cuh \ impl/../utils/PtxUtils.cuh impl/IVFUtils.cuh \
...@@ -342,8 +361,9 @@ impl/PQScanMultiPassNoPrecomputed.o: impl/PQScanMultiPassNoPrecomputed.cu \ ...@@ -342,8 +361,9 @@ impl/PQScanMultiPassNoPrecomputed.o: impl/PQScanMultiPassNoPrecomputed.cu \
impl/PQScanMultiPassPrecomputed.o: impl/PQScanMultiPassPrecomputed.cu \ impl/PQScanMultiPassPrecomputed.o: impl/PQScanMultiPassPrecomputed.cu \
impl/PQScanMultiPassPrecomputed.cuh impl/../GpuIndicesOptions.h \ impl/PQScanMultiPassPrecomputed.cuh impl/../GpuIndicesOptions.h \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \ impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \ impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceUtils.h impl/../utils/NoTypeTensor.cuh \ impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/../../FaissAssert.h impl/../utils/NoTypeTensor.cuh \
impl/../GpuResources.h impl/../utils/DeviceMemory.h impl/PQCodeLoad.cuh \ impl/../GpuResources.h impl/../utils/DeviceMemory.h impl/PQCodeLoad.cuh \
impl/../utils/PtxUtils.cuh impl/IVFUtils.cuh \ impl/../utils/PtxUtils.cuh impl/IVFUtils.cuh \
impl/../utils/ConversionOperators.cuh impl/../utils/Float16.cuh \ impl/../utils/ConversionOperators.cuh impl/../utils/Float16.cuh \
...@@ -352,33 +372,36 @@ impl/PQScanMultiPassPrecomputed.o: impl/PQScanMultiPassPrecomputed.cu \ ...@@ -352,33 +372,36 @@ impl/PQScanMultiPassPrecomputed.o: impl/PQScanMultiPassPrecomputed.cu \
impl/../utils/MathOperators.cuh impl/../utils/StaticUtils.h impl/../utils/MathOperators.cuh impl/../utils/StaticUtils.h
impl/VectorResidual.o: impl/VectorResidual.cu impl/VectorResidual.cuh \ impl/VectorResidual.o: impl/VectorResidual.cu impl/VectorResidual.cuh \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \ impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../../FaissAssert.h impl/../utils/../../FaissException.h \ impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceUtils.h impl/../utils/Float16.cuh \ impl/../utils/../../FaissException.h impl/../utils/DeviceUtils.h \
impl/../utils/../../FaissAssert.h impl/../utils/Float16.cuh \
impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \ impl/../utils/../GpuResources.h impl/../utils/../utils/DeviceMemory.h \
impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \ impl/../utils/DeviceTensor.cuh impl/../utils/MemorySpace.h \
impl/../utils/DeviceTensor-inl.cuh impl/../../FaissAssert.h \ impl/../utils/DeviceTensor-inl.cuh impl/../../FaissAssert.h \
impl/../utils/ConversionOperators.cuh impl/../utils/StaticUtils.h impl/../utils/ConversionOperators.cuh impl/../utils/StaticUtils.h
GpuIndex.cu/GpuIndex.o: GpuIndex.cu GpuIndex.h ../Index.h utils/MemorySpace.h \ ./GpuIndex.o: GpuIndex.cu GpuIndex.h ../Index.h utils/MemorySpace.h \
utils/../../FaissAssert.h utils/../../FaissException.h ../FaissAssert.h \ utils/../../FaissAssert.h utils/../../FaissException.h ../FaissAssert.h \
GpuResources.h utils/DeviceMemory.h utils/DeviceUtils.h GpuResources.h utils/DeviceMemory.h utils/DeviceUtils.h
GpuIndexFlat.cu/GpuIndexFlat.o: GpuIndexFlat.cu GpuIndexFlat.h GpuIndex.h ../Index.h \ ./GpuIndexFlat.o: GpuIndexFlat.cu GpuIndexFlat.h GpuIndex.h ../Index.h \
utils/MemorySpace.h utils/../../FaissAssert.h \ utils/MemorySpace.h utils/../../FaissAssert.h \
utils/../../FaissException.h ../IndexFlat.h ../Index.h GpuResources.h \ utils/../../FaissException.h ../IndexFlat.h ../Index.h GpuResources.h \
utils/DeviceMemory.h impl/FlatIndex.cuh impl/../utils/DeviceTensor.cuh \ utils/DeviceMemory.h impl/FlatIndex.cuh impl/../utils/DeviceTensor.cuh \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \ impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../../FaissAssert.h impl/../utils/DeviceUtils.h \ impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceUtils.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceTensor-inl.cuh impl/../utils/DeviceVector.cuh \ impl/../utils/DeviceTensor-inl.cuh impl/../utils/DeviceVector.cuh \
impl/../utils/StaticUtils.h impl/../utils/Float16.cuh \ impl/../utils/StaticUtils.h impl/../utils/Float16.cuh \
utils/CopyUtils.cuh utils/HostTensor.cuh utils/HostTensor-inl.cuh utils/CopyUtils.cuh utils/HostTensor.cuh utils/HostTensor-inl.cuh
GpuIndexIVF.cu/GpuIndexIVF.o: GpuIndexIVF.cu GpuIndexIVF.h GpuIndex.h ../Index.h \ ./GpuIndexIVF.o: GpuIndexIVF.cu GpuIndexIVF.h GpuIndex.h ../Index.h \
utils/MemorySpace.h utils/../../FaissAssert.h \ utils/MemorySpace.h utils/../../FaissAssert.h \
utils/../../FaissException.h GpuIndexFlat.h GpuIndicesOptions.h \ utils/../../FaissException.h GpuIndexFlat.h GpuIndicesOptions.h \
../Clustering.h ../Index.h ../FaissAssert.h ../IndexFlat.h ../IndexIVF.h \ ../Clustering.h ../Index.h ../FaissAssert.h ../IndexFlat.h ../IndexIVF.h \
../Clustering.h ../Heap.h utils/DeviceUtils.h utils/Float16.cuh \ ../Clustering.h ../Heap.h utils/DeviceUtils.h utils/Float16.cuh \
utils/../GpuResources.h utils/../utils/DeviceMemory.h \ utils/../GpuResources.h utils/../utils/DeviceMemory.h \
utils/DeviceTensor.cuh utils/Tensor.cuh utils/Tensor-inl.cuh \ utils/DeviceTensor.cuh utils/Tensor.cuh utils/Tensor-inl.cuh \
utils/../GpuFaissAssert.h utils/../../FaissAssert.h \
utils/DeviceTensor-inl.cuh utils/DeviceTensor-inl.cuh
GpuIndexIVFFlat.cu/GpuIndexIVFFlat.o: GpuIndexIVFFlat.cu GpuIndexIVFFlat.h GpuIndexIVF.h \ ./GpuIndexIVFFlat.o: GpuIndexIVFFlat.cu GpuIndexIVFFlat.h GpuIndexIVF.h \
GpuIndex.h ../Index.h utils/MemorySpace.h utils/../../FaissAssert.h \ GpuIndex.h ../Index.h utils/MemorySpace.h utils/../../FaissAssert.h \
utils/../../FaissException.h GpuIndexFlat.h GpuIndicesOptions.h \ utils/../../FaissException.h GpuIndexFlat.h GpuIndicesOptions.h \
../Clustering.h ../Index.h ../IndexFlat.h ../IndexIVF.h ../Clustering.h \ ../Clustering.h ../Index.h ../IndexFlat.h ../IndexIVF.h ../Clustering.h \
...@@ -387,9 +410,10 @@ GpuIndexIVFFlat.cu/GpuIndexIVFFlat.o: GpuIndexIVFFlat.cu GpuIndexIVFFlat.h GpuIn ...@@ -387,9 +410,10 @@ GpuIndexIVFFlat.cu/GpuIndexIVFFlat.o: GpuIndexIVFFlat.cu GpuIndexIVFFlat.h GpuIn
impl/../utils/../../FaissAssert.h impl/../utils/DeviceUtils.h \ impl/../utils/../../FaissAssert.h impl/../utils/DeviceUtils.h \
impl/../utils/StaticUtils.h impl/../utils/DeviceTensor.cuh \ impl/../utils/StaticUtils.h impl/../utils/DeviceTensor.cuh \
impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \ impl/../utils/Tensor.cuh impl/../utils/Tensor-inl.cuh \
impl/../utils/../GpuFaissAssert.h impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceTensor-inl.cuh utils/CopyUtils.cuh \ impl/../utils/DeviceTensor-inl.cuh utils/CopyUtils.cuh \
utils/HostTensor.cuh utils/HostTensor-inl.cuh utils/Float16.cuh utils/HostTensor.cuh utils/HostTensor-inl.cuh utils/Float16.cuh
GpuIndexIVFPQ.cu/GpuIndexIVFPQ.o: GpuIndexIVFPQ.cu GpuIndexIVFPQ.h GpuIndexIVF.h \ ./GpuIndexIVFPQ.o: GpuIndexIVFPQ.cu GpuIndexIVFPQ.h GpuIndexIVF.h \
GpuIndex.h ../Index.h utils/MemorySpace.h utils/../../FaissAssert.h \ GpuIndex.h ../Index.h utils/MemorySpace.h utils/../../FaissAssert.h \
utils/../../FaissException.h GpuIndexFlat.h GpuIndicesOptions.h \ utils/../../FaissException.h GpuIndexFlat.h GpuIndicesOptions.h \
../Clustering.h ../Index.h ../IndexFlat.h ../IndexIVFPQ.h ../IndexIVF.h \ ../Clustering.h ../Index.h ../IndexFlat.h ../IndexIVFPQ.h ../IndexIVF.h \
...@@ -399,19 +423,22 @@ GpuIndexIVFPQ.cu/GpuIndexIVFPQ.o: GpuIndexIVFPQ.cu GpuIndexIVFPQ.h GpuIndexIVF.h ...@@ -399,19 +423,22 @@ GpuIndexIVFPQ.cu/GpuIndexIVFPQ.o: GpuIndexIVFPQ.cu GpuIndexIVFPQ.h GpuIndexIVF.h
impl/../utils/DeviceVector.cuh impl/../utils/../../FaissAssert.h \ impl/../utils/DeviceVector.cuh impl/../utils/../../FaissAssert.h \
impl/../utils/DeviceUtils.h impl/../utils/StaticUtils.h \ impl/../utils/DeviceUtils.h impl/../utils/StaticUtils.h \
impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \ impl/../utils/DeviceTensor.cuh impl/../utils/Tensor.cuh \
impl/../utils/Tensor-inl.cuh impl/../utils/DeviceTensor-inl.cuh \ impl/../utils/Tensor-inl.cuh impl/../utils/../GpuFaissAssert.h \
impl/../utils/../../FaissAssert.h impl/../utils/DeviceTensor-inl.cuh \
impl/../utils/Float16.cuh utils/CopyUtils.cuh utils/HostTensor.cuh \ impl/../utils/Float16.cuh utils/CopyUtils.cuh utils/HostTensor.cuh \
utils/HostTensor-inl.cuh utils/HostTensor-inl.cuh
utils/Float16.o: utils/Float16.cu utils/Float16.cuh utils/../GpuResources.h \ utils/Float16.o: utils/Float16.cu utils/Float16.cuh utils/../GpuResources.h \
utils/../utils/DeviceMemory.h utils/DeviceTensor.cuh utils/Tensor.cuh \ utils/../utils/DeviceMemory.h utils/DeviceTensor.cuh utils/Tensor.cuh \
utils/Tensor-inl.cuh utils/../../FaissAssert.h \ utils/Tensor-inl.cuh utils/../GpuFaissAssert.h utils/../../FaissAssert.h \
utils/../../FaissException.h utils/DeviceUtils.h utils/MemorySpace.h \ utils/../../FaissException.h utils/DeviceUtils.h \
utils/DeviceTensor-inl.cuh utils/nvidia/fp16_emu.cuh utils/../../FaissAssert.h utils/MemorySpace.h utils/DeviceTensor-inl.cuh \
utils/nvidia/fp16_emu.cuh
utils/MatrixMult.o: utils/MatrixMult.cu utils/MatrixMult.cuh utils/Float16.cuh \ utils/MatrixMult.o: utils/MatrixMult.cu utils/MatrixMult.cuh utils/Float16.cuh \
utils/../GpuResources.h utils/../utils/DeviceMemory.h \ utils/../GpuResources.h utils/../utils/DeviceMemory.h \
utils/DeviceTensor.cuh utils/Tensor.cuh utils/Tensor-inl.cuh \ utils/DeviceTensor.cuh utils/Tensor.cuh utils/Tensor-inl.cuh \
utils/../../FaissAssert.h utils/../../FaissException.h \ utils/../GpuFaissAssert.h utils/../../FaissAssert.h \
utils/DeviceUtils.h utils/MemorySpace.h utils/DeviceTensor-inl.cuh \ utils/../../FaissException.h utils/DeviceUtils.h \
utils/../../FaissAssert.h utils/MemorySpace.h utils/DeviceTensor-inl.cuh \
utils/HostTensor.cuh utils/HostTensor-inl.cuh utils/HostTensor.cuh utils/HostTensor-inl.cuh
utils/BlockSelectFloat.o: utils/BlockSelectFloat.cu \ utils/BlockSelectFloat.o: utils/BlockSelectFloat.cu \
utils/blockselect/BlockSelectImpl.cuh \ utils/blockselect/BlockSelectImpl.cuh \
...@@ -420,9 +447,12 @@ utils/BlockSelectFloat.o: utils/BlockSelectFloat.cu \ ...@@ -420,9 +447,12 @@ utils/BlockSelectFloat.o: utils/BlockSelectFloat.cu \
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -442,9 +472,12 @@ utils/BlockSelectHalf.o: utils/BlockSelectHalf.cu \ ...@@ -442,9 +472,12 @@ utils/BlockSelectHalf.o: utils/BlockSelectHalf.cu \
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -464,9 +497,12 @@ utils/WarpSelectFloat.o: utils/WarpSelectFloat.cu \ ...@@ -464,9 +497,12 @@ utils/WarpSelectFloat.o: utils/WarpSelectFloat.cu \
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -485,9 +521,12 @@ utils/WarpSelectHalf.o: utils/WarpSelectHalf.cu \ ...@@ -485,9 +521,12 @@ utils/WarpSelectHalf.o: utils/WarpSelectHalf.cu \
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -507,9 +546,12 @@ utils/blockselect/BlockSelectHalf1.o: utils/blockselect/BlockSelectHalf1.cu \ ...@@ -507,9 +546,12 @@ utils/blockselect/BlockSelectHalf1.o: utils/blockselect/BlockSelectHalf1.cu \
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -529,9 +571,12 @@ utils/blockselect/BlockSelectFloat1.o: utils/blockselect/BlockSelectFloat1.cu \ ...@@ -529,9 +571,12 @@ utils/blockselect/BlockSelectFloat1.o: utils/blockselect/BlockSelectFloat1.cu \
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -551,9 +596,12 @@ utils/warpselect/WarpSelectHalf1.o: utils/warpselect/WarpSelectHalf1.cu \ ...@@ -551,9 +596,12 @@ utils/warpselect/WarpSelectHalf1.o: utils/warpselect/WarpSelectHalf1.cu \
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -572,9 +620,12 @@ utils/warpselect/WarpSelectFloat1.o: utils/warpselect/WarpSelectFloat1.cu \ ...@@ -572,9 +620,12 @@ utils/warpselect/WarpSelectFloat1.o: utils/warpselect/WarpSelectFloat1.cu \
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -593,9 +644,12 @@ utils/blockselect/BlockSelectHalf32.o: utils/blockselect/BlockSelectHalf32.cu \ ...@@ -593,9 +644,12 @@ utils/blockselect/BlockSelectHalf32.o: utils/blockselect/BlockSelectHalf32.cu \
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -615,9 +669,12 @@ utils/blockselect/BlockSelectFloat32.o: utils/blockselect/BlockSelectFloat32.cu ...@@ -615,9 +669,12 @@ utils/blockselect/BlockSelectFloat32.o: utils/blockselect/BlockSelectFloat32.cu
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -637,9 +694,12 @@ utils/warpselect/WarpSelectHalf32.o: utils/warpselect/WarpSelectHalf32.cu \ ...@@ -637,9 +694,12 @@ utils/warpselect/WarpSelectHalf32.o: utils/warpselect/WarpSelectHalf32.cu \
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -658,9 +718,12 @@ utils/warpselect/WarpSelectFloat32.o: utils/warpselect/WarpSelectFloat32.cu \ ...@@ -658,9 +718,12 @@ utils/warpselect/WarpSelectFloat32.o: utils/warpselect/WarpSelectFloat32.cu \
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -679,9 +742,12 @@ utils/blockselect/BlockSelectHalf64.o: utils/blockselect/BlockSelectHalf64.cu \ ...@@ -679,9 +742,12 @@ utils/blockselect/BlockSelectHalf64.o: utils/blockselect/BlockSelectHalf64.cu \
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -701,9 +767,12 @@ utils/blockselect/BlockSelectFloat64.o: utils/blockselect/BlockSelectFloat64.cu ...@@ -701,9 +767,12 @@ utils/blockselect/BlockSelectFloat64.o: utils/blockselect/BlockSelectFloat64.cu
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -723,9 +792,12 @@ utils/warpselect/WarpSelectHalf64.o: utils/warpselect/WarpSelectHalf64.cu \ ...@@ -723,9 +792,12 @@ utils/warpselect/WarpSelectHalf64.o: utils/warpselect/WarpSelectHalf64.cu \
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -744,9 +816,12 @@ utils/warpselect/WarpSelectFloat64.o: utils/warpselect/WarpSelectFloat64.cu \ ...@@ -744,9 +816,12 @@ utils/warpselect/WarpSelectFloat64.o: utils/warpselect/WarpSelectFloat64.cu \
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -765,9 +840,12 @@ utils/blockselect/BlockSelectHalf128.o: utils/blockselect/BlockSelectHalf128.cu ...@@ -765,9 +840,12 @@ utils/blockselect/BlockSelectHalf128.o: utils/blockselect/BlockSelectHalf128.cu
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -787,9 +865,12 @@ utils/blockselect/BlockSelectFloat128.o: utils/blockselect/BlockSelectFloat128.c ...@@ -787,9 +865,12 @@ utils/blockselect/BlockSelectFloat128.o: utils/blockselect/BlockSelectFloat128.c
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -809,9 +890,12 @@ utils/warpselect/WarpSelectHalf128.o: utils/warpselect/WarpSelectHalf128.cu \ ...@@ -809,9 +890,12 @@ utils/warpselect/WarpSelectHalf128.o: utils/warpselect/WarpSelectHalf128.cu \
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -830,9 +914,12 @@ utils/warpselect/WarpSelectFloat128.o: utils/warpselect/WarpSelectFloat128.cu \ ...@@ -830,9 +914,12 @@ utils/warpselect/WarpSelectFloat128.o: utils/warpselect/WarpSelectFloat128.cu \
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -851,9 +938,12 @@ utils/blockselect/BlockSelectHalf256.o: utils/blockselect/BlockSelectHalf256.cu ...@@ -851,9 +938,12 @@ utils/blockselect/BlockSelectHalf256.o: utils/blockselect/BlockSelectHalf256.cu
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -873,9 +963,12 @@ utils/blockselect/BlockSelectFloat256.o: utils/blockselect/BlockSelectFloat256.c ...@@ -873,9 +963,12 @@ utils/blockselect/BlockSelectFloat256.o: utils/blockselect/BlockSelectFloat256.c
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -895,9 +988,12 @@ utils/warpselect/WarpSelectHalf256.o: utils/warpselect/WarpSelectHalf256.cu \ ...@@ -895,9 +988,12 @@ utils/warpselect/WarpSelectHalf256.o: utils/warpselect/WarpSelectHalf256.cu \
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -916,9 +1012,12 @@ utils/warpselect/WarpSelectFloat256.o: utils/warpselect/WarpSelectFloat256.cu \ ...@@ -916,9 +1012,12 @@ utils/warpselect/WarpSelectFloat256.o: utils/warpselect/WarpSelectFloat256.cu \
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -937,9 +1036,12 @@ utils/blockselect/BlockSelectHalfF512.o: utils/blockselect/BlockSelectHalfF512.c ...@@ -937,9 +1036,12 @@ utils/blockselect/BlockSelectHalfF512.o: utils/blockselect/BlockSelectHalfF512.c
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -959,9 +1061,12 @@ utils/blockselect/BlockSelectFloatF512.o: utils/blockselect/BlockSelectFloatF512 ...@@ -959,9 +1061,12 @@ utils/blockselect/BlockSelectFloatF512.o: utils/blockselect/BlockSelectFloatF512
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -981,9 +1086,12 @@ utils/warpselect/WarpSelectHalfF512.o: utils/warpselect/WarpSelectHalfF512.cu \ ...@@ -981,9 +1086,12 @@ utils/warpselect/WarpSelectHalfF512.o: utils/warpselect/WarpSelectHalfF512.cu \
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -1002,9 +1110,12 @@ utils/warpselect/WarpSelectFloatF512.o: utils/warpselect/WarpSelectFloatF512.cu ...@@ -1002,9 +1110,12 @@ utils/warpselect/WarpSelectFloatF512.o: utils/warpselect/WarpSelectFloatF512.cu
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -1023,9 +1134,12 @@ utils/blockselect/BlockSelectHalfT512.o: utils/blockselect/BlockSelectHalfT512.c ...@@ -1023,9 +1134,12 @@ utils/blockselect/BlockSelectHalfT512.o: utils/blockselect/BlockSelectHalfT512.c
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -1045,9 +1159,12 @@ utils/blockselect/BlockSelectFloatT512.o: utils/blockselect/BlockSelectFloatT512 ...@@ -1045,9 +1159,12 @@ utils/blockselect/BlockSelectFloatT512.o: utils/blockselect/BlockSelectFloatT512
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -1067,9 +1184,12 @@ utils/warpselect/WarpSelectHalfT512.o: utils/warpselect/WarpSelectHalfT512.cu \ ...@@ -1067,9 +1184,12 @@ utils/warpselect/WarpSelectHalfT512.o: utils/warpselect/WarpSelectHalfT512.cu \
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -1088,9 +1208,12 @@ utils/warpselect/WarpSelectFloatT512.o: utils/warpselect/WarpSelectFloatT512.cu ...@@ -1088,9 +1208,12 @@ utils/warpselect/WarpSelectFloatT512.o: utils/warpselect/WarpSelectFloatT512.cu
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -1109,9 +1232,12 @@ utils/blockselect/BlockSelectHalfF1024.o: utils/blockselect/BlockSelectHalfF1024 ...@@ -1109,9 +1232,12 @@ utils/blockselect/BlockSelectHalfF1024.o: utils/blockselect/BlockSelectHalfF1024
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -1131,9 +1257,12 @@ utils/blockselect/BlockSelectFloatF1024.o: utils/blockselect/BlockSelectFloatF10 ...@@ -1131,9 +1257,12 @@ utils/blockselect/BlockSelectFloatF1024.o: utils/blockselect/BlockSelectFloatF10
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -1153,9 +1282,12 @@ utils/warpselect/WarpSelectHalfF1024.o: utils/warpselect/WarpSelectHalfF1024.cu ...@@ -1153,9 +1282,12 @@ utils/warpselect/WarpSelectHalfF1024.o: utils/warpselect/WarpSelectHalfF1024.cu
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -1174,9 +1306,12 @@ utils/warpselect/WarpSelectFloatF1024.o: utils/warpselect/WarpSelectFloatF1024.c ...@@ -1174,9 +1306,12 @@ utils/warpselect/WarpSelectFloatF1024.o: utils/warpselect/WarpSelectFloatF1024.c
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -1195,9 +1330,12 @@ utils/blockselect/BlockSelectHalfT1024.o: utils/blockselect/BlockSelectHalfT1024 ...@@ -1195,9 +1330,12 @@ utils/blockselect/BlockSelectHalfT1024.o: utils/blockselect/BlockSelectHalfT1024
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -1217,9 +1355,12 @@ utils/blockselect/BlockSelectFloatT1024.o: utils/blockselect/BlockSelectFloatT10 ...@@ -1217,9 +1355,12 @@ utils/blockselect/BlockSelectFloatT1024.o: utils/blockselect/BlockSelectFloatT10
utils/blockselect/../../utils/DeviceMemory.h \ utils/blockselect/../../utils/DeviceMemory.h \
utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \ utils/blockselect/../DeviceTensor.cuh utils/blockselect/../Tensor.cuh \
utils/blockselect/../Tensor-inl.cuh \ utils/blockselect/../Tensor-inl.cuh \
utils/blockselect/../../GpuFaissAssert.h \
utils/blockselect/../../../FaissAssert.h \ utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../../../FaissException.h \ utils/blockselect/../../../FaissException.h \
utils/blockselect/../DeviceUtils.h utils/blockselect/../MemorySpace.h \ utils/blockselect/../DeviceUtils.h \
utils/blockselect/../../../FaissAssert.h \
utils/blockselect/../MemorySpace.h \
utils/blockselect/../DeviceTensor-inl.cuh \ utils/blockselect/../DeviceTensor-inl.cuh \
utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \ utils/blockselect/../Select.cuh utils/blockselect/../Comparators.cuh \
utils/blockselect/../DeviceDefs.cuh \ utils/blockselect/../DeviceDefs.cuh \
...@@ -1239,9 +1380,12 @@ utils/warpselect/WarpSelectHalfT1024.o: utils/warpselect/WarpSelectHalfT1024.cu ...@@ -1239,9 +1380,12 @@ utils/warpselect/WarpSelectHalfT1024.o: utils/warpselect/WarpSelectHalfT1024.cu
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
...@@ -1260,9 +1404,12 @@ utils/warpselect/WarpSelectFloatT1024.o: utils/warpselect/WarpSelectFloatT1024.c ...@@ -1260,9 +1404,12 @@ utils/warpselect/WarpSelectFloatT1024.o: utils/warpselect/WarpSelectFloatT1024.c
utils/warpselect/../../utils/DeviceMemory.h \ utils/warpselect/../../utils/DeviceMemory.h \
utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \ utils/warpselect/../DeviceTensor.cuh utils/warpselect/../Tensor.cuh \
utils/warpselect/../Tensor-inl.cuh \ utils/warpselect/../Tensor-inl.cuh \
utils/warpselect/../../GpuFaissAssert.h \
utils/warpselect/../../../FaissAssert.h \ utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../../../FaissException.h \ utils/warpselect/../../../FaissException.h \
utils/warpselect/../DeviceUtils.h utils/warpselect/../MemorySpace.h \ utils/warpselect/../DeviceUtils.h \
utils/warpselect/../../../FaissAssert.h \
utils/warpselect/../MemorySpace.h \
utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \ utils/warpselect/../DeviceTensor-inl.cuh utils/warpselect/../Select.cuh \
utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \ utils/warpselect/../Comparators.cuh utils/warpselect/../DeviceDefs.cuh \
utils/warpselect/../MergeNetworkBlock.cuh \ utils/warpselect/../MergeNetworkBlock.cuh \
......
...@@ -29,44 +29,107 @@ namespace faiss { namespace gpu { ...@@ -29,44 +29,107 @@ namespace faiss { namespace gpu {
namespace { namespace {
constexpr int kDefaultTileSize = 256; template <typename T>
Tensor<T, 2, true> sliceCentroids(Tensor<T, 2, true>& centroids,
Tensor<T, 2, true>* centroidsTransposed,
int startCentroid,
int num) {
if (startCentroid == 0 && num == centroids.getSize(0)) {
if (centroidsTransposed) {
return *centroidsTransposed;
} else {
return centroids;
}
}
if (centroidsTransposed) {
// (dim, num)
return centroidsTransposed->narrow(1, startCentroid, num);
} else {
return centroids.narrow(0, startCentroid, num);
}
}
// For each chunk of k indices, increment the index by chunk * increment
template <typename T> template <typename T>
int chooseTileSize(int tileSizeOverride, __global__ void incrementIndex(Tensor<T, 2, true> indices,
size_t numCentroids, int k,
size_t tempMemAvailable) { int increment) {
if (tileSizeOverride > 0) { for (int i = threadIdx.x; i < k; i += blockDim.x) {
return tileSizeOverride; indices[blockIdx.y][blockIdx.x * k + i] += blockIdx.x * increment;
} }
}
size_t tileSize = // Used to update result indices in distance computation where the number of
sizeof(T) < 4 ? kDefaultTileSize * 2 : kDefaultTileSize; // centroids is high, and is tiled
template <typename T>
void runIncrementIndex(Tensor<T, 2, true>& indices,
int k,
int increment,
cudaStream_t stream) {
dim3 grid(indices.getSize(1) / k, indices.getSize(0));
int block = std::min(k, 512);
while (tileSize > 64) { // should be exact
size_t memRequirement = 2 * tileSize * numCentroids * sizeof(T); FAISS_ASSERT(grid.x * k == indices.getSize(1));
if (memRequirement <= tempMemAvailable) { incrementIndex<<<grid, block, 0, stream>>>(indices, k, increment);
// This fits entirely into our temporary memory
return tileSize;
}
// Otherwise, halve the tile size cudaDeviceSynchronize();
tileSize /= 2; }
// If the inner size (dim) of the vectors is small, we want a larger query tile
// size, like 1024
void chooseTileSize(int numQueries,
int numCentroids,
int dim,
int elementSize,
size_t tempMemAvailable,
int& tileRows,
int& tileCols) {
// The matrix multiplication should be large enough to be efficient, but if it
// is too large, we seem to lose efficiency as opposed to double-streaming.
// Each tile size here defines 1/2 of the memory use due to double streaming.
// We ignore available temporary memory, as that is adjusted independently by
// the user and can thus meet these requirements (or not).
// For <= 4 GB GPUs, prefer 512 MB of usage.
// For <= 8 GB GPUs, prefer 768 MB of usage.
// Otherwise, prefer 1 GB of usage.
auto totalMem = getCurrentDeviceProperties().totalGlobalMem;
int targetUsage = 0;
if (totalMem <= ((size_t) 4) * 1024 * 1024 * 1024) {
targetUsage = 512 * 1024 * 1024;
} else if (totalMem <= ((size_t) 8) * 1024 * 1024 * 1024) {
targetUsage = 768 * 1024 * 1024;
} else {
targetUsage = 1024 * 1024 * 1024;
} }
// We use 64 as the minimum acceptable tile size targetUsage /= 2 * elementSize;
FAISS_ASSERT(tileSize >= 64);
// FIXME: if we're running with no available temp memory, do we try // 512 seems to be a batch size sweetspot for float32.
// and go larger based on free memory available on the device? // If we are on float16, increase to 512.
// If the k size (vec dim) of the matrix multiplication is small (<= 32),
// increase to 1024.
int preferredTileRows = 512;
if (dim <= 32) {
preferredTileRows = 1024;
}
return tileSize; tileRows = std::min(preferredTileRows, numQueries);
// tileCols is the remainder size
tileCols = std::min(targetUsage / preferredTileRows, numCentroids);
} }
} }
template <typename T> template <typename T>
void runL2Distance(GpuResources* resources, void runDistance(bool computeL2,
GpuResources* resources,
Tensor<T, 2, true>& centroids, Tensor<T, 2, true>& centroids,
Tensor<T, 2, true>* centroidsTransposed, Tensor<T, 2, true>* centroidsTransposed,
Tensor<T, 1, true>* centroidNorms, Tensor<T, 1, true>* centroidNorms,
...@@ -75,8 +138,7 @@ void runL2Distance(GpuResources* resources, ...@@ -75,8 +138,7 @@ void runL2Distance(GpuResources* resources,
Tensor<T, 2, true>& outDistances, Tensor<T, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool useHgemm, bool useHgemm,
bool ignoreOutDistances = false, bool ignoreOutDistances) {
int tileSizeOverride = -1) {
FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0)); FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0)); FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
FAISS_ASSERT(outDistances.getSize(1) == k); FAISS_ASSERT(outDistances.getSize(1) == k);
...@@ -98,9 +160,9 @@ void runL2Distance(GpuResources* resources, ...@@ -98,9 +160,9 @@ void runL2Distance(GpuResources* resources,
return; return;
} }
// If ||c||^2 is not pre-computed, calculate it // L2: If ||c||^2 is not pre-computed, calculate it
DeviceTensor<T, 1, true> cNorms; DeviceTensor<T, 1, true> cNorms;
if (!centroidNorms) { if (computeL2 && !centroidNorms) {
cNorms = std::move(DeviceTensor<T, 1, true>( cNorms = std::move(DeviceTensor<T, 1, true>(
mem, mem,
{centroids.getSize(0)}, defaultStream)); {centroids.getSize(0)}, defaultStream));
...@@ -115,68 +177,111 @@ void runL2Distance(GpuResources* resources, ...@@ -115,68 +177,111 @@ void runL2Distance(GpuResources* resources,
DeviceTensor<T, 1, true> queryNorms(mem, qNormSize, defaultStream); DeviceTensor<T, 1, true> queryNorms(mem, qNormSize, defaultStream);
// ||q||^2 // ||q||^2
if (computeL2) {
runL2Norm(queries, queryNorms, true, defaultStream); runL2Norm(queries, queryNorms, true, defaultStream);
}
// // By default, aim to use up to 512 MB of memory for the processing, with both
// Handle the problem in row tiles, to avoid excessive temporary // number of queries and number of centroids being at least 512.
// memory requests int tileRows = 0;
// int tileCols = 0;
chooseTileSize(queries.getSize(0),
centroids.getSize(0),
queries.getSize(1),
sizeof(T),
mem.getSizeAvailable(),
tileRows,
tileCols);
int numColTiles = utils::divUp(centroids.getSize(0), tileCols);
FAISS_ASSERT(k <= centroids.getSize(0)); FAISS_ASSERT(k <= centroids.getSize(0));
FAISS_ASSERT(k <= 1024); // select limitation FAISS_ASSERT(k <= 1024); // select limitation
int tileSize =
chooseTileSize<T>(
tileSizeOverride,
centroids.getSize(0),
resources->getMemoryManagerCurrentDevice().getSizeAvailable());
int maxQueriesPerIteration = std::min(tileSize, queries.getSize(0));
// Temporary output memory space we'll use // Temporary output memory space we'll use
DeviceTensor<T, 2, true> distanceBuf1( DeviceTensor<T, 2, true> distanceBuf1(
mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream); mem, {tileRows, tileCols}, defaultStream);
DeviceTensor<T, 2, true> distanceBuf2( DeviceTensor<T, 2, true> distanceBuf2(
mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream); mem, {tileRows, tileCols}, defaultStream);
DeviceTensor<T, 2, true>* distanceBufs[2] = DeviceTensor<T, 2, true>* distanceBufs[2] =
{&distanceBuf1, &distanceBuf2}; {&distanceBuf1, &distanceBuf2};
DeviceTensor<T, 2, true> outDistanceBuf1(
mem, {tileRows, numColTiles * k}, defaultStream);
DeviceTensor<T, 2, true> outDistanceBuf2(
mem, {tileRows, numColTiles * k}, defaultStream);
DeviceTensor<T, 2, true>* outDistanceBufs[2] =
{&outDistanceBuf1, &outDistanceBuf2};
DeviceTensor<int, 2, true> outIndexBuf1(
mem, {tileRows, numColTiles * k}, defaultStream);
DeviceTensor<int, 2, true> outIndexBuf2(
mem, {tileRows, numColTiles * k}, defaultStream);
DeviceTensor<int, 2, true>* outIndexBufs[2] =
{&outIndexBuf1, &outIndexBuf2};
auto streams = resources->getAlternateStreamsCurrentDevice(); auto streams = resources->getAlternateStreamsCurrentDevice();
streamWait(streams, {defaultStream}); streamWait(streams, {defaultStream});
int curStream = 0; int curStream = 0;
for (int i = 0; i < queries.getSize(0); i += maxQueriesPerIteration) { // Tile over the input queries
int numQueriesForIteration = std::min(maxQueriesPerIteration, for (int i = 0; i < queries.getSize(0); i += tileRows) {
queries.getSize(0) - i); int curQuerySize = std::min(tileRows, queries.getSize(0) - i);
auto distanceBufView =
distanceBufs[curStream]->narrowOutermost(0, numQueriesForIteration);
auto queryView =
queries.narrowOutermost(i, numQueriesForIteration);
auto outDistanceView = auto outDistanceView =
outDistances.narrowOutermost(i, numQueriesForIteration); outDistances.narrow(0, i, curQuerySize);
auto outIndexView = auto outIndexView =
outIndices.narrowOutermost(i, numQueriesForIteration); outIndices.narrow(0, i, curQuerySize);
auto queryView =
queries.narrow(0, i, curQuerySize);
auto queryNormNiew = auto queryNormNiew =
queryNorms.narrowOutermost(i, numQueriesForIteration); queryNorms.narrow(0, i, curQuerySize);
auto outDistanceBufRowView =
outDistanceBufs[curStream]->narrow(0, 0, curQuerySize);
auto outIndexBufRowView =
outIndexBufs[curStream]->narrow(0, 0, curQuerySize);
// Tile over the centroids
for (int j = 0; j < centroids.getSize(0); j += tileCols) {
int curCentroidSize = std::min(tileCols, centroids.getSize(0) - j);
int curColTile = j / tileCols;
// L2 distance is ||c||^2 - 2qc + ||q||^2 auto centroidsView =
sliceCentroids(centroids, centroidsTransposed, j, curCentroidSize);
// -2qc auto distanceBufView = distanceBufs[curStream]->
narrow(0, 0, curQuerySize).narrow(1, 0, curCentroidSize);
auto outDistanceBufColView =
outDistanceBufRowView.narrow(1, k * curColTile, k);
auto outIndexBufColView =
outIndexBufRowView.narrow(1, k * curColTile, k);
// L2: distance is ||c||^2 - 2qc + ||q||^2, we compute -2qc
// IP: just compute qc
// (query id x dim) x (centroid id, dim)' = (query id, centroid id) // (query id x dim) x (centroid id, dim)' = (query id, centroid id)
runMatrixMult(distanceBufView, false, runMatrixMult(distanceBufView, false,
queryView, false, queryView, false,
centroidsTransposed ? *centroidsTransposed : centroids, centroidsView,
centroidsTransposed ? false : true, centroidsTransposed ? false : true,
-2.0f, 0.0f, useHgemm, computeL2 ? -2.0f : 1.0f, 0.0f, useHgemm,
resources->getBlasHandleCurrentDevice(), resources->getBlasHandleCurrentDevice(),
streams[curStream]); streams[curStream]);
if (computeL2) {
// For L2 distance, we use this fused kernel that performs both // For L2 distance, we use this fused kernel that performs both
// adding ||c||^2 to -2qc and k-selection, so we only need two // adding ||c||^2 to -2qc and k-selection, so we only need two
// passes (one write by the gemm, one read here) over the huge // passes (one write by the gemm, one read here) over the huge
// region of output memory // region of output memory
//
// If we aren't tiling along the number of centroids, we can perform the
// output work directly
if (tileCols == centroids.getSize(0)) {
// Write into the final output
runL2SelectMin(distanceBufView, runL2SelectMin(distanceBufView,
*centroidNorms, *centroidNorms,
outDistanceView, outDistanceView,
...@@ -189,6 +294,57 @@ void runL2Distance(GpuResources* resources, ...@@ -189,6 +294,57 @@ void runL2Distance(GpuResources* resources,
// top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k) // top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)
runSumAlongRows(queryNormNiew, outDistanceView, streams[curStream]); runSumAlongRows(queryNormNiew, outDistanceView, streams[curStream]);
} }
} else {
auto centroidNormsView =
centroidNorms->narrow(0, j, curCentroidSize);
// Write into our intermediate output
runL2SelectMin(distanceBufView,
centroidNormsView,
outDistanceBufColView,
outIndexBufColView,
k,
streams[curStream]);
if (!ignoreOutDistances) {
// expand (query id) to (query id, k) by duplicating along rows
// top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)
runSumAlongRows(queryNormNiew,
outDistanceBufColView,
streams[curStream]);
}
}
} else {
// For IP, just k-select the output for this tile
if (tileCols == centroids.getSize(0)) {
// Write into the final output
runBlockSelect(distanceBufView,
outDistanceView,
outIndexView,
true, k, streams[curStream]);
} else {
// Write into the intermediate output
runBlockSelect(distanceBufView,
outDistanceBufColView,
outIndexBufColView,
true, k, streams[curStream]);
}
}
}
// As we're finished with processing a full set of centroids, perform the
// final k-selection
if (tileCols != centroids.getSize(0)) {
// The indices are tile-relative; for each tile of k, we need to add
// tileCols to the index
runIncrementIndex(outIndexBufRowView, k, tileCols, streams[curStream]);
runBlockSelectPair(outDistanceBufRowView,
outIndexBufRowView,
outDistanceView,
outIndexView,
computeL2 ? false : true, k, streams[curStream]);
}
curStream = (curStream + 1) % 2; curStream = (curStream + 1) % 2;
} }
...@@ -198,98 +354,49 @@ void runL2Distance(GpuResources* resources, ...@@ -198,98 +354,49 @@ void runL2Distance(GpuResources* resources,
} }
template <typename T> template <typename T>
void runIPDistance(GpuResources* resources, void runL2Distance(GpuResources* resources,
Tensor<T, 2, true>& centroids, Tensor<T, 2, true>& centroids,
Tensor<T, 2, true>* centroidsTransposed, Tensor<T, 2, true>* centroidsTransposed,
Tensor<T, 1, true>* centroidNorms,
Tensor<T, 2, true>& queries, Tensor<T, 2, true>& queries,
int k, int k,
Tensor<T, 2, true>& outDistances, Tensor<T, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool useHgemm, bool useHgemm,
int tileSizeOverride = -1) { bool ignoreOutDistances = false) {
FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0)); runDistance<T>(true, // L2
FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0)); resources,
FAISS_ASSERT(outDistances.getSize(1) == k); centroids,
FAISS_ASSERT(outIndices.getSize(1) == k); centroidsTransposed,
centroidNorms,
auto& mem = resources->getMemoryManagerCurrentDevice(); queries,
auto defaultStream = resources->getDefaultStreamCurrentDevice(); k,
outDistances,
// If we're quering against a 0 sized set, just return empty results outIndices,
if (centroids.numElements() == 0) { useHgemm,
thrust::fill(thrust::cuda::par.on(defaultStream), ignoreOutDistances);
outDistances.data(), outDistances.end(), }
Limits<T>::getMax());
thrust::fill(thrust::cuda::par.on(defaultStream),
outIndices.data(), outIndices.end(),
-1);
return;
}
//
// Handle the problem in row tiles, to avoid excessive temporary
// memory requests
//
FAISS_ASSERT(k <= centroids.getSize(0));
FAISS_ASSERT(k <= 1024); // select limitation
int tileSize =
chooseTileSize<T>(
tileSizeOverride,
centroids.getSize(0),
resources->getMemoryManagerCurrentDevice().getSizeAvailable());
int maxQueriesPerIteration = std::min(tileSize, queries.getSize(0));
// Temporary output memory space we'll use
DeviceTensor<T, 2, true> distanceBuf1(
mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
DeviceTensor<T, 2, true> distanceBuf2(
mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
DeviceTensor<T, 2, true>* distanceBufs[2] =
{&distanceBuf1, &distanceBuf2};
auto streams = resources->getAlternateStreamsCurrentDevice();
streamWait(streams, {defaultStream});
int curStream = 0;
for (int i = 0; i < queries.getSize(0); i += maxQueriesPerIteration) {
int numQueriesForIteration = std::min(maxQueriesPerIteration,
queries.getSize(0) - i);
auto distanceBufView =
distanceBufs[curStream]->narrowOutermost(0, numQueriesForIteration);
auto queryView =
queries.narrowOutermost(i, numQueriesForIteration);
auto outDistanceView =
outDistances.narrowOutermost(i, numQueriesForIteration);
auto outIndexView =
outIndices.narrowOutermost(i, numQueriesForIteration);
// (query id x dim) x (centroid id, dim)' = (query id, centroid id)
runMatrixMult(distanceBufView, false,
queryView, false,
centroidsTransposed ? *centroidsTransposed : centroids,
centroidsTransposed ? false : true,
1.0f, 0.0f, useHgemm,
resources->getBlasHandleCurrentDevice(),
streams[curStream]);
// top-k of dot products
// (query id, top k centroids)
runBlockSelect(distanceBufView,
outDistanceView,
outIndexView,
true, k, streams[curStream]);
curStream = (curStream + 1) % 2;
}
streamWait({defaultStream}, streams); template <typename T>
void runIPDistance(GpuResources* resources,
Tensor<T, 2, true>& centroids,
Tensor<T, 2, true>* centroidsTransposed,
Tensor<T, 2, true>& queries,
int k,
Tensor<T, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool useHgemm) {
runDistance<T>(false, // IP
resources,
centroids,
centroidsTransposed,
nullptr,
queries,
k,
outDistances,
outIndices,
useHgemm,
false);
} }
// //
...@@ -303,8 +410,7 @@ runIPDistance(GpuResources* resources, ...@@ -303,8 +410,7 @@ runIPDistance(GpuResources* resources,
Tensor<float, 2, true>& queries, Tensor<float, 2, true>& queries,
int k, int k,
Tensor<float, 2, true>& outDistances, Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices) {
int tileSizeOverride) {
runIPDistance<float>(resources, runIPDistance<float>(resources,
vectors, vectors,
vectorsTransposed, vectorsTransposed,
...@@ -312,8 +418,7 @@ runIPDistance(GpuResources* resources, ...@@ -312,8 +418,7 @@ runIPDistance(GpuResources* resources,
k, k,
outDistances, outDistances,
outIndices, outIndices,
false, false);
tileSizeOverride);
} }
#ifdef FAISS_USE_FLOAT16 #ifdef FAISS_USE_FLOAT16
...@@ -325,8 +430,7 @@ runIPDistance(GpuResources* resources, ...@@ -325,8 +430,7 @@ runIPDistance(GpuResources* resources,
int k, int k,
Tensor<half, 2, true>& outDistances, Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool useHgemm, bool useHgemm) {
int tileSizeOverride) {
runIPDistance<half>(resources, runIPDistance<half>(resources,
vectors, vectors,
vectorsTransposed, vectorsTransposed,
...@@ -334,8 +438,7 @@ runIPDistance(GpuResources* resources, ...@@ -334,8 +438,7 @@ runIPDistance(GpuResources* resources,
k, k,
outDistances, outDistances,
outIndices, outIndices,
useHgemm, useHgemm);
tileSizeOverride);
} }
#endif #endif
...@@ -348,8 +451,7 @@ runL2Distance(GpuResources* resources, ...@@ -348,8 +451,7 @@ runL2Distance(GpuResources* resources,
int k, int k,
Tensor<float, 2, true>& outDistances, Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool ignoreOutDistances, bool ignoreOutDistances) {
int tileSizeOverride) {
runL2Distance<float>(resources, runL2Distance<float>(resources,
vectors, vectors,
vectorsTransposed, vectorsTransposed,
...@@ -359,8 +461,7 @@ runL2Distance(GpuResources* resources, ...@@ -359,8 +461,7 @@ runL2Distance(GpuResources* resources,
outDistances, outDistances,
outIndices, outIndices,
false, false,
ignoreOutDistances, ignoreOutDistances);
tileSizeOverride);
} }
#ifdef FAISS_USE_FLOAT16 #ifdef FAISS_USE_FLOAT16
...@@ -374,8 +475,7 @@ runL2Distance(GpuResources* resources, ...@@ -374,8 +475,7 @@ runL2Distance(GpuResources* resources,
Tensor<half, 2, true>& outDistances, Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool useHgemm, bool useHgemm,
bool ignoreOutDistances, bool ignoreOutDistances) {
int tileSizeOverride) {
runL2Distance<half>(resources, runL2Distance<half>(resources,
vectors, vectors,
vectorsTransposed, vectorsTransposed,
...@@ -385,8 +485,7 @@ runL2Distance(GpuResources* resources, ...@@ -385,8 +485,7 @@ runL2Distance(GpuResources* resources,
outDistances, outDistances,
outIndices, outIndices,
useHgemm, useHgemm,
ignoreOutDistances, ignoreOutDistances);
tileSizeOverride);
} }
#endif #endif
......
...@@ -31,11 +31,7 @@ void runL2Distance(GpuResources* resources, ...@@ -31,11 +31,7 @@ void runL2Distance(GpuResources* resources,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
// Do we care about `outDistances`? If not, we can // Do we care about `outDistances`? If not, we can
// take shortcuts. // take shortcuts.
bool ignoreOutDistances = false, bool ignoreOutDistances = false);
// Hint to use a different sized tile for
// multi-streaming the queries. If <= 0, we use the
// default
int tileSizeOverride = -1);
/// Calculates brute-force inner product distance between `vectors` /// Calculates brute-force inner product distance between `vectors`
/// and `queries`, returning the k closest results seen /// and `queries`, returning the k closest results seen
...@@ -45,11 +41,7 @@ void runIPDistance(GpuResources* resources, ...@@ -45,11 +41,7 @@ void runIPDistance(GpuResources* resources,
Tensor<float, 2, true>& queries, Tensor<float, 2, true>& queries,
int k, int k,
Tensor<float, 2, true>& outDistances, Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices);
// Hint to use a different sized tile for
// multi-streaming the queries. If <= 0, we use the
// default
int tileSizeOverride = -1);
#ifdef FAISS_USE_FLOAT16 #ifdef FAISS_USE_FLOAT16
void runIPDistance(GpuResources* resources, void runIPDistance(GpuResources* resources,
...@@ -59,8 +51,7 @@ void runIPDistance(GpuResources* resources, ...@@ -59,8 +51,7 @@ void runIPDistance(GpuResources* resources,
int k, int k,
Tensor<half, 2, true>& outDistances, Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool useHgemm, bool useHgemm);
int tileSizeOverride = -1);
void runL2Distance(GpuResources* resources, void runL2Distance(GpuResources* resources,
Tensor<half, 2, true>& vectors, Tensor<half, 2, true>& vectors,
...@@ -71,8 +62,7 @@ void runL2Distance(GpuResources* resources, ...@@ -71,8 +62,7 @@ void runL2Distance(GpuResources* resources,
Tensor<half, 2, true>& outDistances, Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool useHgemm, bool useHgemm,
bool ignoreOutDistances = false, bool ignoreOutDistances = false);
int tileSizeOverride = -1);
#endif #endif
} } // namespace } } // namespace
...@@ -114,8 +114,7 @@ FlatIndex::query(Tensor<float, 2, true>& input, ...@@ -114,8 +114,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
int k, int k,
Tensor<float, 2, true>& outDistances, Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool exactDistance, bool exactDistance) {
int tileSize) {
auto stream = resources_->getDefaultStreamCurrentDevice(); auto stream = resources_->getDefaultStreamCurrentDevice();
auto& mem = resources_->getMemoryManagerCurrentDevice(); auto& mem = resources_->getMemoryManagerCurrentDevice();
...@@ -127,7 +126,7 @@ FlatIndex::query(Tensor<float, 2, true>& input, ...@@ -127,7 +126,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
DeviceTensor<half, 2, true> outDistancesHalf( DeviceTensor<half, 2, true> outDistancesHalf(
mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream); mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream);
query(inputHalf, k, outDistancesHalf, outIndices, exactDistance, tileSize); query(inputHalf, k, outDistancesHalf, outIndices, exactDistance);
if (exactDistance) { if (exactDistance) {
// Convert outDistances back // Convert outDistances back
...@@ -145,8 +144,7 @@ FlatIndex::query(Tensor<float, 2, true>& input, ...@@ -145,8 +144,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
outDistances, outDistances,
outIndices, outIndices,
// FIXME // FIXME
!exactDistance, !exactDistance);
tileSize);
} else { } else {
runIPDistance(resources_, runIPDistance(resources_,
vectors_, vectors_,
...@@ -154,8 +152,7 @@ FlatIndex::query(Tensor<float, 2, true>& input, ...@@ -154,8 +152,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
input, input,
k, k,
outDistances, outDistances,
outIndices, outIndices);
tileSize);
} }
} }
} }
...@@ -166,8 +163,7 @@ FlatIndex::query(Tensor<half, 2, true>& input, ...@@ -166,8 +163,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
int k, int k,
Tensor<half, 2, true>& outDistances, Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool exactDistance, bool exactDistance) {
int tileSize) {
FAISS_ASSERT(useFloat16_); FAISS_ASSERT(useFloat16_);
if (l2Distance_) { if (l2Distance_) {
...@@ -181,8 +177,7 @@ FlatIndex::query(Tensor<half, 2, true>& input, ...@@ -181,8 +177,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
outIndices, outIndices,
useFloat16Accumulator_, useFloat16Accumulator_,
// FIXME // FIXME
!exactDistance, !exactDistance);
tileSize);
} else { } else {
runIPDistance(resources_, runIPDistance(resources_,
vectorsHalf_, vectorsHalf_,
...@@ -191,8 +186,7 @@ FlatIndex::query(Tensor<half, 2, true>& input, ...@@ -191,8 +186,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
k, k,
outDistances, outDistances,
outIndices, outIndices,
useFloat16Accumulator_, useFloat16Accumulator_);
tileSize);
} }
} }
#endif #endif
...@@ -217,12 +211,14 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) { ...@@ -217,12 +211,14 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
rawData_.append((char*) devDataHalf.data(), rawData_.append((char*) devDataHalf.data(),
devDataHalf.getSizeInBytes(), devDataHalf.getSizeInBytes(),
stream); stream,
true /* reserve exactly */);
#endif #endif
} else { } else {
rawData_.append((char*) data, rawData_.append((char*) data,
(size_t) dim_ * numVecs * sizeof(float), (size_t) dim_ * numVecs * sizeof(float),
stream); stream,
true /* reserve exactly */);
} }
num_ += numVecs; num_ += numVecs;
......
...@@ -61,16 +61,14 @@ class FlatIndex { ...@@ -61,16 +61,14 @@ class FlatIndex {
int k, int k,
Tensor<float, 2, true>& outDistances, Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool exactDistance, bool exactDistance);
int tileSize = -1);
#ifdef FAISS_USE_FLOAT16 #ifdef FAISS_USE_FLOAT16
void query(Tensor<half, 2, true>& vecs, void query(Tensor<half, 2, true>& vecs,
int k, int k,
Tensor<half, 2, true>& outDistances, Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool exactDistance, bool exactDistance);
int tileSize = -1);
#endif #endif
/// Add vectors to ourselves; the pointer passed can be on the host /// Add vectors to ourselves; the pointer passed can be on the host
......
...@@ -195,10 +195,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs, ...@@ -195,10 +195,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
closestSubQDistanceView, closestSubQDistanceView,
closestSubQIndexView, closestSubQIndexView,
// We don't care about distances // We don't care about distances
true, true);
// Much larger tile size, since these vectors are a
// lot smaller than query vectors
1024);
} }
// Now, we have the nearest sub-q centroid for each slice of the // Now, we have the nearest sub-q centroid for each slice of the
......
...@@ -10,10 +10,10 @@ ...@@ -10,10 +10,10 @@
#include "IVFUtils.cuh" #include "IVFUtils.cuh"
#include "../utils/DeviceUtils.h" #include "../utils/DeviceUtils.h"
#include "../utils/Limits.cuh"
#include "../utils/Select.cuh" #include "../utils/Select.cuh"
#include "../utils/StaticUtils.h" #include "../utils/StaticUtils.h"
#include "../utils/Tensor.cuh" #include "../utils/Tensor.cuh"
#include <limits>
// //
// This kernel is split into a separate compilation unit to cut down // This kernel is split into a separate compilation unit to cut down
...@@ -22,9 +22,6 @@ ...@@ -22,9 +22,6 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
constexpr auto kMax = std::numeric_limits<float>::max();
constexpr auto kMin = std::numeric_limits<float>::min();
template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir> template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir>
__global__ void __global__ void
pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets, pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
...@@ -38,7 +35,7 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets, ...@@ -38,7 +35,7 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
__shared__ float smemK[kNumWarps * NumWarpQ]; __shared__ float smemK[kNumWarps * NumWarpQ];
__shared__ int smemV[kNumWarps * NumWarpQ]; __shared__ int smemV[kNumWarps * NumWarpQ];
constexpr auto kInit = Dir ? kMin : kMax; constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
BlockSelect<float, int, Dir, Comparator<float>, BlockSelect<float, int, Dir, Comparator<float>,
NumWarpQ, NumThreadQ, ThreadsPerBlock> NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(kInit, -1, smemK, smemV, k); heap(kInit, -1, smemK, smemV, k);
......
...@@ -10,10 +10,10 @@ ...@@ -10,10 +10,10 @@
#include "IVFUtils.cuh" #include "IVFUtils.cuh"
#include "../utils/DeviceUtils.h" #include "../utils/DeviceUtils.h"
#include "../utils/Limits.cuh"
#include "../utils/Select.cuh" #include "../utils/Select.cuh"
#include "../utils/StaticUtils.h" #include "../utils/StaticUtils.h"
#include "../utils/Tensor.cuh" #include "../utils/Tensor.cuh"
#include <limits>
// //
// This kernel is split into a separate compilation unit to cut down // This kernel is split into a separate compilation unit to cut down
...@@ -22,9 +22,6 @@ ...@@ -22,9 +22,6 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
constexpr auto kMax = std::numeric_limits<float>::max();
constexpr auto kMin = std::numeric_limits<float>::min();
// This is warp divergence central, but this is really a final step // This is warp divergence central, but this is really a final step
// and happening a small number of times // and happening a small number of times
inline __device__ int binarySearchForBucket(int* prefixSumOffsets, inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
...@@ -71,7 +68,7 @@ pass2SelectLists(Tensor<float, 2, true> heapDistances, ...@@ -71,7 +68,7 @@ pass2SelectLists(Tensor<float, 2, true> heapDistances,
__shared__ float smemK[kNumWarps * NumWarpQ]; __shared__ float smemK[kNumWarps * NumWarpQ];
__shared__ int smemV[kNumWarps * NumWarpQ]; __shared__ int smemV[kNumWarps * NumWarpQ];
constexpr auto kInit = Dir ? kMin : kMax; constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
BlockSelect<float, int, Dir, Comparator<float>, BlockSelect<float, int, Dir, Comparator<float>,
NumWarpQ, NumThreadQ, ThreadsPerBlock> NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(kInit, -1, smemK, smemV, k); heap(kInit, -1, smemK, smemV, k);
......
...@@ -31,28 +31,29 @@ namespace faiss { namespace gpu { ...@@ -31,28 +31,29 @@ namespace faiss { namespace gpu {
// T: the type we are doing the math in (e.g., float, half) // T: the type we are doing the math in (e.g., float, half)
// TVec: the potentially vectorized type we are loading in (e.g., // TVec: the potentially vectorized type we are loading in (e.g.,
// float4, half2) // float4, half2)
template <typename T, typename TVec, template <typename T, typename TVec, typename TIndex,
int RowTileSize, bool NormLoop, bool NormSquared> int RowTileSize, bool NormLoop, bool NormSquared>
__global__ void l2Norm(Tensor<TVec, 2, true> input, __global__ void l2Norm(Tensor<TVec, 2, true, TIndex> input,
Tensor<T, 1, true> output) { Tensor<T, 1, true, TIndex> output) {
extern __shared__ char smemByte[]; // #warps * RowTileSize elements extern __shared__ char smemByte[]; // #warps * RowTileSize elements
T* smem = (T*) smemByte; T* smem = (T*) smemByte;
int numWarps = utils::divUp(blockDim.x, kWarpSize); TIndex numWarps = utils::divUp(blockDim.x, kWarpSize);
int laneId = getLaneId(); TIndex laneId = getLaneId();
int warpId = threadIdx.x / kWarpSize; TIndex warpId = threadIdx.x / kWarpSize;
bool lastRowTile = (blockIdx.x == (gridDim.x - 1)); bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
int rowStart = RowTileSize * blockIdx.x; TIndex rowStart = RowTileSize * blockIdx.x;
T rowNorm[RowTileSize]; T rowNorm[RowTileSize];
if (lastRowTile) { if (lastRowTile) {
// We are handling the very end of the input matrix rows // We are handling the very end of the input matrix rows
for (int row = 0; row < input.getSize(0) - rowStart; ++row) { for (TIndex row = 0; row < input.getSize(0) - rowStart; ++row) {
if (NormLoop) { if (NormLoop) {
rowNorm[0] = Math<T>::zero(); rowNorm[0] = Math<T>::zero();
for (int col = threadIdx.x; col < input.getSize(1); col += blockDim.x) { for (TIndex col = threadIdx.x;
col < input.getSize(1); col += blockDim.x) {
TVec val = input[rowStart + row][col]; TVec val = input[rowStart + row][col];
val = Math<TVec>::mul(val, val); val = Math<TVec>::mul(val, val);
rowNorm[0] = Math<T>::add(rowNorm[0], Math<TVec>::reduceAdd(val)); rowNorm[0] = Math<T>::add(rowNorm[0], Math<TVec>::reduceAdd(val));
...@@ -82,7 +83,8 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input, ...@@ -82,7 +83,8 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input,
rowNorm[row] = Math<T>::zero(); rowNorm[row] = Math<T>::zero();
} }
for (int col = threadIdx.x; col < input.getSize(1); col += blockDim.x) { for (TIndex col = threadIdx.x;
col < input.getSize(1); col += blockDim.x) {
#pragma unroll #pragma unroll
for (int row = 0; row < RowTileSize; ++row) { for (int row = 0; row < RowTileSize; ++row) {
tmp[row] = input[rowStart + row][col]; tmp[row] = input[rowStart + row][col];
...@@ -172,32 +174,32 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input, ...@@ -172,32 +174,32 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input,
} }
} }
template <typename T, typename TVec> template <typename T, typename TVec, typename TIndex>
void runL2Norm(Tensor<T, 2, true>& input, void runL2Norm(Tensor<T, 2, true, TIndex>& input,
Tensor<T, 1, true>& output, Tensor<T, 1, true, TIndex>& output,
bool normSquared, bool normSquared,
cudaStream_t stream) { cudaStream_t stream) {
FAISS_ASSERT(input.getSize(0) == output.getSize(0)); FAISS_ASSERT(input.getSize(0) == output.getSize(0));
int maxThreads = getMaxThreadsCurrentDevice(); TIndex maxThreads = (TIndex) getMaxThreadsCurrentDevice();
constexpr int rowTileSize = 8; constexpr int rowTileSize = 8;
#define RUN_L2(TYPE_T, TYPE_TVEC, INPUT) \ #define RUN_L2(TYPE_T, TYPE_TVEC, INPUT) \
do { \ do { \
if (normLoop) { \ if (normLoop) { \
if (normSquared) { \ if (normSquared) { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, true, true> \ l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, true, true> \
<<<grid, block, smem, stream>>>(INPUT, output); \ <<<grid, block, smem, stream>>>(INPUT, output); \
} else { \ } else { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, true, false> \ l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, true, false> \
<<<grid, block, smem, stream>>>(INPUT, output); \ <<<grid, block, smem, stream>>>(INPUT, output); \
} \ } \
} else { \ } else { \
if (normSquared) { \ if (normSquared) { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, false, true> \ l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, false, true> \
<<<grid, block, smem, stream>>>(INPUT, output); \ <<<grid, block, smem, stream>>>(INPUT, output); \
} else { \ } else { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, false, false> \ l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, false, false> \
<<<grid, block, smem, stream>>>(INPUT, output); \ <<<grid, block, smem, stream>>>(INPUT, output); \
} \ } \
} \ } \
...@@ -207,9 +209,9 @@ void runL2Norm(Tensor<T, 2, true>& input, ...@@ -207,9 +209,9 @@ void runL2Norm(Tensor<T, 2, true>& input,
// Can load using the vectorized type // Can load using the vectorized type
auto inputV = input.template castResize<TVec>(); auto inputV = input.template castResize<TVec>();
int dim = inputV.getSize(1); auto dim = inputV.getSize(1);
bool normLoop = dim > maxThreads; bool normLoop = dim > maxThreads;
int numThreads = min(dim, maxThreads); auto numThreads = min(dim, maxThreads);
auto grid = dim3(utils::divUp(inputV.getSize(0), rowTileSize)); auto grid = dim3(utils::divUp(inputV.getSize(0), rowTileSize));
auto block = dim3(numThreads); auto block = dim3(numThreads);
...@@ -220,9 +222,9 @@ void runL2Norm(Tensor<T, 2, true>& input, ...@@ -220,9 +222,9 @@ void runL2Norm(Tensor<T, 2, true>& input,
} else { } else {
// Can't load using the vectorized type // Can't load using the vectorized type
int dim = input.getSize(1); auto dim = input.getSize(1);
bool normLoop = dim > maxThreads; bool normLoop = dim > maxThreads;
int numThreads = min(dim, maxThreads); auto numThreads = min(dim, maxThreads);
auto grid = dim3(utils::divUp(input.getSize(0), rowTileSize)); auto grid = dim3(utils::divUp(input.getSize(0), rowTileSize));
auto block = dim3(numThreads); auto block = dim3(numThreads);
...@@ -241,7 +243,13 @@ void runL2Norm(Tensor<float, 2, true>& input, ...@@ -241,7 +243,13 @@ void runL2Norm(Tensor<float, 2, true>& input,
Tensor<float, 1, true>& output, Tensor<float, 1, true>& output,
bool normSquared, bool normSquared,
cudaStream_t stream) { cudaStream_t stream) {
runL2Norm<float, float4>(input, output, normSquared, stream); if (input.canUseIndexType<int>()) {
runL2Norm<float, float4, int>(input, output, normSquared, stream);
} else {
auto inputCast = input.castIndexType<long>();
auto outputCast = output.castIndexType<long>();
runL2Norm<float, float4, long>(inputCast, outputCast, normSquared, stream);
}
} }
#ifdef FAISS_USE_FLOAT16 #ifdef FAISS_USE_FLOAT16
...@@ -249,7 +257,13 @@ void runL2Norm(Tensor<half, 2, true>& input, ...@@ -249,7 +257,13 @@ void runL2Norm(Tensor<half, 2, true>& input,
Tensor<half, 1, true>& output, Tensor<half, 1, true>& output,
bool normSquared, bool normSquared,
cudaStream_t stream) { cudaStream_t stream) {
runL2Norm<half, half2>(input, output, normSquared, stream); if (input.canUseIndexType<int>()) {
runL2Norm<half, half2, int>(input, output, normSquared, stream);
} else {
auto inputCast = input.castIndexType<long>();
auto outputCast = output.castIndexType<long>();
runL2Norm<half, half2, long>(inputCast, outputCast, normSquared, stream);
}
} }
#endif #endif
......
...@@ -29,11 +29,14 @@ DEFINE_int32(num, 128, "# of vecs"); ...@@ -29,11 +29,14 @@ DEFINE_int32(num, 128, "# of vecs");
DEFINE_int32(dim, 128, "# of dimensions"); DEFINE_int32(dim, 128, "# of dimensions");
DEFINE_int32(num_queries, 3, "number of query vectors"); DEFINE_int32(num_queries, 3, "number of query vectors");
DEFINE_bool(diff, true, "show exact distance + index output discrepancies"); DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
DEFINE_bool(use_float16, false, "use encodings in float16 instead of float32"); DEFINE_bool(use_float16, false, "use encodings in float16");
DEFINE_bool(use_float16_math, false, "perform math in float16");
DEFINE_bool(transposed, false, "store vectors transposed"); DEFINE_bool(transposed, false, "store vectors transposed");
DEFINE_int64(seed, -1, "specify random seed"); DEFINE_int64(seed, -1, "specify random seed");
DEFINE_int32(num_gpus, 1, "number of gpus to use"); DEFINE_int32(num_gpus, 1, "number of gpus to use");
DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use"); DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use");
DEFINE_bool(cpu, true, "run the CPU code for timing and comparison");
DEFINE_bool(use_unified_mem, false, "use Pascal unified memory for the index");
using namespace faiss::gpu; using namespace faiss::gpu;
...@@ -72,7 +75,10 @@ int main(int argc, char** argv) { ...@@ -72,7 +75,10 @@ int main(int argc, char** argv) {
GpuIndexFlatConfig config; GpuIndexFlatConfig config;
config.device = dev; config.device = dev;
config.useFloat16 = FLAGS_use_float16; config.useFloat16 = FLAGS_use_float16;
config.useFloat16Accumulator = FLAGS_use_float16_math;
config.storeTransposed = FLAGS_transposed; config.storeTransposed = FLAGS_transposed;
config.memorySpace = FLAGS_use_unified_mem ?
MemorySpace::Unified : MemorySpace::Device;
auto p = std::unique_ptr<faiss::gpu::GpuIndexFlatL2>( auto p = std::unique_ptr<faiss::gpu::GpuIndexFlatL2>(
new faiss::gpu::GpuIndexFlatL2(res, index.get(), config)); new faiss::gpu::GpuIndexFlatL2(res, index.get(), config));
...@@ -90,9 +96,9 @@ int main(int argc, char** argv) { ...@@ -90,9 +96,9 @@ int main(int argc, char** argv) {
HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k}); HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k}); HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
if (FLAGS_cpu) {
float cpuTime = 0.0f; float cpuTime = 0.0f;
{
CpuTimer timer; CpuTimer timer;
index->search(numQueries, index->search(numQueries,
cpuQuery.data(), cpuQuery.data(),
...@@ -101,9 +107,8 @@ int main(int argc, char** argv) { ...@@ -101,9 +107,8 @@ int main(int argc, char** argv) {
cpuIndices.data()); cpuIndices.data());
cpuTime = timer.elapsedMilliseconds(); cpuTime = timer.elapsedMilliseconds();
}
printf("CPU time %.3f ms\n", cpuTime); printf("CPU time %.3f ms\n", cpuTime);
}
HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k}); HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k}); HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
...@@ -131,14 +136,14 @@ int main(int argc, char** argv) { ...@@ -131,14 +136,14 @@ int main(int argc, char** argv) {
CUDA_VERIFY(cudaProfilerStop()); CUDA_VERIFY(cudaProfilerStop());
printf("GPU time %.3f ms\n", gpuTime); printf("GPU time %.3f ms\n", gpuTime);
if (FLAGS_cpu) {
compareLists(cpuDistances.data(), cpuIndices.data(), compareLists(cpuDistances.data(), cpuIndices.data(),
gpuDistances.data(), gpuIndices.data(), gpuDistances.data(), gpuIndices.data(),
numQueries, FLAGS_k, numQueries, FLAGS_k,
"", true, FLAGS_diff, false); "", true, FLAGS_diff, false);
}
CUDA_VERIFY(cudaDeviceSynchronize()); CUDA_VERIFY(cudaDeviceSynchronize());
// printf("\ncudaMalloc usage %zd\n",
// resources.getMemoryManager().getHighWaterCudaMalloc());
return 0; return 0;
} }
...@@ -21,29 +21,47 @@ ...@@ -21,29 +21,47 @@
constexpr float kF16MaxRelErr = 0.07f; constexpr float kF16MaxRelErr = 0.07f;
constexpr float kF32MaxRelErr = 6e-3f; constexpr float kF32MaxRelErr = 6e-3f;
void testFlat(bool useL2, struct TestFlatOptions {
bool useFloat16, TestFlatOptions()
bool useTransposed, : useL2(true),
int kOverride = -1) { useFloat16(false),
int numVecs = faiss::gpu::randVal(1000, 20000); useTransposed(false),
numVecsOverride(-1),
numQueriesOverride(-1),
kOverride(-1) {
}
bool useL2;
bool useFloat16;
bool useTransposed;
int numVecsOverride;
int numQueriesOverride;
int kOverride;
};
void testFlat(const TestFlatOptions& opt) {
int numVecs = opt.numVecsOverride > 0 ?
opt.numVecsOverride : faiss::gpu::randVal(1000, 20000);
int dim = faiss::gpu::randVal(50, 800); int dim = faiss::gpu::randVal(50, 800);
int numQuery = faiss::gpu::randVal(1, 512); int numQuery = opt.numQueriesOverride > 0 ?
opt.numQueriesOverride : faiss::gpu::randVal(1, 512);
// Due to loss of precision in a float16 accumulator, for large k, // Due to loss of precision in a float16 accumulator, for large k,
// the number of differences is pretty huge. Restrict ourselves to a // the number of differences is pretty huge. Restrict ourselves to a
// fairly small `k` for float16 // fairly small `k` for float16
int k = useFloat16 ? int k = opt.useFloat16 ?
std::min(faiss::gpu::randVal(1, 50), numVecs) : std::min(faiss::gpu::randVal(1, 50), numVecs) :
std::min(faiss::gpu::randVal(1, 1024), numVecs); std::min(faiss::gpu::randVal(1, 1024), numVecs);
if (kOverride > 0) { if (opt.kOverride > 0) {
k = kOverride; k = opt.kOverride;
} }
faiss::IndexFlatIP cpuIndexIP(dim); faiss::IndexFlatIP cpuIndexIP(dim);
faiss::IndexFlatL2 cpuIndexL2(dim); faiss::IndexFlatL2 cpuIndexL2(dim);
faiss::IndexFlat* cpuIndex = faiss::IndexFlat* cpuIndex =
useL2 ? (faiss::IndexFlat*) &cpuIndexL2 : (faiss::IndexFlat*) &cpuIndexIP; opt.useL2 ? (faiss::IndexFlat*) &cpuIndexL2 :
(faiss::IndexFlat*) &cpuIndexIP;
// Construct on a random device to test multi-device, if we have // Construct on a random device to test multi-device, if we have
// multiple devices // multiple devices
...@@ -55,14 +73,14 @@ void testFlat(bool useL2, ...@@ -55,14 +73,14 @@ void testFlat(bool useL2,
faiss::gpu::GpuIndexFlatConfig config; faiss::gpu::GpuIndexFlatConfig config;
config.device = device; config.device = device;
config.useFloat16 = useFloat16; config.useFloat16 = opt.useFloat16;
config.storeTransposed = useTransposed; config.storeTransposed = opt.useTransposed;
faiss::gpu::GpuIndexFlatIP gpuIndexIP(&res, dim, config); faiss::gpu::GpuIndexFlatIP gpuIndexIP(&res, dim, config);
faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config); faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
faiss::gpu::GpuIndexFlat* gpuIndex = faiss::gpu::GpuIndexFlat* gpuIndex =
useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 : opt.useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
(faiss::gpu::GpuIndexFlat*) &gpuIndexIP; (faiss::gpu::GpuIndexFlat*) &gpuIndexIP;
std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim); std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
...@@ -70,37 +88,53 @@ void testFlat(bool useL2, ...@@ -70,37 +88,53 @@ void testFlat(bool useL2,
gpuIndex->add(numVecs, vecs.data()); gpuIndex->add(numVecs, vecs.data());
std::stringstream str; std::stringstream str;
str << (useL2 ? "L2" : "IP") << " numVecs " << numVecs str << (opt.useL2 ? "L2" : "IP") << " numVecs " << numVecs
<< " dim " << dim << " dim " << dim
<< " useFloat16 " << useFloat16 << " useFloat16 " << opt.useFloat16
<< " transposed " << useTransposed << " transposed " << opt.useTransposed
<< " numQuery " << numQuery << " numQuery " << numQuery
<< " k " << k; << " k " << k;
// To some extent, we depend upon the relative error for the test // To some extent, we depend upon the relative error for the test
// for float16 // for float16
faiss::gpu::compareIndices(*cpuIndex, *gpuIndex, numQuery, dim, k, str.str(), faiss::gpu::compareIndices(*cpuIndex, *gpuIndex, numQuery, dim, k, str.str(),
useFloat16 ? kF16MaxRelErr : kF32MaxRelErr, opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
// FIXME: the fp16 bounds are // FIXME: the fp16 bounds are
// useless when math (the accumulator) is // useless when math (the accumulator) is
// in fp16. Figure out another way to test // in fp16. Figure out another way to test
useFloat16 ? 0.99f : 0.1f, opt.useFloat16 ? 0.99f : 0.1f,
useFloat16 ? 0.65f : 0.015f); opt.useFloat16 ? 0.65f : 0.015f);
} }
TEST(TestGpuIndexFlat, IP_Float32) { TEST(TestGpuIndexFlat, IP_Float32) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(false, false, false);
testFlat(false, false, true); TestFlatOptions opt;
opt.useL2 = false;
opt.useFloat16 = false;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
} }
} }
TEST(TestGpuIndexFlat, L2_Float32) { TEST(TestGpuIndexFlat, L2_Float32) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(true, false, false);
testFlat(true, false, true); TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
} }
} }
...@@ -108,24 +142,46 @@ TEST(TestGpuIndexFlat, L2_Float32) { ...@@ -108,24 +142,46 @@ TEST(TestGpuIndexFlat, L2_Float32) {
TEST(TestGpuIndexFlat, L2_Float32_K1) { TEST(TestGpuIndexFlat, L2_Float32_K1) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(true, false, false, 1);
testFlat(true, false, true, 1); TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
opt.kOverride = 1;
testFlat(opt);
} }
} }
TEST(TestGpuIndexFlat, IP_Float16) { TEST(TestGpuIndexFlat, IP_Float16) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(false, true, false);
testFlat(false, true, false); TestFlatOptions opt;
opt.useL2 = false;
opt.useFloat16 = true;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
} }
} }
TEST(TestGpuIndexFlat, L2_Float16) { TEST(TestGpuIndexFlat, L2_Float16) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(true, true, false);
testFlat(true, true, true); TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = true;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
} }
} }
...@@ -133,8 +189,33 @@ TEST(TestGpuIndexFlat, L2_Float16) { ...@@ -133,8 +189,33 @@ TEST(TestGpuIndexFlat, L2_Float16) {
TEST(TestGpuIndexFlat, L2_Float16_K1) { TEST(TestGpuIndexFlat, L2_Float16_K1) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(true, true, false, 1);
testFlat(true, true, true, 1); TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = true;
opt.useTransposed = false;
opt.kOverride = 1;
testFlat(opt);
}
}
// test tiling along a huge vector set
TEST(TestGpuIndexFlat, L2_Tiling) {
for (int tries = 0; tries < 3; ++tries) {
faiss::gpu::newTestSeed();
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
opt.numVecsOverride = 1000000;
opt.numQueriesOverride = 8;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
} }
} }
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "../StandardGpuResources.h" #include "../StandardGpuResources.h"
#include "../utils/DeviceUtils.h" #include "../utils/DeviceUtils.h"
#include "../test/TestUtils.h" #include "../test/TestUtils.h"
#include <cmath>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <sstream> #include <sstream>
...@@ -390,6 +391,68 @@ TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) { ...@@ -390,6 +391,68 @@ TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) {
copyToTest(false, false); copyToTest(false, false);
} }
TEST(TestGpuIndexIVFFlat, Float32_negative) {
faiss::gpu::newTestSeed();
Options opt;
auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
// Put all vecs on negative side
for (auto& f : trainVecs) {
f = std::abs(f) * -1.0f;
}
for (auto& f : addVecs) {
f *= std::abs(f) * -1.0f;
}
faiss::IndexFlatIP quantizerIP(opt.dim);
faiss::Index* quantizer = (faiss::Index*) &quantizerIP;
faiss::IndexIVFFlat cpuIndex(quantizer,
opt.dim, opt.numCentroids,
faiss::METRIC_INNER_PRODUCT);
cpuIndex.train(opt.numTrain, trainVecs.data());
cpuIndex.add(opt.numAdd, addVecs.data());
cpuIndex.nprobe = opt.nprobe;
faiss::gpu::StandardGpuResources res;
res.noTempMemory();
faiss::gpu::GpuIndexIVFFlatConfig config;
config.device = opt.device;
config.indicesOptions = opt.indicesOpt;
faiss::gpu::GpuIndexIVFFlat gpuIndex(&res,
cpuIndex.d,
cpuIndex.nlist,
cpuIndex.metric_type,
config);
gpuIndex.copyFrom(&cpuIndex);
gpuIndex.setNumProbes(opt.nprobe);
// Construct a positive test set
auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
// Put all vecs on positive size
for (auto& f : queryVecs) {
f = std::abs(f);
}
bool compFloat16 = false;
faiss::gpu::compareIndices(queryVecs,
cpuIndex, gpuIndex,
opt.numQuery, opt.dim, opt.k, opt.toString(),
compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
// FIXME: the fp16 bounds are
// useless when math (the accumulator) is
// in fp16. Figure out another way to test
compFloat16 ? 0.99f : 0.1f,
compFloat16 ? 0.65f : 0.015f);
}
// //
// NaN tests // NaN tests
// //
......
...@@ -64,24 +64,23 @@ std::vector<float> randVecs(size_t num, size_t dim) { ...@@ -64,24 +64,23 @@ std::vector<float> randVecs(size_t num, size_t dim) {
return v; return v;
} }
void compareIndices(faiss::Index& refIndex, void compareIndices(const std::vector<float>& queryVecs,
faiss::Index& refIndex,
faiss::Index& testIndex, faiss::Index& testIndex,
int numQuery, int dim, int k, int numQuery, int dim, int k,
const std::string& configMsg, const std::string& configMsg,
float maxRelativeError, float maxRelativeError,
float pctMaxDiff1, float pctMaxDiff1,
float pctMaxDiffN) { float pctMaxDiffN) {
auto queries = faiss::gpu::randVecs(numQuery, dim);
// Compare // Compare
std::vector<float> refDistance(numQuery * k, 0); std::vector<float> refDistance(numQuery * k, 0);
std::vector<faiss::Index::idx_t> refIndices(numQuery * k, -1); std::vector<faiss::Index::idx_t> refIndices(numQuery * k, -1);
refIndex.search(numQuery, queries.data(), refIndex.search(numQuery, queryVecs.data(),
k, refDistance.data(), refIndices.data()); k, refDistance.data(), refIndices.data());
std::vector<float> testDistance(numQuery * k, 0); std::vector<float> testDistance(numQuery * k, 0);
std::vector<faiss::Index::idx_t> testIndices(numQuery * k, -1); std::vector<faiss::Index::idx_t> testIndices(numQuery * k, -1);
testIndex.search(numQuery, queries.data(), testIndex.search(numQuery, queryVecs.data(),
k, testDistance.data(), testIndices.data()); k, testDistance.data(), testIndices.data());
faiss::gpu::compareLists(refDistance.data(), faiss::gpu::compareLists(refDistance.data(),
...@@ -94,6 +93,25 @@ void compareIndices(faiss::Index& refIndex, ...@@ -94,6 +93,25 @@ void compareIndices(faiss::Index& refIndex,
maxRelativeError, pctMaxDiff1, pctMaxDiffN); maxRelativeError, pctMaxDiff1, pctMaxDiffN);
} }
void compareIndices(faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
const std::string& configMsg,
float maxRelativeError,
float pctMaxDiff1,
float pctMaxDiffN) {
auto queryVecs = faiss::gpu::randVecs(numQuery, dim);
compareIndices(queryVecs,
refIndex,
testIndex,
numQuery, dim, k,
configMsg,
maxRelativeError,
pctMaxDiff1,
pctMaxDiffN);
}
template <typename T> template <typename T>
inline T lookup(const T* p, int i, int j, int /*dim1*/, int dim2) { inline T lookup(const T* p, int i, int j, int /*dim1*/, int dim2) {
return p[i * dim2 + j]; return p[i * dim2 + j];
......
...@@ -56,7 +56,19 @@ T randSelect(std::initializer_list<T> vals) { ...@@ -56,7 +56,19 @@ T randSelect(std::initializer_list<T> vals) {
/// Generates a collection of random vectors in the range [0, 1] /// Generates a collection of random vectors in the range [0, 1]
std::vector<float> randVecs(size_t num, size_t dim); std::vector<float> randVecs(size_t num, size_t dim);
/// Compare two indices via query for similarity /// Compare two indices via query for similarity, with a user-specified set of
/// query vectors
void compareIndices(const std::vector<float>& queryVecs,
faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
const std::string& configMsg,
float maxRelativeError = 6e-5f,
float pctMaxDiff1 = 0.1f,
float pctMaxDiffN = 0.005f);
/// Compare two indices via query for similarity, generating random query
/// vectors
void compareIndices(faiss::Index& refIndex, void compareIndices(faiss::Index& refIndex,
faiss::Index& testIndex, faiss::Index& testIndex,
int numQuery, int dim, int k, int numQuery, int dim, int k,
......
...@@ -38,14 +38,14 @@ def search_index_pytorch(index, x, k, D=None, I=None): ...@@ -38,14 +38,14 @@ def search_index_pytorch(index, x, k, D=None, I=None):
assert I.__class__ in (torch.LongTensor, torch.cuda.LongTensor) assert I.__class__ in (torch.LongTensor, torch.cuda.LongTensor)
assert I.size() == (n, k) assert I.size() == (n, k)
assert I.is_contiguous() assert I.is_contiguous()
torch.cuda.synchronize()
xptr = x.storage().data_ptr() xptr = x.storage().data_ptr()
Iptr = I.storage().data_ptr() Iptr = I.storage().data_ptr()
Dptr = D.storage().data_ptr() Dptr = D.storage().data_ptr()
index.search_c(n, faiss.cast_integer_to_float_ptr(xptr), index.search_c(n, faiss.cast_integer_to_float_ptr(xptr),
k, faiss.cast_integer_to_float_ptr(Dptr), k, faiss.cast_integer_to_float_ptr(Dptr),
faiss.cast_integer_to_long_ptr(Iptr)) faiss.cast_integer_to_long_ptr(Iptr))
torch.cuda.synchronize()
return D, I return D, I
......
...@@ -77,4 +77,46 @@ void runBlockSelect(Tensor<float, 2, true>& in, ...@@ -77,4 +77,46 @@ void runBlockSelect(Tensor<float, 2, true>& in,
} }
} }
void runBlockSelectPair(Tensor<float, 2, true>& inK,
Tensor<int, 2, true>& inV,
Tensor<float, 2, true>& outK,
Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) {
FAISS_ASSERT(k <= 1024);
if (dir) {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(float, true, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(float, true, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(float, true, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(float, true, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(float, true, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(float, true, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(float, true, 1024);
}
} else {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(float, false, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(float, false, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(float, false, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(float, false, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(float, false, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(float, false, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(float, false, 1024);
}
}
}
} } // namespace } } // namespace
...@@ -79,6 +79,48 @@ void runBlockSelect(Tensor<half, 2, true>& in, ...@@ -79,6 +79,48 @@ void runBlockSelect(Tensor<half, 2, true>& in,
} }
} }
void runBlockSelectPair(Tensor<half, 2, true>& inK,
Tensor<int, 2, true>& inV,
Tensor<half, 2, true>& outK,
Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) {
FAISS_ASSERT(k <= 1024);
if (dir) {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(half, true, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(half, true, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(half, true, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(half, true, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(half, true, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(half, true, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(half, true, 1024);
}
} else {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(half, false, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(half, false, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(half, false, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(half, false, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(half, false, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(half, false, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(half, false, 1024);
}
}
}
#endif #endif
} } // namespace } } // namespace
...@@ -62,16 +62,79 @@ __global__ void blockSelect(Tensor<K, 2, true> in, ...@@ -62,16 +62,79 @@ __global__ void blockSelect(Tensor<K, 2, true> in,
} }
} }
template <typename K,
typename IndexType,
bool Dir,
int NumWarpQ,
int NumThreadQ,
int ThreadsPerBlock>
__global__ void blockSelectPair(Tensor<K, 2, true> inK,
Tensor<IndexType, 2, true> inV,
Tensor<K, 2, true> outK,
Tensor<IndexType, 2, true> outV,
K initK,
IndexType initV,
int k) {
constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
__shared__ K smemK[kNumWarps * NumWarpQ];
__shared__ IndexType smemV[kNumWarps * NumWarpQ];
BlockSelect<K, IndexType, Dir, Comparator<K>,
NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(initK, initV, smemK, smemV, k);
// Grid is exactly sized to rows available
int row = blockIdx.x;
int i = threadIdx.x;
K* inKStart = inK[row][i].data();
IndexType* inVStart = inV[row][i].data();
// Whole warps must participate in the selection
int limit = utils::roundDown(inK.getSize(1), kWarpSize);
for (; i < limit; i += ThreadsPerBlock) {
heap.add(*inKStart, *inVStart);
inKStart += ThreadsPerBlock;
inVStart += ThreadsPerBlock;
}
// Handle last remainder fraction of a warp of elements
if (i < inK.getSize(1)) {
heap.addThreadQ(*inKStart, *inVStart);
}
heap.reduce();
for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
outK[row][i] = smemK[i];
outV[row][i] = smemV[i];
}
}
void runBlockSelect(Tensor<float, 2, true>& in, void runBlockSelect(Tensor<float, 2, true>& in,
Tensor<float, 2, true>& outKeys, Tensor<float, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream); bool dir, int k, cudaStream_t stream);
void runBlockSelectPair(Tensor<float, 2, true>& inKeys,
Tensor<int, 2, true>& inIndices,
Tensor<float, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
#ifdef FAISS_USE_FLOAT16 #ifdef FAISS_USE_FLOAT16
void runBlockSelect(Tensor<half, 2, true>& in, void runBlockSelect(Tensor<half, 2, true>& in,
Tensor<half, 2, true>& outKeys, Tensor<half, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream); bool dir, int k, cudaStream_t stream);
void runBlockSelectPair(Tensor<half, 2, true>& inKeys,
Tensor<int, 2, true>& inIndices,
Tensor<half, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
#endif #endif
} } // namespace } } // namespace
...@@ -12,37 +12,37 @@ ...@@ -12,37 +12,37 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor() : DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor() :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner), state_(AllocState::NotOwner),
space_(MemorySpace::Device) { space_(MemorySpace::Device) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t) : DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner), state_(AllocState::NotOwner),
space_(MemorySpace::Device) { space_(MemorySpace::Device) {
this->operator=(std::move(t)); this->operator=(std::move(t));
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>& DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t) { DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
if (this->state_ == AllocState::Owner) { if (this->state_ == AllocState::Owner) {
CUDA_VERIFY(cudaFree(this->data_)); CUDA_VERIFY(cudaFree(this->data_));
} }
this->Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator=( this->Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
std::move(t)); std::move(t));
this->state_ = t.state_; t.state_ = AllocState::NotOwner; this->state_ = t.state_; t.state_ = AllocState::NotOwner;
...@@ -52,10 +52,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=( ...@@ -52,10 +52,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
return *this; return *this;
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() { DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~DeviceTensor() {
if (state_ == AllocState::Owner) { if (state_ == AllocState::Owner) {
FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0)); FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
CUDA_VERIFY(cudaFree(this->data_)); CUDA_VERIFY(cudaFree(this->data_));
...@@ -66,13 +66,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() { ...@@ -66,13 +66,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() {
// destructor will return the reservation // destructor will return the reservation
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
const IndexT sizes[Dim], const IndexT sizes[Dim],
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner), state_(AllocState::Owner),
space_(space) { space_(space) {
...@@ -80,13 +80,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -80,13 +80,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0)); FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
std::initializer_list<IndexT> sizes, std::initializer_list<IndexT> sizes,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner), state_(AllocState::Owner),
space_(space) { space_(space) {
...@@ -95,15 +95,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -95,15 +95,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
} }
// memory reservation constructor // memory reservation constructor
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m, DeviceMemory& m,
const IndexT sizes[Dim], const IndexT sizes[Dim],
cudaStream_t stream, cudaStream_t stream,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Reservation), state_(AllocState::Reservation),
space_(space) { space_(space) {
...@@ -116,15 +116,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -116,15 +116,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
} }
// memory reservation constructor // memory reservation constructor
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m, DeviceMemory& m,
std::initializer_list<IndexT> sizes, std::initializer_list<IndexT> sizes,
cudaStream_t stream, cudaStream_t stream,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Reservation), state_(AllocState::Reservation),
space_(space) { space_(space) {
...@@ -136,51 +136,51 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -136,51 +136,51 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
reservation_ = std::move(memory); reservation_ = std::move(memory);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data, DataPtrType data,
const IndexT sizes[Dim], const IndexT sizes[Dim],
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner), state_(AllocState::NotOwner),
space_(space) { space_(space) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data, DataPtrType data,
std::initializer_list<IndexT> sizes, std::initializer_list<IndexT> sizes,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner), state_(AllocState::NotOwner),
space_(space) { space_(space) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data, DataPtrType data,
const IndexT sizes[Dim], const IndexT sizes[Dim],
const IndexT strides[Dim], const IndexT strides[Dim],
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes, strides), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
state_(AllocState::NotOwner), state_(AllocState::NotOwner),
space_(space) { space_(space) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream, cudaStream_t stream,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Owner), state_(AllocState::Owner),
space_(space) { space_(space) {
...@@ -189,15 +189,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -189,15 +189,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
this->copyFrom(t, stream); this->copyFrom(t, stream);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m, DeviceMemory& m,
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream, cudaStream_t stream,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Reservation), state_(AllocState::Reservation),
space_(space) { space_(space) {
...@@ -211,10 +211,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -211,10 +211,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
this->copyFrom(t, stream); this->copyFrom(t, stream);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>& __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::zero( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero(
cudaStream_t stream) { cudaStream_t stream) {
if (this->data_) { if (this->data_) {
// Region must be contiguous // Region must be contiguous
......
...@@ -18,10 +18,10 @@ namespace faiss { namespace gpu { ...@@ -18,10 +18,10 @@ namespace faiss { namespace gpu {
template <typename T, template <typename T,
int Dim, int Dim,
bool Contig = false, bool InnerContig = false,
typename IndexT = int, typename IndexT = int,
template <typename U> class PtrTraits = traits::DefaultPtrTraits> template <typename U> class PtrTraits = traits::DefaultPtrTraits>
class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> { class DeviceTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
public: public:
typedef IndexT IndexType; typedef IndexT IndexType;
typedef typename PtrTraits<T>::PtrType DataPtrType; typedef typename PtrTraits<T>::PtrType DataPtrType;
...@@ -33,11 +33,11 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> { ...@@ -33,11 +33,11 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
__host__ ~DeviceTensor(); __host__ ~DeviceTensor();
/// Move constructor /// Move constructor
__host__ DeviceTensor(DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t); __host__ DeviceTensor(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
/// Move assignment /// Move assignment
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>& __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
operator=(DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t); operator=(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
/// Constructs a tensor of the given size, allocating memory for it /// Constructs a tensor of the given size, allocating memory for it
/// locally /// locally
...@@ -76,19 +76,19 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> { ...@@ -76,19 +76,19 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
MemorySpace space = MemorySpace::Device); MemorySpace space = MemorySpace::Device);
/// Copies a tensor into ourselves, allocating memory for it locally /// Copies a tensor into ourselves, allocating memory for it locally
__host__ DeviceTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, __host__ DeviceTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream, cudaStream_t stream,
MemorySpace space = MemorySpace::Device); MemorySpace space = MemorySpace::Device);
/// Copies a tensor into ourselves, reserving a temporary /// Copies a tensor into ourselves, reserving a temporary
/// memory reservation via a memory manager. /// memory reservation via a memory manager.
__host__ DeviceTensor(DeviceMemory& m, __host__ DeviceTensor(DeviceMemory& m,
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream, cudaStream_t stream,
MemorySpace space = MemorySpace::Device); MemorySpace space = MemorySpace::Device);
/// Call to zero out memory /// Call to zero out memory
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>& __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
zero(cudaStream_t stream); zero(cudaStream_t stream);
private: private:
......
...@@ -43,7 +43,7 @@ void synchronizeAllDevices() { ...@@ -43,7 +43,7 @@ void synchronizeAllDevices() {
} }
} }
cudaDeviceProp& getDeviceProperties(int device) { const cudaDeviceProp& getDeviceProperties(int device) {
static std::mutex mutex; static std::mutex mutex;
static std::unordered_map<int, cudaDeviceProp> properties; static std::unordered_map<int, cudaDeviceProp> properties;
...@@ -61,6 +61,10 @@ cudaDeviceProp& getDeviceProperties(int device) { ...@@ -61,6 +61,10 @@ cudaDeviceProp& getDeviceProperties(int device) {
return it->second; return it->second;
} }
const cudaDeviceProp& getCurrentDeviceProperties() {
return getDeviceProperties(getCurrentDevice());
}
int getMaxThreads(int device) { int getMaxThreads(int device) {
return getDeviceProperties(device).maxThreadsPerBlock; return getDeviceProperties(device).maxThreadsPerBlock;
} }
......
...@@ -31,7 +31,10 @@ int getNumDevices(); ...@@ -31,7 +31,10 @@ int getNumDevices();
void synchronizeAllDevices(); void synchronizeAllDevices();
/// Returns a cached cudaDeviceProp for the given device /// Returns a cached cudaDeviceProp for the given device
cudaDeviceProp& getDeviceProperties(int device); const cudaDeviceProp& getDeviceProperties(int device);
/// Returns the cached cudaDeviceProp for the current device
const cudaDeviceProp& getCurrentDeviceProperties();
/// Returns the maximum number of threads available for the given GPU /// Returns the maximum number of threads available for the given GPU
/// device /// device
......
...@@ -10,18 +10,18 @@ ...@@ -10,18 +10,18 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor() : HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor() :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner) { state_(AllocState::NotOwner) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() { HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~HostTensor() {
if (state_ == AllocState::Owner) { if (state_ == AllocState::Owner) {
FAISS_ASSERT(this->data_ != nullptr); FAISS_ASSERT(this->data_ != nullptr);
delete[] this->data_; delete[] this->data_;
...@@ -29,67 +29,67 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() { ...@@ -29,67 +29,67 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() {
} }
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
const IndexT sizes[Dim]) : const IndexT sizes[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner) { state_(AllocState::Owner) {
this->data_ = new T[this->numElements()]; this->data_ = new T[this->numElements()];
FAISS_ASSERT(this->data_ != nullptr); FAISS_ASSERT(this->data_ != nullptr);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
std::initializer_list<IndexT> sizes) : std::initializer_list<IndexT> sizes) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner) { state_(AllocState::Owner) {
this->data_ = new T[this->numElements()]; this->data_ = new T[this->numElements()];
FAISS_ASSERT(this->data_ != nullptr); FAISS_ASSERT(this->data_ != nullptr);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data, DataPtrType data,
const IndexT sizes[Dim]) : const IndexT sizes[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner) { state_(AllocState::NotOwner) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data, DataPtrType data,
std::initializer_list<IndexT> sizes) : std::initializer_list<IndexT> sizes) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner) { state_(AllocState::NotOwner) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data, DataPtrType data,
const IndexT sizes[Dim], const IndexT sizes[Dim],
const IndexT strides[Dim]) : const IndexT strides[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes, strides), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
state_(AllocState::NotOwner) { state_(AllocState::NotOwner) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream) : cudaStream_t stream) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Owner) { state_(AllocState::Owner) {
// Only contiguous arrays handled for now // Only contiguous arrays handled for now
FAISS_ASSERT(t.isContiguous()); FAISS_ASSERT(t.isContiguous());
...@@ -99,10 +99,10 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( ...@@ -99,10 +99,10 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
} }
/// Call to zero out memory /// Call to zero out memory
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>& __host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() { HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero() {
// Region must be contiguous // Region must be contiguous
FAISS_ASSERT(this->isContiguous()); FAISS_ASSERT(this->isContiguous());
...@@ -113,17 +113,17 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() { ...@@ -113,17 +113,17 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() {
return *this; return *this;
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ T __host__ T
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::maxDiff( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::maxDiff(
const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const { const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
auto size = this->numElements(); auto size = this->numElements();
FAISS_ASSERT(size == t.numElements()); FAISS_ASSERT(size == t.numElements());
FAISS_ASSERT(size > 0); FAISS_ASSERT(size > 0);
if (Contig) { if (InnerContig) {
auto a = this->data(); auto a = this->data();
auto b = t.data(); auto b = t.data();
......
...@@ -16,10 +16,10 @@ namespace faiss { namespace gpu { ...@@ -16,10 +16,10 @@ namespace faiss { namespace gpu {
template <typename T, template <typename T,
int Dim, int Dim,
bool Contig = false, bool InnerContig = false,
typename IndexT = int, typename IndexT = int,
template <typename U> class PtrTraits = traits::DefaultPtrTraits> template <typename U> class PtrTraits = traits::DefaultPtrTraits>
class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> { class HostTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
public: public:
typedef IndexT IndexType; typedef IndexT IndexType;
typedef typename PtrTraits<T>::PtrType DataPtrType; typedef typename PtrTraits<T>::PtrType DataPtrType;
...@@ -51,19 +51,19 @@ class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> { ...@@ -51,19 +51,19 @@ class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
/// Copies a tensor into ourselves, allocating memory for it /// Copies a tensor into ourselves, allocating memory for it
/// locally. If the tensor is on the GPU, then we will copy it to /// locally. If the tensor is on the GPU, then we will copy it to
/// ourselves wrt the given stream. /// ourselves wrt the given stream.
__host__ HostTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, __host__ HostTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream); cudaStream_t stream);
/// Call to zero out memory /// Call to zero out memory
__host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>& zero(); __host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& zero();
/// Returns the maximum difference seen between two tensors /// Returns the maximum difference seen between two tensors
__host__ T __host__ T
maxDiff(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const; maxDiff(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const;
/// Are the two tensors exactly equal? /// Are the two tensors exactly equal?
__host__ bool __host__ bool
equal(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const { equal(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
return (maxDiff(t) == (T) 0); return (maxDiff(t) == (T) 0);
} }
......
...@@ -24,11 +24,12 @@ struct Limits { ...@@ -24,11 +24,12 @@ struct Limits {
// constexpr constructor for half // constexpr constructor for half
// FIXME: faiss CPU uses +/-FLT_MAX instead of +/-infinity // FIXME: faiss CPU uses +/-FLT_MAX instead of +/-infinity
constexpr float kFloatMax = std::numeric_limits<float>::max(); constexpr float kFloatMax = std::numeric_limits<float>::max();
constexpr float kFloatMin = std::numeric_limits<float>::lowest();
template <> template <>
struct Limits<float> { struct Limits<float> {
static __device__ __host__ inline float getMin() { static __device__ __host__ inline float getMin() {
return -kFloatMax; return kFloatMin;
} }
static __device__ __host__ inline float getMax() { static __device__ __host__ inline float getMax() {
return kFloatMax; return kFloatMax;
...@@ -55,8 +56,8 @@ struct Limits<half> { ...@@ -55,8 +56,8 @@ struct Limits<half> {
#endif // FAISS_USE_FLOAT16 #endif // FAISS_USE_FLOAT16
constexpr int kIntMin = std::numeric_limits<int>::min();
constexpr int kIntMax = std::numeric_limits<int>::max(); constexpr int kIntMax = std::numeric_limits<int>::max();
constexpr int kIntMin = std::numeric_limits<int>::lowest();
template <> template <>
struct Limits<int> { struct Limits<int> {
......
...@@ -112,6 +112,10 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC, ...@@ -112,6 +112,10 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC,
FAISS_ASSERT(aK == bK); FAISS_ASSERT(aK == bK);
FAISS_ASSERT(bN == cN); FAISS_ASSERT(bN == cN);
FAISS_ASSERT(a.getStride(1) == 1);
FAISS_ASSERT(b.getStride(1) == 1);
FAISS_ASSERT(c.getStride(1) == 1);
// Now, we have to represent the matrix multiplication in // Now, we have to represent the matrix multiplication in
// column-major layout // column-major layout
T* pA = transC ? a.data() : b.data(); T* pA = transC ? a.data() : b.data();
...@@ -122,9 +126,9 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC, ...@@ -122,9 +126,9 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC,
int n = c.getSize(0); // other size int n = c.getSize(0); // other size
int k = transA ? a.getSize(0) : a.getSize(1); int k = transA ? a.getSize(0) : a.getSize(1);
int lda = transC ? a.getSize(1) : b.getSize(1); int lda = transC ? a.getStride(0) : b.getStride(0);
int ldb = transC ? b.getSize(1) : a.getSize(1); int ldb = transC ? b.getStride(0) : a.getStride(0);
int ldc = c.getSize(1); int ldc = c.getStride(0);
auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N; auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N; auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
...@@ -238,9 +242,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC, ...@@ -238,9 +242,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
int n = c.getSize(1); // other size int n = c.getSize(1); // other size
int k = transA ? a.getSize(1) : a.getSize(2); int k = transA ? a.getSize(1) : a.getSize(2);
int lda = transC ? a.getSize(2) : b.getSize(2); int lda = transC ? a.getStride(1) : b.getStride(1);
int ldb = transC ? b.getSize(2) : a.getSize(2); int ldb = transC ? b.getStride(1) : a.getStride(1);
int ldc = c.getSize(2); int ldc = c.getStride(1);
auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N; auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N; auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
...@@ -254,9 +258,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC, ...@@ -254,9 +258,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
HostTensor<float*, 1, true> hostB({b.getSize(0)}); HostTensor<float*, 1, true> hostB({b.getSize(0)});
HostTensor<float*, 1, true> hostC({c.getSize(0)}); HostTensor<float*, 1, true> hostC({c.getSize(0)});
size_t aOffset = a.getSize(1) * a.getSize(2); size_t aOffset = a.getStride(0);
size_t bOffset = b.getSize(1) * b.getSize(2); size_t bOffset = b.getStride(0);
size_t cOffset = c.getSize(1) * c.getSize(2); size_t cOffset = c.getStride(0);
for (int i = 0; i < a.getSize(0); ++i) { for (int i = 0; i < a.getSize(0); ++i) {
hostA[i] = transC ? a.data() + i * aOffset : b.data() + i * bOffset; hostA[i] = transC ? a.data() + i * aOffset : b.data() + i * bOffset;
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
template <int Dim, bool Contig = false, typename IndexT = int> template <int Dim, bool InnerContig = false, typename IndexT = int>
class NoTypeTensor { class NoTypeTensor {
public: public:
NoTypeTensor() NoTypeTensor()
...@@ -25,7 +25,7 @@ class NoTypeTensor { ...@@ -25,7 +25,7 @@ class NoTypeTensor {
} }
template <typename T> template <typename T>
NoTypeTensor(Tensor<T, Dim, Contig, IndexT>& t) NoTypeTensor(Tensor<T, Dim, InnerContig, IndexT>& t)
: mem_(t.data()), : mem_(t.data()),
typeSize_(sizeof(T)) { typeSize_(sizeof(T)) {
for (int i = 0; i < Dim; ++i) { for (int i = 0; i < Dim; ++i) {
...@@ -87,13 +87,14 @@ class NoTypeTensor { ...@@ -87,13 +87,14 @@ class NoTypeTensor {
} }
template <typename T> template <typename T>
Tensor<T, Dim, Contig, IndexT> toTensor() { Tensor<T, Dim, InnerContig, IndexT> toTensor() {
FAISS_ASSERT(sizeof(T) == typeSize_); FAISS_ASSERT(sizeof(T) == typeSize_);
return Tensor<T, Dim, Contig, IndexT>((T*) mem_, size_, stride_); return Tensor<T, Dim, InnerContig, IndexT>((T*) mem_, size_, stride_);
} }
NoTypeTensor<Dim, Contig, IndexT> narrowOutermost(IndexT start, IndexT size) { NoTypeTensor<Dim, InnerContig, IndexT> narrowOutermost(IndexT start,
IndexT size) {
char* newPtr = (char*) mem_; char* newPtr = (char*) mem_;
if (start > 0) { if (start > 0) {
...@@ -110,7 +111,7 @@ class NoTypeTensor { ...@@ -110,7 +111,7 @@ class NoTypeTensor {
} }
} }
return NoTypeTensor<Dim, Contig, IndexT>( return NoTypeTensor<Dim, InnerContig, IndexT>(
newPtr, typeSize_, newSize, stride_); newPtr, typeSize_, newSize, stride_);
} }
......
...@@ -8,16 +8,16 @@ ...@@ -8,16 +8,16 @@
// Copyright 2004-present Facebook. All Rights Reserved. // Copyright 2004-present Facebook. All Rights Reserved.
#include "../../FaissAssert.h" #include "../GpuFaissAssert.h"
#include "DeviceUtils.h" #include "DeviceUtils.h"
#include <limits> #include <limits>
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ __host__ __device__
Tensor<T, Dim, Contig, IndexT, PtrTraits>::Tensor() Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor()
: data_(nullptr) { : data_(nullptr) {
static_assert(Dim > 0, "must have > 0 dimensions"); static_assert(Dim > 0, "must have > 0 dimensions");
...@@ -27,12 +27,12 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::Tensor() ...@@ -27,12 +27,12 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::Tensor()
} }
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ __host__ __device__
Tensor<T, Dim, Contig, IndexT, PtrTraits>& Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator=( Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
Tensor<T, Dim, Contig, IndexT, PtrTraits>&& t) { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
data_ = t.data_; t.data_ = nullptr; data_ = t.data_; t.data_ = nullptr;
for (int i = 0; i < Dim; ++i) { for (int i = 0; i < Dim; ++i) {
stride_[i] = t.stride_[i]; t.stride_[i] = 0; stride_[i] = t.stride_[i]; t.stride_[i] = 0;
...@@ -42,10 +42,10 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator=( ...@@ -42,10 +42,10 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
return *this; return *this;
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ __host__ __device__
Tensor<T, Dim, Contig, IndexT, PtrTraits>:: Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
Tensor(DataPtrType data, const IndexT sizes[Dim]) Tensor(DataPtrType data, const IndexT sizes[Dim])
: data_(data) { : data_(data) {
static_assert(Dim > 0, "must have > 0 dimensions"); static_assert(Dim > 0, "must have > 0 dimensions");
...@@ -60,13 +60,13 @@ Tensor(DataPtrType data, const IndexT sizes[Dim]) ...@@ -60,13 +60,13 @@ Tensor(DataPtrType data, const IndexT sizes[Dim])
} }
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ __host__ __device__
Tensor<T, Dim, Contig, IndexT, PtrTraits>:: Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
Tensor(DataPtrType data, std::initializer_list<IndexT> sizes) Tensor(DataPtrType data, std::initializer_list<IndexT> sizes)
: data_(data) { : data_(data) {
assert(sizes.size() == Dim); GPU_FAISS_ASSERT(sizes.size() == Dim);
static_assert(Dim > 0, "must have > 0 dimensions"); static_assert(Dim > 0, "must have > 0 dimensions");
int i = 0; int i = 0;
...@@ -81,10 +81,10 @@ Tensor(DataPtrType data, std::initializer_list<IndexT> sizes) ...@@ -81,10 +81,10 @@ Tensor(DataPtrType data, std::initializer_list<IndexT> sizes)
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ __host__ __device__
Tensor<T, Dim, Contig, IndexT, PtrTraits>::Tensor( Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim]) DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim])
: data_(data) { : data_(data) {
static_assert(Dim > 0, "must have > 0 dimensions"); static_assert(Dim > 0, "must have > 0 dimensions");
...@@ -95,22 +95,23 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::Tensor( ...@@ -95,22 +95,23 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::Tensor(
} }
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ void __host__ void
Tensor<T, Dim, Contig, IndexT, PtrTraits>::copyFrom( Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::copyFrom(
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream) { cudaStream_t stream) {
static_assert(Contig, "only contiguous tensors handled"); // The tensor must be fully contiguous
GPU_FAISS_ASSERT(this->isContiguous());
// Size must be the same (since dimensions are checked and // Size must be the same (since dimensions are checked and
// continuity is assumed, we need only check total number of // continuity is assumed, we need only check total number of
// elements // elements
FAISS_ASSERT(this->numElements() == t.numElements()); GPU_FAISS_ASSERT(this->numElements() == t.numElements());
if (t.numElements() > 0) { if (t.numElements() > 0) {
FAISS_ASSERT(this->data_); GPU_FAISS_ASSERT(this->data_);
FAISS_ASSERT(t.data()); GPU_FAISS_ASSERT(t.data());
int ourDev = getDeviceForAddress(this->data_); int ourDev = getDeviceForAddress(this->data_);
int tDev = getDeviceForAddress(t.data()); int tDev = getDeviceForAddress(t.data());
...@@ -133,22 +134,23 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::copyFrom( ...@@ -133,22 +134,23 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::copyFrom(
} }
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ void __host__ void
Tensor<T, Dim, Contig, IndexT, PtrTraits>::copyTo( Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::copyTo(
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream) { cudaStream_t stream) {
static_assert(Contig, "only contiguous tensors handled"); // The tensor must be fully contiguous
GPU_FAISS_ASSERT(this->isContiguous());
// Size must be the same (since dimensions are checked and // Size must be the same (since dimensions are checked and
// continuity is assumed, we need only check total number of // continuity is assumed, we need only check total number of
// elements // elements
FAISS_ASSERT(this->numElements() == t.numElements()); GPU_FAISS_ASSERT(this->numElements() == t.numElements());
if (t.numElements() > 0) { if (t.numElements() > 0) {
FAISS_ASSERT(this->data_); GPU_FAISS_ASSERT(this->data_);
FAISS_ASSERT(t.data()); GPU_FAISS_ASSERT(t.data());
int ourDev = getDeviceForAddress(this->data_); int ourDev = getDeviceForAddress(this->data_);
int tDev = getDeviceForAddress(t.data()); int tDev = getDeviceForAddress(t.data());
...@@ -171,62 +173,79 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::copyTo( ...@@ -171,62 +173,79 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::copyTo(
} }
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
template <int OtherDim> template <typename OtherT, int OtherDim>
__host__ __device__ bool __host__ __device__ bool
Tensor<T, Dim, Contig, IndexT, PtrTraits>::isSame( Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isSame(
const Tensor<T, OtherDim, Contig, IndexT, PtrTraits>& rhs) const { const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs) const {
if (Dim != OtherDim) { if (Dim != OtherDim) {
return false; return false;
} }
for (int i = 0; i < Dim; ++i) { for (int i = 0; i < Dim; ++i) {
if (size_[i] != rhs.size_[i]) { if (this->getSize(i) != rhs.getSize(i)) {
return false; return false;
} }
if (!Contig) { if (this->getStride(i) != rhs.getStride(i)) {
if (stride_[i] != rhs.stride_[i]) {
return false; return false;
} }
} }
return true;
}
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
template <typename OtherT, int OtherDim>
__host__ __device__ bool
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isSameSize(
const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs) const {
if (Dim != OtherDim) {
return false;
}
for (int i = 0; i < Dim; ++i) {
if (this->getSize(i) != rhs.getSize(i)) {
return false;
}
} }
return true; return true;
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
template <typename U> template <typename U>
__host__ __device__ Tensor<U, Dim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::cast() { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::cast() {
static_assert(sizeof(U) == sizeof(T), "cast must be to same size object"); static_assert(sizeof(U) == sizeof(T), "cast must be to same size object");
return Tensor<U, Dim, Contig, IndexT, PtrTraits>( return Tensor<U, Dim, InnerContig, IndexT, PtrTraits>(
reinterpret_cast<U*>(data_), size_, stride_); reinterpret_cast<U*>(data_), size_, stride_);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
template <typename U> template <typename U>
__host__ __device__ const Tensor<U, Dim, Contig, IndexT, PtrTraits> __host__ __device__ const Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::cast() const { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::cast() const {
static_assert(sizeof(U) == sizeof(T), "cast must be to same size object"); static_assert(sizeof(U) == sizeof(T), "cast must be to same size object");
return Tensor<U, Dim, Contig, IndexT, PtrTraits>( return Tensor<U, Dim, InnerContig, IndexT, PtrTraits>(
reinterpret_cast<U*>(data_), size_, stride_); reinterpret_cast<U*>(data_), size_, stride_);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
template <typename U> template <typename U>
__host__ __device__ Tensor<U, Dim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::castResize() { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::castResize() {
static_assert(sizeof(U) >= sizeof(T), "only handles greater sizes"); static_assert(sizeof(U) >= sizeof(T), "only handles greater sizes");
constexpr int kMultiple = sizeof(U) / sizeof(T); constexpr int kMultiple = sizeof(U) / sizeof(T);
assert(canCastResize<U>()); GPU_FAISS_ASSERT(canCastResize<U>());
IndexT newSize[Dim]; IndexT newSize[Dim];
IndexT newStride[Dim]; IndexT newStride[Dim];
...@@ -239,24 +258,24 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::castResize() { ...@@ -239,24 +258,24 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::castResize() {
newStride[Dim - 1] = 1; // this is the same as the old stride newStride[Dim - 1] = 1; // this is the same as the old stride
newSize[Dim - 1] = size_[Dim - 1] / kMultiple; newSize[Dim - 1] = size_[Dim - 1] / kMultiple;
return Tensor<U, Dim, Contig, IndexT, PtrTraits>( return Tensor<U, Dim, InnerContig, IndexT, PtrTraits>(
reinterpret_cast<U*>(data_), newSize, newStride); reinterpret_cast<U*>(data_), newSize, newStride);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
template <typename U> template <typename U>
__host__ __device__ const Tensor<U, Dim, Contig, IndexT, PtrTraits> __host__ __device__ const Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::castResize() const { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::castResize() const {
return const_cast<Tensor<T, Dim, Contig, IndexT, PtrTraits>*>(this)-> return const_cast<Tensor<T, Dim, InnerContig, IndexT, PtrTraits>*>(this)->
castResize<U>(); castResize<U>();
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
template <typename U> template <typename U>
__host__ __device__ bool __host__ __device__ bool
Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastResize() const { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::canCastResize() const {
static_assert(sizeof(U) >= sizeof(T), "only handles greater sizes"); static_assert(sizeof(U) >= sizeof(T), "only handles greater sizes");
constexpr int kMultiple = sizeof(U) / sizeof(T); constexpr int kMultiple = sizeof(U) / sizeof(T);
...@@ -279,13 +298,13 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastResize() const { ...@@ -279,13 +298,13 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastResize() const {
return true; return true;
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
template <typename NewIndexT> template <typename NewIndexT>
__host__ Tensor<T, Dim, Contig, NewIndexT, PtrTraits> __host__ Tensor<T, Dim, InnerContig, NewIndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::castIndexType() const { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::castIndexType() const {
if (sizeof(NewIndexT) < sizeof(IndexT)) { if (sizeof(NewIndexT) < sizeof(IndexT)) {
assert(this->canCastIndexType<NewIndexT>()); GPU_FAISS_ASSERT(this->canUseIndexType<NewIndexT>());
} }
NewIndexT newSize[Dim]; NewIndexT newSize[Dim];
...@@ -295,15 +314,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::castIndexType() const { ...@@ -295,15 +314,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::castIndexType() const {
newStride[i] = (NewIndexT) stride_[i]; newStride[i] = (NewIndexT) stride_[i];
} }
return Tensor<T, Dim, Contig, NewIndexT, PtrTraits>( return Tensor<T, Dim, InnerContig, NewIndexT, PtrTraits>(
data_, newSize, newStride); data_, newSize, newStride);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
template <typename NewIndexT> template <typename NewIndexT>
__host__ bool __host__ bool
Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastIndexType() const { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::canUseIndexType() const {
static_assert(sizeof(size_t) >= sizeof(IndexT), static_assert(sizeof(size_t) >= sizeof(IndexT),
"index size too large"); "index size too large");
static_assert(sizeof(size_t) >= sizeof(NewIndexT), static_assert(sizeof(size_t) >= sizeof(NewIndexT),
...@@ -313,16 +332,12 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastIndexType() const { ...@@ -313,16 +332,12 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastIndexType() const {
// FIXME: maybe also consider offset in bytes? multiply by sizeof(T)? // FIXME: maybe also consider offset in bytes? multiply by sizeof(T)?
size_t maxOffset = 0; size_t maxOffset = 0;
if (Contig) {
maxOffset = (size_t) size_[0] * (size_t) stride_[0];
} else {
for (int i = 0; i < Dim; ++i) { for (int i = 0; i < Dim; ++i) {
size_t curMaxOffset = (size_t) size_[i] * (size_t) stride_[i]; size_t curMaxOffset = (size_t) size_[i] * (size_t) stride_[i];
if (curMaxOffset > maxOffset) { if (curMaxOffset > maxOffset) {
maxOffset = curMaxOffset; maxOffset = curMaxOffset;
} }
} }
}
if (maxOffset > (size_t) std::numeric_limits<NewIndexT>::max()) { if (maxOffset > (size_t) std::numeric_limits<NewIndexT>::max()) {
return false; return false;
...@@ -331,23 +346,23 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastIndexType() const { ...@@ -331,23 +346,23 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::canCastIndexType() const {
return true; return true;
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ IndexT __host__ __device__ size_t
Tensor<T, Dim, Contig, IndexT, PtrTraits>::numElements() const { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::numElements() const {
long size = getSize(0); size_t size = (size_t) getSize(0);
for (int i = 1; i < Dim; ++i) { for (int i = 1; i < Dim; ++i) {
size *= getSize(i); size *= (size_t) getSize(i);
} }
return size; return size;
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool __host__ __device__ bool
Tensor<T, Dim, Contig, IndexT, PtrTraits>::isContiguous() const { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isContiguous() const {
long prevSize = 1; long prevSize = 1;
for (int i = Dim - 1; i >= 0; --i) { for (int i = Dim - 1; i >= 0; --i) {
...@@ -363,10 +378,10 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::isContiguous() const { ...@@ -363,10 +378,10 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::isContiguous() const {
return true; return true;
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool __host__ __device__ bool
Tensor<T, Dim, Contig, IndexT, PtrTraits>::isConsistentlySized(int i) const { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isConsistentlySized(int i) const {
if (i == 0 && getStride(i) > 0 && getSize(i) > 0) { if (i == 0 && getStride(i) > 0 && getSize(i) > 0) {
return true; return true;
} else if ((i > 0) && (i < Dim) && (getStride(i) > 0) && } else if ((i > 0) && (i < Dim) && (getStride(i) > 0) &&
...@@ -377,10 +392,10 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::isConsistentlySized(int i) const { ...@@ -377,10 +392,10 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::isConsistentlySized(int i) const {
return false; return false;
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool __host__ __device__ bool
Tensor<T, Dim, Contig, IndexT, PtrTraits>::isConsistentlySized() const { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isConsistentlySized() const {
for (int i = 0; i < Dim; ++i) { for (int i = 0; i < Dim; ++i) {
if (!isConsistentlySized(i)) { if (!isConsistentlySized(i)) {
return false; return false;
...@@ -390,23 +405,28 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::isConsistentlySized() const { ...@@ -390,23 +405,28 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::isConsistentlySized() const {
return true; return true;
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool __host__ __device__ bool
Tensor<T, Dim, Contig, IndexT, PtrTraits>::isContiguousDim(int i) const { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::isContiguousDim(int i) const {
return (i == Dim - 1) || // just in case return (i == Dim - 1) || // just in case
((i < Dim - 1) && ((i < Dim - 1) &&
((getStride(i) / getStride(i + 1)) == getSize(i + 1))); ((getStride(i) / getStride(i + 1)) == getSize(i + 1)));
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::transpose(int dim1, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::transpose(int dim1,
int dim2) const { int dim2) const {
assert(dim1 >= 0 && dim1 < Dim); GPU_FAISS_ASSERT(dim1 >= 0 && dim1 < Dim);
assert(dim1 >= 0 && dim2 < Dim); GPU_FAISS_ASSERT(dim1 >= 0 && dim2 < Dim);
static_assert(!Contig, "cannot transpose contiguous arrays");
// If a tensor is innermost contiguous, one cannot transpose the innermost
// dimension
if (InnerContig) {
GPU_FAISS_ASSERT(dim1 != Dim - 1 && dim2 != Dim - 1);
}
IndexT newSize[Dim]; IndexT newSize[Dim];
IndexT newStride[Dim]; IndexT newStride[Dim];
...@@ -424,14 +444,14 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::transpose(int dim1, ...@@ -424,14 +444,14 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::transpose(int dim1,
newStride[dim1] = newStride[dim2]; newStride[dim1] = newStride[dim2];
newStride[dim2] = tmp; newStride[dim2] = tmp;
return Tensor<T, Dim, Contig, IndexT, PtrTraits>(data_, newSize, newStride); return Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data_, newSize, newStride);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
template <int NewDim> template <int NewDim>
__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::upcastOuter() { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::upcastOuter() {
// Can only create tensors of greater dimension // Can only create tensors of greater dimension
static_assert(NewDim > Dim, "Can only upcast to greater dim"); static_assert(NewDim > Dim, "Can only upcast to greater dim");
...@@ -452,15 +472,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::upcastOuter() { ...@@ -452,15 +472,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::upcastOuter() {
} }
} }
return Tensor<T, NewDim, Contig, IndexT, PtrTraits>( return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
data_, newSize, newStride); data_, newSize, newStride);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
template <int NewDim> template <int NewDim>
__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::upcastInner() { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::upcastInner() {
// Can only create tensors of greater dimension // Can only create tensors of greater dimension
static_assert(NewDim > Dim, "Can only upcast to greater dim"); static_assert(NewDim > Dim, "Can only upcast to greater dim");
...@@ -479,15 +499,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::upcastInner() { ...@@ -479,15 +499,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::upcastInner() {
} }
} }
return Tensor<T, NewDim, Contig, IndexT, PtrTraits>( return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
data_, newSize, newStride); data_, newSize, newStride);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
template <int NewDim> template <int NewDim>
__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastOuter() { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::downcastOuter() {
// Can only create tensors of lesser dimension // Can only create tensors of lesser dimension
static_assert(NewDim < Dim, "Can only downcast to lesser dim"); static_assert(NewDim < Dim, "Can only downcast to lesser dim");
...@@ -497,7 +517,7 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastOuter() { ...@@ -497,7 +517,7 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastOuter() {
// them). // them).
for (int i = 0; i < Dim - NewDim; ++i) { for (int i = 0; i < Dim - NewDim; ++i) {
bool cont = isContiguousDim(i); bool cont = isContiguousDim(i);
assert(cont); GPU_FAISS_ASSERT(cont);
} }
IndexT newSize[NewDim]; IndexT newSize[NewDim];
...@@ -524,15 +544,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastOuter() { ...@@ -524,15 +544,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastOuter() {
} }
} }
return Tensor<T, NewDim, Contig, IndexT, PtrTraits>( return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
data_, newSize, newStride); data_, newSize, newStride);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
template <int NewDim> template <int NewDim>
__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastInner() { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::downcastInner() {
// Can only create tensors of lesser dimension // Can only create tensors of lesser dimension
static_assert(NewDim < Dim, "Can only downcast to lesser dim"); static_assert(NewDim < Dim, "Can only downcast to lesser dim");
...@@ -541,7 +561,7 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastInner() { ...@@ -541,7 +561,7 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastInner() {
// in all of the dimensions we are collapsing (no padding in // in all of the dimensions we are collapsing (no padding in
// them). // them).
for (int i = NewDim; i < Dim; ++i) { for (int i = NewDim; i < Dim; ++i) {
assert(isContiguousDim(i)); GPU_FAISS_ASSERT(isContiguousDim(i));
} }
IndexT newSize[NewDim]; IndexT newSize[NewDim];
...@@ -567,15 +587,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastInner() { ...@@ -567,15 +587,15 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::downcastInner() {
} }
} }
return Tensor<T, NewDim, Contig, IndexT, PtrTraits>( return Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>(
data_, newSize, newStride); data_, newSize, newStride);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
template <int SubDim> template <int SubDim>
__host__ __device__ Tensor<T, SubDim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::view(DataPtrType at) { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::view(DataPtrType at) {
static_assert(SubDim >= 1 && SubDim < Dim, static_assert(SubDim >= 1 && SubDim < Dim,
"can only create view of lesser dim"); "can only create view of lesser dim");
...@@ -587,89 +607,76 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::view(DataPtrType at) { ...@@ -587,89 +607,76 @@ Tensor<T, Dim, Contig, IndexT, PtrTraits>::view(DataPtrType at) {
viewStrides[i] = stride_[Dim - SubDim + i]; viewStrides[i] = stride_[Dim - SubDim + i];
} }
return Tensor<T, SubDim, Contig, IndexT, PtrTraits>( return Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>(
at, viewSizes, viewStrides); at, viewSizes, viewStrides);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
template <int SubDim> template <int SubDim>
__host__ __device__ Tensor<T, SubDim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::view() { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::view() {
return view<SubDim>(data_); return view<SubDim>(data_);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::narrowOutermost(IndexT start, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::narrowOutermost(IndexT start,
IndexT size) { IndexT size) {
DataPtrType newData = data_; return this->narrow(0, start, size);
if (start > 0) {
newData += start * stride_[0];
}
IndexT newSize[Dim];
for (int i = 0; i < Dim; ++i) {
if (i == 0) {
assert(start + size <= size_[0]);
newSize[i] = size;
} else {
newSize[i] = size_[i];
}
}
return Tensor<T, Dim, Contig, IndexT, PtrTraits>(newData, newSize, stride_);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ Tensor<T, Dim, false, IndexT, PtrTraits> __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::narrow(int dim, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::narrow(int dim,
IndexT start, IndexT start,
IndexT size) { IndexT size) {
DataPtrType newData = data_; DataPtrType newData = data_;
GPU_FAISS_ASSERT(start >= 0 &&
start < size_[dim] &&
(start + size) <= size_[dim]);
if (start > 0) { if (start > 0) {
newData += start * stride_[dim]; newData += (size_t) start * stride_[dim];
} }
IndexT newSize[Dim]; IndexT newSize[Dim];
for (int i = 0; i < Dim; ++i) { for (int i = 0; i < Dim; ++i) {
if (i == dim) { if (i == dim) {
assert(start + size <= size_[dim]); GPU_FAISS_ASSERT(start + size <= size_[dim]);
newSize[i] = size; newSize[i] = size;
} else { } else {
newSize[i] = size_[i]; newSize[i] = size_[i];
} }
} }
// The narrowed tensor is not necessarily contiguous // If we were innermost contiguous before, we are still innermost contiguous
return Tensor<T, Dim, false, IndexT, PtrTraits>(newData, newSize, stride_); return Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(newData, newSize, stride_);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
template <int NewDim> template <int NewDim>
__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::view( Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::view(
std::initializer_list<IndexT> sizes) { std::initializer_list<IndexT> sizes) {
static_assert(Contig, "on contiguous tensors only"); GPU_FAISS_ASSERT(this->isContiguous());
assert(sizes.size() == NewDim); GPU_FAISS_ASSERT(sizes.size() == NewDim);
// The total size of the new view must be the same as the total size // The total size of the new view must be the same as the total size
// of the old view // of the old view
size_t curSize = numElements(); size_t curSize = numElements();
size_t newSize = 1; size_t newSize = 1;
for (auto s : sizes) { for (auto s : sizes) {
newSize *= s; newSize *= s;
} }
assert(curSize == newSize); GPU_FAISS_ASSERT(curSize == newSize);
return Tensor<T, NewDim, true, IndexT, PtrTraits>(data(), sizes); return Tensor<T, NewDim, true, IndexT, PtrTraits>(data(), sizes);
} }
......
...@@ -24,7 +24,7 @@ namespace faiss { namespace gpu { ...@@ -24,7 +24,7 @@ namespace faiss { namespace gpu {
/// Our tensor type /// Our tensor type
template <typename T, template <typename T,
int Dim, int Dim,
bool Contig, bool InnerContig,
typename IndexT, typename IndexT,
template <typename U> class PtrTraits> template <typename U> class PtrTraits>
class Tensor; class Tensor;
...@@ -58,7 +58,7 @@ struct DefaultPtrTraits { ...@@ -58,7 +58,7 @@ struct DefaultPtrTraits {
- `T` is the contained type (e.g., `float`) - `T` is the contained type (e.g., `float`)
- `Dim` is the tensor rank - `Dim` is the tensor rank
- If `Contig` is true, then the tensor is assumed to be - If `InnerContig` is true, then the tensor is assumed to be innermost
- contiguous, and only operations that make sense on contiguous - contiguous, and only operations that make sense on contiguous
- arrays are allowed (e.g., no transpose). Strides are still - arrays are allowed (e.g., no transpose). Strides are still
- calculated, but innermost stride is assumed to be 1. - calculated, but innermost stride is assumed to be 1.
...@@ -71,7 +71,7 @@ struct DefaultPtrTraits { ...@@ -71,7 +71,7 @@ struct DefaultPtrTraits {
*/ */
template <typename T, template <typename T,
int Dim, int Dim,
bool Contig = false, bool InnerContig = false,
typename IndexT = int, typename IndexT = int,
template <typename U> class PtrTraits = traits::DefaultPtrTraits> template <typename U> class PtrTraits = traits::DefaultPtrTraits>
class Tensor { class Tensor {
...@@ -79,28 +79,28 @@ class Tensor { ...@@ -79,28 +79,28 @@ class Tensor {
enum { NumDim = Dim }; enum { NumDim = Dim };
typedef T DataType; typedef T DataType;
typedef IndexT IndexType; typedef IndexT IndexType;
enum { IsContig = Contig }; enum { IsInnerContig = InnerContig };
typedef typename PtrTraits<T>::PtrType DataPtrType; typedef typename PtrTraits<T>::PtrType DataPtrType;
typedef Tensor<T, Dim, Contig, IndexT, PtrTraits> TensorType; typedef Tensor<T, Dim, InnerContig, IndexT, PtrTraits> TensorType;
/// Default constructor /// Default constructor
__host__ __device__ Tensor(); __host__ __device__ Tensor();
/// Copy constructor /// Copy constructor
__host__ __device__ Tensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t) __host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t)
= default; = default;
/// Move constructor /// Move constructor
__host__ __device__ Tensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>&& t) __host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t)
= default; = default;
/// Assignment /// Assignment
__host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits>& __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
operator=(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t) = default; operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) = default;
/// Move assignment /// Move assignment
__host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits>& __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
operator=(Tensor<T, Dim, Contig, IndexT, PtrTraits>&& t); operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
/// Constructor that calculates strides with no padding /// Constructor that calculates strides with no padding
__host__ __device__ Tensor(DataPtrType data, __host__ __device__ Tensor(DataPtrType data,
...@@ -116,28 +116,33 @@ class Tensor { ...@@ -116,28 +116,33 @@ class Tensor {
const IndexT strides[Dim]); const IndexT strides[Dim]);
/// Copies a tensor into ourselves; sizes must match /// Copies a tensor into ourselves; sizes must match
__host__ void copyFrom(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, __host__ void copyFrom(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream); cudaStream_t stream);
/// Copies ourselves into a tensor; sizes must match /// Copies ourselves into a tensor; sizes must match
__host__ void copyTo(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, __host__ void copyTo(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream); cudaStream_t stream);
/// Returns true if the two tensors are of the same dimensionality, /// Returns true if the two tensors are of the same dimensionality,
/// size and stride. /// size and stride.
template <int OtherDim> template <typename OtherT, int OtherDim>
__host__ __device__ bool __host__ __device__ bool
isSame(const Tensor<T, OtherDim, Contig, IndexT, PtrTraits>& rhs) const; isSame(const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs) const;
/// Returns true if the two tensors are of the same dimensionality and size
template <typename OtherT, int OtherDim>
__host__ __device__ bool
isSameSize(const Tensor<OtherT, OtherDim, InnerContig, IndexT, PtrTraits>& rhs) const;
/// Cast to a tensor of a different type of the same size and /// Cast to a tensor of a different type of the same size and
/// stride. U and our type T must be of the same size /// stride. U and our type T must be of the same size
template <typename U> template <typename U>
__host__ __device__ Tensor<U, Dim, Contig, IndexT, PtrTraits> cast(); __host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits> cast();
/// Const version of `cast` /// Const version of `cast`
template <typename U> template <typename U>
__host__ __device__ __host__ __device__
const Tensor<U, Dim, Contig, IndexT, PtrTraits> cast() const; const Tensor<U, Dim, InnerContig, IndexT, PtrTraits> cast() const;
/// Cast to a tensor of a different type which is potentially a /// Cast to a tensor of a different type which is potentially a
/// different size than our type T. Tensor must be aligned and the /// different size than our type T. Tensor must be aligned and the
...@@ -146,11 +151,11 @@ class Tensor { ...@@ -146,11 +151,11 @@ class Tensor {
/// must be contiguous. The stride of all outer dimensions must be a /// must be contiguous. The stride of all outer dimensions must be a
/// multiple of sizeof(U) / sizeof(T) as well. /// multiple of sizeof(U) / sizeof(T) as well.
template <typename U> template <typename U>
__host__ __device__ Tensor<U, Dim, Contig, IndexT, PtrTraits> castResize(); __host__ __device__ Tensor<U, Dim, InnerContig, IndexT, PtrTraits> castResize();
/// Const version of `castResize` /// Const version of `castResize`
template <typename U> template <typename U>
__host__ __device__ const Tensor<U, Dim, Contig, IndexT, PtrTraits> __host__ __device__ const Tensor<U, Dim, InnerContig, IndexT, PtrTraits>
castResize() const; castResize() const;
/// Returns true if we can castResize() this tensor to the new type /// Returns true if we can castResize() this tensor to the new type
...@@ -161,13 +166,13 @@ class Tensor { ...@@ -161,13 +166,13 @@ class Tensor {
/// Fails if size or stride entries are not representable in the new /// Fails if size or stride entries are not representable in the new
/// IndexT. /// IndexT.
template <typename NewIndexT> template <typename NewIndexT>
__host__ Tensor<T, Dim, Contig, NewIndexT, PtrTraits> __host__ Tensor<T, Dim, InnerContig, NewIndexT, PtrTraits>
castIndexType() const; castIndexType() const;
/// Returns true if we can castIndexType() this tensor to the new /// Returns true if we can use this indexing type to access all elements
/// index type /// index type
template <typename NewIndexT> template <typename NewIndexT>
__host__ bool canCastIndexType() const; __host__ bool canUseIndexType() const;
/// Returns a raw pointer to the start of our data. /// Returns a raw pointer to the start of our data.
__host__ __device__ inline DataPtrType data() { __host__ __device__ inline DataPtrType data() {
...@@ -230,12 +235,12 @@ class Tensor { ...@@ -230,12 +235,12 @@ class Tensor {
/// Returns the total number of elements contained within our data /// Returns the total number of elements contained within our data
/// (product of `getSize(i)`) /// (product of `getSize(i)`)
__host__ __device__ IndexT numElements() const; __host__ __device__ size_t numElements() const;
/// If we are contiguous, returns the total size in bytes of our /// If we are contiguous, returns the total size in bytes of our
/// data /// data
__host__ __device__ size_t getSizeInBytes() const { __host__ __device__ size_t getSizeInBytes() const {
return (size_t) numElements() * sizeof(T); return numElements() * sizeof(T);
} }
/// Returns the size array. /// Returns the size array.
...@@ -273,21 +278,21 @@ class Tensor { ...@@ -273,21 +278,21 @@ class Tensor {
/// dimensions given. Does not actually move elements; transposition /// dimensions given. Does not actually move elements; transposition
/// is made by permuting the size/stride arrays. /// is made by permuting the size/stride arrays.
/// If the dimensions are not valid, asserts. /// If the dimensions are not valid, asserts.
__host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
transpose(int dim1, int dim2) const; transpose(int dim1, int dim2) const;
/// Upcast a tensor of dimension `D` to some tensor of dimension /// Upcast a tensor of dimension `D` to some tensor of dimension
/// D' > D by padding the leading dimensions by 1 /// D' > D by padding the leading dimensions by 1
/// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]` /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
template <int NewDim> template <int NewDim>
__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
upcastOuter(); upcastOuter();
/// Upcast a tensor of dimension `D` to some tensor of dimension /// Upcast a tensor of dimension `D` to some tensor of dimension
/// D' > D by padding the lowest/most varying dimensions by 1 /// D' > D by padding the lowest/most varying dimensions by 1
/// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]` /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
template <int NewDim> template <int NewDim>
__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
upcastInner(); upcastInner();
/// Downcast a tensor of dimension `D` to some tensor of dimension /// Downcast a tensor of dimension `D` to some tensor of dimension
...@@ -295,46 +300,45 @@ class Tensor { ...@@ -295,46 +300,45 @@ class Tensor {
/// padding on the leading dimensions. /// padding on the leading dimensions.
template <int NewDim> template <int NewDim>
__host__ __device__ __host__ __device__
Tensor<T, NewDim, Contig, IndexT, PtrTraits> downcastOuter(); Tensor<T, NewDim, InnerContig, IndexT, PtrTraits> downcastOuter();
/// Downcast a tensor of dimension `D` to some tensor of dimension /// Downcast a tensor of dimension `D` to some tensor of dimension
/// D' < D by collapsing the leading dimensions. asserts if there is /// D' < D by collapsing the leading dimensions. asserts if there is
/// padding on the leading dimensions. /// padding on the leading dimensions.
template <int NewDim> template <int NewDim>
__host__ __device__ __host__ __device__
Tensor<T, NewDim, Contig, IndexT, PtrTraits> downcastInner(); Tensor<T, NewDim, InnerContig, IndexT, PtrTraits> downcastInner();
/// Returns a tensor that is a view of the `SubDim`-dimensional slice /// Returns a tensor that is a view of the `SubDim`-dimensional slice
/// of this tensor, starting at `at`. /// of this tensor, starting at `at`.
template <int SubDim> template <int SubDim>
__host__ __device__ Tensor<T, SubDim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>
view(DataPtrType at); view(DataPtrType at);
/// Returns a tensor that is a view of the `SubDim`-dimensional slice /// Returns a tensor that is a view of the `SubDim`-dimensional slice
/// of this tensor, starting where our data begins /// of this tensor, starting where our data begins
template <int SubDim> template <int SubDim>
__host__ __device__ Tensor<T, SubDim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<T, SubDim, InnerContig, IndexT, PtrTraits>
view(); view();
/// Returns a tensor of the same dimension that is a view of the /// Returns a tensor of the same dimension that is a view of the
/// original tensor with the specified dimension restricted to the /// original tensor with the specified dimension restricted to the
/// elements in the range [start, start + size) /// elements in the range [start, start + size)
__host__ __device__ Tensor<T, Dim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
narrowOutermost(IndexT start, IndexT size); narrowOutermost(IndexT start, IndexT size);
/// Returns a tensor of the same dimension that is a view of the /// Returns a tensor of the same dimension that is a view of the
/// original tensor with the specified dimension restricted to the /// original tensor with the specified dimension restricted to the
/// elements in the range [start, start + size). /// elements in the range [start, start + size).
/// Can occur in an arbitrary dimension, and is possibly /// Can occur in an arbitrary dimension
/// non-contiguous __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
__host__ __device__ Tensor<T, Dim, false, IndexT, PtrTraits>
narrow(int dim, IndexT start, IndexT size); narrow(int dim, IndexT start, IndexT size);
/// Returns a view of the given tensor expressed as a tensor of a /// Returns a view of the given tensor expressed as a tensor of a
/// different number of dimensions. /// different number of dimensions.
/// Only works if we are contiguous. /// Only works if we are contiguous.
template <int NewDim> template <int NewDim>
__host__ __device__ Tensor<T, NewDim, Contig, IndexT, PtrTraits> __host__ __device__ Tensor<T, NewDim, InnerContig, IndexT, PtrTraits>
view(std::initializer_list<IndexT> sizes); view(std::initializer_list<IndexT> sizes);
protected: protected:
...@@ -352,21 +356,21 @@ class Tensor { ...@@ -352,21 +356,21 @@ class Tensor {
namespace detail { namespace detail {
template <typename IndexType> template <typename IndexType>
bool canCastIndexType() { bool canUseIndexType() {
return true; return true;
} }
template <typename IndexType, typename T, typename... U> template <typename IndexType, typename T, typename... U>
bool canCastIndexType(const T& arg, const U&... args) { bool canUseIndexType(const T& arg, const U&... args) {
return arg.canCastIndexType<IndexType>() && return arg.canUseIndexType<IndexType>() &&
canCastIndexType(args...); canUseIndexType(args...);
} }
} // namespace detail } // namespace detail
template <typename IndexType, typename... T> template <typename IndexType, typename... T>
bool canCastIndexType(const T&... args) { bool canUseIndexType(const T&... args) {
return detail::canCastIndexType(args...); return detail::canUseIndexType(args...);
} }
namespace detail { namespace detail {
...@@ -464,7 +468,7 @@ class SubTensor<TensorType, 0, PtrTraits> { ...@@ -464,7 +468,7 @@ class SubTensor<TensorType, 0, PtrTraits> {
/// Our parent tensor can create us /// Our parent tensor can create us
friend class Tensor<typename TensorType::DataType, friend class Tensor<typename TensorType::DataType,
1, 1,
TensorType::IsContig, TensorType::IsInnerContig,
typename TensorType::IndexType, typename TensorType::IndexType,
PtrTraits>; PtrTraits>;
...@@ -493,7 +497,7 @@ class SubTensor { ...@@ -493,7 +497,7 @@ class SubTensor {
__host__ __device__ inline __host__ __device__ inline
SubTensor<TensorType, SubDim - 1, PtrTraits> SubTensor<TensorType, SubDim - 1, PtrTraits>
operator[](typename TensorType::IndexType index) { operator[](typename TensorType::IndexType index) {
if (TensorType::IsContig && SubDim == 1) { if (TensorType::IsInnerContig && SubDim == 1) {
// Innermost dimension is stride 1 for contiguous arrays // Innermost dimension is stride 1 for contiguous arrays
return SubTensor<TensorType, SubDim - 1, PtrTraits>( return SubTensor<TensorType, SubDim - 1, PtrTraits>(
tensor_, data_ + index); tensor_, data_ + index);
...@@ -509,7 +513,7 @@ class SubTensor { ...@@ -509,7 +513,7 @@ class SubTensor {
__host__ __device__ inline __host__ __device__ inline
const SubTensor<TensorType, SubDim - 1, PtrTraits> const SubTensor<TensorType, SubDim - 1, PtrTraits>
operator[](typename TensorType::IndexType index) const { operator[](typename TensorType::IndexType index) const {
if (TensorType::IsContig && SubDim == 1) { if (TensorType::IsInnerContig && SubDim == 1) {
// Innermost dimension is stride 1 for contiguous arrays // Innermost dimension is stride 1 for contiguous arrays
return SubTensor<TensorType, SubDim - 1, PtrTraits>( return SubTensor<TensorType, SubDim - 1, PtrTraits>(
tensor_, data_ + index); tensor_, data_ + index);
...@@ -590,7 +594,7 @@ class SubTensor { ...@@ -590,7 +594,7 @@ class SubTensor {
/// of this tensor, starting where our data begins /// of this tensor, starting where our data begins
Tensor<typename TensorType::DataType, Tensor<typename TensorType::DataType,
SubDim, SubDim,
TensorType::IsContig, TensorType::IsInnerContig,
typename TensorType::IndexType, typename TensorType::IndexType,
PtrTraits> view() { PtrTraits> view() {
return tensor_.template view<SubDim>(data_); return tensor_.template view<SubDim>(data_);
...@@ -604,7 +608,7 @@ class SubTensor { ...@@ -604,7 +608,7 @@ class SubTensor {
friend class friend class
Tensor<typename TensorType::DataType, Tensor<typename TensorType::DataType,
TensorType::NumDim, TensorType::NumDim,
TensorType::IsContig, TensorType::IsInnerContig,
typename TensorType::IndexType, typename TensorType::IndexType,
PtrTraits>; PtrTraits>;
...@@ -624,23 +628,23 @@ class SubTensor { ...@@ -624,23 +628,23 @@ class SubTensor {
} // namespace detail } // namespace detail
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ inline __host__ __device__ inline
detail::SubTensor<Tensor<T, Dim, Contig, IndexT, PtrTraits>, detail::SubTensor<Tensor<T, Dim, InnerContig, IndexT, PtrTraits>,
Dim - 1, PtrTraits> Dim - 1, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator[](IndexT index) { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator[](IndexT index) {
return detail::SubTensor<TensorType, Dim - 1, PtrTraits>( return detail::SubTensor<TensorType, Dim - 1, PtrTraits>(
detail::SubTensor<TensorType, Dim, PtrTraits>( detail::SubTensor<TensorType, Dim, PtrTraits>(
*this, data_)[index]); *this, data_)[index]);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ inline __host__ __device__ inline
const detail::SubTensor<Tensor<T, Dim, Contig, IndexT, PtrTraits>, const detail::SubTensor<Tensor<T, Dim, InnerContig, IndexT, PtrTraits>,
Dim - 1, PtrTraits> Dim - 1, PtrTraits>
Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator[](IndexT index) const { Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator[](IndexT index) const {
return detail::SubTensor<TensorType, Dim - 1, PtrTraits>( return detail::SubTensor<TensorType, Dim - 1, PtrTraits>(
detail::SubTensor<TensorType, Dim, PtrTraits>( detail::SubTensor<TensorType, Dim, PtrTraits>(
const_cast<TensorType&>(*this), data_)[index]); const_cast<TensorType&>(*this), data_)[index]);
......
...@@ -19,26 +19,26 @@ ...@@ -19,26 +19,26 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
template <typename T> template <typename T, typename IndexT>
struct TensorInfo { struct TensorInfo {
static constexpr int kMaxDims = 8; static constexpr int kMaxDims = 8;
T* data; T* data;
int sizes[kMaxDims]; IndexT sizes[kMaxDims];
int strides[kMaxDims]; IndexT strides[kMaxDims];
int dims; int dims;
}; };
template <typename T, int Dim> template <typename T, typename IndexT, int Dim>
struct TensorInfoOffset { struct TensorInfoOffset {
__device__ inline static unsigned int get(const TensorInfo<T>& info, __device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
unsigned int linearId) { IndexT linearId) {
unsigned int offset = 0; IndexT offset = 0;
#pragma unroll #pragma unroll
for (int i = Dim - 1; i >= 0; --i) { for (int i = Dim - 1; i >= 0; --i) {
unsigned int curDimIndex = linearId % info.sizes[i]; IndexT curDimIndex = linearId % info.sizes[i];
unsigned int curDimOffset = curDimIndex * info.strides[i]; IndexT curDimOffset = curDimIndex * info.strides[i];
offset += curDimOffset; offset += curDimOffset;
...@@ -51,21 +51,21 @@ struct TensorInfoOffset { ...@@ -51,21 +51,21 @@ struct TensorInfoOffset {
} }
}; };
template <typename T> template <typename T, typename IndexT>
struct TensorInfoOffset<T, -1> { struct TensorInfoOffset<T, IndexT, -1> {
__device__ inline static unsigned int get(const TensorInfo<T>& info, __device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
unsigned int linearId) { IndexT linearId) {
return linearId; return linearId;
} }
}; };
template <typename T, int Dim> template <typename T, typename IndexT, int Dim>
TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) { TensorInfo<T, IndexT> getTensorInfo(const Tensor<T, Dim, true>& t) {
TensorInfo<T> info; TensorInfo<T, IndexT> info;
for (int i = 0; i < Dim; ++i) { for (int i = 0; i < Dim; ++i) {
info.sizes[i] = t.getSize(i); info.sizes[i] = (IndexT) t.getSize(i);
info.strides[i] = t.getStride(i); info.strides[i] = (IndexT) t.getStride(i);
} }
info.data = t.data(); info.data = t.data();
...@@ -74,26 +74,22 @@ TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) { ...@@ -74,26 +74,22 @@ TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) {
return info; return info;
} }
template <typename T, int DimInput, int DimOutput> template <typename T, typename IndexT, int DimInput, int DimOutput>
__global__ void transposeAny(TensorInfo<T> input, __global__ void transposeAny(TensorInfo<T, IndexT> input,
TensorInfo<T> output, TensorInfo<T, IndexT> output,
unsigned int totalSize) { IndexT totalSize) {
auto linearThreadId = blockIdx.x * blockDim.x + threadIdx.x; for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x;
i < totalSize;
if (linearThreadId >= totalSize) { i += gridDim.x + blockDim.x) {
return; auto inputOffset = TensorInfoOffset<T, IndexT, DimInput>::get(input, i);
} auto outputOffset = TensorInfoOffset<T, IndexT, DimOutput>::get(output, i);
auto inputOffset =
TensorInfoOffset<T, DimInput>::get(input, linearThreadId);
auto outputOffset =
TensorInfoOffset<T, DimOutput>::get(output, linearThreadId);
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
output.data[outputOffset] = __ldg(&input.data[inputOffset]); output.data[outputOffset] = __ldg(&input.data[inputOffset]);
#else #else
output.data[outputOffset] = input.data[inputOffset]; output.data[outputOffset] = input.data[inputOffset];
#endif #endif
}
} }
/// Performs an out-of-place transposition between any two dimensions. /// Performs an out-of-place transposition between any two dimensions.
...@@ -110,7 +106,8 @@ void runTransposeAny(Tensor<T, Dim, true>& in, ...@@ -110,7 +106,8 @@ void runTransposeAny(Tensor<T, Dim, true>& in,
int dim1, int dim2, int dim1, int dim2,
Tensor<T, Dim, true>& out, Tensor<T, Dim, true>& out,
cudaStream_t stream) { cudaStream_t stream) {
static_assert(Dim <= TensorInfo<T>::kMaxDims, "too many dimensions"); static_assert(Dim <= TensorInfo<T, unsigned int>::kMaxDims,
"too many dimensions");
FAISS_ASSERT(dim1 != dim2); FAISS_ASSERT(dim1 != dim2);
FAISS_ASSERT(dim1 < Dim && dim2 < Dim); FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
...@@ -127,20 +124,33 @@ void runTransposeAny(Tensor<T, Dim, true>& in, ...@@ -127,20 +124,33 @@ void runTransposeAny(Tensor<T, Dim, true>& in,
FAISS_ASSERT(out.getSize(i) == outSize[i]); FAISS_ASSERT(out.getSize(i) == outSize[i]);
} }
auto inInfo = getTensorInfo<T, Dim>(in); size_t totalSize = in.numElements();
auto outInfo = getTensorInfo<T, Dim>(out); size_t block = std::min((size_t) getMaxThreadsCurrentDevice(), totalSize);
if (totalSize <= (size_t) std::numeric_limits<int>::max()) {
// div/mod seems faster with unsigned types
auto inInfo = getTensorInfo<T, unsigned int, Dim>(in);
auto outInfo = getTensorInfo<T, unsigned int, Dim>(out);
std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]); std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
std::swap(inInfo.strides[dim1], inInfo.strides[dim2]); std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
int totalSize = in.numElements(); auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
transposeAny<T, unsigned int, Dim, -1>
<<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
} else {
auto inInfo = getTensorInfo<T, unsigned long, Dim>(in);
auto outInfo = getTensorInfo<T, unsigned long, Dim>(out);
std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
int numThreads = std::min(getMaxThreadsCurrentDevice(), totalSize); auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
auto grid = dim3(utils::divUp(totalSize, numThreads));
auto block = dim3(numThreads);
transposeAny<T, Dim, -1><<<grid, block, 0, stream>>>( transposeAny<T, unsigned long, Dim, -1>
inInfo, outInfo, totalSize); <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
}
CUDA_TEST_ERROR(); CUDA_TEST_ERROR();
} }
......
...@@ -7,6 +7,8 @@ ...@@ -7,6 +7,8 @@
*/ */
// Copyright 2004-present Facebook. All Rights Reserved. // Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#include "../BlockSelectKernel.cuh" #include "../BlockSelectKernel.cuh"
#include "../Limits.cuh" #include "../Limits.cuh"
...@@ -17,6 +19,15 @@ ...@@ -17,6 +19,15 @@
Tensor<int, 2, true>& outV, \ Tensor<int, 2, true>& outV, \
bool dir, \ bool dir, \
int k, \ int k, \
cudaStream_t stream); \
\
extern void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
Tensor<TYPE, 2, true>& inK, \
Tensor<int, 2, true>& inV, \
Tensor<TYPE, 2, true>& outK, \
Tensor<int, 2, true>& outV, \
bool dir, \
int k, \
cudaStream_t stream) cudaStream_t stream)
#define BLOCK_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q) \ #define BLOCK_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q) \
...@@ -27,6 +38,11 @@ ...@@ -27,6 +38,11 @@
bool dir, \ bool dir, \
int k, \ int k, \
cudaStream_t stream) { \ cudaStream_t stream) { \
FAISS_ASSERT(in.getSize(0) == outK.getSize(0)); \
FAISS_ASSERT(in.getSize(0) == outV.getSize(0)); \
FAISS_ASSERT(outK.getSize(1) == k); \
FAISS_ASSERT(outV.getSize(1) == k); \
\
auto grid = dim3(in.getSize(0)); \ auto grid = dim3(in.getSize(0)); \
\ \
constexpr int kBlockSelectNumThreads = 128; \ constexpr int kBlockSelectNumThreads = 128; \
...@@ -41,8 +57,40 @@ ...@@ -41,8 +57,40 @@
blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \ blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
<<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k); \ <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k); \
CUDA_TEST_ERROR(); \ CUDA_TEST_ERROR(); \
} \
\
void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
Tensor<TYPE, 2, true>& inK, \
Tensor<int, 2, true>& inV, \
Tensor<TYPE, 2, true>& outK, \
Tensor<int, 2, true>& outV, \
bool dir, \
int k, \
cudaStream_t stream) { \
FAISS_ASSERT(inK.isSameSize(inV)); \
FAISS_ASSERT(outK.isSameSize(outV)); \
\
auto grid = dim3(inK.getSize(0)); \
\
constexpr int kBlockSelectNumThreads = 128; \
auto block = dim3(kBlockSelectNumThreads); \
\
FAISS_ASSERT(k <= WARP_Q); \
FAISS_ASSERT(dir == DIR); \
\
auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
auto vInit = -1; \
\
blockSelectPair<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
<<<grid, block, 0, stream>>>(inK, inV, outK, outV, kInit, vInit, k); \
CUDA_TEST_ERROR(); \
} }
#define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q) \ #define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q) \
runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \ runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
in, outK, outV, dir, k, stream) in, outK, outV, dir, k, stream)
#define BLOCK_SELECT_PAIR_CALL(TYPE, DIR, WARP_Q) \
runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
inK, inV, outK, outV, dir, k, stream)
...@@ -222,7 +222,6 @@ void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) { ...@@ -222,7 +222,6 @@ void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) {
} }
static void write_ivf_header (const IndexIVF * ivf, FILE *f, static void write_ivf_header (const IndexIVF * ivf, FILE *f,
bool include_ids = true) { bool include_ids = true) {
write_index_header (ivf, f); write_index_header (ivf, f);
...@@ -445,6 +444,7 @@ static void read_ScalarQuantizer (ScalarQuantizer *ivsc, FILE *f) { ...@@ -445,6 +444,7 @@ static void read_ScalarQuantizer (ScalarQuantizer *ivsc, FILE *f) {
READVECTOR (ivsc->trained); READVECTOR (ivsc->trained);
} }
ProductQuantizer * read_ProductQuantizer (const char*fname) { ProductQuantizer * read_ProductQuantizer (const char*fname) {
FILE *f = fopen (fname, "r"); FILE *f = fopen (fname, "r");
FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname); FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
...@@ -676,8 +676,8 @@ Index *read_index (FILE * f, bool try_mmap) { ...@@ -676,8 +676,8 @@ Index *read_index (FILE * f, bool try_mmap) {
} }
idx = idxmap; idx = idxmap;
} else { } else {
fprintf (stderr, "Index type 0x%08x not supported\n", h); FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
abort (); idx = nullptr;
} }
return idx; return idx;
} }
......
...@@ -80,6 +80,7 @@ class FloatVector(_object): ...@@ -80,6 +80,7 @@ class FloatVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss.FloatVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss.FloatVector_push_back(self, *args)
def clear(self): return _swigfaiss.FloatVector_clear(self)
def data(self): return _swigfaiss.FloatVector_data(self) def data(self): return _swigfaiss.FloatVector_data(self)
def size(self): return _swigfaiss.FloatVector_size(self) def size(self): return _swigfaiss.FloatVector_size(self)
def at(self, *args): return _swigfaiss.FloatVector_at(self, *args) def at(self, *args): return _swigfaiss.FloatVector_at(self, *args)
...@@ -89,6 +90,27 @@ class FloatVector(_object): ...@@ -89,6 +90,27 @@ class FloatVector(_object):
FloatVector_swigregister = _swigfaiss.FloatVector_swigregister FloatVector_swigregister = _swigfaiss.FloatVector_swigregister
FloatVector_swigregister(FloatVector) FloatVector_swigregister(FloatVector)
class DoubleVector(_object):
__swig_setmethods__ = {}
__setattr__ = lambda self, name, value: _swig_setattr(self, DoubleVector, name, value)
__swig_getmethods__ = {}
__getattr__ = lambda self, name: _swig_getattr(self, DoubleVector, name)
__repr__ = _swig_repr
def __init__(self):
this = _swigfaiss.new_DoubleVector()
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss.DoubleVector_push_back(self, *args)
def clear(self): return _swigfaiss.DoubleVector_clear(self)
def data(self): return _swigfaiss.DoubleVector_data(self)
def size(self): return _swigfaiss.DoubleVector_size(self)
def at(self, *args): return _swigfaiss.DoubleVector_at(self, *args)
def resize(self, *args): return _swigfaiss.DoubleVector_resize(self, *args)
__swig_destroy__ = _swigfaiss.delete_DoubleVector
__del__ = lambda self : None;
DoubleVector_swigregister = _swigfaiss.DoubleVector_swigregister
DoubleVector_swigregister(DoubleVector)
class ByteVector(_object): class ByteVector(_object):
__swig_setmethods__ = {} __swig_setmethods__ = {}
__setattr__ = lambda self, name, value: _swig_setattr(self, ByteVector, name, value) __setattr__ = lambda self, name, value: _swig_setattr(self, ByteVector, name, value)
...@@ -100,6 +122,7 @@ class ByteVector(_object): ...@@ -100,6 +122,7 @@ class ByteVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss.ByteVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss.ByteVector_push_back(self, *args)
def clear(self): return _swigfaiss.ByteVector_clear(self)
def data(self): return _swigfaiss.ByteVector_data(self) def data(self): return _swigfaiss.ByteVector_data(self)
def size(self): return _swigfaiss.ByteVector_size(self) def size(self): return _swigfaiss.ByteVector_size(self)
def at(self, *args): return _swigfaiss.ByteVector_at(self, *args) def at(self, *args): return _swigfaiss.ByteVector_at(self, *args)
...@@ -120,6 +143,7 @@ class Uint64Vector(_object): ...@@ -120,6 +143,7 @@ class Uint64Vector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss.Uint64Vector_push_back(self, *args) def push_back(self, *args): return _swigfaiss.Uint64Vector_push_back(self, *args)
def clear(self): return _swigfaiss.Uint64Vector_clear(self)
def data(self): return _swigfaiss.Uint64Vector_data(self) def data(self): return _swigfaiss.Uint64Vector_data(self)
def size(self): return _swigfaiss.Uint64Vector_size(self) def size(self): return _swigfaiss.Uint64Vector_size(self)
def at(self, *args): return _swigfaiss.Uint64Vector_at(self, *args) def at(self, *args): return _swigfaiss.Uint64Vector_at(self, *args)
...@@ -140,6 +164,7 @@ class LongVector(_object): ...@@ -140,6 +164,7 @@ class LongVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss.LongVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss.LongVector_push_back(self, *args)
def clear(self): return _swigfaiss.LongVector_clear(self)
def data(self): return _swigfaiss.LongVector_data(self) def data(self): return _swigfaiss.LongVector_data(self)
def size(self): return _swigfaiss.LongVector_size(self) def size(self): return _swigfaiss.LongVector_size(self)
def at(self, *args): return _swigfaiss.LongVector_at(self, *args) def at(self, *args): return _swigfaiss.LongVector_at(self, *args)
...@@ -160,6 +185,7 @@ class IntVector(_object): ...@@ -160,6 +185,7 @@ class IntVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss.IntVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss.IntVector_push_back(self, *args)
def clear(self): return _swigfaiss.IntVector_clear(self)
def data(self): return _swigfaiss.IntVector_data(self) def data(self): return _swigfaiss.IntVector_data(self)
def size(self): return _swigfaiss.IntVector_size(self) def size(self): return _swigfaiss.IntVector_size(self)
def at(self, *args): return _swigfaiss.IntVector_at(self, *args) def at(self, *args): return _swigfaiss.IntVector_at(self, *args)
...@@ -180,6 +206,7 @@ class VectorTransformVector(_object): ...@@ -180,6 +206,7 @@ class VectorTransformVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss.VectorTransformVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss.VectorTransformVector_push_back(self, *args)
def clear(self): return _swigfaiss.VectorTransformVector_clear(self)
def data(self): return _swigfaiss.VectorTransformVector_data(self) def data(self): return _swigfaiss.VectorTransformVector_data(self)
def size(self): return _swigfaiss.VectorTransformVector_size(self) def size(self): return _swigfaiss.VectorTransformVector_size(self)
def at(self, *args): return _swigfaiss.VectorTransformVector_at(self, *args) def at(self, *args): return _swigfaiss.VectorTransformVector_at(self, *args)
...@@ -200,6 +227,7 @@ class OperatingPointVector(_object): ...@@ -200,6 +227,7 @@ class OperatingPointVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss.OperatingPointVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss.OperatingPointVector_push_back(self, *args)
def clear(self): return _swigfaiss.OperatingPointVector_clear(self)
def data(self): return _swigfaiss.OperatingPointVector_data(self) def data(self): return _swigfaiss.OperatingPointVector_data(self)
def size(self): return _swigfaiss.OperatingPointVector_size(self) def size(self): return _swigfaiss.OperatingPointVector_size(self)
def at(self, *args): return _swigfaiss.OperatingPointVector_at(self, *args) def at(self, *args): return _swigfaiss.OperatingPointVector_at(self, *args)
...@@ -220,6 +248,7 @@ class FloatVectorVector(_object): ...@@ -220,6 +248,7 @@ class FloatVectorVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss.FloatVectorVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss.FloatVectorVector_push_back(self, *args)
def clear(self): return _swigfaiss.FloatVectorVector_clear(self)
def data(self): return _swigfaiss.FloatVectorVector_data(self) def data(self): return _swigfaiss.FloatVectorVector_data(self)
def size(self): return _swigfaiss.FloatVectorVector_size(self) def size(self): return _swigfaiss.FloatVectorVector_size(self)
def at(self, *args): return _swigfaiss.FloatVectorVector_at(self, *args) def at(self, *args): return _swigfaiss.FloatVectorVector_at(self, *args)
...@@ -240,6 +269,7 @@ class ByteVectorVector(_object): ...@@ -240,6 +269,7 @@ class ByteVectorVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss.ByteVectorVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss.ByteVectorVector_push_back(self, *args)
def clear(self): return _swigfaiss.ByteVectorVector_clear(self)
def data(self): return _swigfaiss.ByteVectorVector_data(self) def data(self): return _swigfaiss.ByteVectorVector_data(self)
def size(self): return _swigfaiss.ByteVectorVector_size(self) def size(self): return _swigfaiss.ByteVectorVector_size(self)
def at(self, *args): return _swigfaiss.ByteVectorVector_at(self, *args) def at(self, *args): return _swigfaiss.ByteVectorVector_at(self, *args)
...@@ -260,6 +290,7 @@ class LongVectorVector(_object): ...@@ -260,6 +290,7 @@ class LongVectorVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss.LongVectorVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss.LongVectorVector_push_back(self, *args)
def clear(self): return _swigfaiss.LongVectorVector_clear(self)
def data(self): return _swigfaiss.LongVectorVector_data(self) def data(self): return _swigfaiss.LongVectorVector_data(self)
def size(self): return _swigfaiss.LongVectorVector_size(self) def size(self): return _swigfaiss.LongVectorVector_size(self)
def at(self, *args): return _swigfaiss.LongVectorVector_at(self, *args) def at(self, *args): return _swigfaiss.LongVectorVector_at(self, *args)
...@@ -876,6 +907,9 @@ class ClusteringParameters(_object): ...@@ -876,6 +907,9 @@ class ClusteringParameters(_object):
__swig_setmethods__["update_index"] = _swigfaiss.ClusteringParameters_update_index_set __swig_setmethods__["update_index"] = _swigfaiss.ClusteringParameters_update_index_set
__swig_getmethods__["update_index"] = _swigfaiss.ClusteringParameters_update_index_get __swig_getmethods__["update_index"] = _swigfaiss.ClusteringParameters_update_index_get
if _newclass:update_index = _swig_property(_swigfaiss.ClusteringParameters_update_index_get, _swigfaiss.ClusteringParameters_update_index_set) if _newclass:update_index = _swig_property(_swigfaiss.ClusteringParameters_update_index_get, _swigfaiss.ClusteringParameters_update_index_set)
__swig_setmethods__["frozen_centroids"] = _swigfaiss.ClusteringParameters_frozen_centroids_set
__swig_getmethods__["frozen_centroids"] = _swigfaiss.ClusteringParameters_frozen_centroids_get
if _newclass:frozen_centroids = _swig_property(_swigfaiss.ClusteringParameters_frozen_centroids_get, _swigfaiss.ClusteringParameters_frozen_centroids_set)
__swig_setmethods__["min_points_per_centroid"] = _swigfaiss.ClusteringParameters_min_points_per_centroid_set __swig_setmethods__["min_points_per_centroid"] = _swigfaiss.ClusteringParameters_min_points_per_centroid_set
__swig_getmethods__["min_points_per_centroid"] = _swigfaiss.ClusteringParameters_min_points_per_centroid_get __swig_getmethods__["min_points_per_centroid"] = _swigfaiss.ClusteringParameters_min_points_per_centroid_get
if _newclass:min_points_per_centroid = _swig_property(_swigfaiss.ClusteringParameters_min_points_per_centroid_get, _swigfaiss.ClusteringParameters_min_points_per_centroid_set) if _newclass:min_points_per_centroid = _swig_property(_swigfaiss.ClusteringParameters_min_points_per_centroid_get, _swigfaiss.ClusteringParameters_min_points_per_centroid_set)
...@@ -1720,6 +1754,9 @@ class IndexIVF(Index): ...@@ -1720,6 +1754,9 @@ class IndexIVF(Index):
__swig_setmethods__["cp"] = _swigfaiss.IndexIVF_cp_set __swig_setmethods__["cp"] = _swigfaiss.IndexIVF_cp_set
__swig_getmethods__["cp"] = _swigfaiss.IndexIVF_cp_get __swig_getmethods__["cp"] = _swigfaiss.IndexIVF_cp_get
if _newclass:cp = _swig_property(_swigfaiss.IndexIVF_cp_get, _swigfaiss.IndexIVF_cp_set) if _newclass:cp = _swig_property(_swigfaiss.IndexIVF_cp_get, _swigfaiss.IndexIVF_cp_set)
__swig_setmethods__["clustering_index"] = _swigfaiss.IndexIVF_clustering_index_set
__swig_getmethods__["clustering_index"] = _swigfaiss.IndexIVF_clustering_index_get
if _newclass:clustering_index = _swig_property(_swigfaiss.IndexIVF_clustering_index_get, _swigfaiss.IndexIVF_clustering_index_set)
__swig_setmethods__["ids"] = _swigfaiss.IndexIVF_ids_set __swig_setmethods__["ids"] = _swigfaiss.IndexIVF_ids_set
__swig_getmethods__["ids"] = _swigfaiss.IndexIVF_ids_get __swig_getmethods__["ids"] = _swigfaiss.IndexIVF_ids_get
if _newclass:ids = _swig_property(_swigfaiss.IndexIVF_ids_get, _swigfaiss.IndexIVF_ids_set) if _newclass:ids = _swig_property(_swigfaiss.IndexIVF_ids_get, _swigfaiss.IndexIVF_ids_set)
...@@ -1949,6 +1986,7 @@ class IndexIVFPQ(IndexIVF): ...@@ -1949,6 +1986,7 @@ class IndexIVFPQ(IndexIVF):
def encode_multiple(self, *args): return _swigfaiss.IndexIVFPQ_encode_multiple(self, *args) def encode_multiple(self, *args): return _swigfaiss.IndexIVFPQ_encode_multiple(self, *args)
def decode_multiple(self, *args): return _swigfaiss.IndexIVFPQ_decode_multiple(self, *args) def decode_multiple(self, *args): return _swigfaiss.IndexIVFPQ_decode_multiple(self, *args)
def search_preassigned(self, *args): return _swigfaiss.IndexIVFPQ_search_preassigned(self, *args) def search_preassigned(self, *args): return _swigfaiss.IndexIVFPQ_search_preassigned(self, *args)
def search_and_reconstruct(self, *args): return _swigfaiss.IndexIVFPQ_search_and_reconstruct(self, *args)
def precompute_table(self): return _swigfaiss.IndexIVFPQ_precompute_table(self) def precompute_table(self): return _swigfaiss.IndexIVFPQ_precompute_table(self)
def __init__(self, *args): def __init__(self, *args):
this = _swigfaiss.new_IndexIVFPQ(*args) this = _swigfaiss.new_IndexIVFPQ(*args)
...@@ -2107,6 +2145,7 @@ class IndexIDMap(Index): ...@@ -2107,6 +2145,7 @@ class IndexIDMap(Index):
def train(self, *args): return _swigfaiss.IndexIDMap_train(self, *args) def train(self, *args): return _swigfaiss.IndexIDMap_train(self, *args)
def reset(self): return _swigfaiss.IndexIDMap_reset(self) def reset(self): return _swigfaiss.IndexIDMap_reset(self)
def remove_ids(self, *args): return _swigfaiss.IndexIDMap_remove_ids(self, *args) def remove_ids(self, *args): return _swigfaiss.IndexIDMap_remove_ids(self, *args)
def range_search(self, *args): return _swigfaiss.IndexIDMap_range_search(self, *args)
__swig_destroy__ = _swigfaiss.delete_IndexIDMap __swig_destroy__ = _swigfaiss.delete_IndexIDMap
__del__ = lambda self : None; __del__ = lambda self : None;
def __init__(self, *args): def __init__(self, *args):
...@@ -2775,6 +2814,27 @@ RangeSearchPartialResult_swigregister(RangeSearchPartialResult) ...@@ -2775,6 +2814,27 @@ RangeSearchPartialResult_swigregister(RangeSearchPartialResult)
def ignore_SIGTTIN(): def ignore_SIGTTIN():
return _swigfaiss.ignore_SIGTTIN() return _swigfaiss.ignore_SIGTTIN()
ignore_SIGTTIN = _swigfaiss.ignore_SIGTTIN ignore_SIGTTIN = _swigfaiss.ignore_SIGTTIN
class MapLong2Long(_object):
__swig_setmethods__ = {}
__setattr__ = lambda self, name, value: _swig_setattr(self, MapLong2Long, name, value)
__swig_getmethods__ = {}
__getattr__ = lambda self, name: _swig_getattr(self, MapLong2Long, name)
__repr__ = _swig_repr
__swig_setmethods__["map"] = _swigfaiss.MapLong2Long_map_set
__swig_getmethods__["map"] = _swigfaiss.MapLong2Long_map_get
if _newclass:map = _swig_property(_swigfaiss.MapLong2Long_map_get, _swigfaiss.MapLong2Long_map_set)
def add(self, *args): return _swigfaiss.MapLong2Long_add(self, *args)
def search(self, *args): return _swigfaiss.MapLong2Long_search(self, *args)
def search_multiple(self, *args): return _swigfaiss.MapLong2Long_search_multiple(self, *args)
def __init__(self):
this = _swigfaiss.new_MapLong2Long()
try: self.this.append(this)
except: self.this = this
__swig_destroy__ = _swigfaiss.delete_MapLong2Long
__del__ = lambda self : None;
MapLong2Long_swigregister = _swigfaiss.MapLong2Long_swigregister
MapLong2Long_swigregister(MapLong2Long)
# This file is compatible with both classic and new-style classes. # This file is compatible with both classic and new-style classes.
...@@ -80,6 +80,7 @@ class FloatVector(_object): ...@@ -80,6 +80,7 @@ class FloatVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.FloatVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss_gpu.FloatVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.FloatVector_clear(self)
def data(self): return _swigfaiss_gpu.FloatVector_data(self) def data(self): return _swigfaiss_gpu.FloatVector_data(self)
def size(self): return _swigfaiss_gpu.FloatVector_size(self) def size(self): return _swigfaiss_gpu.FloatVector_size(self)
def at(self, *args): return _swigfaiss_gpu.FloatVector_at(self, *args) def at(self, *args): return _swigfaiss_gpu.FloatVector_at(self, *args)
...@@ -89,6 +90,27 @@ class FloatVector(_object): ...@@ -89,6 +90,27 @@ class FloatVector(_object):
FloatVector_swigregister = _swigfaiss_gpu.FloatVector_swigregister FloatVector_swigregister = _swigfaiss_gpu.FloatVector_swigregister
FloatVector_swigregister(FloatVector) FloatVector_swigregister(FloatVector)
class DoubleVector(_object):
__swig_setmethods__ = {}
__setattr__ = lambda self, name, value: _swig_setattr(self, DoubleVector, name, value)
__swig_getmethods__ = {}
__getattr__ = lambda self, name: _swig_getattr(self, DoubleVector, name)
__repr__ = _swig_repr
def __init__(self):
this = _swigfaiss_gpu.new_DoubleVector()
try: self.this.append(this)
except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.DoubleVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.DoubleVector_clear(self)
def data(self): return _swigfaiss_gpu.DoubleVector_data(self)
def size(self): return _swigfaiss_gpu.DoubleVector_size(self)
def at(self, *args): return _swigfaiss_gpu.DoubleVector_at(self, *args)
def resize(self, *args): return _swigfaiss_gpu.DoubleVector_resize(self, *args)
__swig_destroy__ = _swigfaiss_gpu.delete_DoubleVector
__del__ = lambda self : None;
DoubleVector_swigregister = _swigfaiss_gpu.DoubleVector_swigregister
DoubleVector_swigregister(DoubleVector)
class ByteVector(_object): class ByteVector(_object):
__swig_setmethods__ = {} __swig_setmethods__ = {}
__setattr__ = lambda self, name, value: _swig_setattr(self, ByteVector, name, value) __setattr__ = lambda self, name, value: _swig_setattr(self, ByteVector, name, value)
...@@ -100,6 +122,7 @@ class ByteVector(_object): ...@@ -100,6 +122,7 @@ class ByteVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.ByteVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss_gpu.ByteVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.ByteVector_clear(self)
def data(self): return _swigfaiss_gpu.ByteVector_data(self) def data(self): return _swigfaiss_gpu.ByteVector_data(self)
def size(self): return _swigfaiss_gpu.ByteVector_size(self) def size(self): return _swigfaiss_gpu.ByteVector_size(self)
def at(self, *args): return _swigfaiss_gpu.ByteVector_at(self, *args) def at(self, *args): return _swigfaiss_gpu.ByteVector_at(self, *args)
...@@ -120,6 +143,7 @@ class Uint64Vector(_object): ...@@ -120,6 +143,7 @@ class Uint64Vector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.Uint64Vector_push_back(self, *args) def push_back(self, *args): return _swigfaiss_gpu.Uint64Vector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.Uint64Vector_clear(self)
def data(self): return _swigfaiss_gpu.Uint64Vector_data(self) def data(self): return _swigfaiss_gpu.Uint64Vector_data(self)
def size(self): return _swigfaiss_gpu.Uint64Vector_size(self) def size(self): return _swigfaiss_gpu.Uint64Vector_size(self)
def at(self, *args): return _swigfaiss_gpu.Uint64Vector_at(self, *args) def at(self, *args): return _swigfaiss_gpu.Uint64Vector_at(self, *args)
...@@ -140,6 +164,7 @@ class LongVector(_object): ...@@ -140,6 +164,7 @@ class LongVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.LongVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss_gpu.LongVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.LongVector_clear(self)
def data(self): return _swigfaiss_gpu.LongVector_data(self) def data(self): return _swigfaiss_gpu.LongVector_data(self)
def size(self): return _swigfaiss_gpu.LongVector_size(self) def size(self): return _swigfaiss_gpu.LongVector_size(self)
def at(self, *args): return _swigfaiss_gpu.LongVector_at(self, *args) def at(self, *args): return _swigfaiss_gpu.LongVector_at(self, *args)
...@@ -160,6 +185,7 @@ class IntVector(_object): ...@@ -160,6 +185,7 @@ class IntVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.IntVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss_gpu.IntVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.IntVector_clear(self)
def data(self): return _swigfaiss_gpu.IntVector_data(self) def data(self): return _swigfaiss_gpu.IntVector_data(self)
def size(self): return _swigfaiss_gpu.IntVector_size(self) def size(self): return _swigfaiss_gpu.IntVector_size(self)
def at(self, *args): return _swigfaiss_gpu.IntVector_at(self, *args) def at(self, *args): return _swigfaiss_gpu.IntVector_at(self, *args)
...@@ -180,6 +206,7 @@ class VectorTransformVector(_object): ...@@ -180,6 +206,7 @@ class VectorTransformVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.VectorTransformVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss_gpu.VectorTransformVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.VectorTransformVector_clear(self)
def data(self): return _swigfaiss_gpu.VectorTransformVector_data(self) def data(self): return _swigfaiss_gpu.VectorTransformVector_data(self)
def size(self): return _swigfaiss_gpu.VectorTransformVector_size(self) def size(self): return _swigfaiss_gpu.VectorTransformVector_size(self)
def at(self, *args): return _swigfaiss_gpu.VectorTransformVector_at(self, *args) def at(self, *args): return _swigfaiss_gpu.VectorTransformVector_at(self, *args)
...@@ -200,6 +227,7 @@ class OperatingPointVector(_object): ...@@ -200,6 +227,7 @@ class OperatingPointVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.OperatingPointVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss_gpu.OperatingPointVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.OperatingPointVector_clear(self)
def data(self): return _swigfaiss_gpu.OperatingPointVector_data(self) def data(self): return _swigfaiss_gpu.OperatingPointVector_data(self)
def size(self): return _swigfaiss_gpu.OperatingPointVector_size(self) def size(self): return _swigfaiss_gpu.OperatingPointVector_size(self)
def at(self, *args): return _swigfaiss_gpu.OperatingPointVector_at(self, *args) def at(self, *args): return _swigfaiss_gpu.OperatingPointVector_at(self, *args)
...@@ -220,6 +248,7 @@ class FloatVectorVector(_object): ...@@ -220,6 +248,7 @@ class FloatVectorVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.FloatVectorVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss_gpu.FloatVectorVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.FloatVectorVector_clear(self)
def data(self): return _swigfaiss_gpu.FloatVectorVector_data(self) def data(self): return _swigfaiss_gpu.FloatVectorVector_data(self)
def size(self): return _swigfaiss_gpu.FloatVectorVector_size(self) def size(self): return _swigfaiss_gpu.FloatVectorVector_size(self)
def at(self, *args): return _swigfaiss_gpu.FloatVectorVector_at(self, *args) def at(self, *args): return _swigfaiss_gpu.FloatVectorVector_at(self, *args)
...@@ -240,6 +269,7 @@ class ByteVectorVector(_object): ...@@ -240,6 +269,7 @@ class ByteVectorVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.ByteVectorVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss_gpu.ByteVectorVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.ByteVectorVector_clear(self)
def data(self): return _swigfaiss_gpu.ByteVectorVector_data(self) def data(self): return _swigfaiss_gpu.ByteVectorVector_data(self)
def size(self): return _swigfaiss_gpu.ByteVectorVector_size(self) def size(self): return _swigfaiss_gpu.ByteVectorVector_size(self)
def at(self, *args): return _swigfaiss_gpu.ByteVectorVector_at(self, *args) def at(self, *args): return _swigfaiss_gpu.ByteVectorVector_at(self, *args)
...@@ -260,6 +290,7 @@ class LongVectorVector(_object): ...@@ -260,6 +290,7 @@ class LongVectorVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.LongVectorVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss_gpu.LongVectorVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.LongVectorVector_clear(self)
def data(self): return _swigfaiss_gpu.LongVectorVector_data(self) def data(self): return _swigfaiss_gpu.LongVectorVector_data(self)
def size(self): return _swigfaiss_gpu.LongVectorVector_size(self) def size(self): return _swigfaiss_gpu.LongVectorVector_size(self)
def at(self, *args): return _swigfaiss_gpu.LongVectorVector_at(self, *args) def at(self, *args): return _swigfaiss_gpu.LongVectorVector_at(self, *args)
...@@ -280,6 +311,7 @@ class GpuResourcesVector(_object): ...@@ -280,6 +311,7 @@ class GpuResourcesVector(_object):
try: self.this.append(this) try: self.this.append(this)
except: self.this = this except: self.this = this
def push_back(self, *args): return _swigfaiss_gpu.GpuResourcesVector_push_back(self, *args) def push_back(self, *args): return _swigfaiss_gpu.GpuResourcesVector_push_back(self, *args)
def clear(self): return _swigfaiss_gpu.GpuResourcesVector_clear(self)
def data(self): return _swigfaiss_gpu.GpuResourcesVector_data(self) def data(self): return _swigfaiss_gpu.GpuResourcesVector_data(self)
def size(self): return _swigfaiss_gpu.GpuResourcesVector_size(self) def size(self): return _swigfaiss_gpu.GpuResourcesVector_size(self)
def at(self, *args): return _swigfaiss_gpu.GpuResourcesVector_at(self, *args) def at(self, *args): return _swigfaiss_gpu.GpuResourcesVector_at(self, *args)
...@@ -949,6 +981,9 @@ class ClusteringParameters(_object): ...@@ -949,6 +981,9 @@ class ClusteringParameters(_object):
__swig_setmethods__["update_index"] = _swigfaiss_gpu.ClusteringParameters_update_index_set __swig_setmethods__["update_index"] = _swigfaiss_gpu.ClusteringParameters_update_index_set
__swig_getmethods__["update_index"] = _swigfaiss_gpu.ClusteringParameters_update_index_get __swig_getmethods__["update_index"] = _swigfaiss_gpu.ClusteringParameters_update_index_get
if _newclass:update_index = _swig_property(_swigfaiss_gpu.ClusteringParameters_update_index_get, _swigfaiss_gpu.ClusteringParameters_update_index_set) if _newclass:update_index = _swig_property(_swigfaiss_gpu.ClusteringParameters_update_index_get, _swigfaiss_gpu.ClusteringParameters_update_index_set)
__swig_setmethods__["frozen_centroids"] = _swigfaiss_gpu.ClusteringParameters_frozen_centroids_set
__swig_getmethods__["frozen_centroids"] = _swigfaiss_gpu.ClusteringParameters_frozen_centroids_get
if _newclass:frozen_centroids = _swig_property(_swigfaiss_gpu.ClusteringParameters_frozen_centroids_get, _swigfaiss_gpu.ClusteringParameters_frozen_centroids_set)
__swig_setmethods__["min_points_per_centroid"] = _swigfaiss_gpu.ClusteringParameters_min_points_per_centroid_set __swig_setmethods__["min_points_per_centroid"] = _swigfaiss_gpu.ClusteringParameters_min_points_per_centroid_set
__swig_getmethods__["min_points_per_centroid"] = _swigfaiss_gpu.ClusteringParameters_min_points_per_centroid_get __swig_getmethods__["min_points_per_centroid"] = _swigfaiss_gpu.ClusteringParameters_min_points_per_centroid_get
if _newclass:min_points_per_centroid = _swig_property(_swigfaiss_gpu.ClusteringParameters_min_points_per_centroid_get, _swigfaiss_gpu.ClusteringParameters_min_points_per_centroid_set) if _newclass:min_points_per_centroid = _swig_property(_swigfaiss_gpu.ClusteringParameters_min_points_per_centroid_get, _swigfaiss_gpu.ClusteringParameters_min_points_per_centroid_set)
...@@ -1793,6 +1828,9 @@ class IndexIVF(Index): ...@@ -1793,6 +1828,9 @@ class IndexIVF(Index):
__swig_setmethods__["cp"] = _swigfaiss_gpu.IndexIVF_cp_set __swig_setmethods__["cp"] = _swigfaiss_gpu.IndexIVF_cp_set
__swig_getmethods__["cp"] = _swigfaiss_gpu.IndexIVF_cp_get __swig_getmethods__["cp"] = _swigfaiss_gpu.IndexIVF_cp_get
if _newclass:cp = _swig_property(_swigfaiss_gpu.IndexIVF_cp_get, _swigfaiss_gpu.IndexIVF_cp_set) if _newclass:cp = _swig_property(_swigfaiss_gpu.IndexIVF_cp_get, _swigfaiss_gpu.IndexIVF_cp_set)
__swig_setmethods__["clustering_index"] = _swigfaiss_gpu.IndexIVF_clustering_index_set
__swig_getmethods__["clustering_index"] = _swigfaiss_gpu.IndexIVF_clustering_index_get
if _newclass:clustering_index = _swig_property(_swigfaiss_gpu.IndexIVF_clustering_index_get, _swigfaiss_gpu.IndexIVF_clustering_index_set)
__swig_setmethods__["ids"] = _swigfaiss_gpu.IndexIVF_ids_set __swig_setmethods__["ids"] = _swigfaiss_gpu.IndexIVF_ids_set
__swig_getmethods__["ids"] = _swigfaiss_gpu.IndexIVF_ids_get __swig_getmethods__["ids"] = _swigfaiss_gpu.IndexIVF_ids_get
if _newclass:ids = _swig_property(_swigfaiss_gpu.IndexIVF_ids_get, _swigfaiss_gpu.IndexIVF_ids_set) if _newclass:ids = _swig_property(_swigfaiss_gpu.IndexIVF_ids_get, _swigfaiss_gpu.IndexIVF_ids_set)
...@@ -2022,6 +2060,7 @@ class IndexIVFPQ(IndexIVF): ...@@ -2022,6 +2060,7 @@ class IndexIVFPQ(IndexIVF):
def encode_multiple(self, *args): return _swigfaiss_gpu.IndexIVFPQ_encode_multiple(self, *args) def encode_multiple(self, *args): return _swigfaiss_gpu.IndexIVFPQ_encode_multiple(self, *args)
def decode_multiple(self, *args): return _swigfaiss_gpu.IndexIVFPQ_decode_multiple(self, *args) def decode_multiple(self, *args): return _swigfaiss_gpu.IndexIVFPQ_decode_multiple(self, *args)
def search_preassigned(self, *args): return _swigfaiss_gpu.IndexIVFPQ_search_preassigned(self, *args) def search_preassigned(self, *args): return _swigfaiss_gpu.IndexIVFPQ_search_preassigned(self, *args)
def search_and_reconstruct(self, *args): return _swigfaiss_gpu.IndexIVFPQ_search_and_reconstruct(self, *args)
def precompute_table(self): return _swigfaiss_gpu.IndexIVFPQ_precompute_table(self) def precompute_table(self): return _swigfaiss_gpu.IndexIVFPQ_precompute_table(self)
def __init__(self, *args): def __init__(self, *args):
this = _swigfaiss_gpu.new_IndexIVFPQ(*args) this = _swigfaiss_gpu.new_IndexIVFPQ(*args)
...@@ -2180,6 +2219,7 @@ class IndexIDMap(Index): ...@@ -2180,6 +2219,7 @@ class IndexIDMap(Index):
def train(self, *args): return _swigfaiss_gpu.IndexIDMap_train(self, *args) def train(self, *args): return _swigfaiss_gpu.IndexIDMap_train(self, *args)
def reset(self): return _swigfaiss_gpu.IndexIDMap_reset(self) def reset(self): return _swigfaiss_gpu.IndexIDMap_reset(self)
def remove_ids(self, *args): return _swigfaiss_gpu.IndexIDMap_remove_ids(self, *args) def remove_ids(self, *args): return _swigfaiss_gpu.IndexIDMap_remove_ids(self, *args)
def range_search(self, *args): return _swigfaiss_gpu.IndexIDMap_range_search(self, *args)
__swig_destroy__ = _swigfaiss_gpu.delete_IndexIDMap __swig_destroy__ = _swigfaiss_gpu.delete_IndexIDMap
__del__ = lambda self : None; __del__ = lambda self : None;
def __init__(self, *args): def __init__(self, *args):
...@@ -2340,6 +2380,9 @@ class GpuMultipleClonerOptions(GpuClonerOptions): ...@@ -2340,6 +2380,9 @@ class GpuMultipleClonerOptions(GpuClonerOptions):
__swig_setmethods__["shard"] = _swigfaiss_gpu.GpuMultipleClonerOptions_shard_set __swig_setmethods__["shard"] = _swigfaiss_gpu.GpuMultipleClonerOptions_shard_set
__swig_getmethods__["shard"] = _swigfaiss_gpu.GpuMultipleClonerOptions_shard_get __swig_getmethods__["shard"] = _swigfaiss_gpu.GpuMultipleClonerOptions_shard_get
if _newclass:shard = _swig_property(_swigfaiss_gpu.GpuMultipleClonerOptions_shard_get, _swigfaiss_gpu.GpuMultipleClonerOptions_shard_set) if _newclass:shard = _swig_property(_swigfaiss_gpu.GpuMultipleClonerOptions_shard_get, _swigfaiss_gpu.GpuMultipleClonerOptions_shard_set)
__swig_setmethods__["shard_type"] = _swigfaiss_gpu.GpuMultipleClonerOptions_shard_type_set
__swig_getmethods__["shard_type"] = _swigfaiss_gpu.GpuMultipleClonerOptions_shard_type_get
if _newclass:shard_type = _swig_property(_swigfaiss_gpu.GpuMultipleClonerOptions_shard_type_get, _swigfaiss_gpu.GpuMultipleClonerOptions_shard_type_set)
__swig_destroy__ = _swigfaiss_gpu.delete_GpuMultipleClonerOptions __swig_destroy__ = _swigfaiss_gpu.delete_GpuMultipleClonerOptions
__del__ = lambda self : None; __del__ = lambda self : None;
GpuMultipleClonerOptions_swigregister = _swigfaiss_gpu.GpuMultipleClonerOptions_swigregister GpuMultipleClonerOptions_swigregister = _swigfaiss_gpu.GpuMultipleClonerOptions_swigregister
...@@ -3256,6 +3299,27 @@ RangeSearchPartialResult_swigregister(RangeSearchPartialResult) ...@@ -3256,6 +3299,27 @@ RangeSearchPartialResult_swigregister(RangeSearchPartialResult)
def ignore_SIGTTIN(): def ignore_SIGTTIN():
return _swigfaiss_gpu.ignore_SIGTTIN() return _swigfaiss_gpu.ignore_SIGTTIN()
ignore_SIGTTIN = _swigfaiss_gpu.ignore_SIGTTIN ignore_SIGTTIN = _swigfaiss_gpu.ignore_SIGTTIN
class MapLong2Long(_object):
__swig_setmethods__ = {}
__setattr__ = lambda self, name, value: _swig_setattr(self, MapLong2Long, name, value)
__swig_getmethods__ = {}
__getattr__ = lambda self, name: _swig_getattr(self, MapLong2Long, name)
__repr__ = _swig_repr
__swig_setmethods__["map"] = _swigfaiss_gpu.MapLong2Long_map_set
__swig_getmethods__["map"] = _swigfaiss_gpu.MapLong2Long_map_get
if _newclass:map = _swig_property(_swigfaiss_gpu.MapLong2Long_map_get, _swigfaiss_gpu.MapLong2Long_map_set)
def add(self, *args): return _swigfaiss_gpu.MapLong2Long_add(self, *args)
def search(self, *args): return _swigfaiss_gpu.MapLong2Long_search(self, *args)
def search_multiple(self, *args): return _swigfaiss_gpu.MapLong2Long_search_multiple(self, *args)
def __init__(self):
this = _swigfaiss_gpu.new_MapLong2Long()
try: self.this.append(this)
except: self.this = this
__swig_destroy__ = _swigfaiss_gpu.delete_MapLong2Long
__del__ = lambda self : None;
MapLong2Long_swigregister = _swigfaiss_gpu.MapLong2Long_swigregister
MapLong2Long_swigregister(MapLong2Long)
# This file is compatible with both classic and new-style classes. # This file is compatible with both classic and new-style classes.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#pragma SWIG nowarn=321 #pragma SWIG nowarn=321
#pragma SWIG nowarn=403 #pragma SWIG nowarn=403
#pragma SWIG nowarn=325
typedef unsigned long uint64_t; typedef unsigned long uint64_t;
typedef uint64_t size_t; typedef uint64_t size_t;
...@@ -108,6 +109,7 @@ namespace std { ...@@ -108,6 +109,7 @@ namespace std {
public: public:
vector(); vector();
void push_back(T); void push_back(T);
void clear();
T * data(); T * data();
size_t size(); size_t size();
T at (size_t n) const; T at (size_t n) const;
...@@ -117,6 +119,7 @@ namespace std { ...@@ -117,6 +119,7 @@ namespace std {
%template(FloatVector) std::vector<float>; %template(FloatVector) std::vector<float>;
%template(DoubleVector) std::vector<double>;
%template(ByteVector) std::vector<uint8_t>; %template(ByteVector) std::vector<uint8_t>;
%template(Uint64Vector) std::vector<uint64_t>; %template(Uint64Vector) std::vector<uint64_t>;
%template(LongVector) std::vector<long>; %template(LongVector) std::vector<long>;
...@@ -709,4 +712,36 @@ void ignore_SIGTTIN() { ...@@ -709,4 +712,36 @@ void ignore_SIGTTIN() {
void ignore_SIGTTIN(); void ignore_SIGTTIN();
%inline %{
// numpy misses a hash table implementation, hence this class. It
// represents not found values as -1 like in the Index implementation
struct MapLong2Long {
std::unordered_map<long, long> map;
void add(size_t n, const long *keys, const long *vals) {
map.reserve(map.size() + n);
for (size_t i = 0; i < n; i++) {
map[keys[i]] = vals[i];
}
}
long search(long key) {
if (map.count(key) == 0) {
return -1;
} else {
return map[key];
}
}
void search_multiple(size_t n, const long *keys, long * vals) {
for (size_t i = 0; i < n; i++) {
vals[i] = search(keys[i]);
}
}
};
%}
// End of file... // End of file...
...@@ -17,8 +17,8 @@ class TestClustering(unittest.TestCase): ...@@ -17,8 +17,8 @@ class TestClustering(unittest.TestCase):
def test_clustering(self): def test_clustering(self):
d = 64 d = 64
n = 1000 n = 1000
np.random.seed(123) rs = np.random.RandomState(123)
x = np.random.random(size=(n, d)).astype('float32') x = rs.uniform(size=(n, d)).astype('float32')
km = faiss.Kmeans(d, 32, niter=10) km = faiss.Kmeans(d, 32, niter=10)
err32 = km.train(x) err32 = km.train(x)
...@@ -37,15 +37,35 @@ class TestClustering(unittest.TestCase): ...@@ -37,15 +37,35 @@ class TestClustering(unittest.TestCase):
def test_nasty_clustering(self): def test_nasty_clustering(self):
d = 2 d = 2
np.random.seed(123) rs = np.random.RandomState(123)
x = np.zeros((100, d), dtype='float32') x = np.zeros((100, d), dtype='float32')
for i in range(5): for i in range(5):
x[i * 20:i * 20 + 20] = np.random.random(size=d) x[i * 20:i * 20 + 20] = rs.uniform(size=d)
# we have 5 distinct points but ask for 10 centroids... # we have 5 distinct points but ask for 10 centroids...
km = faiss.Kmeans(d, 10, niter=10, verbose=True) km = faiss.Kmeans(d, 10, niter=10, verbose=True)
km.train(x) km.train(x)
def test_redo(self):
d = 64
n = 1000
rs = np.random.RandomState(123)
x = rs.uniform(size=(n, d)).astype('float32')
clus = faiss.Clustering(d, 20)
clus.nredo = 1
clus.train(x, faiss.IndexFlatL2(d))
obj1 = faiss.vector_to_array(clus.obj)
clus = faiss.Clustering(d, 20)
clus.nredo = 10
clus.train(x, faiss.IndexFlatL2(d))
obj10 = faiss.vector_to_array(clus.obj)
self.assertGreater(obj1[-1], obj10[-1])
class TestPCA(unittest.TestCase): class TestPCA(unittest.TestCase):
...@@ -87,7 +107,6 @@ class TestProductQuantizer(unittest.TestCase): ...@@ -87,7 +107,6 @@ class TestProductQuantizer(unittest.TestCase):
self.assertGreater(2500, diff) self.assertGreater(2500, diff)
class TestRevSwigPtr(unittest.TestCase): class TestRevSwigPtr(unittest.TestCase):
def test_rev_swig_ptr(self): def test_rev_swig_ptr(self):
...@@ -127,6 +146,19 @@ class TestException(unittest.TestCase): ...@@ -127,6 +146,19 @@ class TestException(unittest.TestCase):
else: else:
assert False, 'exception did not fire???' assert False, 'exception did not fire???'
class TestMapLong2Long:
def test_do_it(self):
keys = np.array([13, 45, 67])
vals = np.array([3, 8, 2])
m = faiss.MapLong2Long()
m.add(keys, vals)
assert np.all(m.search_multiple(keys) == vals)
assert m.search(12343) == -1
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -22,15 +22,32 @@ def get_dataset(d, nb, nt, nq): ...@@ -22,15 +22,32 @@ def get_dataset(d, nb, nt, nq):
return (xt, xb, xq) return (xt, xb, xq)
def get_dataset_2(d, nb, nt, nq):
"""A dataset that is not completely random but still challenging to
index
"""
d1 = 10 # intrinsic dimension (more or less)
n = nb + nt + nq
rs = np.random.RandomState(1234)
x = rs.normal(size=(n, d1))
x = np.dot(x, rs.rand(d1, d))
# now we have a d1-dim ellipsoid in d-dimensional space
# higher factor (>4) -> higher frequency -> less linear
x = x * (rs.rand(d) * 4 + 0.1)
x = np.sin(x)
x = x.astype('float32')
return x[:nt], x[nt:-nq], x[-nq:]
class EvalIVFPQAccuracy(unittest.TestCase): class EvalIVFPQAccuracy(unittest.TestCase):
def test_IndexIVFPQ(self): def test_IndexIVFPQ(self):
d = 64 d = 32
nb = 1000 nb = 1000
nt = 1500 nt = 1500
nq = 200 nq = 200
(xt, xb, xq) = get_dataset(d, nb, nt, nq) (xt, xb, xq) = get_dataset_2(d, nb, nt, nq)
d = xt.shape[1] d = xt.shape[1]
gt_index = faiss.IndexFlatL2(d) gt_index = faiss.IndexFlatL2(d)
...@@ -38,15 +55,15 @@ class EvalIVFPQAccuracy(unittest.TestCase): ...@@ -38,15 +55,15 @@ class EvalIVFPQAccuracy(unittest.TestCase):
D, gt_nns = gt_index.search(xq, 1) D, gt_nns = gt_index.search(xq, 1)
coarse_quantizer = faiss.IndexFlatL2(d) coarse_quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(coarse_quantizer, d, 25, 16, 8) index = faiss.IndexIVFPQ(coarse_quantizer, d, 32, 8, 8)
index.train(xt) index.train(xt)
index.add(xb) index.add(xb)
index.nprobe = 5 index.nprobe = 4
D, nns = index.search(xq, 10) D, nns = index.search(xq, 10)
n_ok = (nns == gt_nns).sum() n_ok = (nns == gt_nns).sum()
nq = xq.shape[0] nq = xq.shape[0]
self.assertGreater(n_ok, nq * 0.4) self.assertGreater(n_ok, nq * 0.66)
class TestMultiIndexQuantizer(unittest.TestCase): class TestMultiIndexQuantizer(unittest.TestCase):
...@@ -78,16 +95,16 @@ class TestScalarQuantizer(unittest.TestCase): ...@@ -78,16 +95,16 @@ class TestScalarQuantizer(unittest.TestCase):
def test_4variants_ivf(self): def test_4variants_ivf(self):
d = 32 d = 32
nt = 1500 nt = 2500
nq = 200 nq = 400
nb = 10000 nb = 5000
(xt, xb, xq) = get_dataset(d, nb, nt, nq) (xt, xb, xq) = get_dataset_2(d, nb, nt, nq)
# common quantizer # common quantizer
quantizer = faiss.IndexFlatL2(d) quantizer = faiss.IndexFlatL2(d)
ncent = 128 ncent = 64
index_gt = faiss.IndexFlatL2(d) index_gt = faiss.IndexFlatL2(d)
index_gt.add(xb) index_gt.add(xb)
...@@ -114,9 +131,12 @@ class TestScalarQuantizer(unittest.TestCase): ...@@ -114,9 +131,12 @@ class TestScalarQuantizer(unittest.TestCase):
D, I = index.search(xq, 10) D, I = index.search(xq, 10)
nok[qname] = (I[:, 0] == I_ref[:, 0]).sum() nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
print(nok, nq)
print(nok) self.assertGreaterEqual(nok['flat'], nq * 0.6)
# The tests below are a bit fragile, it happens that the
# ordering between uniform and non-uniform are reverted,
# probably because the dataset is small, which introduces
# jitter
self.assertGreaterEqual(nok['flat'], nok['QT_8bit']) self.assertGreaterEqual(nok['flat'], nok['QT_8bit'])
self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit']) self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit'])
self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform']) self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
...@@ -124,15 +144,15 @@ class TestScalarQuantizer(unittest.TestCase): ...@@ -124,15 +144,15 @@ class TestScalarQuantizer(unittest.TestCase):
def test_4variants(self): def test_4variants(self):
d = 32 d = 32
nt = 1500 nt = 2500
nq = 200 nq = 400
nb = 10000 nb = 5000
(xt, xb, xq) = get_dataset(d, nb, nt, nq) (xt, xb, xq) = get_dataset(d, nb, nt, nq)
index_gt = faiss.IndexFlatL2(d) index_gt = faiss.IndexFlatL2(d)
index_gt.add(xb) index_gt.add(xb)
D, I_ref = index_gt.search(xq, 10) D_ref, I_ref = index_gt.search(xq, 10)
nok = {} nok = {}
...@@ -141,12 +161,12 @@ class TestScalarQuantizer(unittest.TestCase): ...@@ -141,12 +161,12 @@ class TestScalarQuantizer(unittest.TestCase):
index = faiss.IndexScalarQuantizer(d, qtype, faiss.METRIC_L2) index = faiss.IndexScalarQuantizer(d, qtype, faiss.METRIC_L2)
index.train(xt) index.train(xt)
index.add(xb) index.add(xb)
D, I = index.search(xq, 10) D, I = index.search(xq, 10)
nok[qname] = (I[:, 0] == I_ref[:, 0]).sum() nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
print(nok) self.assertGreaterEqual(nok['QT_8bit'], nq * 0.9)
self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit']) self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit'])
self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform']) self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform']) self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform'])
......
...@@ -42,6 +42,42 @@ class TestRemove(unittest.TestCase): ...@@ -42,6 +42,42 @@ class TestRemove(unittest.TestCase):
else: else:
assert False, 'should have raised an exception' assert False, 'should have raised an exception'
def test_remove_id_map_2(self):
# from https://github.com/facebookresearch/faiss/issues/255
rs = np.random.RandomState(1234)
X = rs.randn(10, 10).astype(np.float32)
idx = np.array([0, 10, 20, 30, 40, 5, 15, 25, 35, 45], np.int64)
remove_set = np.array([10, 30], dtype=np.int64)
index = faiss.index_factory(10, 'IDMap,Flat')
index.add_with_ids(X[:5, :], idx[:5])
index.remove_ids(remove_set)
index.add_with_ids(X[5:, :], idx[5:])
print (index.search(X, 1))
for i in range(10):
_, searchres = index.search(X[i:i + 1, :], 1)
if idx[i] in remove_set:
assert searchres[0] != idx[i]
else:
assert searchres[0] == idx[i]
class TestRangeSearch(unittest.TestCase):
def test_range_search_id_map(self):
sub_index = faiss.IndexFlat(5, 1) # L2 search instead of inner product
xb = np.zeros((10, 5), dtype='float32')
xb[:, 0] = np.arange(10) + 1000
index = faiss.IndexIDMap2(sub_index)
index.add_with_ids(xb, np.arange(10) + 100)
dist = float(np.linalg.norm(xb[3] - xb[0])) * 0.99
res_subindex = sub_index.range_search(xb[[0], :], dist)
res_index = index.range_search(xb[[0], :], dist)
assert len(res_subindex[2]) == 2
np.testing.assert_array_equal(res_subindex[2] + 100, res_index[2])
class TestUpdate(unittest.TestCase): class TestUpdate(unittest.TestCase):
......
...@@ -67,6 +67,10 @@ int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a, ...@@ -67,6 +67,10 @@ int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
namespace faiss { namespace faiss {
#ifdef __AVX__
#define USE_AVX
#endif
double getmillisecs () { double getmillisecs () {
struct timeval tv; struct timeval tv;
gettimeofday (&tv, nullptr); gettimeofday (&tv, nullptr);
...@@ -455,7 +459,7 @@ float fvec_norm_L2sqr_ref (const float * __restrict x, ...@@ -455,7 +459,7 @@ float fvec_norm_L2sqr_ref (const float * __restrict x,
/********************************************************* /*********************************************************
* SSE implementations * SSE and AVX implementations
*/ */
// reads 0 <= d < 4 floats as __m128 // reads 0 <= d < 4 floats as __m128
...@@ -475,7 +479,96 @@ static inline __m128 masked_read (int d, const float *x) ...@@ -475,7 +479,96 @@ static inline __m128 masked_read (int d, const float *x)
// cannot use AVX2 _mm_mask_set1_epi32 // cannot use AVX2 _mm_mask_set1_epi32
} }
#ifdef USE_AVX
// reads 0 <= d < 8 floats as __m256
static inline __m256 masked_read_8 (int d, const float *x)
{
assert (0 <= d && d < 8);
if (d < 4) {
__m256 res = _mm256_setzero_ps ();
res = _mm256_insertf128_ps (res, masked_read (d, x), 0);
return res;
} else {
__m256 res = _mm256_setzero_ps ();
res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0);
res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1);
return res;
}
}
float fvec_inner_product (const float * x,
const float * y,
size_t d)
{
__m256 msum1 = _mm256_setzero_ps();
while (d >= 8) {
__m256 mx = _mm256_loadu_ps (x); x += 8;
__m256 my = _mm256_loadu_ps (y); y += 8;
msum1 = _mm256_add_ps (msum1, _mm256_mul_ps (mx, my));
d -= 8;
}
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
msum2 += _mm256_extractf128_ps(msum1, 0);
if (d >= 4) {
__m128 mx = _mm_loadu_ps (x); x += 4;
__m128 my = _mm_loadu_ps (y); y += 4;
msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
d -= 4;
}
if (d > 0) {
__m128 mx = masked_read (d, x);
__m128 my = masked_read (d, y);
msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
}
msum2 = _mm_hadd_ps (msum2, msum2);
msum2 = _mm_hadd_ps (msum2, msum2);
return _mm_cvtss_f32 (msum2);
}
float fvec_L2sqr (const float * x,
const float * y,
size_t d)
{
__m256 msum1 = _mm256_setzero_ps();
while (d >= 8) {
__m256 mx = _mm256_loadu_ps (x); x += 8;
__m256 my = _mm256_loadu_ps (y); y += 8;
const __m256 a_m_b1 = mx - my;
msum1 += a_m_b1 * a_m_b1;
d -= 8;
}
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
msum2 += _mm256_extractf128_ps(msum1, 0);
if (d >= 4) {
__m128 mx = _mm_loadu_ps (x); x += 4;
__m128 my = _mm_loadu_ps (y); y += 4;
const __m128 a_m_b1 = mx - my;
msum2 += a_m_b1 * a_m_b1;
d -= 4;
}
if (d > 0) {
__m128 mx = masked_read (d, x);
__m128 my = masked_read (d, y);
__m128 a_m_b1 = mx - my;
msum2 += a_m_b1 * a_m_b1;
}
msum2 = _mm_hadd_ps (msum2, msum2);
msum2 = _mm_hadd_ps (msum2, msum2);
return _mm_cvtss_f32 (msum2);
}
#else
/* SSE-implementation of L2 distance */ /* SSE-implementation of L2 distance */
float fvec_L2sqr (const float * x, float fvec_L2sqr (const float * x,
...@@ -534,6 +627,7 @@ float fvec_inner_product (const float * x, ...@@ -534,6 +627,7 @@ float fvec_inner_product (const float * x,
#endif
float fvec_norm_L2sqr (const float * x, float fvec_norm_L2sqr (const float * x,
size_t d) size_t d)
...@@ -557,69 +651,6 @@ float fvec_norm_L2sqr (const float * x, ...@@ -557,69 +651,6 @@ float fvec_norm_L2sqr (const float * x,
/*********************************************************
* AVX implementations
*
* Disabled for now, it is not faster than SSE on current machines
* see P57425519
*/
#if 0
// reads 0 <= d < 8 floats as __m256
static inline __m256 masked_read_8 (int d, const float *x)
{
assert (0 <= d && d < 8);
if (d < 4) {
__m256 res = _mm256_setzero_ps ();
res = _mm256_insertf128_ps (res, masked_read (d, x), 0);
return res;
} else {
__m256 res;
res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0);
res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1);
return res;
}
}
float fvec_L2sqr (const float * x,
const float * y,
size_t d)
{
__m256 msum1 = _mm256_setzero_ps();
while (d >= 8) {
__m256 mx = _mm256_loadu_ps (x); x += 8;
__m256 my = _mm256_loadu_ps (y); y += 8;
const __m256 a_m_b1 = mx - my;
msum1 += a_m_b1 * a_m_b1;
d -= 8;
}
if (d > 0) {
// add the last 1, 2 or 3 values
__m256 mx = masked_read_8 (d, x);
__m256 my = masked_read_8 (d, y);
__m256 a_m_b1 = mx - my;
msum1 += a_m_b1 * a_m_b1;
}
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
msum2 += _mm256_extractf128_ps(msum1, 0);
msum2 = _mm_hadd_ps (msum2, msum2);
msum2 = _mm_hadd_ps (msum2, msum2);
return _mm_cvtss_f32 (msum2);
}
#endif
/*************************************************************************** /***************************************************************************
* Matrix/vector ops * Matrix/vector ops
...@@ -1365,16 +1396,18 @@ void pairwise_L2sqr (long d, ...@@ -1365,16 +1396,18 @@ void pairwise_L2sqr (long d,
#define EPS (1 / 1024.) #define EPS (1 / 1024.)
/* For k-means, compute centroids given assignment of vectors to centroids */ /* For k-means, compute centroids given assignment of vectors to centroids */
/* NOTE: This could be multi-threaded (use histogram of indexes) */
int km_update_centroids (const float * x, int km_update_centroids (const float * x,
float * centroids, float * centroids,
long * assign, long * assign,
size_t d, size_t k, size_t n) size_t d, size_t k, size_t n,
size_t k_frozen)
{ {
k -= k_frozen;
centroids += k_frozen * d;
std::vector<size_t> hassign(k); std::vector<size_t> hassign(k);
memset (centroids, 0, sizeof(*centroids) * d * k); memset (centroids, 0, sizeof(*centroids) * d * k);
#pragma omp parallel #pragma omp parallel
{ {
int nt = omp_get_num_threads(); int nt = omp_get_num_threads();
...@@ -1383,12 +1416,12 @@ int km_update_centroids (const float * x, ...@@ -1383,12 +1416,12 @@ int km_update_centroids (const float * x,
size_t c0 = (k * rank) / nt; size_t c0 = (k * rank) / nt;
size_t c1 = (k * (rank + 1)) / nt; size_t c1 = (k * (rank + 1)) / nt;
const float *xi = x; const float *xi = x;
// printf("thread %d/%d: centroids %ld:%ld\n", rank, nt, c0, c1);
size_t nacc = 0; size_t nacc = 0;
for (size_t i = 0; i < n; i++) { for (size_t i = 0; i < n; i++) {
long ci = assign[i]; long ci = assign[i];
assert (ci >= 0 && ci < k); assert (ci >= 0 && ci < k + k_frozen);
ci -= k_frozen;
if (ci >= c0 && ci < c1) { if (ci >= c0 && ci < c1) {
float * c = centroids + ci * d; float * c = centroids + ci * d;
hassign[ci]++; hassign[ci]++;
...@@ -1398,7 +1431,6 @@ int km_update_centroids (const float * x, ...@@ -1398,7 +1431,6 @@ int km_update_centroids (const float * x,
} }
xi += d; xi += d;
} }
// printf("thread %d/%d: nacc = %ld/%ld\n", rank, nt, nacc, n);
} }
......
...@@ -307,12 +307,20 @@ int fvec_madd_and_argmin (size_t n, const float *a, ...@@ -307,12 +307,20 @@ int fvec_madd_and_argmin (size_t n, const float *a,
void reflection (const float * u, float * x, size_t n, size_t d, size_t nu); void reflection (const float * u, float * x, size_t n, size_t d, size_t nu);
/** For k-means: update stage. Returns nb of split clusters. */ /** For k-means: update stage.
*
* @param x training vectors, size n * d
* @param centroids centroid vectors, size k * d
* @param assign nearest centroid for each training vector, size n
* @param k_frozen do not update the k_frozen first centroids
* @return nb of spliting operations to fight empty clusters
*/
int km_update_centroids ( int km_update_centroids (
const float * x, const float * x,
float * centroids, float * centroids,
long * assign, long * assign,
size_t d, size_t k, size_t n); size_t d, size_t k, size_t n,
size_t k_frozen);
/** compute the Q of the QR decomposition for m > n /** compute the Q of the QR decomposition for m > n
* @param a size n * m: input matrix and output Q * @param a size n * m: input matrix and output Q
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment