Commit 250a3d3f authored by matthijs's avatar matthijs

sync with FB version 2017-11-22

various bugfixes from github issues
kmean with some frozen centroids
GPU better tiling for large flat datasets
default AVX for vector ops
parent 71335194
...@@ -77,10 +77,10 @@ IntersectionCriterion::IntersectionCriterion (idx_t nq, idx_t R): ...@@ -77,10 +77,10 @@ IntersectionCriterion::IntersectionCriterion (idx_t nq, idx_t R):
double IntersectionCriterion::evaluate(const float* /*D*/, const idx_t* I) double IntersectionCriterion::evaluate(const float* /*D*/, const idx_t* I)
const { const {
FAISS_THROW_IF_NOT_MSG( FAISS_THROW_IF_NOT_MSG(
(gt_I.size() == gt_nnn * nq && gt_nnn >= R && nnn >= R), (gt_I.size() == gt_nnn * nq && gt_nnn >= R && nnn >= R),
"ground truth not initialized"); "ground truth not initialized");
long n_ok = 0; long n_ok = 0;
#pragma omp parallel for reduction(+: n_ok) #pragma omp parallel for reduction(+: n_ok)
for (idx_t q = 0; q < nq; q++) { for (idx_t q = 0; q < nq; q++) {
n_ok += ranklist_intersection_size ( n_ok += ranklist_intersection_size (
...@@ -345,11 +345,13 @@ void ParameterSpace::initialize (const Index * index) ...@@ -345,11 +345,13 @@ void ParameterSpace::initialize (const Index * index)
} }
if (DC (IndexIVF)) { if (DC (IndexIVF)) {
ParameterRange & pr = add_range("nprobe"); {
for (int i = 0; i < 13; i++) { ParameterRange & pr = add_range("nprobe");
size_t nprobe = 1 << i; for (int i = 0; i < 13; i++) {
if (nprobe >= ix->nlist) break; size_t nprobe = 1 << i;
pr.values.push_back (nprobe); if (nprobe >= ix->nlist) break;
pr.values.push_back (nprobe);
}
} }
} }
if (DC (IndexPQ)) { if (DC (IndexPQ)) {
...@@ -371,7 +373,6 @@ void ParameterSpace::initialize (const Index * index) ...@@ -371,7 +373,6 @@ void ParameterSpace::initialize (const Index * index)
} }
} }
if (DC (IndexIVFPQR)) { if (DC (IndexIVFPQR)) {
assert (ix);
ParameterRange & pr = add_range("k_factor"); ParameterRange & pr = add_range("k_factor");
for (int i = 0; i <= 6; i++) { for (int i = 0; i <= 6; i++) {
pr.values.push_back (1 << i); pr.values.push_back (1 << i);
...@@ -427,12 +428,21 @@ void ParameterSpace::set_index_parameter ( ...@@ -427,12 +428,21 @@ void ParameterSpace::set_index_parameter (
if (name == "verbose") { if (name == "verbose") {
index->verbose = int(val); index->verbose = int(val);
// and fall through to also enable it on sub-indexes
} }
if (DC (IndexPreTransform)) { if (DC (IndexPreTransform)) {
index = ix->index; index = ix->index;
} }
if (DC (IndexShards)) {
// call on all sub-indexes
for (auto & shard_index : ix->shard_indexes) {
set_index_parameter (shard_index, name, val);
}
return;
}
if (name == "verbose") { if (name == "verbose") {
index->verbose = int(val); index->verbose = int(val);
// in case it was an IndexPreTransform
} }
if (DC (IndexRefineFlat)) { if (DC (IndexRefineFlat)) {
if (name == "k_factor_rf") { if (name == "k_factor_rf") {
...@@ -449,9 +459,12 @@ void ParameterSpace::set_index_parameter ( ...@@ -449,9 +459,12 @@ void ParameterSpace::set_index_parameter (
return; // last verbose that we could find return; // last verbose that we could find
} }
if (name == "nprobe") { if (name == "nprobe") {
DC(IndexIVF); if ( DC(IndexIVF)) {
ix->nprobe = int(val); ix->nprobe = int(val);
} else if (name == "ht") { return;
}
}
if (name == "ht") {
if (DC (IndexPQ)) { if (DC (IndexPQ)) {
if (val >= ix->pq.code_size * 8) { if (val >= ix->pq.code_size * 8) {
ix->search_type = IndexPQ::ST_PQ; ix->search_type = IndexPQ::ST_PQ;
...@@ -459,25 +472,32 @@ void ParameterSpace::set_index_parameter ( ...@@ -459,25 +472,32 @@ void ParameterSpace::set_index_parameter (
ix->search_type = IndexPQ::ST_polysemous; ix->search_type = IndexPQ::ST_polysemous;
ix->polysemous_ht = int(val); ix->polysemous_ht = int(val);
} }
return;
} else if (DC (IndexIVFPQ)) { } else if (DC (IndexIVFPQ)) {
if (val >= ix->pq.code_size * 8) { if (val >= ix->pq.code_size * 8) {
ix->polysemous_ht = 0; ix->polysemous_ht = 0;
} else { } else {
ix->polysemous_ht = int(val); ix->polysemous_ht = int(val);
} }
return;
} }
} else if (name == "k_factor") {
DC (IndexIVFPQR);
ix->k_factor = val;
} else if (name == "max_codes") {
DC (IndexIVFPQ);
ix->max_codes = finite(val) ? size_t(val) : 0;
} else {
FAISS_THROW_FMT (
"ParameterSpace::set_index_parameter:"
"could not set parameter %s",
name.c_str());
} }
if (name == "k_factor") {
if (DC (IndexIVFPQR)) {
ix->k_factor = val;
return;
}
}
if (name == "max_codes") {
if (DC (IndexIVFPQ)) {
ix->max_codes = finite(val) ? size_t(val) : 0;
return;
}
}
FAISS_THROW_FMT ("ParameterSpace::set_index_parameter:"
"could not set parameter %s",
name.c_str());
} }
void ParameterSpace::display () const void ParameterSpace::display () const
...@@ -634,6 +654,15 @@ struct VTChain { ...@@ -634,6 +654,15 @@ struct VTChain {
} }
}; };
/// what kind of training does this coarse quantizer require?
char get_trains_alone(const Index *coarse_quantizer) {
return
dynamic_cast<const MultiIndexQuantizer*>(coarse_quantizer) ? 1 :
0;
}
} }
Index *index_factory (int d, const char *description_in, MetricType metric) Index *index_factory (int d, const char *description_in, MetricType metric)
...@@ -656,6 +685,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -656,6 +685,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
tok; tok;
tok = strtok_r (nullptr, " ,", &ptr)) { tok = strtok_r (nullptr, " ,", &ptr)) {
int d_out, opq_M, nbit, M, M2; int d_out, opq_M, nbit, M, M2;
char option[100];
std::string stok(tok); std::string stok(tok);
// to avoid mem leaks with exceptions: // to avoid mem leaks with exceptions:
...@@ -686,7 +716,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -686,7 +716,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
} else if (stok == "L2norm") { } else if (stok == "L2norm") {
vt_1 = new NormalizationTransform (d, 2.0); vt_1 = new NormalizationTransform (d, 2.0);
// coarse quantizers
} else if (!coarse_quantizer && } else if (!coarse_quantizer &&
sscanf (tok, "IVF%d", &ncentroids) == 1) { sscanf (tok, "IVF%d", &ncentroids) == 1) {
if (metric == METRIC_L2) { if (metric == METRIC_L2) {
...@@ -709,8 +739,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -709,8 +739,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
IndexIVF *index_ivf = new IndexIVFFlat ( IndexIVF *index_ivf = new IndexIVFFlat (
coarse_quantizer, d, ncentroids, metric); coarse_quantizer, d, ncentroids, metric);
index_ivf->quantizer_trains_alone = index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer) get_trains_alone (coarse_quantizer);
!= nullptr;
index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT; index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
del_coarse_quantizer.release (); del_coarse_quantizer.release ();
index_ivf->own_fields = true; index_ivf->own_fields = true;
...@@ -728,8 +757,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -728,8 +757,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
new IndexIVFScalarQuantizer ( new IndexIVFScalarQuantizer (
coarse_quantizer, d, ncentroids, qt, metric); coarse_quantizer, d, ncentroids, qt, metric);
index_ivf->quantizer_trains_alone = index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer) get_trains_alone (coarse_quantizer);
!= nullptr;
del_coarse_quantizer.release (); del_coarse_quantizer.release ();
index_ivf->own_fields = true; index_ivf->own_fields = true;
index_1 = index_ivf; index_1 = index_ivf;
...@@ -744,29 +772,31 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -744,29 +772,31 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
IndexIVFPQR *index_ivf = new IndexIVFPQR ( IndexIVFPQR *index_ivf = new IndexIVFPQR (
coarse_quantizer, d, ncentroids, M, 8, M2, 8); coarse_quantizer, d, ncentroids, M, 8, M2, 8);
index_ivf->quantizer_trains_alone = index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer) get_trains_alone (coarse_quantizer);
!= nullptr;
del_coarse_quantizer.release (); del_coarse_quantizer.release ();
index_ivf->own_fields = true; index_ivf->own_fields = true;
index_1 = index_ivf; index_1 = index_ivf;
} else if (!index && sscanf (tok, "PQ%d", &M) == 1) { } else if (!index && sscanf (tok, "PQ%d%10s", &M, option) == 2) {
std::string soption = option;
// np to disable polysemous trainign
FAISS_THROW_IF_NOT(soption == "" || soption == "np");
if (coarse_quantizer) { if (coarse_quantizer) {
IndexIVFPQ *index_ivf = new IndexIVFPQ ( IndexIVFPQ *index_ivf = new IndexIVFPQ (
coarse_quantizer, d, ncentroids, M, 8); coarse_quantizer, d, ncentroids, M, 8);
index_ivf->quantizer_trains_alone = index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer) get_trains_alone (coarse_quantizer);
!= nullptr;
index_ivf->metric_type = metric; index_ivf->metric_type = metric;
index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT; index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
del_coarse_quantizer.release (); del_coarse_quantizer.release ();
index_ivf->own_fields = true; index_ivf->own_fields = true;
index_ivf->do_polysemous_training = true; index_ivf->do_polysemous_training = soption != "np";
index_1 = index_ivf; index_1 = index_ivf;
} else { } else {
IndexPQ *index_pq = new IndexPQ (d, M, 8, metric); IndexPQ *index_pq = new IndexPQ (d, M, 8, metric);
index_pq->do_polysemous_training = true; index_pq->do_polysemous_training = soption != "np";
index_1 = index_pq; index_1 = index_pq;
} }
} else if (stok == "RFlat") { } else if (stok == "RFlat") {
make_IndexRefineFlat = true; make_IndexRefineFlat = true;
} else { } else {
......
...@@ -25,7 +25,7 @@ namespace faiss { ...@@ -25,7 +25,7 @@ namespace faiss {
/** The objective is to have a simple result structure while /** The objective is to have a simple result structure while
* minimizing the number of mem copies in the result. The method * minimizing the number of mem copies in the result. The method
* do_allocation can be overloaded to allocate the result tables in * do_allocation can be overloaded to allocate the result tables in
* the matrix type of a srcipting language like Lua or Python. */ * the matrix type of a scripting language like Lua or Python. */
struct RangeSearchResult { struct RangeSearchResult {
size_t nq; ///< nb of queries size_t nq; ///< nb of queries
size_t *lims; ///< size (nq + 1) size_t *lims; ///< size (nq + 1)
......
...@@ -29,6 +29,7 @@ ClusteringParameters::ClusteringParameters (): ...@@ -29,6 +29,7 @@ ClusteringParameters::ClusteringParameters ():
nredo(1), nredo(1),
verbose(false), spherical(false), verbose(false), spherical(false),
update_index(false), update_index(false),
frozen_centroids(false),
min_points_per_centroid(39), min_points_per_centroid(39),
max_points_per_centroid(256), max_points_per_centroid(256),
seed(1234) seed(1234)
...@@ -110,7 +111,24 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) { ...@@ -110,7 +111,24 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
float * dis = new float[nx]; float * dis = new float[nx];
ScopeDeleter<float> del2(dis); ScopeDeleter<float> del2(dis);
// for redo
float best_err = 1e50; float best_err = 1e50;
std::vector<float> best_obj;
std::vector<float> best_centroids;
// support input centroids
FAISS_THROW_IF_NOT_MSG (
centroids.size() % d == 0,
"size of provided input centroids not a multiple of dimension");
size_t n_input_centroids = centroids.size() / d;
if (verbose && n_input_centroids > 0) {
printf (" Using %zd centroids provided as input (%sfrozen)\n",
n_input_centroids, frozen_centroids ? "" : "not ");
}
double t_search_tot = 0; double t_search_tot = 0;
if (verbose) { if (verbose) {
printf(" Preprocessing in %.2f s\n", printf(" Preprocessing in %.2f s\n",
...@@ -120,39 +138,28 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) { ...@@ -120,39 +138,28 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
for (int redo = 0; redo < nredo; redo++) { for (int redo = 0; redo < nredo; redo++) {
std::vector<float> buf_centroids;
std::vector<float> &cur_centroids =
nredo == 1 ? centroids : buf_centroids;
if (verbose && nredo > 1) { if (verbose && nredo > 1) {
printf("Outer iteration %d / %d\n", redo, nredo); printf("Outer iteration %d / %d\n", redo, nredo);
} }
if (cur_centroids.size() == 0) {
// initialize centroids with random points from the dataset // initialize remaining centroids with random points from the dataset
cur_centroids.resize (d * k); centroids.resize (d * k);
std::vector<int> perm (nx); std::vector<int> perm (nx);
rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L); rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L);
#pragma omp parallel for for (int i = n_input_centroids; i < k ; i++)
for (int i = 0; i < k ; i++) memcpy (&centroids[i * d], x + perm[i] * d,
memcpy (&cur_centroids[i * d], x + perm[i] * d, d * sizeof (float));
d * sizeof (float));
} else { // assume user provides some meaningful initialization
FAISS_THROW_IF_NOT (cur_centroids.size() == d * k);
FAISS_THROW_IF_NOT_MSG (nredo == 1,
"will redo with same initialization");
}
if (spherical) if (spherical)
fvec_renorm_L2 (d, k, cur_centroids.data()); fvec_renorm_L2 (d, k, centroids.data());
if (!index.is_trained) if (!index.is_trained)
index.train (k, cur_centroids.data()); index.train (k, centroids.data());
FAISS_THROW_IF_NOT (index.ntotal == 0); FAISS_THROW_IF_NOT (index.ntotal == 0);
index.add (k, cur_centroids.data()); index.add (k, centroids.data());
float err = 0; float err = 0;
for (int i = 0; i < niter; i++) { for (int i = 0; i < niter; i++) {
double t0s = getmillisecs(); double t0s = getmillisecs();
...@@ -164,8 +171,9 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) { ...@@ -164,8 +171,9 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
err += dis[j]; err += dis[j];
obj.push_back (err); obj.push_back (err);
int nsplit = km_update_centroids (x, cur_centroids.data(), int nsplit = km_update_centroids (
assign, d, k, nx); x, centroids.data(),
assign, d, k, nx, frozen_centroids ? n_input_centroids : 0);
if (verbose) { if (verbose) {
printf (" Iteration %d (%.2f s, search %.2f s): " printf (" Iteration %d (%.2f s, search %.2f s): "
...@@ -178,26 +186,31 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) { ...@@ -178,26 +186,31 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
} }
if (spherical) if (spherical)
fvec_renorm_L2 (d, k, cur_centroids.data()); fvec_renorm_L2 (d, k, centroids.data());
index.reset (); index.reset ();
if (update_index) if (update_index)
index.train (k, cur_centroids.data()); index.train (k, centroids.data());
assert (index.ntotal == 0); assert (index.ntotal == 0);
index.add (k, cur_centroids.data()); index.add (k, centroids.data());
} }
if (verbose) printf("\n"); if (verbose) printf("\n");
if (nredo > 1) { if (nredo > 1) {
if (err < best_err) { if (err < best_err) {
if (verbose) if (verbose)
printf ("Objective improved: keep new clusters\n"); printf ("Objective improved: keep new clusters\n");
centroids = buf_centroids; best_centroids = centroids;
best_obj = obj;
best_err = err; best_err = err;
} }
index.reset (); index.reset ();
} }
} }
if (nredo > 1) {
centroids = best_centroids;
obj = best_obj;
}
} }
......
...@@ -28,6 +28,7 @@ struct ClusteringParameters { ...@@ -28,6 +28,7 @@ struct ClusteringParameters {
bool verbose; bool verbose;
bool spherical; ///< do we want normalized centroids? bool spherical; ///< do we want normalized centroids?
bool update_index; ///< update index after each iteration? bool update_index; ///< update index after each iteration?
bool frozen_centroids; ///< use the centroids provided as input and do not change them during iterations
int min_points_per_centroid; ///< otherwise you get a warning int min_points_per_centroid; ///< otherwise you get a warning
int max_points_per_centroid; ///< to limit size of dataset int max_points_per_centroid; ///< to limit size of dataset
......
...@@ -41,8 +41,7 @@ long Index::remove_ids(const IDSelector& /*sel*/) { ...@@ -41,8 +41,7 @@ long Index::remove_ids(const IDSelector& /*sel*/) {
void Index::reconstruct (idx_t, float * ) const { void Index::reconstruct (idx_t, float * ) const {
FAISS_THROW_MSG ("Can not compute reconstruct without " FAISS_THROW_MSG ("reconstruct not implemented for this type of index");
"knowing how to do so");
} }
......
...@@ -34,8 +34,9 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist, ...@@ -34,8 +34,9 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
nlist (nlist), nlist (nlist),
nprobe (1), nprobe (1),
quantizer (quantizer), quantizer (quantizer),
quantizer_trains_alone (false), quantizer_trains_alone (0),
own_fields (false), own_fields (false),
clustering_index (nullptr),
ids (nlist), ids (nlist),
maintain_direct_map (false) maintain_direct_map (false)
{ {
...@@ -56,7 +57,8 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist, ...@@ -56,7 +57,8 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
IndexIVF::IndexIVF (): IndexIVF::IndexIVF ():
nlist (0), nprobe (1), quantizer (nullptr), nlist (0), nprobe (1), quantizer (nullptr),
quantizer_trains_alone (false), own_fields (false), quantizer_trains_alone (0), own_fields (false),
clustering_index (nullptr),
maintain_direct_map (false) maintain_direct_map (false)
{} {}
...@@ -157,22 +159,44 @@ void IndexIVF::train (idx_t n, const float *x) ...@@ -157,22 +159,44 @@ void IndexIVF::train (idx_t n, const float *x)
if (quantizer->is_trained && (quantizer->ntotal == nlist)) { if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
if (verbose) if (verbose)
printf ("IVF quantizer does not need training.\n"); printf ("IVF quantizer does not need training.\n");
} else if (quantizer_trains_alone) { } else if (quantizer_trains_alone == 1) {
if (verbose) if (verbose)
printf ("IVF quantizer trains alone...\n"); printf ("IVF quantizer trains alone...\n");
quantizer->train (n, x); quantizer->train (n, x);
quantizer->verbose = verbose;
FAISS_THROW_IF_NOT_MSG (quantizer->ntotal == nlist, FAISS_THROW_IF_NOT_MSG (quantizer->ntotal == nlist,
"nlist not consistent with quantizer size"); "nlist not consistent with quantizer size");
} else { } else if (quantizer_trains_alone == 0) {
if (verbose) if (verbose)
printf ("Training IVF quantizer on %ld vectors in %dD\n", printf ("Training IVF quantizer on %ld vectors in %dD\n",
n, d); n, d);
Clustering clus (d, nlist, cp); Clustering clus (d, nlist, cp);
quantizer->reset(); quantizer->reset();
clus.train (n, x, *quantizer); if (clustering_index) {
clus.train (n, x, *clustering_index);
quantizer->add (nlist, clus.centroids.data());
} else {
clus.train (n, x, *quantizer);
}
quantizer->is_trained = true; quantizer->is_trained = true;
} else if (quantizer_trains_alone == 2) {
if (verbose)
printf (
"Training L2 quantizer on %ld vectors in %dD%s\n",
n, d,
clustering_index ? "(user provided index)" : "");
FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
Clustering clus (d, nlist, cp);
if (!clustering_index) {
IndexFlatL2 assigner (d);
clus.train(n, x, assigner);
} else {
clus.train(n, x, *clustering_index);
}
if (verbose)
printf ("Adding centroids to quantizer\n");
quantizer->add (nlist, clus.centroids.data());
} }
if (verbose) if (verbose)
printf ("Training IVF residual\n"); printf ("Training IVF residual\n");
...@@ -250,8 +274,9 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type, ...@@ -250,8 +274,9 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
{ {
FAISS_THROW_IF_NOT (nlist == other.nlist); FAISS_THROW_IF_NOT (nlist == other.nlist);
FAISS_THROW_IF_NOT (!other.maintain_direct_map); FAISS_THROW_IF_NOT (!other.maintain_direct_map);
FAISS_THROW_IF_NOT_MSG (subset_type == 0 || subset_type == 2, FAISS_THROW_IF_NOT_FMT (
"this subset type is not implemented"); subset_type == 0 || subset_type == 1 || subset_type == 2,
"subset type %d not implemented", subset_type);
size_t accu_n = 0; size_t accu_n = 0;
size_t accu_a1 = 0; size_t accu_a1 = 0;
...@@ -275,15 +300,24 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type, ...@@ -275,15 +300,24 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
other.ntotal++; other.ntotal++;
} }
} }
} else if (subset_type == 1) {
for (long i = 0; i < n; i++) {
idx_t id = ids_in[i];
if (id % a1 == a2) {
ids_out.push_back (id);
codes_out.insert (codes_out.end(),
codes_in.begin() + i * code_size,
codes_in.begin() + (i + 1) * code_size);
other.ntotal++;
}
}
} else if (subset_type == 2) { } else if (subset_type == 2) {
// see what is allocated to a1 and to a2 // see what is allocated to a1 and to a2
size_t next_accu_n = accu_n + n; size_t next_accu_n = accu_n + n;
size_t next_accu_a1 = next_accu_n * a1 / ntotal; size_t next_accu_a1 = next_accu_n * a1 / ntotal;
size_t i1 = next_accu_a1 - accu_a1; size_t i1 = next_accu_a1 - accu_a1;
accu_a1 = next_accu_a1;
size_t next_accu_a2 = next_accu_n * a2 / ntotal; size_t next_accu_a2 = next_accu_n * a2 / ntotal;
size_t i2 = next_accu_a2 - accu_a2; size_t i2 = next_accu_a2 - accu_a2;
accu_a2 = next_accu_a2;
ids_out.insert(ids_out.end(), ids_out.insert(ids_out.end(),
ids_in.begin() + i1, ids_in.begin() + i1,
ids_in.begin() + i2); ids_in.begin() + i2);
...@@ -291,6 +325,8 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type, ...@@ -291,6 +325,8 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
codes_in.begin() + i1 * code_size, codes_in.begin() + i1 * code_size,
codes_in.begin() + i2 * code_size); codes_in.begin() + i2 * code_size);
other.ntotal += i2 - i1; other.ntotal += i2 - i1;
accu_a1 = next_accu_a1;
accu_a2 = next_accu_a2;
} }
accu_n += n; accu_n += n;
} }
......
...@@ -47,10 +47,17 @@ struct IndexIVF: Index { ...@@ -47,10 +47,17 @@ struct IndexIVF: Index {
size_t nprobe; ///< number of probes at query time size_t nprobe; ///< number of probes at query time
Index * quantizer; ///< quantizer that maps vectors to inverted lists Index * quantizer; ///< quantizer that maps vectors to inverted lists
bool quantizer_trains_alone; ///< just pass over the trainset to quantizer
/**
* = 0: use the quantizer as index in a kmeans training
* = 1: just pass on the training set to the train() of the quantizer
* = 2: kmeans training on a flat index + add the centroids to the quantizer
*/
char quantizer_trains_alone;
bool own_fields; ///< whether object owns the quantizer bool own_fields; ///< whether object owns the quantizer
ClusteringParameters cp; ///< to override default clustering params ClusteringParameters cp; ///< to override default clustering params
Index *clustering_index; ///< to override index used during clustering
std::vector < std::vector<long> > ids; ///< Inverted lists for indexes std::vector < std::vector<long> > ids; ///< Inverted lists for indexes
......
...@@ -291,8 +291,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const ...@@ -291,8 +291,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
for (int j = 0; j < d; j++) { for (int j = 0; j < d; j++) {
r[j] += centroid[j]; r[j] += centroid[j];
} }
} } else {
else {
pq.decode (code_line + ofs * pq.code_size, r); pq.decode (code_line + ofs * pq.code_size, r);
} }
} }
...@@ -303,6 +302,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const ...@@ -303,6 +302,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
void IndexIVFPQ::reconstruct (idx_t key, float * recons) const void IndexIVFPQ::reconstruct (idx_t key, float * recons) const
{ {
FAISS_THROW_IF_NOT (direct_map.size() == ntotal); FAISS_THROW_IF_NOT (direct_map.size() == ntotal);
int list_no = direct_map[key] >> 32; int list_no = direct_map[key] >> 32;
int ofs = direct_map[key] & 0xffffffff; int ofs = direct_map[key] & 0xffffffff;
...@@ -1029,6 +1029,51 @@ void IndexIVFPQ::search_preassigned (idx_t nx, const float *qx, idx_t k, ...@@ -1029,6 +1029,51 @@ void IndexIVFPQ::search_preassigned (idx_t nx, const float *qx, idx_t k,
} }
void IndexIVFPQ::search_and_reconstruct (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels,
float *reconstructed)
{
long * idx = new long [n * nprobe];
ScopeDeleter<long> del (idx);
float * coarse_dis = new float [n * nprobe];
ScopeDeleter<float> del2 (coarse_dis);
quantizer->search (n, x, nprobe, coarse_dis, idx);
search_preassigned (n, x, k, idx, coarse_dis,
distances, labels, true);
for (long i = 0; i < n; i++) {
for (long j = 0; j < k; j++) {
long ij = i * k + j;
idx_t res = labels[ij];
float *recons = reconstructed + d * (ij);
if (res < 0) {
// fill with NaNs
memset(recons, -1, sizeof(*recons) * d);
} else {
int list_no = res >> 32;
int ofs = res & 0xffffffff;
labels[ij] = ids[list_no][ofs];
quantizer->reconstruct (list_no, recons);
const uint8_t * code = &(codes[list_no][ofs * pq.code_size]);
for (size_t m = 0; m < pq.M; m++) {
float * out = recons + m * pq.dsub;
const float * cent = pq.get_centroids (m, code[m]);
for (size_t l = 0; l < pq.dsub; l++) {
out[l] += cent[l];
}
}
}
}
}
}
IndexIVFPQ::IndexIVFPQ () IndexIVFPQ::IndexIVFPQ ()
......
...@@ -114,6 +114,15 @@ struct IndexIVFPQ: IndexIVF { ...@@ -114,6 +114,15 @@ struct IndexIVFPQ: IndexIVF {
float *distances, idx_t *labels, float *distances, idx_t *labels,
bool store_pairs) const override; bool store_pairs) const override;
/** Same as the search function, but also reconstruct approximate
* vectors for the search results
*
* @param reconstructed size (n, k, d)
**/
void search_and_reconstruct (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels,
float *reconstructed);
/// build precomputed table /// build precomputed table
void precompute_table (); void precompute_table ();
......
...@@ -124,8 +124,8 @@ struct Codec4bit { ...@@ -124,8 +124,8 @@ struct Codec4bit {
struct SimilarityL2 { struct SimilarityL2 {
const float *y, *yi; const float *y, *yi;
explicit SimilarityL2 (const float * y): y(y) {}
explicit SimilarityL2 (const float * y): y(y) {}
/******* scalar accumulator *******/ /******* scalar accumulator *******/
...@@ -676,19 +676,19 @@ void ScalarQuantizer::compute_codes (const float * x, ...@@ -676,19 +676,19 @@ void ScalarQuantizer::compute_codes (const float * x,
size_t n) const size_t n) const
{ {
Quantizer *squant = select_quantizer (*this); Quantizer *squant = select_quantizer (*this);
ScopeDeleter1<Quantizer> del(squant);
#pragma omp parallel for #pragma omp parallel for
for (size_t i = 0; i < n; i++) for (size_t i = 0; i < n; i++)
squant->encode_vector (x + i * d, codes + i * code_size); squant->encode_vector (x + i * d, codes + i * code_size);
delete squant;
} }
void ScalarQuantizer::decode (const uint8_t *codes, float *x, size_t n) const void ScalarQuantizer::decode (const uint8_t *codes, float *x, size_t n) const
{ {
Quantizer *squant = select_quantizer (*this); Quantizer *squant = select_quantizer (*this);
ScopeDeleter1<Quantizer> del(squant);
#pragma omp parallel for #pragma omp parallel for
for (size_t i = 0; i < n; i++) for (size_t i = 0; i < n; i++)
squant->decode_vector (codes + i * code_size, x + i * d); squant->decode_vector (codes + i * code_size, x + i * d);
delete squant;
} }
/******************************************************************* /*******************************************************************
...@@ -754,6 +754,7 @@ void IndexScalarQuantizer::search( ...@@ -754,6 +754,7 @@ void IndexScalarQuantizer::search(
} }
ci += code_size; ci += code_size;
} }
minheap_reorder (k, simi, idxi);
} }
} else { } else {
#pragma omp parallel for #pragma omp parallel for
...@@ -774,7 +775,7 @@ void IndexScalarQuantizer::search( ...@@ -774,7 +775,7 @@ void IndexScalarQuantizer::search(
} }
ci += code_size; ci += code_size;
} }
maxheap_reorder (k, simi, idxi);
} }
} }
...@@ -855,6 +856,7 @@ void IndexIVFScalarQuantizer::add_with_ids ...@@ -855,6 +856,7 @@ void IndexIVFScalarQuantizer::add_with_ids
int nt = omp_get_num_threads(); int nt = omp_get_num_threads();
int rank = omp_get_thread_num(); int rank = omp_get_thread_num();
// each thread takes care of a subset of lists
for (size_t i = 0; i < n; i++) { for (size_t i = 0; i < n; i++) {
long list_no = idx [i]; long list_no = idx [i];
...@@ -879,6 +881,7 @@ void IndexIVFScalarQuantizer::add_with_ids ...@@ -879,6 +881,7 @@ void IndexIVFScalarQuantizer::add_with_ids
ntotal += nadd; ntotal += nadd;
} }
namespace {
void search_with_probes_ip (const IndexIVFScalarQuantizer & index, void search_with_probes_ip (const IndexIVFScalarQuantizer & index,
const float *x, const float *x,
...@@ -958,6 +961,8 @@ void search_with_probes_L2 (const IndexIVFScalarQuantizer & index, ...@@ -958,6 +961,8 @@ void search_with_probes_L2 (const IndexIVFScalarQuantizer & index,
maxheap_reorder (k, simi, idxi); maxheap_reorder (k, simi, idxi);
} }
} // anonymous namespace
void IndexIVFScalarQuantizer::search_preassigned ( void IndexIVFScalarQuantizer::search_preassigned (
idx_t n, const float *x, idx_t k, idx_t n, const float *x, idx_t k,
const idx_t *idx, const idx_t *idx,
......
...@@ -87,54 +87,59 @@ _swigfaiss.so: python/_swigfaiss.so ...@@ -87,54 +87,59 @@ _swigfaiss.so: python/_swigfaiss.so
cp python/_swigfaiss.so python/swigfaiss.py . cp python/_swigfaiss.so python/swigfaiss.py .
############################# #############################
# Dependencies # Dependencies.
# make dep > x
# then copy/paste from x by hand below
# for i in *.cpp ; do g++ -std=c++11 -I.. -MM $i -msse4; done dep:
for i in $(patsubst %.o,%.cpp,$(LIBOBJ)) ; do \
cpp -MM -std=gnu++0x $$i ; \
done
AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h IndexFlat.h
FaissException.o: FaissException.cpp FaissException.h
hamming.o: hamming.cpp hamming.h Heap.h FaissAssert.h FaissException.h hamming.o: hamming.cpp hamming.h Heap.h FaissAssert.h FaissException.h
Heap.o: Heap.cpp Heap.h utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \
Index.o: Index.cpp IndexFlat.h Index.h FaissAssert.h FaissException.h FaissAssert.h FaissException.h
IndexFlat.o: IndexFlat.cpp IndexFlat.h Index.h utils.h Heap.h \ IndexFlat.o: IndexFlat.cpp IndexFlat.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h AuxIndexStructures.h FaissAssert.h FaissException.h AuxIndexStructures.h
index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
IndexIVF.o: IndexIVF.cpp IndexIVF.h Index.h Clustering.h Heap.h utils.h \ IndexIVF.o: IndexIVF.cpp IndexIVF.h Index.h Clustering.h Heap.h utils.h \
hamming.h FaissAssert.h FaissException.h IndexFlat.h \ hamming.h FaissAssert.h FaissException.h IndexFlat.h \
AuxIndexStructures.h AuxIndexStructures.h
IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
IndexFlat.h hamming.h FaissAssert.h FaissException.h \
AuxIndexStructures.h
IndexLSH.o: IndexLSH.cpp IndexLSH.h Index.h VectorTransform.h utils.h \ IndexLSH.o: IndexLSH.cpp IndexLSH.h Index.h VectorTransform.h utils.h \
Heap.h hamming.h FaissAssert.h FaissException.h Heap.h hamming.h FaissAssert.h FaissException.h
IndexPQ.o: IndexPQ.cpp IndexPQ.h Index.h ProductQuantizer.h Clustering.h \ IndexPQ.o: IndexPQ.cpp IndexPQ.h Index.h ProductQuantizer.h Clustering.h \
Heap.h PolysemousTraining.h FaissAssert.h FaissException.h hamming.h Heap.h PolysemousTraining.h FaissAssert.h FaissException.h hamming.h
IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \ IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \ Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
FaissException.h IndexFlat.h hamming.h FaissAssert.h FaissException.h \
MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \ AuxIndexStructures.h
FaissException.h Heap.h AuxIndexStructures.h Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h IndexFlat.h
Heap.o: Heap.cpp Heap.h
VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \
Heap.h FaissAssert.h FaissException.h IndexPQ.h ProductQuantizer.h \
Clustering.h PolysemousTraining.h
index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
PolysemousTraining.o: PolysemousTraining.cpp PolysemousTraining.h \ PolysemousTraining.o: PolysemousTraining.cpp PolysemousTraining.h \
ProductQuantizer.h Clustering.h Index.h Heap.h utils.h hamming.h \ ProductQuantizer.h Clustering.h Index.h Heap.h utils.h hamming.h \
FaissAssert.h FaissException.h FaissAssert.h FaissException.h
MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \
FaissException.h Heap.h AuxIndexStructures.h
Index.o: Index.cpp IndexFlat.h Index.h FaissAssert.h FaissException.h
ProductQuantizer.o: ProductQuantizer.cpp ProductQuantizer.h Clustering.h \ ProductQuantizer.o: ProductQuantizer.cpp ProductQuantizer.h Clustering.h \
Index.h Heap.h FaissAssert.h FaissException.h VectorTransform.h \ Index.h Heap.h FaissAssert.h FaissException.h VectorTransform.h \
IndexFlat.h utils.h IndexFlat.h utils.h
utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \ AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
FaissAssert.h FaissException.h FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \ IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
Heap.h FaissAssert.h FaissException.h IndexPQ.h ProductQuantizer.h \ IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
Clustering.h PolysemousTraining.h AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \
IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \
FaissException.h
FaissException.o: FaissException.cpp FaissException.h
clean: clean:
......
...@@ -76,6 +76,17 @@ void IndexIDMap::search (idx_t n, const float *x, idx_t k, ...@@ -76,6 +76,17 @@ void IndexIDMap::search (idx_t n, const float *x, idx_t k,
} }
} }
void IndexIDMap::range_search (idx_t n, const float *x, float radius,
RangeSearchResult *result) const
{
index->range_search(n, x, radius, result);
for (idx_t i = 0; i < result->lims[result->nq]; i++) {
result->labels[i] = result->labels[i] < 0 ?
result->labels[i] : id_map[result->labels[i]];
}
}
namespace { namespace {
struct IDTranslatedSelector: IDSelector { struct IDTranslatedSelector: IDSelector {
...@@ -109,6 +120,7 @@ long IndexIDMap::remove_ids (const IDSelector & sel) ...@@ -109,6 +120,7 @@ long IndexIDMap::remove_ids (const IDSelector & sel)
} }
FAISS_ASSERT (j == index->ntotal); FAISS_ASSERT (j == index->ntotal);
ntotal = j; ntotal = j;
id_map.resize(ntotal);
return nremove; return nremove;
} }
......
...@@ -51,6 +51,9 @@ struct IndexIDMap : Index { ...@@ -51,6 +51,9 @@ struct IndexIDMap : Index {
/// remove ids adapted to IndexFlat /// remove ids adapted to IndexFlat
long remove_ids(const IDSelector& sel) override; long remove_ids(const IDSelector& sel) override;
void range_search (idx_t n, const float *x, float radius,
RangeSearchResult *result) const override;
~IndexIDMap() override; ~IndexIDMap() override;
IndexIDMap () {own_fields=false; index=nullptr; } IndexIDMap () {own_fields=false; index=nullptr; }
}; };
......
...@@ -804,18 +804,38 @@ void IndexPreTransform::train (idx_t n, const float *x) ...@@ -804,18 +804,38 @@ void IndexPreTransform::train (idx_t n, const float *x)
const float *prev_x = x; const float *prev_x = x;
ScopeDeleter<float> del; ScopeDeleter<float> del;
if (verbose) {
printf("IndexPreTransform::train: training chain 0 to %d\n",
last_untrained);
}
for (int i = 0; i <= last_untrained; i++) { for (int i = 0; i <= last_untrained; i++) {
if (i < chain.size()) { if (i < chain.size()) {
VectorTransform *ltrans = chain [i]; VectorTransform *ltrans = chain [i];
if (!ltrans->is_trained) if (!ltrans->is_trained) {
ltrans->train(n, prev_x); if (verbose) {
printf(" Training chain component %d/%zd\n",
i, chain.size());
if (OPQMatrix *opqm = dynamic_cast<OPQMatrix*>(ltrans)) {
opqm->verbose = true;
}
}
ltrans->train (n, prev_x);
}
} else { } else {
if (verbose) {
printf(" Training sub-index\n");
}
index->train (n, prev_x); index->train (n, prev_x);
} }
if (i == last_untrained) break; if (i == last_untrained) break;
if (verbose) {
printf(" Applying transform %d/%zd\n",
i, chain.size());
}
float * xt = chain[i]->apply (n, prev_x); float * xt = chain[i]->apply (n, prev_x);
if (prev_x != x) delete prev_x; if (prev_x != x) delete [] prev_x;
prev_x = xt; prev_x = xt;
del.set(xt); del.set(xt);
} }
......
...@@ -521,7 +521,7 @@ def compute_populated_index(preproc): ...@@ -521,7 +521,7 @@ def compute_populated_index(preproc):
co.verbose = True co.verbose = True
co.reserveVecs = max_add if max_add > 0 else xb.shape[0] co.reserveVecs = max_add if max_add > 0 else xb.shape[0]
co.shard = True co.shard = True
assert co.shard_type in (0, 1, 2)
vres, vdev = make_vres_vdev() vres, vdev = make_vres_vdev()
gpu_index = faiss.index_cpu_to_gpu_multiple( gpu_index = faiss.index_cpu_to_gpu_multiple(
vres, vdev, indexall, co) vres, vdev, indexall, co)
...@@ -630,7 +630,7 @@ def get_populated_index(preproc): ...@@ -630,7 +630,7 @@ def get_populated_index(preproc):
co.usePrecomputed = use_precomputed_tables co.usePrecomputed = use_precomputed_tables
co.indicesOptions = 0 co.indicesOptions = 0
co.verbose = True co.verbose = True
co.shard = True # the replicas will be made "manually" co.shard = True # the replicas will be made "manually"
t0 = time.time() t0 = time.time()
print "CPU index contains %d vectors, move to GPU" % indexall.ntotal print "CPU index contains %d vectors, move to GPU" % indexall.ntotal
if replicas == 1: if replicas == 1:
......
...@@ -121,6 +121,18 @@ def handle_Index(the_class): ...@@ -121,6 +121,18 @@ def handle_Index(the_class):
swig_ptr(labels)) swig_ptr(labels))
return distances, labels return distances, labels
def replacement_search_and_reconstruct(self, x, k):
n, d = x.shape
assert d == self.d
distances = np.empty((n, k), dtype=np.float32)
labels = np.empty((n, k), dtype=np.int64)
recons = np.empty((n, k, d), dtype=np.float32)
self.search_and_reconstruct_c(n, swig_ptr(x),
k, swig_ptr(distances),
swig_ptr(labels),
swig_ptr(recons))
return distances, labels, recons
def replacement_remove_ids(self, x): def replacement_remove_ids(self, x):
if isinstance(x, IDSelector): if isinstance(x, IDSelector):
sel = x sel = x
...@@ -167,6 +179,8 @@ def handle_Index(the_class): ...@@ -167,6 +179,8 @@ def handle_Index(the_class):
replace_method(the_class, 'range_search', replacement_range_search) replace_method(the_class, 'range_search', replacement_range_search)
replace_method(the_class, 'update_vectors', replacement_update_vectors, replace_method(the_class, 'update_vectors', replacement_update_vectors,
ignore_missing=True) ignore_missing=True)
replace_method(the_class, 'search_and_reconstruct',
replacement_search_and_reconstruct, ignore_missing=True)
def handle_VectorTransform(the_class): def handle_VectorTransform(the_class):
...@@ -258,12 +272,52 @@ def index_cpu_to_gpu_multiple_py(resources, index, co=None): ...@@ -258,12 +272,52 @@ def index_cpu_to_gpu_multiple_py(resources, index, co=None):
return index_cpu_to_gpu_multiple(vres, vdev, index, co) return index_cpu_to_gpu_multiple(vres, vdev, index, co)
def vector_float_to_array(v): def index_cpu_to_all_gpus(index, co=None, ngpu=-1):
a = np.empty(v.size(), dtype='float32') if ngpu == -1:
memcpy(swig_ptr(a), v.data(), 4 * v.size()) ngpu = get_num_gpus()
res = [StandardGpuResources() for i in range(ngpu)]
index2 = index_cpu_to_gpu_multiple_py(res, index, co)
index2.dont_dealloc = res
return index2
# mapping from vector names in swigfaiss.swig and the numpy dtype names
vector_name_map = {
'Float': 'float32',
'Byte': 'uint8',
'Uint64': 'uint64',
'Long': 'int64',
'Int': 'int32',
'Double': 'float64'
}
def vector_to_array(v):
""" convert a C++ vector to a numpy array """
classname = v.__class__.__name__
assert classname.endswith('Vector')
dtype = np.dtype(vector_name_map[classname[:-6]])
a = np.empty(v.size(), dtype=dtype)
memcpy(swig_ptr(a), v.data(), a.nbytes)
return a return a
def vector_float_to_array(v):
return vector_to_array(v)
def copy_array_to_vector(a, v):
""" copy a numpy array to a vector """
n, = a.shape
classname = v.__class__.__name__
assert classname.endswith('Vector')
dtype = np.dtype(vector_name_map[classname[:-6]])
assert dtype == a.dtype, (
'cannot copy a %s array to a %s (should be %s)' % (
a.dtype, classname, dtype))
v.resize(n)
memcpy(v.data(), swig_ptr(a), a.nbytes)
class Kmeans: class Kmeans:
def __init__(self, d, k, niter=25, verbose=False, spherical = False): def __init__(self, d, k, niter=25, verbose=False, spherical = False):
...@@ -364,3 +418,18 @@ def eval_intersection(I1, I2): ...@@ -364,3 +418,18 @@ def eval_intersection(I1, I2):
def normalize_L2(x): def normalize_L2(x):
fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x)) fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x))
def replacement_map_add(self, keys, vals):
n, = keys.shape
assert (n,) == keys.shape
self.add_c(n, swig_ptr(keys), swig_ptr(vals))
def replacement_map_search_multiple(self, keys):
n, = keys.shape
vals = np.empty(n, dtype='uint64')
self.search_multiple_c(n, swig_ptr(keys), swig_ptr(vals))
return vals
replace_method(MapLong2Long, 'add', replacement_map_add)
replace_method(MapLong2Long, 'search_multiple', replacement_map_search_multiple)
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
// Copyright 2004-present Facebook. All Rights Reserved. // Copyright 2004-present Facebook. All Rights Reserved.
#include "GpuAutoTune.h" #include "GpuAutoTune.h"
#include <typeinfo>
#include "GpuIndex.h" #include "GpuIndex.h"
#include "../FaissAssert.h" #include "../FaissAssert.h"
...@@ -97,17 +98,6 @@ faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index) ...@@ -97,17 +98,6 @@ faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index)
GpuClonerOptions::GpuClonerOptions():
indicesOptions(INDICES_64_BIT),
useFloat16CoarseQuantizer(false),
useFloat16(false),
usePrecomputed(true),
reserveVecs(0),
storeTransposed(false),
verbose(0)
{}
struct ToGpuCloner: faiss::Cloner, GpuClonerOptions { struct ToGpuCloner: faiss::Cloner, GpuClonerOptions {
GpuResources *resources; GpuResources *resources;
int device; int device;
...@@ -185,9 +175,6 @@ faiss::Index * index_cpu_to_gpu( ...@@ -185,9 +175,6 @@ faiss::Index * index_cpu_to_gpu(
return cl.clone_Index(index); return cl.clone_Index(index);
} }
GpuMultipleClonerOptions::GpuMultipleClonerOptions(): shard(false)
{}
struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions { struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
std::vector<ToGpuCloner> sub_cloners; std::vector<ToGpuCloner> sub_cloners;
...@@ -211,6 +198,28 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions { ...@@ -211,6 +198,28 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
{} {}
void copy_ivf_shard (const IndexIVF *index_ivf, IndexIVF *idx2,
long n, long i) {
if (shard_type == 2) {
long i0 = i * index_ivf->ntotal / n;
long i1 = (i + 1) * index_ivf->ntotal / n;
if(verbose)
printf("IndexShards shard %ld indices %ld:%ld\n",
i, i0, i1);
index_ivf->copy_subset_to(*idx2, 2, i0, i1);
FAISS_ASSERT(idx2->ntotal == i1 - i0);
} else if (shard_type == 1) {
if(verbose)
printf("IndexShards shard %ld select modulo %ld = %ld\n",
i, n, i);
index_ivf->copy_subset_to(*idx2, 1, n, i);
} else {
FAISS_THROW_FMT ("shard_type %d not implemented", shard_type);
}
}
Index *clone_Index(const Index *index) override { Index *clone_Index(const Index *index) override {
long n = sub_cloners.size(); long n = sub_cloners.size();
if (n == 1) if (n == 1)
...@@ -231,19 +240,13 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions { ...@@ -231,19 +240,13 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
dynamic_cast<const faiss::IndexIVFPQ *>(index); dynamic_cast<const faiss::IndexIVFPQ *>(index);
auto index_ivfflat = auto index_ivfflat =
dynamic_cast<const faiss::IndexIVFFlat *>(index); dynamic_cast<const faiss::IndexIVFFlat *>(index);
FAISS_ASSERT_MSG (index_ivfpq || index_ivfflat, FAISS_THROW_IF_NOT_MSG (index_ivfpq || index_ivfflat,
"IndexShards implemented only for " "IndexShards implemented only for "
"IndexIVFFlat or IndexIVFPQ"); "IndexIVFFlat or IndexIVFPQ");
std::vector<faiss::Index*> shards(n); std::vector<faiss::Index*> shards(n);
for(long i = 0; i < n; i++) { for(long i = 0; i < n; i++) {
// make a shallow copy // make a shallow copy
long i0 = i * index->ntotal / n;
long i1 = (i + 1) * index->ntotal / n;
if(verbose)
printf("IndexShards shard %ld indices %ld:%ld\n",
i, i0, i1);
if(reserveVecs) if(reserveVecs)
sub_cloners[i].reserveVecs = sub_cloners[i].reserveVecs =
(reserveVecs + n - 1) / n; (reserveVecs + n - 1) / n;
...@@ -258,18 +261,19 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions { ...@@ -258,18 +261,19 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
idx2.nprobe = index_ivfpq->nprobe; idx2.nprobe = index_ivfpq->nprobe;
idx2.use_precomputed_table = 0; idx2.use_precomputed_table = 0;
idx2.is_trained = index->is_trained; idx2.is_trained = index->is_trained;
index_ivfpq->copy_subset_to(idx2, 2, i0, i1); copy_ivf_shard (index_ivfpq, &idx2, n, i);
FAISS_ASSERT(idx2.ntotal == i1 - i0);
shards[i] = sub_cloners[i].clone_Index(&idx2); shards[i] = sub_cloners[i].clone_Index(&idx2);
} else if (index_ivfflat) { } else if (index_ivfflat) {
faiss::IndexIVFFlat idx2( faiss::IndexIVFFlat idx2(
index_ivfflat->quantizer, index->d, index_ivfflat->quantizer, index->d,
index_ivfflat->nlist, index_ivfflat->metric_type); index_ivfflat->nlist, index_ivfflat->metric_type);
idx2.nprobe = index_ivfflat->nprobe; idx2.nprobe = index_ivfflat->nprobe;
index_ivfflat->copy_subset_to(idx2, 2, i0, i1);
idx2.nprobe = index_ivfflat->nprobe; idx2.nprobe = index_ivfflat->nprobe;
copy_ivf_shard (index_ivfflat, &idx2, n, i);
shards[i] = sub_cloners[i].clone_Index(&idx2); shards[i] = sub_cloners[i].clone_Index(&idx2);
} }
} }
faiss::IndexShards *res = faiss::IndexShards *res =
new faiss::IndexShards(index->d, true, false); new faiss::IndexShards(index->d, true, false);
...@@ -372,33 +376,26 @@ void GpuParameterSpace::initialize (const Index * index) ...@@ -372,33 +376,26 @@ void GpuParameterSpace::initialize (const Index * index)
void GpuParameterSpace::set_index_parameter ( void GpuParameterSpace::set_index_parameter (
Index * index, const std::string & name, double val) const Index * index, const std::string & name, double val) const
{ {
if (DC (IndexPreTransform)) {
index = ix->index;
}
if (DC (IndexProxy)) { if (DC (IndexProxy)) {
for (int i = 0; i < ix->count(); i++) for (int i = 0; i < ix->count(); i++)
set_index_parameter (ix->at(i), name, val); set_index_parameter (ix->at(i), name, val);
return; return;
} }
if (DC (faiss::IndexShards)) { if (DC (GpuIndexIVF)) {
for (auto sub_index : ix->shard_indexes) if (name == "nprobe") {
set_index_parameter (sub_index, name, val); ix->setNumProbes (int (val));
return; return;
} }
if (name == "nprobe") {
DC (GpuIndexIVF);
FAISS_ASSERT(ix);
ix->setNumProbes (int (val));
return;
} }
if (name == "use_precomputed_table") { if(DC (GpuIndexIVFPQ)) {
DC (GpuIndexIVFPQ); if (name == "use_precomputed_table") {
FAISS_ASSERT(ix); ix->setPrecomputedCodes(bool (val));
ix->setPrecomputedCodes(bool (val)); return;
return; }
} }
FAISS_ASSERT_MSG (false, "unknown parameter"); // maybe norma lindex parameters apply?
ParameterSpace::set_index_parameter (index, name, val);
} }
......
...@@ -22,7 +22,9 @@ GpuClonerOptions::GpuClonerOptions() ...@@ -22,7 +22,9 @@ GpuClonerOptions::GpuClonerOptions()
} }
GpuMultipleClonerOptions::GpuMultipleClonerOptions() GpuMultipleClonerOptions::GpuMultipleClonerOptions()
: shard(false) { : shard(false),
shard_type(1)
{
} }
} } // namespace } } // namespace
...@@ -47,6 +47,9 @@ struct GpuMultipleClonerOptions : public GpuClonerOptions { ...@@ -47,6 +47,9 @@ struct GpuMultipleClonerOptions : public GpuClonerOptions {
/// Whether to shard the index across GPUs, versus replication /// Whether to shard the index across GPUs, versus replication
/// across GPUs /// across GPUs
bool shard; bool shard;
/// IndexIVF::copy_subset_to subset type
int shard_type;
}; };
} } // namespace } } // namespace
...@@ -26,7 +26,7 @@ struct GpuIndexConfig { ...@@ -26,7 +26,7 @@ struct GpuIndexConfig {
/// GPU device on which the index is resident /// GPU device on which the index is resident
int device; int device;
/// What memory space to use for primary storae. /// What memory space to use for primary storage.
/// On Pascal and above (CC 6+) architectures, allows GPUs to use /// On Pascal and above (CC 6+) architectures, allows GPUs to use
/// more memory than is available on the GPU. /// more memory than is available on the GPU.
MemorySpace memorySpace; MemorySpace memorySpace;
......
...@@ -184,7 +184,7 @@ GpuIndexIVF::copyTo(faiss::IndexIVF* index) const { ...@@ -184,7 +184,7 @@ GpuIndexIVF::copyTo(faiss::IndexIVF* index) const {
} }
index->quantizer = q; index->quantizer = q;
index->quantizer_trains_alone = false; index->quantizer_trains_alone = 0;
index->own_fields = true; index->own_fields = true;
index->cp = this->cp; index->cp = this->cp;
index->ids.clear(); index->ids.clear();
......
...@@ -96,7 +96,6 @@ GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) { ...@@ -96,7 +96,6 @@ GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
FAISS_ASSERT(index->pq.byte_per_idx == 1); FAISS_ASSERT(index->pq.byte_per_idx == 1);
FAISS_ASSERT(index->by_residual); FAISS_ASSERT(index->by_residual);
FAISS_ASSERT(index->polysemous_ht == 0); FAISS_ASSERT(index->polysemous_ht == 0);
ivfpqConfig_.usePrecomputedTables = (bool) index->use_precomputed_table;
verifySettings_(); verifySettings_();
......
This diff is collapsed.
This diff is collapsed.
...@@ -31,11 +31,7 @@ void runL2Distance(GpuResources* resources, ...@@ -31,11 +31,7 @@ void runL2Distance(GpuResources* resources,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
// Do we care about `outDistances`? If not, we can // Do we care about `outDistances`? If not, we can
// take shortcuts. // take shortcuts.
bool ignoreOutDistances = false, bool ignoreOutDistances = false);
// Hint to use a different sized tile for
// multi-streaming the queries. If <= 0, we use the
// default
int tileSizeOverride = -1);
/// Calculates brute-force inner product distance between `vectors` /// Calculates brute-force inner product distance between `vectors`
/// and `queries`, returning the k closest results seen /// and `queries`, returning the k closest results seen
...@@ -45,11 +41,7 @@ void runIPDistance(GpuResources* resources, ...@@ -45,11 +41,7 @@ void runIPDistance(GpuResources* resources,
Tensor<float, 2, true>& queries, Tensor<float, 2, true>& queries,
int k, int k,
Tensor<float, 2, true>& outDistances, Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices);
// Hint to use a different sized tile for
// multi-streaming the queries. If <= 0, we use the
// default
int tileSizeOverride = -1);
#ifdef FAISS_USE_FLOAT16 #ifdef FAISS_USE_FLOAT16
void runIPDistance(GpuResources* resources, void runIPDistance(GpuResources* resources,
...@@ -59,8 +51,7 @@ void runIPDistance(GpuResources* resources, ...@@ -59,8 +51,7 @@ void runIPDistance(GpuResources* resources,
int k, int k,
Tensor<half, 2, true>& outDistances, Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool useHgemm, bool useHgemm);
int tileSizeOverride = -1);
void runL2Distance(GpuResources* resources, void runL2Distance(GpuResources* resources,
Tensor<half, 2, true>& vectors, Tensor<half, 2, true>& vectors,
...@@ -71,8 +62,7 @@ void runL2Distance(GpuResources* resources, ...@@ -71,8 +62,7 @@ void runL2Distance(GpuResources* resources,
Tensor<half, 2, true>& outDistances, Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool useHgemm, bool useHgemm,
bool ignoreOutDistances = false, bool ignoreOutDistances = false);
int tileSizeOverride = -1);
#endif #endif
} } // namespace } } // namespace
...@@ -114,8 +114,7 @@ FlatIndex::query(Tensor<float, 2, true>& input, ...@@ -114,8 +114,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
int k, int k,
Tensor<float, 2, true>& outDistances, Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool exactDistance, bool exactDistance) {
int tileSize) {
auto stream = resources_->getDefaultStreamCurrentDevice(); auto stream = resources_->getDefaultStreamCurrentDevice();
auto& mem = resources_->getMemoryManagerCurrentDevice(); auto& mem = resources_->getMemoryManagerCurrentDevice();
...@@ -127,7 +126,7 @@ FlatIndex::query(Tensor<float, 2, true>& input, ...@@ -127,7 +126,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
DeviceTensor<half, 2, true> outDistancesHalf( DeviceTensor<half, 2, true> outDistancesHalf(
mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream); mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream);
query(inputHalf, k, outDistancesHalf, outIndices, exactDistance, tileSize); query(inputHalf, k, outDistancesHalf, outIndices, exactDistance);
if (exactDistance) { if (exactDistance) {
// Convert outDistances back // Convert outDistances back
...@@ -145,8 +144,7 @@ FlatIndex::query(Tensor<float, 2, true>& input, ...@@ -145,8 +144,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
outDistances, outDistances,
outIndices, outIndices,
// FIXME // FIXME
!exactDistance, !exactDistance);
tileSize);
} else { } else {
runIPDistance(resources_, runIPDistance(resources_,
vectors_, vectors_,
...@@ -154,8 +152,7 @@ FlatIndex::query(Tensor<float, 2, true>& input, ...@@ -154,8 +152,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
input, input,
k, k,
outDistances, outDistances,
outIndices, outIndices);
tileSize);
} }
} }
} }
...@@ -166,8 +163,7 @@ FlatIndex::query(Tensor<half, 2, true>& input, ...@@ -166,8 +163,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
int k, int k,
Tensor<half, 2, true>& outDistances, Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool exactDistance, bool exactDistance) {
int tileSize) {
FAISS_ASSERT(useFloat16_); FAISS_ASSERT(useFloat16_);
if (l2Distance_) { if (l2Distance_) {
...@@ -181,8 +177,7 @@ FlatIndex::query(Tensor<half, 2, true>& input, ...@@ -181,8 +177,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
outIndices, outIndices,
useFloat16Accumulator_, useFloat16Accumulator_,
// FIXME // FIXME
!exactDistance, !exactDistance);
tileSize);
} else { } else {
runIPDistance(resources_, runIPDistance(resources_,
vectorsHalf_, vectorsHalf_,
...@@ -191,8 +186,7 @@ FlatIndex::query(Tensor<half, 2, true>& input, ...@@ -191,8 +186,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
k, k,
outDistances, outDistances,
outIndices, outIndices,
useFloat16Accumulator_, useFloat16Accumulator_);
tileSize);
} }
} }
#endif #endif
...@@ -217,12 +211,14 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) { ...@@ -217,12 +211,14 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
rawData_.append((char*) devDataHalf.data(), rawData_.append((char*) devDataHalf.data(),
devDataHalf.getSizeInBytes(), devDataHalf.getSizeInBytes(),
stream); stream,
true /* reserve exactly */);
#endif #endif
} else { } else {
rawData_.append((char*) data, rawData_.append((char*) data,
(size_t) dim_ * numVecs * sizeof(float), (size_t) dim_ * numVecs * sizeof(float),
stream); stream,
true /* reserve exactly */);
} }
num_ += numVecs; num_ += numVecs;
......
...@@ -61,16 +61,14 @@ class FlatIndex { ...@@ -61,16 +61,14 @@ class FlatIndex {
int k, int k,
Tensor<float, 2, true>& outDistances, Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool exactDistance, bool exactDistance);
int tileSize = -1);
#ifdef FAISS_USE_FLOAT16 #ifdef FAISS_USE_FLOAT16
void query(Tensor<half, 2, true>& vecs, void query(Tensor<half, 2, true>& vecs,
int k, int k,
Tensor<half, 2, true>& outDistances, Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool exactDistance, bool exactDistance);
int tileSize = -1);
#endif #endif
/// Add vectors to ourselves; the pointer passed can be on the host /// Add vectors to ourselves; the pointer passed can be on the host
......
...@@ -195,10 +195,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs, ...@@ -195,10 +195,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
closestSubQDistanceView, closestSubQDistanceView,
closestSubQIndexView, closestSubQIndexView,
// We don't care about distances // We don't care about distances
true, true);
// Much larger tile size, since these vectors are a
// lot smaller than query vectors
1024);
} }
// Now, we have the nearest sub-q centroid for each slice of the // Now, we have the nearest sub-q centroid for each slice of the
......
...@@ -10,10 +10,10 @@ ...@@ -10,10 +10,10 @@
#include "IVFUtils.cuh" #include "IVFUtils.cuh"
#include "../utils/DeviceUtils.h" #include "../utils/DeviceUtils.h"
#include "../utils/Limits.cuh"
#include "../utils/Select.cuh" #include "../utils/Select.cuh"
#include "../utils/StaticUtils.h" #include "../utils/StaticUtils.h"
#include "../utils/Tensor.cuh" #include "../utils/Tensor.cuh"
#include <limits>
// //
// This kernel is split into a separate compilation unit to cut down // This kernel is split into a separate compilation unit to cut down
...@@ -22,9 +22,6 @@ ...@@ -22,9 +22,6 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
constexpr auto kMax = std::numeric_limits<float>::max();
constexpr auto kMin = std::numeric_limits<float>::min();
template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir> template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir>
__global__ void __global__ void
pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets, pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
...@@ -38,9 +35,9 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets, ...@@ -38,9 +35,9 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
__shared__ float smemK[kNumWarps * NumWarpQ]; __shared__ float smemK[kNumWarps * NumWarpQ];
__shared__ int smemV[kNumWarps * NumWarpQ]; __shared__ int smemV[kNumWarps * NumWarpQ];
constexpr auto kInit = Dir ? kMin : kMax; constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
BlockSelect<float, int, Dir, Comparator<float>, BlockSelect<float, int, Dir, Comparator<float>,
NumWarpQ, NumThreadQ, ThreadsPerBlock> NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(kInit, -1, smemK, smemV, k); heap(kInit, -1, smemK, smemV, k);
auto queryId = blockIdx.y; auto queryId = blockIdx.y;
......
...@@ -10,10 +10,10 @@ ...@@ -10,10 +10,10 @@
#include "IVFUtils.cuh" #include "IVFUtils.cuh"
#include "../utils/DeviceUtils.h" #include "../utils/DeviceUtils.h"
#include "../utils/Limits.cuh"
#include "../utils/Select.cuh" #include "../utils/Select.cuh"
#include "../utils/StaticUtils.h" #include "../utils/StaticUtils.h"
#include "../utils/Tensor.cuh" #include "../utils/Tensor.cuh"
#include <limits>
// //
// This kernel is split into a separate compilation unit to cut down // This kernel is split into a separate compilation unit to cut down
...@@ -22,9 +22,6 @@ ...@@ -22,9 +22,6 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
constexpr auto kMax = std::numeric_limits<float>::max();
constexpr auto kMin = std::numeric_limits<float>::min();
// This is warp divergence central, but this is really a final step // This is warp divergence central, but this is really a final step
// and happening a small number of times // and happening a small number of times
inline __device__ int binarySearchForBucket(int* prefixSumOffsets, inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
...@@ -71,7 +68,7 @@ pass2SelectLists(Tensor<float, 2, true> heapDistances, ...@@ -71,7 +68,7 @@ pass2SelectLists(Tensor<float, 2, true> heapDistances,
__shared__ float smemK[kNumWarps * NumWarpQ]; __shared__ float smemK[kNumWarps * NumWarpQ];
__shared__ int smemV[kNumWarps * NumWarpQ]; __shared__ int smemV[kNumWarps * NumWarpQ];
constexpr auto kInit = Dir ? kMin : kMax; constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
BlockSelect<float, int, Dir, Comparator<float>, BlockSelect<float, int, Dir, Comparator<float>,
NumWarpQ, NumThreadQ, ThreadsPerBlock> NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(kInit, -1, smemK, smemV, k); heap(kInit, -1, smemK, smemV, k);
......
...@@ -31,28 +31,29 @@ namespace faiss { namespace gpu { ...@@ -31,28 +31,29 @@ namespace faiss { namespace gpu {
// T: the type we are doing the math in (e.g., float, half) // T: the type we are doing the math in (e.g., float, half)
// TVec: the potentially vectorized type we are loading in (e.g., // TVec: the potentially vectorized type we are loading in (e.g.,
// float4, half2) // float4, half2)
template <typename T, typename TVec, template <typename T, typename TVec, typename TIndex,
int RowTileSize, bool NormLoop, bool NormSquared> int RowTileSize, bool NormLoop, bool NormSquared>
__global__ void l2Norm(Tensor<TVec, 2, true> input, __global__ void l2Norm(Tensor<TVec, 2, true, TIndex> input,
Tensor<T, 1, true> output) { Tensor<T, 1, true, TIndex> output) {
extern __shared__ char smemByte[]; // #warps * RowTileSize elements extern __shared__ char smemByte[]; // #warps * RowTileSize elements
T* smem = (T*) smemByte; T* smem = (T*) smemByte;
int numWarps = utils::divUp(blockDim.x, kWarpSize); TIndex numWarps = utils::divUp(blockDim.x, kWarpSize);
int laneId = getLaneId(); TIndex laneId = getLaneId();
int warpId = threadIdx.x / kWarpSize; TIndex warpId = threadIdx.x / kWarpSize;
bool lastRowTile = (blockIdx.x == (gridDim.x - 1)); bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
int rowStart = RowTileSize * blockIdx.x; TIndex rowStart = RowTileSize * blockIdx.x;
T rowNorm[RowTileSize]; T rowNorm[RowTileSize];
if (lastRowTile) { if (lastRowTile) {
// We are handling the very end of the input matrix rows // We are handling the very end of the input matrix rows
for (int row = 0; row < input.getSize(0) - rowStart; ++row) { for (TIndex row = 0; row < input.getSize(0) - rowStart; ++row) {
if (NormLoop) { if (NormLoop) {
rowNorm[0] = Math<T>::zero(); rowNorm[0] = Math<T>::zero();
for (int col = threadIdx.x; col < input.getSize(1); col += blockDim.x) { for (TIndex col = threadIdx.x;
col < input.getSize(1); col += blockDim.x) {
TVec val = input[rowStart + row][col]; TVec val = input[rowStart + row][col];
val = Math<TVec>::mul(val, val); val = Math<TVec>::mul(val, val);
rowNorm[0] = Math<T>::add(rowNorm[0], Math<TVec>::reduceAdd(val)); rowNorm[0] = Math<T>::add(rowNorm[0], Math<TVec>::reduceAdd(val));
...@@ -82,7 +83,8 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input, ...@@ -82,7 +83,8 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input,
rowNorm[row] = Math<T>::zero(); rowNorm[row] = Math<T>::zero();
} }
for (int col = threadIdx.x; col < input.getSize(1); col += blockDim.x) { for (TIndex col = threadIdx.x;
col < input.getSize(1); col += blockDim.x) {
#pragma unroll #pragma unroll
for (int row = 0; row < RowTileSize; ++row) { for (int row = 0; row < RowTileSize; ++row) {
tmp[row] = input[rowStart + row][col]; tmp[row] = input[rowStart + row][col];
...@@ -172,44 +174,44 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input, ...@@ -172,44 +174,44 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input,
} }
} }
template <typename T, typename TVec> template <typename T, typename TVec, typename TIndex>
void runL2Norm(Tensor<T, 2, true>& input, void runL2Norm(Tensor<T, 2, true, TIndex>& input,
Tensor<T, 1, true>& output, Tensor<T, 1, true, TIndex>& output,
bool normSquared, bool normSquared,
cudaStream_t stream) { cudaStream_t stream) {
FAISS_ASSERT(input.getSize(0) == output.getSize(0)); FAISS_ASSERT(input.getSize(0) == output.getSize(0));
int maxThreads = getMaxThreadsCurrentDevice(); TIndex maxThreads = (TIndex) getMaxThreadsCurrentDevice();
constexpr int rowTileSize = 8; constexpr int rowTileSize = 8;
#define RUN_L2(TYPE_T, TYPE_TVEC, INPUT) \ #define RUN_L2(TYPE_T, TYPE_TVEC, INPUT) \
do { \ do { \
if (normLoop) { \ if (normLoop) { \
if (normSquared) { \ if (normSquared) { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, true, true> \ l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, true, true> \
<<<grid, block, smem, stream>>>(INPUT, output); \ <<<grid, block, smem, stream>>>(INPUT, output); \
} else { \ } else { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, true, false> \ l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, true, false> \
<<<grid, block, smem, stream>>>(INPUT, output); \ <<<grid, block, smem, stream>>>(INPUT, output); \
} \ } \
} else { \ } else { \
if (normSquared) { \ if (normSquared) { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, false, true> \ l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, false, true> \
<<<grid, block, smem, stream>>>(INPUT, output); \ <<<grid, block, smem, stream>>>(INPUT, output); \
} else { \ } else { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, false, false> \ l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, false, false> \
<<<grid, block, smem, stream>>>(INPUT, output); \ <<<grid, block, smem, stream>>>(INPUT, output); \
} \ } \
} \ } \
} while (0) } while (0)
if (input.template canCastResize<TVec>()) { if (input.template canCastResize<TVec>()) {
// Can load using the vectorized type // Can load using the vectorized type
auto inputV = input.template castResize<TVec>(); auto inputV = input.template castResize<TVec>();
int dim = inputV.getSize(1); auto dim = inputV.getSize(1);
bool normLoop = dim > maxThreads; bool normLoop = dim > maxThreads;
int numThreads = min(dim, maxThreads); auto numThreads = min(dim, maxThreads);
auto grid = dim3(utils::divUp(inputV.getSize(0), rowTileSize)); auto grid = dim3(utils::divUp(inputV.getSize(0), rowTileSize));
auto block = dim3(numThreads); auto block = dim3(numThreads);
...@@ -220,9 +222,9 @@ void runL2Norm(Tensor<T, 2, true>& input, ...@@ -220,9 +222,9 @@ void runL2Norm(Tensor<T, 2, true>& input,
} else { } else {
// Can't load using the vectorized type // Can't load using the vectorized type
int dim = input.getSize(1); auto dim = input.getSize(1);
bool normLoop = dim > maxThreads; bool normLoop = dim > maxThreads;
int numThreads = min(dim, maxThreads); auto numThreads = min(dim, maxThreads);
auto grid = dim3(utils::divUp(input.getSize(0), rowTileSize)); auto grid = dim3(utils::divUp(input.getSize(0), rowTileSize));
auto block = dim3(numThreads); auto block = dim3(numThreads);
...@@ -241,7 +243,13 @@ void runL2Norm(Tensor<float, 2, true>& input, ...@@ -241,7 +243,13 @@ void runL2Norm(Tensor<float, 2, true>& input,
Tensor<float, 1, true>& output, Tensor<float, 1, true>& output,
bool normSquared, bool normSquared,
cudaStream_t stream) { cudaStream_t stream) {
runL2Norm<float, float4>(input, output, normSquared, stream); if (input.canUseIndexType<int>()) {
runL2Norm<float, float4, int>(input, output, normSquared, stream);
} else {
auto inputCast = input.castIndexType<long>();
auto outputCast = output.castIndexType<long>();
runL2Norm<float, float4, long>(inputCast, outputCast, normSquared, stream);
}
} }
#ifdef FAISS_USE_FLOAT16 #ifdef FAISS_USE_FLOAT16
...@@ -249,7 +257,13 @@ void runL2Norm(Tensor<half, 2, true>& input, ...@@ -249,7 +257,13 @@ void runL2Norm(Tensor<half, 2, true>& input,
Tensor<half, 1, true>& output, Tensor<half, 1, true>& output,
bool normSquared, bool normSquared,
cudaStream_t stream) { cudaStream_t stream) {
runL2Norm<half, half2>(input, output, normSquared, stream); if (input.canUseIndexType<int>()) {
runL2Norm<half, half2, int>(input, output, normSquared, stream);
} else {
auto inputCast = input.castIndexType<long>();
auto outputCast = output.castIndexType<long>();
runL2Norm<half, half2, long>(inputCast, outputCast, normSquared, stream);
}
} }
#endif #endif
......
...@@ -29,11 +29,14 @@ DEFINE_int32(num, 128, "# of vecs"); ...@@ -29,11 +29,14 @@ DEFINE_int32(num, 128, "# of vecs");
DEFINE_int32(dim, 128, "# of dimensions"); DEFINE_int32(dim, 128, "# of dimensions");
DEFINE_int32(num_queries, 3, "number of query vectors"); DEFINE_int32(num_queries, 3, "number of query vectors");
DEFINE_bool(diff, true, "show exact distance + index output discrepancies"); DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
DEFINE_bool(use_float16, false, "use encodings in float16 instead of float32"); DEFINE_bool(use_float16, false, "use encodings in float16");
DEFINE_bool(use_float16_math, false, "perform math in float16");
DEFINE_bool(transposed, false, "store vectors transposed"); DEFINE_bool(transposed, false, "store vectors transposed");
DEFINE_int64(seed, -1, "specify random seed"); DEFINE_int64(seed, -1, "specify random seed");
DEFINE_int32(num_gpus, 1, "number of gpus to use"); DEFINE_int32(num_gpus, 1, "number of gpus to use");
DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use"); DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use");
DEFINE_bool(cpu, true, "run the CPU code for timing and comparison");
DEFINE_bool(use_unified_mem, false, "use Pascal unified memory for the index");
using namespace faiss::gpu; using namespace faiss::gpu;
...@@ -72,7 +75,10 @@ int main(int argc, char** argv) { ...@@ -72,7 +75,10 @@ int main(int argc, char** argv) {
GpuIndexFlatConfig config; GpuIndexFlatConfig config;
config.device = dev; config.device = dev;
config.useFloat16 = FLAGS_use_float16; config.useFloat16 = FLAGS_use_float16;
config.useFloat16Accumulator = FLAGS_use_float16_math;
config.storeTransposed = FLAGS_transposed; config.storeTransposed = FLAGS_transposed;
config.memorySpace = FLAGS_use_unified_mem ?
MemorySpace::Unified : MemorySpace::Device;
auto p = std::unique_ptr<faiss::gpu::GpuIndexFlatL2>( auto p = std::unique_ptr<faiss::gpu::GpuIndexFlatL2>(
new faiss::gpu::GpuIndexFlatL2(res, index.get(), config)); new faiss::gpu::GpuIndexFlatL2(res, index.get(), config));
...@@ -90,9 +96,9 @@ int main(int argc, char** argv) { ...@@ -90,9 +96,9 @@ int main(int argc, char** argv) {
HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k}); HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k}); HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
float cpuTime = 0.0f; if (FLAGS_cpu) {
float cpuTime = 0.0f;
{
CpuTimer timer; CpuTimer timer;
index->search(numQueries, index->search(numQueries,
cpuQuery.data(), cpuQuery.data(),
...@@ -101,10 +107,9 @@ int main(int argc, char** argv) { ...@@ -101,10 +107,9 @@ int main(int argc, char** argv) {
cpuIndices.data()); cpuIndices.data());
cpuTime = timer.elapsedMilliseconds(); cpuTime = timer.elapsedMilliseconds();
printf("CPU time %.3f ms\n", cpuTime);
} }
printf("CPU time %.3f ms\n", cpuTime);
HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k}); HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k}); HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
...@@ -131,14 +136,14 @@ int main(int argc, char** argv) { ...@@ -131,14 +136,14 @@ int main(int argc, char** argv) {
CUDA_VERIFY(cudaProfilerStop()); CUDA_VERIFY(cudaProfilerStop());
printf("GPU time %.3f ms\n", gpuTime); printf("GPU time %.3f ms\n", gpuTime);
compareLists(cpuDistances.data(), cpuIndices.data(), if (FLAGS_cpu) {
gpuDistances.data(), gpuIndices.data(), compareLists(cpuDistances.data(), cpuIndices.data(),
numQueries, FLAGS_k, gpuDistances.data(), gpuIndices.data(),
"", true, FLAGS_diff, false); numQueries, FLAGS_k,
"", true, FLAGS_diff, false);
}
CUDA_VERIFY(cudaDeviceSynchronize()); CUDA_VERIFY(cudaDeviceSynchronize());
// printf("\ncudaMalloc usage %zd\n",
// resources.getMemoryManager().getHighWaterCudaMalloc());
return 0; return 0;
} }
...@@ -21,29 +21,47 @@ ...@@ -21,29 +21,47 @@
constexpr float kF16MaxRelErr = 0.07f; constexpr float kF16MaxRelErr = 0.07f;
constexpr float kF32MaxRelErr = 6e-3f; constexpr float kF32MaxRelErr = 6e-3f;
void testFlat(bool useL2, struct TestFlatOptions {
bool useFloat16, TestFlatOptions()
bool useTransposed, : useL2(true),
int kOverride = -1) { useFloat16(false),
int numVecs = faiss::gpu::randVal(1000, 20000); useTransposed(false),
numVecsOverride(-1),
numQueriesOverride(-1),
kOverride(-1) {
}
bool useL2;
bool useFloat16;
bool useTransposed;
int numVecsOverride;
int numQueriesOverride;
int kOverride;
};
void testFlat(const TestFlatOptions& opt) {
int numVecs = opt.numVecsOverride > 0 ?
opt.numVecsOverride : faiss::gpu::randVal(1000, 20000);
int dim = faiss::gpu::randVal(50, 800); int dim = faiss::gpu::randVal(50, 800);
int numQuery = faiss::gpu::randVal(1, 512); int numQuery = opt.numQueriesOverride > 0 ?
opt.numQueriesOverride : faiss::gpu::randVal(1, 512);
// Due to loss of precision in a float16 accumulator, for large k, // Due to loss of precision in a float16 accumulator, for large k,
// the number of differences is pretty huge. Restrict ourselves to a // the number of differences is pretty huge. Restrict ourselves to a
// fairly small `k` for float16 // fairly small `k` for float16
int k = useFloat16 ? int k = opt.useFloat16 ?
std::min(faiss::gpu::randVal(1, 50), numVecs) : std::min(faiss::gpu::randVal(1, 50), numVecs) :
std::min(faiss::gpu::randVal(1, 1024), numVecs); std::min(faiss::gpu::randVal(1, 1024), numVecs);
if (kOverride > 0) { if (opt.kOverride > 0) {
k = kOverride; k = opt.kOverride;
} }
faiss::IndexFlatIP cpuIndexIP(dim); faiss::IndexFlatIP cpuIndexIP(dim);
faiss::IndexFlatL2 cpuIndexL2(dim); faiss::IndexFlatL2 cpuIndexL2(dim);
faiss::IndexFlat* cpuIndex = faiss::IndexFlat* cpuIndex =
useL2 ? (faiss::IndexFlat*) &cpuIndexL2 : (faiss::IndexFlat*) &cpuIndexIP; opt.useL2 ? (faiss::IndexFlat*) &cpuIndexL2 :
(faiss::IndexFlat*) &cpuIndexIP;
// Construct on a random device to test multi-device, if we have // Construct on a random device to test multi-device, if we have
// multiple devices // multiple devices
...@@ -55,14 +73,14 @@ void testFlat(bool useL2, ...@@ -55,14 +73,14 @@ void testFlat(bool useL2,
faiss::gpu::GpuIndexFlatConfig config; faiss::gpu::GpuIndexFlatConfig config;
config.device = device; config.device = device;
config.useFloat16 = useFloat16; config.useFloat16 = opt.useFloat16;
config.storeTransposed = useTransposed; config.storeTransposed = opt.useTransposed;
faiss::gpu::GpuIndexFlatIP gpuIndexIP(&res, dim, config); faiss::gpu::GpuIndexFlatIP gpuIndexIP(&res, dim, config);
faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config); faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
faiss::gpu::GpuIndexFlat* gpuIndex = faiss::gpu::GpuIndexFlat* gpuIndex =
useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 : opt.useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
(faiss::gpu::GpuIndexFlat*) &gpuIndexIP; (faiss::gpu::GpuIndexFlat*) &gpuIndexIP;
std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim); std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
...@@ -70,37 +88,53 @@ void testFlat(bool useL2, ...@@ -70,37 +88,53 @@ void testFlat(bool useL2,
gpuIndex->add(numVecs, vecs.data()); gpuIndex->add(numVecs, vecs.data());
std::stringstream str; std::stringstream str;
str << (useL2 ? "L2" : "IP") << " numVecs " << numVecs str << (opt.useL2 ? "L2" : "IP") << " numVecs " << numVecs
<< " dim " << dim << " dim " << dim
<< " useFloat16 " << useFloat16 << " useFloat16 " << opt.useFloat16
<< " transposed " << useTransposed << " transposed " << opt.useTransposed
<< " numQuery " << numQuery << " numQuery " << numQuery
<< " k " << k; << " k " << k;
// To some extent, we depend upon the relative error for the test // To some extent, we depend upon the relative error for the test
// for float16 // for float16
faiss::gpu::compareIndices(*cpuIndex, *gpuIndex, numQuery, dim, k, str.str(), faiss::gpu::compareIndices(*cpuIndex, *gpuIndex, numQuery, dim, k, str.str(),
useFloat16 ? kF16MaxRelErr : kF32MaxRelErr, opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
// FIXME: the fp16 bounds are // FIXME: the fp16 bounds are
// useless when math (the accumulator) is // useless when math (the accumulator) is
// in fp16. Figure out another way to test // in fp16. Figure out another way to test
useFloat16 ? 0.99f : 0.1f, opt.useFloat16 ? 0.99f : 0.1f,
useFloat16 ? 0.65f : 0.015f); opt.useFloat16 ? 0.65f : 0.015f);
} }
TEST(TestGpuIndexFlat, IP_Float32) { TEST(TestGpuIndexFlat, IP_Float32) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(false, false, false);
testFlat(false, false, true); TestFlatOptions opt;
opt.useL2 = false;
opt.useFloat16 = false;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
} }
} }
TEST(TestGpuIndexFlat, L2_Float32) { TEST(TestGpuIndexFlat, L2_Float32) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(true, false, false);
testFlat(true, false, true); TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
} }
} }
...@@ -108,24 +142,46 @@ TEST(TestGpuIndexFlat, L2_Float32) { ...@@ -108,24 +142,46 @@ TEST(TestGpuIndexFlat, L2_Float32) {
TEST(TestGpuIndexFlat, L2_Float32_K1) { TEST(TestGpuIndexFlat, L2_Float32_K1) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(true, false, false, 1);
testFlat(true, false, true, 1); TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
opt.kOverride = 1;
testFlat(opt);
} }
} }
TEST(TestGpuIndexFlat, IP_Float16) { TEST(TestGpuIndexFlat, IP_Float16) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(false, true, false);
testFlat(false, true, false); TestFlatOptions opt;
opt.useL2 = false;
opt.useFloat16 = true;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
} }
} }
TEST(TestGpuIndexFlat, L2_Float16) { TEST(TestGpuIndexFlat, L2_Float16) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(true, true, false);
testFlat(true, true, true); TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = true;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
} }
} }
...@@ -133,8 +189,33 @@ TEST(TestGpuIndexFlat, L2_Float16) { ...@@ -133,8 +189,33 @@ TEST(TestGpuIndexFlat, L2_Float16) {
TEST(TestGpuIndexFlat, L2_Float16_K1) { TEST(TestGpuIndexFlat, L2_Float16_K1) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(true, true, false, 1);
testFlat(true, true, true, 1); TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = true;
opt.useTransposed = false;
opt.kOverride = 1;
testFlat(opt);
}
}
// test tiling along a huge vector set
TEST(TestGpuIndexFlat, L2_Tiling) {
for (int tries = 0; tries < 3; ++tries) {
faiss::gpu::newTestSeed();
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
opt.numVecsOverride = 1000000;
opt.numQueriesOverride = 8;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
} }
} }
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "../StandardGpuResources.h" #include "../StandardGpuResources.h"
#include "../utils/DeviceUtils.h" #include "../utils/DeviceUtils.h"
#include "../test/TestUtils.h" #include "../test/TestUtils.h"
#include <cmath>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <sstream> #include <sstream>
...@@ -390,6 +391,68 @@ TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) { ...@@ -390,6 +391,68 @@ TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) {
copyToTest(false, false); copyToTest(false, false);
} }
TEST(TestGpuIndexIVFFlat, Float32_negative) {
faiss::gpu::newTestSeed();
Options opt;
auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
// Put all vecs on negative side
for (auto& f : trainVecs) {
f = std::abs(f) * -1.0f;
}
for (auto& f : addVecs) {
f *= std::abs(f) * -1.0f;
}
faiss::IndexFlatIP quantizerIP(opt.dim);
faiss::Index* quantizer = (faiss::Index*) &quantizerIP;
faiss::IndexIVFFlat cpuIndex(quantizer,
opt.dim, opt.numCentroids,
faiss::METRIC_INNER_PRODUCT);
cpuIndex.train(opt.numTrain, trainVecs.data());
cpuIndex.add(opt.numAdd, addVecs.data());
cpuIndex.nprobe = opt.nprobe;
faiss::gpu::StandardGpuResources res;
res.noTempMemory();
faiss::gpu::GpuIndexIVFFlatConfig config;
config.device = opt.device;
config.indicesOptions = opt.indicesOpt;
faiss::gpu::GpuIndexIVFFlat gpuIndex(&res,
cpuIndex.d,
cpuIndex.nlist,
cpuIndex.metric_type,
config);
gpuIndex.copyFrom(&cpuIndex);
gpuIndex.setNumProbes(opt.nprobe);
// Construct a positive test set
auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
// Put all vecs on positive size
for (auto& f : queryVecs) {
f = std::abs(f);
}
bool compFloat16 = false;
faiss::gpu::compareIndices(queryVecs,
cpuIndex, gpuIndex,
opt.numQuery, opt.dim, opt.k, opt.toString(),
compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
// FIXME: the fp16 bounds are
// useless when math (the accumulator) is
// in fp16. Figure out another way to test
compFloat16 ? 0.99f : 0.1f,
compFloat16 ? 0.65f : 0.015f);
}
// //
// NaN tests // NaN tests
// //
......
...@@ -64,24 +64,23 @@ std::vector<float> randVecs(size_t num, size_t dim) { ...@@ -64,24 +64,23 @@ std::vector<float> randVecs(size_t num, size_t dim) {
return v; return v;
} }
void compareIndices(faiss::Index& refIndex, void compareIndices(const std::vector<float>& queryVecs,
faiss::Index& refIndex,
faiss::Index& testIndex, faiss::Index& testIndex,
int numQuery, int dim, int k, int numQuery, int dim, int k,
const std::string& configMsg, const std::string& configMsg,
float maxRelativeError, float maxRelativeError,
float pctMaxDiff1, float pctMaxDiff1,
float pctMaxDiffN) { float pctMaxDiffN) {
auto queries = faiss::gpu::randVecs(numQuery, dim);
// Compare // Compare
std::vector<float> refDistance(numQuery * k, 0); std::vector<float> refDistance(numQuery * k, 0);
std::vector<faiss::Index::idx_t> refIndices(numQuery * k, -1); std::vector<faiss::Index::idx_t> refIndices(numQuery * k, -1);
refIndex.search(numQuery, queries.data(), refIndex.search(numQuery, queryVecs.data(),
k, refDistance.data(), refIndices.data()); k, refDistance.data(), refIndices.data());
std::vector<float> testDistance(numQuery * k, 0); std::vector<float> testDistance(numQuery * k, 0);
std::vector<faiss::Index::idx_t> testIndices(numQuery * k, -1); std::vector<faiss::Index::idx_t> testIndices(numQuery * k, -1);
testIndex.search(numQuery, queries.data(), testIndex.search(numQuery, queryVecs.data(),
k, testDistance.data(), testIndices.data()); k, testDistance.data(), testIndices.data());
faiss::gpu::compareLists(refDistance.data(), faiss::gpu::compareLists(refDistance.data(),
...@@ -94,6 +93,25 @@ void compareIndices(faiss::Index& refIndex, ...@@ -94,6 +93,25 @@ void compareIndices(faiss::Index& refIndex,
maxRelativeError, pctMaxDiff1, pctMaxDiffN); maxRelativeError, pctMaxDiff1, pctMaxDiffN);
} }
void compareIndices(faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
const std::string& configMsg,
float maxRelativeError,
float pctMaxDiff1,
float pctMaxDiffN) {
auto queryVecs = faiss::gpu::randVecs(numQuery, dim);
compareIndices(queryVecs,
refIndex,
testIndex,
numQuery, dim, k,
configMsg,
maxRelativeError,
pctMaxDiff1,
pctMaxDiffN);
}
template <typename T> template <typename T>
inline T lookup(const T* p, int i, int j, int /*dim1*/, int dim2) { inline T lookup(const T* p, int i, int j, int /*dim1*/, int dim2) {
return p[i * dim2 + j]; return p[i * dim2 + j];
......
...@@ -56,7 +56,19 @@ T randSelect(std::initializer_list<T> vals) { ...@@ -56,7 +56,19 @@ T randSelect(std::initializer_list<T> vals) {
/// Generates a collection of random vectors in the range [0, 1] /// Generates a collection of random vectors in the range [0, 1]
std::vector<float> randVecs(size_t num, size_t dim); std::vector<float> randVecs(size_t num, size_t dim);
/// Compare two indices via query for similarity /// Compare two indices via query for similarity, with a user-specified set of
/// query vectors
void compareIndices(const std::vector<float>& queryVecs,
faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
const std::string& configMsg,
float maxRelativeError = 6e-5f,
float pctMaxDiff1 = 0.1f,
float pctMaxDiffN = 0.005f);
/// Compare two indices via query for similarity, generating random query
/// vectors
void compareIndices(faiss::Index& refIndex, void compareIndices(faiss::Index& refIndex,
faiss::Index& testIndex, faiss::Index& testIndex,
int numQuery, int dim, int k, int numQuery, int dim, int k,
......
...@@ -38,14 +38,14 @@ def search_index_pytorch(index, x, k, D=None, I=None): ...@@ -38,14 +38,14 @@ def search_index_pytorch(index, x, k, D=None, I=None):
assert I.__class__ in (torch.LongTensor, torch.cuda.LongTensor) assert I.__class__ in (torch.LongTensor, torch.cuda.LongTensor)
assert I.size() == (n, k) assert I.size() == (n, k)
assert I.is_contiguous() assert I.is_contiguous()
torch.cuda.synchronize()
xptr = x.storage().data_ptr() xptr = x.storage().data_ptr()
Iptr = I.storage().data_ptr() Iptr = I.storage().data_ptr()
Dptr = D.storage().data_ptr() Dptr = D.storage().data_ptr()
index.search_c(n, faiss.cast_integer_to_float_ptr(xptr), index.search_c(n, faiss.cast_integer_to_float_ptr(xptr),
k, faiss.cast_integer_to_float_ptr(Dptr), k, faiss.cast_integer_to_float_ptr(Dptr),
faiss.cast_integer_to_long_ptr(Iptr)) faiss.cast_integer_to_long_ptr(Iptr))
torch.cuda.synchronize()
return D, I return D, I
......
...@@ -37,9 +37,9 @@ BLOCK_SELECT_DECL(float, false, 512); ...@@ -37,9 +37,9 @@ BLOCK_SELECT_DECL(float, false, 512);
BLOCK_SELECT_DECL(float, false, 1024); BLOCK_SELECT_DECL(float, false, 1024);
void runBlockSelect(Tensor<float, 2, true>& in, void runBlockSelect(Tensor<float, 2, true>& in,
Tensor<float, 2, true>& outK, Tensor<float, 2, true>& outK,
Tensor<int, 2, true>& outV, Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) { bool dir, int k, cudaStream_t stream) {
FAISS_ASSERT(k <= 1024); FAISS_ASSERT(k <= 1024);
if (dir) { if (dir) {
...@@ -77,4 +77,46 @@ void runBlockSelect(Tensor<float, 2, true>& in, ...@@ -77,4 +77,46 @@ void runBlockSelect(Tensor<float, 2, true>& in,
} }
} }
void runBlockSelectPair(Tensor<float, 2, true>& inK,
Tensor<int, 2, true>& inV,
Tensor<float, 2, true>& outK,
Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) {
FAISS_ASSERT(k <= 1024);
if (dir) {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(float, true, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(float, true, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(float, true, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(float, true, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(float, true, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(float, true, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(float, true, 1024);
}
} else {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(float, false, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(float, false, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(float, false, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(float, false, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(float, false, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(float, false, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(float, false, 1024);
}
}
}
} } // namespace } } // namespace
...@@ -39,9 +39,9 @@ BLOCK_SELECT_DECL(half, false, 512); ...@@ -39,9 +39,9 @@ BLOCK_SELECT_DECL(half, false, 512);
BLOCK_SELECT_DECL(half, false, 1024); BLOCK_SELECT_DECL(half, false, 1024);
void runBlockSelect(Tensor<half, 2, true>& in, void runBlockSelect(Tensor<half, 2, true>& in,
Tensor<half, 2, true>& outK, Tensor<half, 2, true>& outK,
Tensor<int, 2, true>& outV, Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) { bool dir, int k, cudaStream_t stream) {
FAISS_ASSERT(k <= 1024); FAISS_ASSERT(k <= 1024);
if (dir) { if (dir) {
...@@ -79,6 +79,48 @@ void runBlockSelect(Tensor<half, 2, true>& in, ...@@ -79,6 +79,48 @@ void runBlockSelect(Tensor<half, 2, true>& in,
} }
} }
void runBlockSelectPair(Tensor<half, 2, true>& inK,
Tensor<int, 2, true>& inV,
Tensor<half, 2, true>& outK,
Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) {
FAISS_ASSERT(k <= 1024);
if (dir) {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(half, true, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(half, true, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(half, true, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(half, true, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(half, true, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(half, true, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(half, true, 1024);
}
} else {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(half, false, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(half, false, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(half, false, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(half, false, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(half, false, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(half, false, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(half, false, 1024);
}
}
}
#endif #endif
} } // namespace } } // namespace
...@@ -32,7 +32,7 @@ __global__ void blockSelect(Tensor<K, 2, true> in, ...@@ -32,7 +32,7 @@ __global__ void blockSelect(Tensor<K, 2, true> in,
__shared__ IndexType smemV[kNumWarps * NumWarpQ]; __shared__ IndexType smemV[kNumWarps * NumWarpQ];
BlockSelect<K, IndexType, Dir, Comparator<K>, BlockSelect<K, IndexType, Dir, Comparator<K>,
NumWarpQ, NumThreadQ, ThreadsPerBlock> NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(initK, initV, smemK, smemV, k); heap(initK, initV, smemK, smemV, k);
// Grid is exactly sized to rows available // Grid is exactly sized to rows available
...@@ -62,16 +62,79 @@ __global__ void blockSelect(Tensor<K, 2, true> in, ...@@ -62,16 +62,79 @@ __global__ void blockSelect(Tensor<K, 2, true> in,
} }
} }
template <typename K,
typename IndexType,
bool Dir,
int NumWarpQ,
int NumThreadQ,
int ThreadsPerBlock>
__global__ void blockSelectPair(Tensor<K, 2, true> inK,
Tensor<IndexType, 2, true> inV,
Tensor<K, 2, true> outK,
Tensor<IndexType, 2, true> outV,
K initK,
IndexType initV,
int k) {
constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
__shared__ K smemK[kNumWarps * NumWarpQ];
__shared__ IndexType smemV[kNumWarps * NumWarpQ];
BlockSelect<K, IndexType, Dir, Comparator<K>,
NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(initK, initV, smemK, smemV, k);
// Grid is exactly sized to rows available
int row = blockIdx.x;
int i = threadIdx.x;
K* inKStart = inK[row][i].data();
IndexType* inVStart = inV[row][i].data();
// Whole warps must participate in the selection
int limit = utils::roundDown(inK.getSize(1), kWarpSize);
for (; i < limit; i += ThreadsPerBlock) {
heap.add(*inKStart, *inVStart);
inKStart += ThreadsPerBlock;
inVStart += ThreadsPerBlock;
}
// Handle last remainder fraction of a warp of elements
if (i < inK.getSize(1)) {
heap.addThreadQ(*inKStart, *inVStart);
}
heap.reduce();
for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
outK[row][i] = smemK[i];
outV[row][i] = smemV[i];
}
}
void runBlockSelect(Tensor<float, 2, true>& in, void runBlockSelect(Tensor<float, 2, true>& in,
Tensor<float, 2, true>& outKeys, Tensor<float, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream); bool dir, int k, cudaStream_t stream);
void runBlockSelectPair(Tensor<float, 2, true>& inKeys,
Tensor<int, 2, true>& inIndices,
Tensor<float, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
#ifdef FAISS_USE_FLOAT16 #ifdef FAISS_USE_FLOAT16
void runBlockSelect(Tensor<half, 2, true>& in, void runBlockSelect(Tensor<half, 2, true>& in,
Tensor<half, 2, true>& outKeys, Tensor<half, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream); bool dir, int k, cudaStream_t stream);
void runBlockSelectPair(Tensor<half, 2, true>& inKeys,
Tensor<int, 2, true>& inIndices,
Tensor<half, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
#endif #endif
} } // namespace } } // namespace
...@@ -12,37 +12,37 @@ ...@@ -12,37 +12,37 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor() : DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor() :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner), state_(AllocState::NotOwner),
space_(MemorySpace::Device) { space_(MemorySpace::Device) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t) : DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner), state_(AllocState::NotOwner),
space_(MemorySpace::Device) { space_(MemorySpace::Device) {
this->operator=(std::move(t)); this->operator=(std::move(t));
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>& DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t) { DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
if (this->state_ == AllocState::Owner) { if (this->state_ == AllocState::Owner) {
CUDA_VERIFY(cudaFree(this->data_)); CUDA_VERIFY(cudaFree(this->data_));
} }
this->Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator=( this->Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
std::move(t)); std::move(t));
this->state_ = t.state_; t.state_ = AllocState::NotOwner; this->state_ = t.state_; t.state_ = AllocState::NotOwner;
...@@ -52,10 +52,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=( ...@@ -52,10 +52,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
return *this; return *this;
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() { DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~DeviceTensor() {
if (state_ == AllocState::Owner) { if (state_ == AllocState::Owner) {
FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0)); FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
CUDA_VERIFY(cudaFree(this->data_)); CUDA_VERIFY(cudaFree(this->data_));
...@@ -66,13 +66,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() { ...@@ -66,13 +66,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() {
// destructor will return the reservation // destructor will return the reservation
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
const IndexT sizes[Dim], const IndexT sizes[Dim],
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner), state_(AllocState::Owner),
space_(space) { space_(space) {
...@@ -80,13 +80,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -80,13 +80,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0)); FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
std::initializer_list<IndexT> sizes, std::initializer_list<IndexT> sizes,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner), state_(AllocState::Owner),
space_(space) { space_(space) {
...@@ -95,15 +95,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -95,15 +95,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
} }
// memory reservation constructor // memory reservation constructor
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m, DeviceMemory& m,
const IndexT sizes[Dim], const IndexT sizes[Dim],
cudaStream_t stream, cudaStream_t stream,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Reservation), state_(AllocState::Reservation),
space_(space) { space_(space) {
...@@ -116,15 +116,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -116,15 +116,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
} }
// memory reservation constructor // memory reservation constructor
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m, DeviceMemory& m,
std::initializer_list<IndexT> sizes, std::initializer_list<IndexT> sizes,
cudaStream_t stream, cudaStream_t stream,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Reservation), state_(AllocState::Reservation),
space_(space) { space_(space) {
...@@ -136,51 +136,51 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -136,51 +136,51 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
reservation_ = std::move(memory); reservation_ = std::move(memory);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data, DataPtrType data,
const IndexT sizes[Dim], const IndexT sizes[Dim],
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner), state_(AllocState::NotOwner),
space_(space) { space_(space) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data, DataPtrType data,
std::initializer_list<IndexT> sizes, std::initializer_list<IndexT> sizes,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner), state_(AllocState::NotOwner),
space_(space) { space_(space) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data, DataPtrType data,
const IndexT sizes[Dim], const IndexT sizes[Dim],
const IndexT strides[Dim], const IndexT strides[Dim],
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes, strides), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
state_(AllocState::NotOwner), state_(AllocState::NotOwner),
space_(space) { space_(space) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream, cudaStream_t stream,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Owner), state_(AllocState::Owner),
space_(space) { space_(space) {
...@@ -189,15 +189,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -189,15 +189,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
this->copyFrom(t, stream); this->copyFrom(t, stream);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m, DeviceMemory& m,
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream, cudaStream_t stream,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Reservation), state_(AllocState::Reservation),
space_(space) { space_(space) {
...@@ -211,10 +211,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -211,10 +211,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
this->copyFrom(t, stream); this->copyFrom(t, stream);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>& __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::zero( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero(
cudaStream_t stream) { cudaStream_t stream) {
if (this->data_) { if (this->data_) {
// Region must be contiguous // Region must be contiguous
......
...@@ -18,10 +18,10 @@ namespace faiss { namespace gpu { ...@@ -18,10 +18,10 @@ namespace faiss { namespace gpu {
template <typename T, template <typename T,
int Dim, int Dim,
bool Contig = false, bool InnerContig = false,
typename IndexT = int, typename IndexT = int,
template <typename U> class PtrTraits = traits::DefaultPtrTraits> template <typename U> class PtrTraits = traits::DefaultPtrTraits>
class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> { class DeviceTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
public: public:
typedef IndexT IndexType; typedef IndexT IndexType;
typedef typename PtrTraits<T>::PtrType DataPtrType; typedef typename PtrTraits<T>::PtrType DataPtrType;
...@@ -33,11 +33,11 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> { ...@@ -33,11 +33,11 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
__host__ ~DeviceTensor(); __host__ ~DeviceTensor();
/// Move constructor /// Move constructor
__host__ DeviceTensor(DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t); __host__ DeviceTensor(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
/// Move assignment /// Move assignment
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>& __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
operator=(DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t); operator=(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
/// Constructs a tensor of the given size, allocating memory for it /// Constructs a tensor of the given size, allocating memory for it
/// locally /// locally
...@@ -76,19 +76,19 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> { ...@@ -76,19 +76,19 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
MemorySpace space = MemorySpace::Device); MemorySpace space = MemorySpace::Device);
/// Copies a tensor into ourselves, allocating memory for it locally /// Copies a tensor into ourselves, allocating memory for it locally
__host__ DeviceTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, __host__ DeviceTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream, cudaStream_t stream,
MemorySpace space = MemorySpace::Device); MemorySpace space = MemorySpace::Device);
/// Copies a tensor into ourselves, reserving a temporary /// Copies a tensor into ourselves, reserving a temporary
/// memory reservation via a memory manager. /// memory reservation via a memory manager.
__host__ DeviceTensor(DeviceMemory& m, __host__ DeviceTensor(DeviceMemory& m,
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream, cudaStream_t stream,
MemorySpace space = MemorySpace::Device); MemorySpace space = MemorySpace::Device);
/// Call to zero out memory /// Call to zero out memory
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>& __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
zero(cudaStream_t stream); zero(cudaStream_t stream);
private: private:
......
...@@ -43,7 +43,7 @@ void synchronizeAllDevices() { ...@@ -43,7 +43,7 @@ void synchronizeAllDevices() {
} }
} }
cudaDeviceProp& getDeviceProperties(int device) { const cudaDeviceProp& getDeviceProperties(int device) {
static std::mutex mutex; static std::mutex mutex;
static std::unordered_map<int, cudaDeviceProp> properties; static std::unordered_map<int, cudaDeviceProp> properties;
...@@ -61,6 +61,10 @@ cudaDeviceProp& getDeviceProperties(int device) { ...@@ -61,6 +61,10 @@ cudaDeviceProp& getDeviceProperties(int device) {
return it->second; return it->second;
} }
const cudaDeviceProp& getCurrentDeviceProperties() {
return getDeviceProperties(getCurrentDevice());
}
int getMaxThreads(int device) { int getMaxThreads(int device) {
return getDeviceProperties(device).maxThreadsPerBlock; return getDeviceProperties(device).maxThreadsPerBlock;
} }
......
...@@ -31,7 +31,10 @@ int getNumDevices(); ...@@ -31,7 +31,10 @@ int getNumDevices();
void synchronizeAllDevices(); void synchronizeAllDevices();
/// Returns a cached cudaDeviceProp for the given device /// Returns a cached cudaDeviceProp for the given device
cudaDeviceProp& getDeviceProperties(int device); const cudaDeviceProp& getDeviceProperties(int device);
/// Returns the cached cudaDeviceProp for the current device
const cudaDeviceProp& getCurrentDeviceProperties();
/// Returns the maximum number of threads available for the given GPU /// Returns the maximum number of threads available for the given GPU
/// device /// device
......
...@@ -10,18 +10,18 @@ ...@@ -10,18 +10,18 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor() : HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor() :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner) { state_(AllocState::NotOwner) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() { HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~HostTensor() {
if (state_ == AllocState::Owner) { if (state_ == AllocState::Owner) {
FAISS_ASSERT(this->data_ != nullptr); FAISS_ASSERT(this->data_ != nullptr);
delete[] this->data_; delete[] this->data_;
...@@ -29,67 +29,67 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() { ...@@ -29,67 +29,67 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() {
} }
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
const IndexT sizes[Dim]) : const IndexT sizes[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner) { state_(AllocState::Owner) {
this->data_ = new T[this->numElements()]; this->data_ = new T[this->numElements()];
FAISS_ASSERT(this->data_ != nullptr); FAISS_ASSERT(this->data_ != nullptr);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
std::initializer_list<IndexT> sizes) : std::initializer_list<IndexT> sizes) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner) { state_(AllocState::Owner) {
this->data_ = new T[this->numElements()]; this->data_ = new T[this->numElements()];
FAISS_ASSERT(this->data_ != nullptr); FAISS_ASSERT(this->data_ != nullptr);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data, DataPtrType data,
const IndexT sizes[Dim]) : const IndexT sizes[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner) { state_(AllocState::NotOwner) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data, DataPtrType data,
std::initializer_list<IndexT> sizes) : std::initializer_list<IndexT> sizes) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner) { state_(AllocState::NotOwner) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data, DataPtrType data,
const IndexT sizes[Dim], const IndexT sizes[Dim],
const IndexT strides[Dim]) : const IndexT strides[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes, strides), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
state_(AllocState::NotOwner) { state_(AllocState::NotOwner) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream) : cudaStream_t stream) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Owner) { state_(AllocState::Owner) {
// Only contiguous arrays handled for now // Only contiguous arrays handled for now
FAISS_ASSERT(t.isContiguous()); FAISS_ASSERT(t.isContiguous());
...@@ -99,10 +99,10 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( ...@@ -99,10 +99,10 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
} }
/// Call to zero out memory /// Call to zero out memory
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>& __host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() { HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero() {
// Region must be contiguous // Region must be contiguous
FAISS_ASSERT(this->isContiguous()); FAISS_ASSERT(this->isContiguous());
...@@ -113,17 +113,17 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() { ...@@ -113,17 +113,17 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() {
return *this; return *this;
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ T __host__ T
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::maxDiff( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::maxDiff(
const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const { const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
auto size = this->numElements(); auto size = this->numElements();
FAISS_ASSERT(size == t.numElements()); FAISS_ASSERT(size == t.numElements());
FAISS_ASSERT(size > 0); FAISS_ASSERT(size > 0);
if (Contig) { if (InnerContig) {
auto a = this->data(); auto a = this->data();
auto b = t.data(); auto b = t.data();
......
...@@ -16,10 +16,10 @@ namespace faiss { namespace gpu { ...@@ -16,10 +16,10 @@ namespace faiss { namespace gpu {
template <typename T, template <typename T,
int Dim, int Dim,
bool Contig = false, bool InnerContig = false,
typename IndexT = int, typename IndexT = int,
template <typename U> class PtrTraits = traits::DefaultPtrTraits> template <typename U> class PtrTraits = traits::DefaultPtrTraits>
class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> { class HostTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
public: public:
typedef IndexT IndexType; typedef IndexT IndexType;
typedef typename PtrTraits<T>::PtrType DataPtrType; typedef typename PtrTraits<T>::PtrType DataPtrType;
...@@ -51,19 +51,19 @@ class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> { ...@@ -51,19 +51,19 @@ class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
/// Copies a tensor into ourselves, allocating memory for it /// Copies a tensor into ourselves, allocating memory for it
/// locally. If the tensor is on the GPU, then we will copy it to /// locally. If the tensor is on the GPU, then we will copy it to
/// ourselves wrt the given stream. /// ourselves wrt the given stream.
__host__ HostTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, __host__ HostTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream); cudaStream_t stream);
/// Call to zero out memory /// Call to zero out memory
__host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>& zero(); __host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& zero();
/// Returns the maximum difference seen between two tensors /// Returns the maximum difference seen between two tensors
__host__ T __host__ T
maxDiff(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const; maxDiff(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const;
/// Are the two tensors exactly equal? /// Are the two tensors exactly equal?
__host__ bool __host__ bool
equal(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const { equal(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
return (maxDiff(t) == (T) 0); return (maxDiff(t) == (T) 0);
} }
......
...@@ -24,11 +24,12 @@ struct Limits { ...@@ -24,11 +24,12 @@ struct Limits {
// constexpr constructor for half // constexpr constructor for half
// FIXME: faiss CPU uses +/-FLT_MAX instead of +/-infinity // FIXME: faiss CPU uses +/-FLT_MAX instead of +/-infinity
constexpr float kFloatMax = std::numeric_limits<float>::max(); constexpr float kFloatMax = std::numeric_limits<float>::max();
constexpr float kFloatMin = std::numeric_limits<float>::lowest();
template <> template <>
struct Limits<float> { struct Limits<float> {
static __device__ __host__ inline float getMin() { static __device__ __host__ inline float getMin() {
return -kFloatMax; return kFloatMin;
} }
static __device__ __host__ inline float getMax() { static __device__ __host__ inline float getMax() {
return kFloatMax; return kFloatMax;
...@@ -55,8 +56,8 @@ struct Limits<half> { ...@@ -55,8 +56,8 @@ struct Limits<half> {
#endif // FAISS_USE_FLOAT16 #endif // FAISS_USE_FLOAT16
constexpr int kIntMin = std::numeric_limits<int>::min();
constexpr int kIntMax = std::numeric_limits<int>::max(); constexpr int kIntMax = std::numeric_limits<int>::max();
constexpr int kIntMin = std::numeric_limits<int>::lowest();
template <> template <>
struct Limits<int> { struct Limits<int> {
......
...@@ -112,6 +112,10 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC, ...@@ -112,6 +112,10 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC,
FAISS_ASSERT(aK == bK); FAISS_ASSERT(aK == bK);
FAISS_ASSERT(bN == cN); FAISS_ASSERT(bN == cN);
FAISS_ASSERT(a.getStride(1) == 1);
FAISS_ASSERT(b.getStride(1) == 1);
FAISS_ASSERT(c.getStride(1) == 1);
// Now, we have to represent the matrix multiplication in // Now, we have to represent the matrix multiplication in
// column-major layout // column-major layout
T* pA = transC ? a.data() : b.data(); T* pA = transC ? a.data() : b.data();
...@@ -122,9 +126,9 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC, ...@@ -122,9 +126,9 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC,
int n = c.getSize(0); // other size int n = c.getSize(0); // other size
int k = transA ? a.getSize(0) : a.getSize(1); int k = transA ? a.getSize(0) : a.getSize(1);
int lda = transC ? a.getSize(1) : b.getSize(1); int lda = transC ? a.getStride(0) : b.getStride(0);
int ldb = transC ? b.getSize(1) : a.getSize(1); int ldb = transC ? b.getStride(0) : a.getStride(0);
int ldc = c.getSize(1); int ldc = c.getStride(0);
auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N; auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N; auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
...@@ -238,9 +242,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC, ...@@ -238,9 +242,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
int n = c.getSize(1); // other size int n = c.getSize(1); // other size
int k = transA ? a.getSize(1) : a.getSize(2); int k = transA ? a.getSize(1) : a.getSize(2);
int lda = transC ? a.getSize(2) : b.getSize(2); int lda = transC ? a.getStride(1) : b.getStride(1);
int ldb = transC ? b.getSize(2) : a.getSize(2); int ldb = transC ? b.getStride(1) : a.getStride(1);
int ldc = c.getSize(2); int ldc = c.getStride(1);
auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N; auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N; auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
...@@ -254,9 +258,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC, ...@@ -254,9 +258,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
HostTensor<float*, 1, true> hostB({b.getSize(0)}); HostTensor<float*, 1, true> hostB({b.getSize(0)});
HostTensor<float*, 1, true> hostC({c.getSize(0)}); HostTensor<float*, 1, true> hostC({c.getSize(0)});
size_t aOffset = a.getSize(1) * a.getSize(2); size_t aOffset = a.getStride(0);
size_t bOffset = b.getSize(1) * b.getSize(2); size_t bOffset = b.getStride(0);
size_t cOffset = c.getSize(1) * c.getSize(2); size_t cOffset = c.getStride(0);
for (int i = 0; i < a.getSize(0); ++i) { for (int i = 0; i < a.getSize(0); ++i) {
hostA[i] = transC ? a.data() + i * aOffset : b.data() + i * bOffset; hostA[i] = transC ? a.data() + i * aOffset : b.data() + i * bOffset;
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
template <int Dim, bool Contig = false, typename IndexT = int> template <int Dim, bool InnerContig = false, typename IndexT = int>
class NoTypeTensor { class NoTypeTensor {
public: public:
NoTypeTensor() NoTypeTensor()
...@@ -25,7 +25,7 @@ class NoTypeTensor { ...@@ -25,7 +25,7 @@ class NoTypeTensor {
} }
template <typename T> template <typename T>
NoTypeTensor(Tensor<T, Dim, Contig, IndexT>& t) NoTypeTensor(Tensor<T, Dim, InnerContig, IndexT>& t)
: mem_(t.data()), : mem_(t.data()),
typeSize_(sizeof(T)) { typeSize_(sizeof(T)) {
for (int i = 0; i < Dim; ++i) { for (int i = 0; i < Dim; ++i) {
...@@ -87,13 +87,14 @@ class NoTypeTensor { ...@@ -87,13 +87,14 @@ class NoTypeTensor {
} }
template <typename T> template <typename T>
Tensor<T, Dim, Contig, IndexT> toTensor() { Tensor<T, Dim, InnerContig, IndexT> toTensor() {
FAISS_ASSERT(sizeof(T) == typeSize_); FAISS_ASSERT(sizeof(T) == typeSize_);
return Tensor<T, Dim, Contig, IndexT>((T*) mem_, size_, stride_); return Tensor<T, Dim, InnerContig, IndexT>((T*) mem_, size_, stride_);
} }
NoTypeTensor<Dim, Contig, IndexT> narrowOutermost(IndexT start, IndexT size) { NoTypeTensor<Dim, InnerContig, IndexT> narrowOutermost(IndexT start,
IndexT size) {
char* newPtr = (char*) mem_; char* newPtr = (char*) mem_;
if (start > 0) { if (start > 0) {
...@@ -110,7 +111,7 @@ class NoTypeTensor { ...@@ -110,7 +111,7 @@ class NoTypeTensor {
} }
} }
return NoTypeTensor<Dim, Contig, IndexT>( return NoTypeTensor<Dim, InnerContig, IndexT>(
newPtr, typeSize_, newSize, stride_); newPtr, typeSize_, newSize, stride_);
} }
......
This diff is collapsed.
This diff is collapsed.
...@@ -19,26 +19,26 @@ ...@@ -19,26 +19,26 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
template <typename T> template <typename T, typename IndexT>
struct TensorInfo { struct TensorInfo {
static constexpr int kMaxDims = 8; static constexpr int kMaxDims = 8;
T* data; T* data;
int sizes[kMaxDims]; IndexT sizes[kMaxDims];
int strides[kMaxDims]; IndexT strides[kMaxDims];
int dims; int dims;
}; };
template <typename T, int Dim> template <typename T, typename IndexT, int Dim>
struct TensorInfoOffset { struct TensorInfoOffset {
__device__ inline static unsigned int get(const TensorInfo<T>& info, __device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
unsigned int linearId) { IndexT linearId) {
unsigned int offset = 0; IndexT offset = 0;
#pragma unroll #pragma unroll
for (int i = Dim - 1; i >= 0; --i) { for (int i = Dim - 1; i >= 0; --i) {
unsigned int curDimIndex = linearId % info.sizes[i]; IndexT curDimIndex = linearId % info.sizes[i];
unsigned int curDimOffset = curDimIndex * info.strides[i]; IndexT curDimOffset = curDimIndex * info.strides[i];
offset += curDimOffset; offset += curDimOffset;
...@@ -51,21 +51,21 @@ struct TensorInfoOffset { ...@@ -51,21 +51,21 @@ struct TensorInfoOffset {
} }
}; };
template <typename T> template <typename T, typename IndexT>
struct TensorInfoOffset<T, -1> { struct TensorInfoOffset<T, IndexT, -1> {
__device__ inline static unsigned int get(const TensorInfo<T>& info, __device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
unsigned int linearId) { IndexT linearId) {
return linearId; return linearId;
} }
}; };
template <typename T, int Dim> template <typename T, typename IndexT, int Dim>
TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) { TensorInfo<T, IndexT> getTensorInfo(const Tensor<T, Dim, true>& t) {
TensorInfo<T> info; TensorInfo<T, IndexT> info;
for (int i = 0; i < Dim; ++i) { for (int i = 0; i < Dim; ++i) {
info.sizes[i] = t.getSize(i); info.sizes[i] = (IndexT) t.getSize(i);
info.strides[i] = t.getStride(i); info.strides[i] = (IndexT) t.getStride(i);
} }
info.data = t.data(); info.data = t.data();
...@@ -74,26 +74,22 @@ TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) { ...@@ -74,26 +74,22 @@ TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) {
return info; return info;
} }
template <typename T, int DimInput, int DimOutput> template <typename T, typename IndexT, int DimInput, int DimOutput>
__global__ void transposeAny(TensorInfo<T> input, __global__ void transposeAny(TensorInfo<T, IndexT> input,
TensorInfo<T> output, TensorInfo<T, IndexT> output,
unsigned int totalSize) { IndexT totalSize) {
auto linearThreadId = blockIdx.x * blockDim.x + threadIdx.x; for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x;
i < totalSize;
if (linearThreadId >= totalSize) { i += gridDim.x + blockDim.x) {
return; auto inputOffset = TensorInfoOffset<T, IndexT, DimInput>::get(input, i);
} auto outputOffset = TensorInfoOffset<T, IndexT, DimOutput>::get(output, i);
auto inputOffset =
TensorInfoOffset<T, DimInput>::get(input, linearThreadId);
auto outputOffset =
TensorInfoOffset<T, DimOutput>::get(output, linearThreadId);
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
output.data[outputOffset] = __ldg(&input.data[inputOffset]); output.data[outputOffset] = __ldg(&input.data[inputOffset]);
#else #else
output.data[outputOffset] = input.data[inputOffset]; output.data[outputOffset] = input.data[inputOffset];
#endif #endif
}
} }
/// Performs an out-of-place transposition between any two dimensions. /// Performs an out-of-place transposition between any two dimensions.
...@@ -110,7 +106,8 @@ void runTransposeAny(Tensor<T, Dim, true>& in, ...@@ -110,7 +106,8 @@ void runTransposeAny(Tensor<T, Dim, true>& in,
int dim1, int dim2, int dim1, int dim2,
Tensor<T, Dim, true>& out, Tensor<T, Dim, true>& out,
cudaStream_t stream) { cudaStream_t stream) {
static_assert(Dim <= TensorInfo<T>::kMaxDims, "too many dimensions"); static_assert(Dim <= TensorInfo<T, unsigned int>::kMaxDims,
"too many dimensions");
FAISS_ASSERT(dim1 != dim2); FAISS_ASSERT(dim1 != dim2);
FAISS_ASSERT(dim1 < Dim && dim2 < Dim); FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
...@@ -127,20 +124,33 @@ void runTransposeAny(Tensor<T, Dim, true>& in, ...@@ -127,20 +124,33 @@ void runTransposeAny(Tensor<T, Dim, true>& in,
FAISS_ASSERT(out.getSize(i) == outSize[i]); FAISS_ASSERT(out.getSize(i) == outSize[i]);
} }
auto inInfo = getTensorInfo<T, Dim>(in); size_t totalSize = in.numElements();
auto outInfo = getTensorInfo<T, Dim>(out); size_t block = std::min((size_t) getMaxThreadsCurrentDevice(), totalSize);
if (totalSize <= (size_t) std::numeric_limits<int>::max()) {
// div/mod seems faster with unsigned types
auto inInfo = getTensorInfo<T, unsigned int, Dim>(in);
auto outInfo = getTensorInfo<T, unsigned int, Dim>(out);
std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]); auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
int totalSize = in.numElements(); transposeAny<T, unsigned int, Dim, -1>
<<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
} else {
auto inInfo = getTensorInfo<T, unsigned long, Dim>(in);
auto outInfo = getTensorInfo<T, unsigned long, Dim>(out);
int numThreads = std::min(getMaxThreadsCurrentDevice(), totalSize); std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
auto grid = dim3(utils::divUp(totalSize, numThreads)); std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
auto block = dim3(numThreads);
transposeAny<T, Dim, -1><<<grid, block, 0, stream>>>( auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
inInfo, outInfo, totalSize);
transposeAny<T, unsigned long, Dim, -1>
<<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
}
CUDA_TEST_ERROR(); CUDA_TEST_ERROR();
} }
......
This diff is collapsed.
...@@ -222,7 +222,6 @@ void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) { ...@@ -222,7 +222,6 @@ void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) {
} }
static void write_ivf_header (const IndexIVF * ivf, FILE *f, static void write_ivf_header (const IndexIVF * ivf, FILE *f,
bool include_ids = true) { bool include_ids = true) {
write_index_header (ivf, f); write_index_header (ivf, f);
...@@ -445,6 +444,7 @@ static void read_ScalarQuantizer (ScalarQuantizer *ivsc, FILE *f) { ...@@ -445,6 +444,7 @@ static void read_ScalarQuantizer (ScalarQuantizer *ivsc, FILE *f) {
READVECTOR (ivsc->trained); READVECTOR (ivsc->trained);
} }
ProductQuantizer * read_ProductQuantizer (const char*fname) { ProductQuantizer * read_ProductQuantizer (const char*fname) {
FILE *f = fopen (fname, "r"); FILE *f = fopen (fname, "r");
FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname); FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
...@@ -676,8 +676,8 @@ Index *read_index (FILE * f, bool try_mmap) { ...@@ -676,8 +676,8 @@ Index *read_index (FILE * f, bool try_mmap) {
} }
idx = idxmap; idx = idxmap;
} else { } else {
fprintf (stderr, "Index type 0x%08x not supported\n", h); FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
abort (); idx = nullptr;
} }
return idx; return idx;
} }
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -42,6 +42,42 @@ class TestRemove(unittest.TestCase): ...@@ -42,6 +42,42 @@ class TestRemove(unittest.TestCase):
else: else:
assert False, 'should have raised an exception' assert False, 'should have raised an exception'
def test_remove_id_map_2(self):
# from https://github.com/facebookresearch/faiss/issues/255
rs = np.random.RandomState(1234)
X = rs.randn(10, 10).astype(np.float32)
idx = np.array([0, 10, 20, 30, 40, 5, 15, 25, 35, 45], np.int64)
remove_set = np.array([10, 30], dtype=np.int64)
index = faiss.index_factory(10, 'IDMap,Flat')
index.add_with_ids(X[:5, :], idx[:5])
index.remove_ids(remove_set)
index.add_with_ids(X[5:, :], idx[5:])
print (index.search(X, 1))
for i in range(10):
_, searchres = index.search(X[i:i + 1, :], 1)
if idx[i] in remove_set:
assert searchres[0] != idx[i]
else:
assert searchres[0] == idx[i]
class TestRangeSearch(unittest.TestCase):
def test_range_search_id_map(self):
sub_index = faiss.IndexFlat(5, 1) # L2 search instead of inner product
xb = np.zeros((10, 5), dtype='float32')
xb[:, 0] = np.arange(10) + 1000
index = faiss.IndexIDMap2(sub_index)
index.add_with_ids(xb, np.arange(10) + 100)
dist = float(np.linalg.norm(xb[3] - xb[0])) * 0.99
res_subindex = sub_index.range_search(xb[[0], :], dist)
res_index = index.range_search(xb[[0], :], dist)
assert len(res_subindex[2]) == 2
np.testing.assert_array_equal(res_subindex[2] + 100, res_index[2])
class TestUpdate(unittest.TestCase): class TestUpdate(unittest.TestCase):
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment