Commit 250a3d3f authored by matthijs's avatar matthijs

sync with FB version 2017-11-22

various bugfixes from github issues
kmean with some frozen centroids
GPU better tiling for large flat datasets
default AVX for vector ops
parent 71335194
...@@ -345,6 +345,7 @@ void ParameterSpace::initialize (const Index * index) ...@@ -345,6 +345,7 @@ void ParameterSpace::initialize (const Index * index)
} }
if (DC (IndexIVF)) { if (DC (IndexIVF)) {
{
ParameterRange & pr = add_range("nprobe"); ParameterRange & pr = add_range("nprobe");
for (int i = 0; i < 13; i++) { for (int i = 0; i < 13; i++) {
size_t nprobe = 1 << i; size_t nprobe = 1 << i;
...@@ -352,6 +353,7 @@ void ParameterSpace::initialize (const Index * index) ...@@ -352,6 +353,7 @@ void ParameterSpace::initialize (const Index * index)
pr.values.push_back (nprobe); pr.values.push_back (nprobe);
} }
} }
}
if (DC (IndexPQ)) { if (DC (IndexPQ)) {
ParameterRange & pr = add_range("ht"); ParameterRange & pr = add_range("ht");
init_pq_ParameterRange (ix->pq, pr); init_pq_ParameterRange (ix->pq, pr);
...@@ -371,7 +373,6 @@ void ParameterSpace::initialize (const Index * index) ...@@ -371,7 +373,6 @@ void ParameterSpace::initialize (const Index * index)
} }
} }
if (DC (IndexIVFPQR)) { if (DC (IndexIVFPQR)) {
assert (ix);
ParameterRange & pr = add_range("k_factor"); ParameterRange & pr = add_range("k_factor");
for (int i = 0; i <= 6; i++) { for (int i = 0; i <= 6; i++) {
pr.values.push_back (1 << i); pr.values.push_back (1 << i);
...@@ -427,12 +428,21 @@ void ParameterSpace::set_index_parameter ( ...@@ -427,12 +428,21 @@ void ParameterSpace::set_index_parameter (
if (name == "verbose") { if (name == "verbose") {
index->verbose = int(val); index->verbose = int(val);
// and fall through to also enable it on sub-indexes
} }
if (DC (IndexPreTransform)) { if (DC (IndexPreTransform)) {
index = ix->index; index = ix->index;
} }
if (DC (IndexShards)) {
// call on all sub-indexes
for (auto & shard_index : ix->shard_indexes) {
set_index_parameter (shard_index, name, val);
}
return;
}
if (name == "verbose") { if (name == "verbose") {
index->verbose = int(val); index->verbose = int(val);
// in case it was an IndexPreTransform
} }
if (DC (IndexRefineFlat)) { if (DC (IndexRefineFlat)) {
if (name == "k_factor_rf") { if (name == "k_factor_rf") {
...@@ -449,9 +459,12 @@ void ParameterSpace::set_index_parameter ( ...@@ -449,9 +459,12 @@ void ParameterSpace::set_index_parameter (
return; // last verbose that we could find return; // last verbose that we could find
} }
if (name == "nprobe") { if (name == "nprobe") {
DC(IndexIVF); if ( DC(IndexIVF)) {
ix->nprobe = int(val); ix->nprobe = int(val);
} else if (name == "ht") { return;
}
}
if (name == "ht") {
if (DC (IndexPQ)) { if (DC (IndexPQ)) {
if (val >= ix->pq.code_size * 8) { if (val >= ix->pq.code_size * 8) {
ix->search_type = IndexPQ::ST_PQ; ix->search_type = IndexPQ::ST_PQ;
...@@ -459,25 +472,32 @@ void ParameterSpace::set_index_parameter ( ...@@ -459,25 +472,32 @@ void ParameterSpace::set_index_parameter (
ix->search_type = IndexPQ::ST_polysemous; ix->search_type = IndexPQ::ST_polysemous;
ix->polysemous_ht = int(val); ix->polysemous_ht = int(val);
} }
return;
} else if (DC (IndexIVFPQ)) { } else if (DC (IndexIVFPQ)) {
if (val >= ix->pq.code_size * 8) { if (val >= ix->pq.code_size * 8) {
ix->polysemous_ht = 0; ix->polysemous_ht = 0;
} else { } else {
ix->polysemous_ht = int(val); ix->polysemous_ht = int(val);
} }
return;
}
} }
} else if (name == "k_factor") {
DC (IndexIVFPQR); if (name == "k_factor") {
if (DC (IndexIVFPQR)) {
ix->k_factor = val; ix->k_factor = val;
} else if (name == "max_codes") { return;
DC (IndexIVFPQ); }
}
if (name == "max_codes") {
if (DC (IndexIVFPQ)) {
ix->max_codes = finite(val) ? size_t(val) : 0; ix->max_codes = finite(val) ? size_t(val) : 0;
} else { return;
FAISS_THROW_FMT ( }
"ParameterSpace::set_index_parameter:" }
FAISS_THROW_FMT ("ParameterSpace::set_index_parameter:"
"could not set parameter %s", "could not set parameter %s",
name.c_str()); name.c_str());
}
} }
void ParameterSpace::display () const void ParameterSpace::display () const
...@@ -634,6 +654,15 @@ struct VTChain { ...@@ -634,6 +654,15 @@ struct VTChain {
} }
}; };
/// what kind of training does this coarse quantizer require?
char get_trains_alone(const Index *coarse_quantizer) {
return
dynamic_cast<const MultiIndexQuantizer*>(coarse_quantizer) ? 1 :
0;
}
} }
Index *index_factory (int d, const char *description_in, MetricType metric) Index *index_factory (int d, const char *description_in, MetricType metric)
...@@ -656,6 +685,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -656,6 +685,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
tok; tok;
tok = strtok_r (nullptr, " ,", &ptr)) { tok = strtok_r (nullptr, " ,", &ptr)) {
int d_out, opq_M, nbit, M, M2; int d_out, opq_M, nbit, M, M2;
char option[100];
std::string stok(tok); std::string stok(tok);
// to avoid mem leaks with exceptions: // to avoid mem leaks with exceptions:
...@@ -686,7 +716,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -686,7 +716,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
} else if (stok == "L2norm") { } else if (stok == "L2norm") {
vt_1 = new NormalizationTransform (d, 2.0); vt_1 = new NormalizationTransform (d, 2.0);
// coarse quantizers
} else if (!coarse_quantizer && } else if (!coarse_quantizer &&
sscanf (tok, "IVF%d", &ncentroids) == 1) { sscanf (tok, "IVF%d", &ncentroids) == 1) {
if (metric == METRIC_L2) { if (metric == METRIC_L2) {
...@@ -709,8 +739,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -709,8 +739,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
IndexIVF *index_ivf = new IndexIVFFlat ( IndexIVF *index_ivf = new IndexIVFFlat (
coarse_quantizer, d, ncentroids, metric); coarse_quantizer, d, ncentroids, metric);
index_ivf->quantizer_trains_alone = index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer) get_trains_alone (coarse_quantizer);
!= nullptr;
index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT; index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
del_coarse_quantizer.release (); del_coarse_quantizer.release ();
index_ivf->own_fields = true; index_ivf->own_fields = true;
...@@ -728,8 +757,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -728,8 +757,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
new IndexIVFScalarQuantizer ( new IndexIVFScalarQuantizer (
coarse_quantizer, d, ncentroids, qt, metric); coarse_quantizer, d, ncentroids, qt, metric);
index_ivf->quantizer_trains_alone = index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer) get_trains_alone (coarse_quantizer);
!= nullptr;
del_coarse_quantizer.release (); del_coarse_quantizer.release ();
index_ivf->own_fields = true; index_ivf->own_fields = true;
index_1 = index_ivf; index_1 = index_ivf;
...@@ -744,29 +772,31 @@ Index *index_factory (int d, const char *description_in, MetricType metric) ...@@ -744,29 +772,31 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
IndexIVFPQR *index_ivf = new IndexIVFPQR ( IndexIVFPQR *index_ivf = new IndexIVFPQR (
coarse_quantizer, d, ncentroids, M, 8, M2, 8); coarse_quantizer, d, ncentroids, M, 8, M2, 8);
index_ivf->quantizer_trains_alone = index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer) get_trains_alone (coarse_quantizer);
!= nullptr;
del_coarse_quantizer.release (); del_coarse_quantizer.release ();
index_ivf->own_fields = true; index_ivf->own_fields = true;
index_1 = index_ivf; index_1 = index_ivf;
} else if (!index && sscanf (tok, "PQ%d", &M) == 1) { } else if (!index && sscanf (tok, "PQ%d%10s", &M, option) == 2) {
std::string soption = option;
// np to disable polysemous trainign
FAISS_THROW_IF_NOT(soption == "" || soption == "np");
if (coarse_quantizer) { if (coarse_quantizer) {
IndexIVFPQ *index_ivf = new IndexIVFPQ ( IndexIVFPQ *index_ivf = new IndexIVFPQ (
coarse_quantizer, d, ncentroids, M, 8); coarse_quantizer, d, ncentroids, M, 8);
index_ivf->quantizer_trains_alone = index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer) get_trains_alone (coarse_quantizer);
!= nullptr;
index_ivf->metric_type = metric; index_ivf->metric_type = metric;
index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT; index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
del_coarse_quantizer.release (); del_coarse_quantizer.release ();
index_ivf->own_fields = true; index_ivf->own_fields = true;
index_ivf->do_polysemous_training = true; index_ivf->do_polysemous_training = soption != "np";
index_1 = index_ivf; index_1 = index_ivf;
} else { } else {
IndexPQ *index_pq = new IndexPQ (d, M, 8, metric); IndexPQ *index_pq = new IndexPQ (d, M, 8, metric);
index_pq->do_polysemous_training = true; index_pq->do_polysemous_training = soption != "np";
index_1 = index_pq; index_1 = index_pq;
} }
} else if (stok == "RFlat") { } else if (stok == "RFlat") {
make_IndexRefineFlat = true; make_IndexRefineFlat = true;
} else { } else {
......
...@@ -25,7 +25,7 @@ namespace faiss { ...@@ -25,7 +25,7 @@ namespace faiss {
/** The objective is to have a simple result structure while /** The objective is to have a simple result structure while
* minimizing the number of mem copies in the result. The method * minimizing the number of mem copies in the result. The method
* do_allocation can be overloaded to allocate the result tables in * do_allocation can be overloaded to allocate the result tables in
* the matrix type of a srcipting language like Lua or Python. */ * the matrix type of a scripting language like Lua or Python. */
struct RangeSearchResult { struct RangeSearchResult {
size_t nq; ///< nb of queries size_t nq; ///< nb of queries
size_t *lims; ///< size (nq + 1) size_t *lims; ///< size (nq + 1)
......
...@@ -29,6 +29,7 @@ ClusteringParameters::ClusteringParameters (): ...@@ -29,6 +29,7 @@ ClusteringParameters::ClusteringParameters ():
nredo(1), nredo(1),
verbose(false), spherical(false), verbose(false), spherical(false),
update_index(false), update_index(false),
frozen_centroids(false),
min_points_per_centroid(39), min_points_per_centroid(39),
max_points_per_centroid(256), max_points_per_centroid(256),
seed(1234) seed(1234)
...@@ -110,7 +111,24 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) { ...@@ -110,7 +111,24 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
float * dis = new float[nx]; float * dis = new float[nx];
ScopeDeleter<float> del2(dis); ScopeDeleter<float> del2(dis);
// for redo
float best_err = 1e50; float best_err = 1e50;
std::vector<float> best_obj;
std::vector<float> best_centroids;
// support input centroids
FAISS_THROW_IF_NOT_MSG (
centroids.size() % d == 0,
"size of provided input centroids not a multiple of dimension");
size_t n_input_centroids = centroids.size() / d;
if (verbose && n_input_centroids > 0) {
printf (" Using %zd centroids provided as input (%sfrozen)\n",
n_input_centroids, frozen_centroids ? "" : "not ");
}
double t_search_tot = 0; double t_search_tot = 0;
if (verbose) { if (verbose) {
printf(" Preprocessing in %.2f s\n", printf(" Preprocessing in %.2f s\n",
...@@ -120,39 +138,28 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) { ...@@ -120,39 +138,28 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
for (int redo = 0; redo < nredo; redo++) { for (int redo = 0; redo < nredo; redo++) {
std::vector<float> buf_centroids;
std::vector<float> &cur_centroids =
nredo == 1 ? centroids : buf_centroids;
if (verbose && nredo > 1) { if (verbose && nredo > 1) {
printf("Outer iteration %d / %d\n", redo, nredo); printf("Outer iteration %d / %d\n", redo, nredo);
} }
if (cur_centroids.size() == 0) {
// initialize centroids with random points from the dataset // initialize remaining centroids with random points from the dataset
cur_centroids.resize (d * k); centroids.resize (d * k);
std::vector<int> perm (nx); std::vector<int> perm (nx);
rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L); rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L);
#pragma omp parallel for for (int i = n_input_centroids; i < k ; i++)
for (int i = 0; i < k ; i++) memcpy (&centroids[i * d], x + perm[i] * d,
memcpy (&cur_centroids[i * d], x + perm[i] * d,
d * sizeof (float)); d * sizeof (float));
} else { // assume user provides some meaningful initialization
FAISS_THROW_IF_NOT (cur_centroids.size() == d * k);
FAISS_THROW_IF_NOT_MSG (nredo == 1,
"will redo with same initialization");
}
if (spherical) if (spherical)
fvec_renorm_L2 (d, k, cur_centroids.data()); fvec_renorm_L2 (d, k, centroids.data());
if (!index.is_trained) if (!index.is_trained)
index.train (k, cur_centroids.data()); index.train (k, centroids.data());
FAISS_THROW_IF_NOT (index.ntotal == 0); FAISS_THROW_IF_NOT (index.ntotal == 0);
index.add (k, cur_centroids.data()); index.add (k, centroids.data());
float err = 0; float err = 0;
for (int i = 0; i < niter; i++) { for (int i = 0; i < niter; i++) {
double t0s = getmillisecs(); double t0s = getmillisecs();
...@@ -164,8 +171,9 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) { ...@@ -164,8 +171,9 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
err += dis[j]; err += dis[j];
obj.push_back (err); obj.push_back (err);
int nsplit = km_update_centroids (x, cur_centroids.data(), int nsplit = km_update_centroids (
assign, d, k, nx); x, centroids.data(),
assign, d, k, nx, frozen_centroids ? n_input_centroids : 0);
if (verbose) { if (verbose) {
printf (" Iteration %d (%.2f s, search %.2f s): " printf (" Iteration %d (%.2f s, search %.2f s): "
...@@ -178,26 +186,31 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) { ...@@ -178,26 +186,31 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
} }
if (spherical) if (spherical)
fvec_renorm_L2 (d, k, cur_centroids.data()); fvec_renorm_L2 (d, k, centroids.data());
index.reset (); index.reset ();
if (update_index) if (update_index)
index.train (k, cur_centroids.data()); index.train (k, centroids.data());
assert (index.ntotal == 0); assert (index.ntotal == 0);
index.add (k, cur_centroids.data()); index.add (k, centroids.data());
} }
if (verbose) printf("\n"); if (verbose) printf("\n");
if (nredo > 1) { if (nredo > 1) {
if (err < best_err) { if (err < best_err) {
if (verbose) if (verbose)
printf ("Objective improved: keep new clusters\n"); printf ("Objective improved: keep new clusters\n");
centroids = buf_centroids; best_centroids = centroids;
best_obj = obj;
best_err = err; best_err = err;
} }
index.reset (); index.reset ();
} }
} }
if (nredo > 1) {
centroids = best_centroids;
obj = best_obj;
}
} }
......
...@@ -28,6 +28,7 @@ struct ClusteringParameters { ...@@ -28,6 +28,7 @@ struct ClusteringParameters {
bool verbose; bool verbose;
bool spherical; ///< do we want normalized centroids? bool spherical; ///< do we want normalized centroids?
bool update_index; ///< update index after each iteration? bool update_index; ///< update index after each iteration?
bool frozen_centroids; ///< use the centroids provided as input and do not change them during iterations
int min_points_per_centroid; ///< otherwise you get a warning int min_points_per_centroid; ///< otherwise you get a warning
int max_points_per_centroid; ///< to limit size of dataset int max_points_per_centroid; ///< to limit size of dataset
......
...@@ -41,8 +41,7 @@ long Index::remove_ids(const IDSelector& /*sel*/) { ...@@ -41,8 +41,7 @@ long Index::remove_ids(const IDSelector& /*sel*/) {
void Index::reconstruct (idx_t, float * ) const { void Index::reconstruct (idx_t, float * ) const {
FAISS_THROW_MSG ("Can not compute reconstruct without " FAISS_THROW_MSG ("reconstruct not implemented for this type of index");
"knowing how to do so");
} }
......
...@@ -34,8 +34,9 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist, ...@@ -34,8 +34,9 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
nlist (nlist), nlist (nlist),
nprobe (1), nprobe (1),
quantizer (quantizer), quantizer (quantizer),
quantizer_trains_alone (false), quantizer_trains_alone (0),
own_fields (false), own_fields (false),
clustering_index (nullptr),
ids (nlist), ids (nlist),
maintain_direct_map (false) maintain_direct_map (false)
{ {
...@@ -56,7 +57,8 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist, ...@@ -56,7 +57,8 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
IndexIVF::IndexIVF (): IndexIVF::IndexIVF ():
nlist (0), nprobe (1), quantizer (nullptr), nlist (0), nprobe (1), quantizer (nullptr),
quantizer_trains_alone (false), own_fields (false), quantizer_trains_alone (0), own_fields (false),
clustering_index (nullptr),
maintain_direct_map (false) maintain_direct_map (false)
{} {}
...@@ -157,22 +159,44 @@ void IndexIVF::train (idx_t n, const float *x) ...@@ -157,22 +159,44 @@ void IndexIVF::train (idx_t n, const float *x)
if (quantizer->is_trained && (quantizer->ntotal == nlist)) { if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
if (verbose) if (verbose)
printf ("IVF quantizer does not need training.\n"); printf ("IVF quantizer does not need training.\n");
} else if (quantizer_trains_alone) { } else if (quantizer_trains_alone == 1) {
if (verbose) if (verbose)
printf ("IVF quantizer trains alone...\n"); printf ("IVF quantizer trains alone...\n");
quantizer->train (n, x); quantizer->train (n, x);
quantizer->verbose = verbose;
FAISS_THROW_IF_NOT_MSG (quantizer->ntotal == nlist, FAISS_THROW_IF_NOT_MSG (quantizer->ntotal == nlist,
"nlist not consistent with quantizer size"); "nlist not consistent with quantizer size");
} else { } else if (quantizer_trains_alone == 0) {
if (verbose) if (verbose)
printf ("Training IVF quantizer on %ld vectors in %dD\n", printf ("Training IVF quantizer on %ld vectors in %dD\n",
n, d); n, d);
Clustering clus (d, nlist, cp); Clustering clus (d, nlist, cp);
quantizer->reset(); quantizer->reset();
if (clustering_index) {
clus.train (n, x, *clustering_index);
quantizer->add (nlist, clus.centroids.data());
} else {
clus.train (n, x, *quantizer); clus.train (n, x, *quantizer);
}
quantizer->is_trained = true; quantizer->is_trained = true;
} else if (quantizer_trains_alone == 2) {
if (verbose)
printf (
"Training L2 quantizer on %ld vectors in %dD%s\n",
n, d,
clustering_index ? "(user provided index)" : "");
FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
Clustering clus (d, nlist, cp);
if (!clustering_index) {
IndexFlatL2 assigner (d);
clus.train(n, x, assigner);
} else {
clus.train(n, x, *clustering_index);
}
if (verbose)
printf ("Adding centroids to quantizer\n");
quantizer->add (nlist, clus.centroids.data());
} }
if (verbose) if (verbose)
printf ("Training IVF residual\n"); printf ("Training IVF residual\n");
...@@ -250,8 +274,9 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type, ...@@ -250,8 +274,9 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
{ {
FAISS_THROW_IF_NOT (nlist == other.nlist); FAISS_THROW_IF_NOT (nlist == other.nlist);
FAISS_THROW_IF_NOT (!other.maintain_direct_map); FAISS_THROW_IF_NOT (!other.maintain_direct_map);
FAISS_THROW_IF_NOT_MSG (subset_type == 0 || subset_type == 2, FAISS_THROW_IF_NOT_FMT (
"this subset type is not implemented"); subset_type == 0 || subset_type == 1 || subset_type == 2,
"subset type %d not implemented", subset_type);
size_t accu_n = 0; size_t accu_n = 0;
size_t accu_a1 = 0; size_t accu_a1 = 0;
...@@ -275,15 +300,24 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type, ...@@ -275,15 +300,24 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
other.ntotal++; other.ntotal++;
} }
} }
} else if (subset_type == 1) {
for (long i = 0; i < n; i++) {
idx_t id = ids_in[i];
if (id % a1 == a2) {
ids_out.push_back (id);
codes_out.insert (codes_out.end(),
codes_in.begin() + i * code_size,
codes_in.begin() + (i + 1) * code_size);
other.ntotal++;
}
}
} else if (subset_type == 2) { } else if (subset_type == 2) {
// see what is allocated to a1 and to a2 // see what is allocated to a1 and to a2
size_t next_accu_n = accu_n + n; size_t next_accu_n = accu_n + n;
size_t next_accu_a1 = next_accu_n * a1 / ntotal; size_t next_accu_a1 = next_accu_n * a1 / ntotal;
size_t i1 = next_accu_a1 - accu_a1; size_t i1 = next_accu_a1 - accu_a1;
accu_a1 = next_accu_a1;
size_t next_accu_a2 = next_accu_n * a2 / ntotal; size_t next_accu_a2 = next_accu_n * a2 / ntotal;
size_t i2 = next_accu_a2 - accu_a2; size_t i2 = next_accu_a2 - accu_a2;
accu_a2 = next_accu_a2;
ids_out.insert(ids_out.end(), ids_out.insert(ids_out.end(),
ids_in.begin() + i1, ids_in.begin() + i1,
ids_in.begin() + i2); ids_in.begin() + i2);
...@@ -291,6 +325,8 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type, ...@@ -291,6 +325,8 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
codes_in.begin() + i1 * code_size, codes_in.begin() + i1 * code_size,
codes_in.begin() + i2 * code_size); codes_in.begin() + i2 * code_size);
other.ntotal += i2 - i1; other.ntotal += i2 - i1;
accu_a1 = next_accu_a1;
accu_a2 = next_accu_a2;
} }
accu_n += n; accu_n += n;
} }
......
...@@ -47,10 +47,17 @@ struct IndexIVF: Index { ...@@ -47,10 +47,17 @@ struct IndexIVF: Index {
size_t nprobe; ///< number of probes at query time size_t nprobe; ///< number of probes at query time
Index * quantizer; ///< quantizer that maps vectors to inverted lists Index * quantizer; ///< quantizer that maps vectors to inverted lists
bool quantizer_trains_alone; ///< just pass over the trainset to quantizer
/**
* = 0: use the quantizer as index in a kmeans training
* = 1: just pass on the training set to the train() of the quantizer
* = 2: kmeans training on a flat index + add the centroids to the quantizer
*/
char quantizer_trains_alone;
bool own_fields; ///< whether object owns the quantizer bool own_fields; ///< whether object owns the quantizer
ClusteringParameters cp; ///< to override default clustering params ClusteringParameters cp; ///< to override default clustering params
Index *clustering_index; ///< to override index used during clustering
std::vector < std::vector<long> > ids; ///< Inverted lists for indexes std::vector < std::vector<long> > ids; ///< Inverted lists for indexes
......
...@@ -291,8 +291,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const ...@@ -291,8 +291,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
for (int j = 0; j < d; j++) { for (int j = 0; j < d; j++) {
r[j] += centroid[j]; r[j] += centroid[j];
} }
} } else {
else {
pq.decode (code_line + ofs * pq.code_size, r); pq.decode (code_line + ofs * pq.code_size, r);
} }
} }
...@@ -303,6 +302,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const ...@@ -303,6 +302,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
void IndexIVFPQ::reconstruct (idx_t key, float * recons) const void IndexIVFPQ::reconstruct (idx_t key, float * recons) const
{ {
FAISS_THROW_IF_NOT (direct_map.size() == ntotal); FAISS_THROW_IF_NOT (direct_map.size() == ntotal);
int list_no = direct_map[key] >> 32; int list_no = direct_map[key] >> 32;
int ofs = direct_map[key] & 0xffffffff; int ofs = direct_map[key] & 0xffffffff;
...@@ -1029,6 +1029,51 @@ void IndexIVFPQ::search_preassigned (idx_t nx, const float *qx, idx_t k, ...@@ -1029,6 +1029,51 @@ void IndexIVFPQ::search_preassigned (idx_t nx, const float *qx, idx_t k,
} }
void IndexIVFPQ::search_and_reconstruct (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels,
float *reconstructed)
{
long * idx = new long [n * nprobe];
ScopeDeleter<long> del (idx);
float * coarse_dis = new float [n * nprobe];
ScopeDeleter<float> del2 (coarse_dis);
quantizer->search (n, x, nprobe, coarse_dis, idx);
search_preassigned (n, x, k, idx, coarse_dis,
distances, labels, true);
for (long i = 0; i < n; i++) {
for (long j = 0; j < k; j++) {
long ij = i * k + j;
idx_t res = labels[ij];
float *recons = reconstructed + d * (ij);
if (res < 0) {
// fill with NaNs
memset(recons, -1, sizeof(*recons) * d);
} else {
int list_no = res >> 32;
int ofs = res & 0xffffffff;
labels[ij] = ids[list_no][ofs];
quantizer->reconstruct (list_no, recons);
const uint8_t * code = &(codes[list_no][ofs * pq.code_size]);
for (size_t m = 0; m < pq.M; m++) {
float * out = recons + m * pq.dsub;
const float * cent = pq.get_centroids (m, code[m]);
for (size_t l = 0; l < pq.dsub; l++) {
out[l] += cent[l];
}
}
}
}
}
}
IndexIVFPQ::IndexIVFPQ () IndexIVFPQ::IndexIVFPQ ()
......
...@@ -114,6 +114,15 @@ struct IndexIVFPQ: IndexIVF { ...@@ -114,6 +114,15 @@ struct IndexIVFPQ: IndexIVF {
float *distances, idx_t *labels, float *distances, idx_t *labels,
bool store_pairs) const override; bool store_pairs) const override;
/** Same as the search function, but also reconstruct approximate
* vectors for the search results
*
* @param reconstructed size (n, k, d)
**/
void search_and_reconstruct (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels,
float *reconstructed);
/// build precomputed table /// build precomputed table
void precompute_table (); void precompute_table ();
......
...@@ -124,8 +124,8 @@ struct Codec4bit { ...@@ -124,8 +124,8 @@ struct Codec4bit {
struct SimilarityL2 { struct SimilarityL2 {
const float *y, *yi; const float *y, *yi;
explicit SimilarityL2 (const float * y): y(y) {}
explicit SimilarityL2 (const float * y): y(y) {}
/******* scalar accumulator *******/ /******* scalar accumulator *******/
...@@ -676,19 +676,19 @@ void ScalarQuantizer::compute_codes (const float * x, ...@@ -676,19 +676,19 @@ void ScalarQuantizer::compute_codes (const float * x,
size_t n) const size_t n) const
{ {
Quantizer *squant = select_quantizer (*this); Quantizer *squant = select_quantizer (*this);
ScopeDeleter1<Quantizer> del(squant);
#pragma omp parallel for #pragma omp parallel for
for (size_t i = 0; i < n; i++) for (size_t i = 0; i < n; i++)
squant->encode_vector (x + i * d, codes + i * code_size); squant->encode_vector (x + i * d, codes + i * code_size);
delete squant;
} }
void ScalarQuantizer::decode (const uint8_t *codes, float *x, size_t n) const void ScalarQuantizer::decode (const uint8_t *codes, float *x, size_t n) const
{ {
Quantizer *squant = select_quantizer (*this); Quantizer *squant = select_quantizer (*this);
ScopeDeleter1<Quantizer> del(squant);
#pragma omp parallel for #pragma omp parallel for
for (size_t i = 0; i < n; i++) for (size_t i = 0; i < n; i++)
squant->decode_vector (codes + i * code_size, x + i * d); squant->decode_vector (codes + i * code_size, x + i * d);
delete squant;
} }
/******************************************************************* /*******************************************************************
...@@ -754,6 +754,7 @@ void IndexScalarQuantizer::search( ...@@ -754,6 +754,7 @@ void IndexScalarQuantizer::search(
} }
ci += code_size; ci += code_size;
} }
minheap_reorder (k, simi, idxi);
} }
} else { } else {
#pragma omp parallel for #pragma omp parallel for
...@@ -774,7 +775,7 @@ void IndexScalarQuantizer::search( ...@@ -774,7 +775,7 @@ void IndexScalarQuantizer::search(
} }
ci += code_size; ci += code_size;
} }
maxheap_reorder (k, simi, idxi);
} }
} }
...@@ -855,6 +856,7 @@ void IndexIVFScalarQuantizer::add_with_ids ...@@ -855,6 +856,7 @@ void IndexIVFScalarQuantizer::add_with_ids
int nt = omp_get_num_threads(); int nt = omp_get_num_threads();
int rank = omp_get_thread_num(); int rank = omp_get_thread_num();
// each thread takes care of a subset of lists
for (size_t i = 0; i < n; i++) { for (size_t i = 0; i < n; i++) {
long list_no = idx [i]; long list_no = idx [i];
...@@ -879,6 +881,7 @@ void IndexIVFScalarQuantizer::add_with_ids ...@@ -879,6 +881,7 @@ void IndexIVFScalarQuantizer::add_with_ids
ntotal += nadd; ntotal += nadd;
} }
namespace {
void search_with_probes_ip (const IndexIVFScalarQuantizer & index, void search_with_probes_ip (const IndexIVFScalarQuantizer & index,
const float *x, const float *x,
...@@ -958,6 +961,8 @@ void search_with_probes_L2 (const IndexIVFScalarQuantizer & index, ...@@ -958,6 +961,8 @@ void search_with_probes_L2 (const IndexIVFScalarQuantizer & index,
maxheap_reorder (k, simi, idxi); maxheap_reorder (k, simi, idxi);
} }
} // anonymous namespace
void IndexIVFScalarQuantizer::search_preassigned ( void IndexIVFScalarQuantizer::search_preassigned (
idx_t n, const float *x, idx_t k, idx_t n, const float *x, idx_t k,
const idx_t *idx, const idx_t *idx,
......
...@@ -87,54 +87,59 @@ _swigfaiss.so: python/_swigfaiss.so ...@@ -87,54 +87,59 @@ _swigfaiss.so: python/_swigfaiss.so
cp python/_swigfaiss.so python/swigfaiss.py . cp python/_swigfaiss.so python/swigfaiss.py .
############################# #############################
# Dependencies # Dependencies.
# make dep > x
# then copy/paste from x by hand below
# for i in *.cpp ; do g++ -std=c++11 -I.. -MM $i -msse4; done dep:
for i in $(patsubst %.o,%.cpp,$(LIBOBJ)) ; do \
cpp -MM -std=gnu++0x $$i ; \
done
AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h IndexFlat.h
FaissException.o: FaissException.cpp FaissException.h
hamming.o: hamming.cpp hamming.h Heap.h FaissAssert.h FaissException.h hamming.o: hamming.cpp hamming.h Heap.h FaissAssert.h FaissException.h
Heap.o: Heap.cpp Heap.h utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \
Index.o: Index.cpp IndexFlat.h Index.h FaissAssert.h FaissException.h FaissAssert.h FaissException.h
IndexFlat.o: IndexFlat.cpp IndexFlat.h Index.h utils.h Heap.h \ IndexFlat.o: IndexFlat.cpp IndexFlat.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h AuxIndexStructures.h FaissAssert.h FaissException.h AuxIndexStructures.h
index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
IndexIVF.o: IndexIVF.cpp IndexIVF.h Index.h Clustering.h Heap.h utils.h \ IndexIVF.o: IndexIVF.cpp IndexIVF.h Index.h Clustering.h Heap.h utils.h \
hamming.h FaissAssert.h FaissException.h IndexFlat.h \ hamming.h FaissAssert.h FaissException.h IndexFlat.h \
AuxIndexStructures.h AuxIndexStructures.h
IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
IndexFlat.h hamming.h FaissAssert.h FaissException.h \
AuxIndexStructures.h
IndexLSH.o: IndexLSH.cpp IndexLSH.h Index.h VectorTransform.h utils.h \ IndexLSH.o: IndexLSH.cpp IndexLSH.h Index.h VectorTransform.h utils.h \
Heap.h hamming.h FaissAssert.h FaissException.h Heap.h hamming.h FaissAssert.h FaissException.h
IndexPQ.o: IndexPQ.cpp IndexPQ.h Index.h ProductQuantizer.h Clustering.h \ IndexPQ.o: IndexPQ.cpp IndexPQ.h Index.h ProductQuantizer.h Clustering.h \
Heap.h PolysemousTraining.h FaissAssert.h FaissException.h hamming.h Heap.h PolysemousTraining.h FaissAssert.h FaissException.h hamming.h
IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \ IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \ Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
FaissException.h IndexFlat.h hamming.h FaissAssert.h FaissException.h \
MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \ AuxIndexStructures.h
FaissException.h Heap.h AuxIndexStructures.h Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h IndexFlat.h
Heap.o: Heap.cpp Heap.h
VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \
Heap.h FaissAssert.h FaissException.h IndexPQ.h ProductQuantizer.h \
Clustering.h PolysemousTraining.h
index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
PolysemousTraining.o: PolysemousTraining.cpp PolysemousTraining.h \ PolysemousTraining.o: PolysemousTraining.cpp PolysemousTraining.h \
ProductQuantizer.h Clustering.h Index.h Heap.h utils.h hamming.h \ ProductQuantizer.h Clustering.h Index.h Heap.h utils.h hamming.h \
FaissAssert.h FaissException.h FaissAssert.h FaissException.h
MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \
FaissException.h Heap.h AuxIndexStructures.h
Index.o: Index.cpp IndexFlat.h Index.h FaissAssert.h FaissException.h
ProductQuantizer.o: ProductQuantizer.cpp ProductQuantizer.h Clustering.h \ ProductQuantizer.o: ProductQuantizer.cpp ProductQuantizer.h Clustering.h \
Index.h Heap.h FaissAssert.h FaissException.h VectorTransform.h \ Index.h Heap.h FaissAssert.h FaissException.h VectorTransform.h \
IndexFlat.h utils.h IndexFlat.h utils.h
utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \ AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
FaissAssert.h FaissException.h FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \ IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
Heap.h FaissAssert.h FaissException.h IndexPQ.h ProductQuantizer.h \ IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
Clustering.h PolysemousTraining.h AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \
IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \
FaissException.h
FaissException.o: FaissException.cpp FaissException.h
clean: clean:
......
...@@ -76,6 +76,17 @@ void IndexIDMap::search (idx_t n, const float *x, idx_t k, ...@@ -76,6 +76,17 @@ void IndexIDMap::search (idx_t n, const float *x, idx_t k,
} }
} }
void IndexIDMap::range_search (idx_t n, const float *x, float radius,
RangeSearchResult *result) const
{
index->range_search(n, x, radius, result);
for (idx_t i = 0; i < result->lims[result->nq]; i++) {
result->labels[i] = result->labels[i] < 0 ?
result->labels[i] : id_map[result->labels[i]];
}
}
namespace { namespace {
struct IDTranslatedSelector: IDSelector { struct IDTranslatedSelector: IDSelector {
...@@ -109,6 +120,7 @@ long IndexIDMap::remove_ids (const IDSelector & sel) ...@@ -109,6 +120,7 @@ long IndexIDMap::remove_ids (const IDSelector & sel)
} }
FAISS_ASSERT (j == index->ntotal); FAISS_ASSERT (j == index->ntotal);
ntotal = j; ntotal = j;
id_map.resize(ntotal);
return nremove; return nremove;
} }
......
...@@ -51,6 +51,9 @@ struct IndexIDMap : Index { ...@@ -51,6 +51,9 @@ struct IndexIDMap : Index {
/// remove ids adapted to IndexFlat /// remove ids adapted to IndexFlat
long remove_ids(const IDSelector& sel) override; long remove_ids(const IDSelector& sel) override;
void range_search (idx_t n, const float *x, float radius,
RangeSearchResult *result) const override;
~IndexIDMap() override; ~IndexIDMap() override;
IndexIDMap () {own_fields=false; index=nullptr; } IndexIDMap () {own_fields=false; index=nullptr; }
}; };
......
...@@ -804,18 +804,38 @@ void IndexPreTransform::train (idx_t n, const float *x) ...@@ -804,18 +804,38 @@ void IndexPreTransform::train (idx_t n, const float *x)
const float *prev_x = x; const float *prev_x = x;
ScopeDeleter<float> del; ScopeDeleter<float> del;
if (verbose) {
printf("IndexPreTransform::train: training chain 0 to %d\n",
last_untrained);
}
for (int i = 0; i <= last_untrained; i++) { for (int i = 0; i <= last_untrained; i++) {
if (i < chain.size()) { if (i < chain.size()) {
VectorTransform *ltrans = chain [i]; VectorTransform *ltrans = chain [i];
if (!ltrans->is_trained) if (!ltrans->is_trained) {
ltrans->train(n, prev_x); if (verbose) {
printf(" Training chain component %d/%zd\n",
i, chain.size());
if (OPQMatrix *opqm = dynamic_cast<OPQMatrix*>(ltrans)) {
opqm->verbose = true;
}
}
ltrans->train (n, prev_x);
}
} else { } else {
if (verbose) {
printf(" Training sub-index\n");
}
index->train (n, prev_x); index->train (n, prev_x);
} }
if (i == last_untrained) break; if (i == last_untrained) break;
if (verbose) {
printf(" Applying transform %d/%zd\n",
i, chain.size());
}
float * xt = chain[i]->apply (n, prev_x); float * xt = chain[i]->apply (n, prev_x);
if (prev_x != x) delete prev_x; if (prev_x != x) delete [] prev_x;
prev_x = xt; prev_x = xt;
del.set(xt); del.set(xt);
} }
......
...@@ -521,7 +521,7 @@ def compute_populated_index(preproc): ...@@ -521,7 +521,7 @@ def compute_populated_index(preproc):
co.verbose = True co.verbose = True
co.reserveVecs = max_add if max_add > 0 else xb.shape[0] co.reserveVecs = max_add if max_add > 0 else xb.shape[0]
co.shard = True co.shard = True
assert co.shard_type in (0, 1, 2)
vres, vdev = make_vres_vdev() vres, vdev = make_vres_vdev()
gpu_index = faiss.index_cpu_to_gpu_multiple( gpu_index = faiss.index_cpu_to_gpu_multiple(
vres, vdev, indexall, co) vres, vdev, indexall, co)
......
...@@ -121,6 +121,18 @@ def handle_Index(the_class): ...@@ -121,6 +121,18 @@ def handle_Index(the_class):
swig_ptr(labels)) swig_ptr(labels))
return distances, labels return distances, labels
def replacement_search_and_reconstruct(self, x, k):
n, d = x.shape
assert d == self.d
distances = np.empty((n, k), dtype=np.float32)
labels = np.empty((n, k), dtype=np.int64)
recons = np.empty((n, k, d), dtype=np.float32)
self.search_and_reconstruct_c(n, swig_ptr(x),
k, swig_ptr(distances),
swig_ptr(labels),
swig_ptr(recons))
return distances, labels, recons
def replacement_remove_ids(self, x): def replacement_remove_ids(self, x):
if isinstance(x, IDSelector): if isinstance(x, IDSelector):
sel = x sel = x
...@@ -167,6 +179,8 @@ def handle_Index(the_class): ...@@ -167,6 +179,8 @@ def handle_Index(the_class):
replace_method(the_class, 'range_search', replacement_range_search) replace_method(the_class, 'range_search', replacement_range_search)
replace_method(the_class, 'update_vectors', replacement_update_vectors, replace_method(the_class, 'update_vectors', replacement_update_vectors,
ignore_missing=True) ignore_missing=True)
replace_method(the_class, 'search_and_reconstruct',
replacement_search_and_reconstruct, ignore_missing=True)
def handle_VectorTransform(the_class): def handle_VectorTransform(the_class):
...@@ -258,12 +272,52 @@ def index_cpu_to_gpu_multiple_py(resources, index, co=None): ...@@ -258,12 +272,52 @@ def index_cpu_to_gpu_multiple_py(resources, index, co=None):
return index_cpu_to_gpu_multiple(vres, vdev, index, co) return index_cpu_to_gpu_multiple(vres, vdev, index, co)
def vector_float_to_array(v): def index_cpu_to_all_gpus(index, co=None, ngpu=-1):
a = np.empty(v.size(), dtype='float32') if ngpu == -1:
memcpy(swig_ptr(a), v.data(), 4 * v.size()) ngpu = get_num_gpus()
res = [StandardGpuResources() for i in range(ngpu)]
index2 = index_cpu_to_gpu_multiple_py(res, index, co)
index2.dont_dealloc = res
return index2
# mapping from vector names in swigfaiss.swig and the numpy dtype names
vector_name_map = {
'Float': 'float32',
'Byte': 'uint8',
'Uint64': 'uint64',
'Long': 'int64',
'Int': 'int32',
'Double': 'float64'
}
def vector_to_array(v):
""" convert a C++ vector to a numpy array """
classname = v.__class__.__name__
assert classname.endswith('Vector')
dtype = np.dtype(vector_name_map[classname[:-6]])
a = np.empty(v.size(), dtype=dtype)
memcpy(swig_ptr(a), v.data(), a.nbytes)
return a return a
def vector_float_to_array(v):
return vector_to_array(v)
def copy_array_to_vector(a, v):
""" copy a numpy array to a vector """
n, = a.shape
classname = v.__class__.__name__
assert classname.endswith('Vector')
dtype = np.dtype(vector_name_map[classname[:-6]])
assert dtype == a.dtype, (
'cannot copy a %s array to a %s (should be %s)' % (
a.dtype, classname, dtype))
v.resize(n)
memcpy(v.data(), swig_ptr(a), a.nbytes)
class Kmeans: class Kmeans:
def __init__(self, d, k, niter=25, verbose=False, spherical = False): def __init__(self, d, k, niter=25, verbose=False, spherical = False):
...@@ -364,3 +418,18 @@ def eval_intersection(I1, I2): ...@@ -364,3 +418,18 @@ def eval_intersection(I1, I2):
def normalize_L2(x): def normalize_L2(x):
fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x)) fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x))
def replacement_map_add(self, keys, vals):
n, = keys.shape
assert (n,) == keys.shape
self.add_c(n, swig_ptr(keys), swig_ptr(vals))
def replacement_map_search_multiple(self, keys):
n, = keys.shape
vals = np.empty(n, dtype='uint64')
self.search_multiple_c(n, swig_ptr(keys), swig_ptr(vals))
return vals
replace_method(MapLong2Long, 'add', replacement_map_add)
replace_method(MapLong2Long, 'search_multiple', replacement_map_search_multiple)
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
// Copyright 2004-present Facebook. All Rights Reserved. // Copyright 2004-present Facebook. All Rights Reserved.
#include "GpuAutoTune.h" #include "GpuAutoTune.h"
#include <typeinfo>
#include "GpuIndex.h" #include "GpuIndex.h"
#include "../FaissAssert.h" #include "../FaissAssert.h"
...@@ -97,17 +98,6 @@ faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index) ...@@ -97,17 +98,6 @@ faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index)
GpuClonerOptions::GpuClonerOptions():
indicesOptions(INDICES_64_BIT),
useFloat16CoarseQuantizer(false),
useFloat16(false),
usePrecomputed(true),
reserveVecs(0),
storeTransposed(false),
verbose(0)
{}
struct ToGpuCloner: faiss::Cloner, GpuClonerOptions { struct ToGpuCloner: faiss::Cloner, GpuClonerOptions {
GpuResources *resources; GpuResources *resources;
int device; int device;
...@@ -185,9 +175,6 @@ faiss::Index * index_cpu_to_gpu( ...@@ -185,9 +175,6 @@ faiss::Index * index_cpu_to_gpu(
return cl.clone_Index(index); return cl.clone_Index(index);
} }
GpuMultipleClonerOptions::GpuMultipleClonerOptions(): shard(false)
{}
struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions { struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
std::vector<ToGpuCloner> sub_cloners; std::vector<ToGpuCloner> sub_cloners;
...@@ -211,6 +198,28 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions { ...@@ -211,6 +198,28 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
{} {}
void copy_ivf_shard (const IndexIVF *index_ivf, IndexIVF *idx2,
long n, long i) {
if (shard_type == 2) {
long i0 = i * index_ivf->ntotal / n;
long i1 = (i + 1) * index_ivf->ntotal / n;
if(verbose)
printf("IndexShards shard %ld indices %ld:%ld\n",
i, i0, i1);
index_ivf->copy_subset_to(*idx2, 2, i0, i1);
FAISS_ASSERT(idx2->ntotal == i1 - i0);
} else if (shard_type == 1) {
if(verbose)
printf("IndexShards shard %ld select modulo %ld = %ld\n",
i, n, i);
index_ivf->copy_subset_to(*idx2, 1, n, i);
} else {
FAISS_THROW_FMT ("shard_type %d not implemented", shard_type);
}
}
Index *clone_Index(const Index *index) override { Index *clone_Index(const Index *index) override {
long n = sub_cloners.size(); long n = sub_cloners.size();
if (n == 1) if (n == 1)
...@@ -231,19 +240,13 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions { ...@@ -231,19 +240,13 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
dynamic_cast<const faiss::IndexIVFPQ *>(index); dynamic_cast<const faiss::IndexIVFPQ *>(index);
auto index_ivfflat = auto index_ivfflat =
dynamic_cast<const faiss::IndexIVFFlat *>(index); dynamic_cast<const faiss::IndexIVFFlat *>(index);
FAISS_ASSERT_MSG (index_ivfpq || index_ivfflat, FAISS_THROW_IF_NOT_MSG (index_ivfpq || index_ivfflat,
"IndexShards implemented only for " "IndexShards implemented only for "
"IndexIVFFlat or IndexIVFPQ"); "IndexIVFFlat or IndexIVFPQ");
std::vector<faiss::Index*> shards(n); std::vector<faiss::Index*> shards(n);
for(long i = 0; i < n; i++) { for(long i = 0; i < n; i++) {
// make a shallow copy // make a shallow copy
long i0 = i * index->ntotal / n;
long i1 = (i + 1) * index->ntotal / n;
if(verbose)
printf("IndexShards shard %ld indices %ld:%ld\n",
i, i0, i1);
if(reserveVecs) if(reserveVecs)
sub_cloners[i].reserveVecs = sub_cloners[i].reserveVecs =
(reserveVecs + n - 1) / n; (reserveVecs + n - 1) / n;
...@@ -258,18 +261,19 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions { ...@@ -258,18 +261,19 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
idx2.nprobe = index_ivfpq->nprobe; idx2.nprobe = index_ivfpq->nprobe;
idx2.use_precomputed_table = 0; idx2.use_precomputed_table = 0;
idx2.is_trained = index->is_trained; idx2.is_trained = index->is_trained;
index_ivfpq->copy_subset_to(idx2, 2, i0, i1); copy_ivf_shard (index_ivfpq, &idx2, n, i);
FAISS_ASSERT(idx2.ntotal == i1 - i0);
shards[i] = sub_cloners[i].clone_Index(&idx2); shards[i] = sub_cloners[i].clone_Index(&idx2);
} else if (index_ivfflat) { } else if (index_ivfflat) {
faiss::IndexIVFFlat idx2( faiss::IndexIVFFlat idx2(
index_ivfflat->quantizer, index->d, index_ivfflat->quantizer, index->d,
index_ivfflat->nlist, index_ivfflat->metric_type); index_ivfflat->nlist, index_ivfflat->metric_type);
idx2.nprobe = index_ivfflat->nprobe; idx2.nprobe = index_ivfflat->nprobe;
index_ivfflat->copy_subset_to(idx2, 2, i0, i1);
idx2.nprobe = index_ivfflat->nprobe; idx2.nprobe = index_ivfflat->nprobe;
copy_ivf_shard (index_ivfflat, &idx2, n, i);
shards[i] = sub_cloners[i].clone_Index(&idx2); shards[i] = sub_cloners[i].clone_Index(&idx2);
} }
} }
faiss::IndexShards *res = faiss::IndexShards *res =
new faiss::IndexShards(index->d, true, false); new faiss::IndexShards(index->d, true, false);
...@@ -372,33 +376,26 @@ void GpuParameterSpace::initialize (const Index * index) ...@@ -372,33 +376,26 @@ void GpuParameterSpace::initialize (const Index * index)
void GpuParameterSpace::set_index_parameter ( void GpuParameterSpace::set_index_parameter (
Index * index, const std::string & name, double val) const Index * index, const std::string & name, double val) const
{ {
if (DC (IndexPreTransform)) {
index = ix->index;
}
if (DC (IndexProxy)) { if (DC (IndexProxy)) {
for (int i = 0; i < ix->count(); i++) for (int i = 0; i < ix->count(); i++)
set_index_parameter (ix->at(i), name, val); set_index_parameter (ix->at(i), name, val);
return; return;
} }
if (DC (faiss::IndexShards)) { if (DC (GpuIndexIVF)) {
for (auto sub_index : ix->shard_indexes)
set_index_parameter (sub_index, name, val);
return;
}
if (name == "nprobe") { if (name == "nprobe") {
DC (GpuIndexIVF);
FAISS_ASSERT(ix);
ix->setNumProbes (int (val)); ix->setNumProbes (int (val));
return; return;
} }
}
if(DC (GpuIndexIVFPQ)) {
if (name == "use_precomputed_table") { if (name == "use_precomputed_table") {
DC (GpuIndexIVFPQ);
FAISS_ASSERT(ix);
ix->setPrecomputedCodes(bool (val)); ix->setPrecomputedCodes(bool (val));
return; return;
} }
}
FAISS_ASSERT_MSG (false, "unknown parameter"); // maybe norma lindex parameters apply?
ParameterSpace::set_index_parameter (index, name, val);
} }
......
...@@ -22,7 +22,9 @@ GpuClonerOptions::GpuClonerOptions() ...@@ -22,7 +22,9 @@ GpuClonerOptions::GpuClonerOptions()
} }
GpuMultipleClonerOptions::GpuMultipleClonerOptions() GpuMultipleClonerOptions::GpuMultipleClonerOptions()
: shard(false) { : shard(false),
shard_type(1)
{
} }
} } // namespace } } // namespace
...@@ -47,6 +47,9 @@ struct GpuMultipleClonerOptions : public GpuClonerOptions { ...@@ -47,6 +47,9 @@ struct GpuMultipleClonerOptions : public GpuClonerOptions {
/// Whether to shard the index across GPUs, versus replication /// Whether to shard the index across GPUs, versus replication
/// across GPUs /// across GPUs
bool shard; bool shard;
/// IndexIVF::copy_subset_to subset type
int shard_type;
}; };
} } // namespace } } // namespace
...@@ -26,7 +26,7 @@ struct GpuIndexConfig { ...@@ -26,7 +26,7 @@ struct GpuIndexConfig {
/// GPU device on which the index is resident /// GPU device on which the index is resident
int device; int device;
/// What memory space to use for primary storae. /// What memory space to use for primary storage.
/// On Pascal and above (CC 6+) architectures, allows GPUs to use /// On Pascal and above (CC 6+) architectures, allows GPUs to use
/// more memory than is available on the GPU. /// more memory than is available on the GPU.
MemorySpace memorySpace; MemorySpace memorySpace;
......
...@@ -184,7 +184,7 @@ GpuIndexIVF::copyTo(faiss::IndexIVF* index) const { ...@@ -184,7 +184,7 @@ GpuIndexIVF::copyTo(faiss::IndexIVF* index) const {
} }
index->quantizer = q; index->quantizer = q;
index->quantizer_trains_alone = false; index->quantizer_trains_alone = 0;
index->own_fields = true; index->own_fields = true;
index->cp = this->cp; index->cp = this->cp;
index->ids.clear(); index->ids.clear();
......
...@@ -96,7 +96,6 @@ GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) { ...@@ -96,7 +96,6 @@ GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
FAISS_ASSERT(index->pq.byte_per_idx == 1); FAISS_ASSERT(index->pq.byte_per_idx == 1);
FAISS_ASSERT(index->by_residual); FAISS_ASSERT(index->by_residual);
FAISS_ASSERT(index->polysemous_ht == 0); FAISS_ASSERT(index->polysemous_ht == 0);
ivfpqConfig_.usePrecomputedTables = (bool) index->use_precomputed_table;
verifySettings_(); verifySettings_();
......
This diff is collapsed.
This diff is collapsed.
...@@ -31,11 +31,7 @@ void runL2Distance(GpuResources* resources, ...@@ -31,11 +31,7 @@ void runL2Distance(GpuResources* resources,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
// Do we care about `outDistances`? If not, we can // Do we care about `outDistances`? If not, we can
// take shortcuts. // take shortcuts.
bool ignoreOutDistances = false, bool ignoreOutDistances = false);
// Hint to use a different sized tile for
// multi-streaming the queries. If <= 0, we use the
// default
int tileSizeOverride = -1);
/// Calculates brute-force inner product distance between `vectors` /// Calculates brute-force inner product distance between `vectors`
/// and `queries`, returning the k closest results seen /// and `queries`, returning the k closest results seen
...@@ -45,11 +41,7 @@ void runIPDistance(GpuResources* resources, ...@@ -45,11 +41,7 @@ void runIPDistance(GpuResources* resources,
Tensor<float, 2, true>& queries, Tensor<float, 2, true>& queries,
int k, int k,
Tensor<float, 2, true>& outDistances, Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices);
// Hint to use a different sized tile for
// multi-streaming the queries. If <= 0, we use the
// default
int tileSizeOverride = -1);
#ifdef FAISS_USE_FLOAT16 #ifdef FAISS_USE_FLOAT16
void runIPDistance(GpuResources* resources, void runIPDistance(GpuResources* resources,
...@@ -59,8 +51,7 @@ void runIPDistance(GpuResources* resources, ...@@ -59,8 +51,7 @@ void runIPDistance(GpuResources* resources,
int k, int k,
Tensor<half, 2, true>& outDistances, Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool useHgemm, bool useHgemm);
int tileSizeOverride = -1);
void runL2Distance(GpuResources* resources, void runL2Distance(GpuResources* resources,
Tensor<half, 2, true>& vectors, Tensor<half, 2, true>& vectors,
...@@ -71,8 +62,7 @@ void runL2Distance(GpuResources* resources, ...@@ -71,8 +62,7 @@ void runL2Distance(GpuResources* resources,
Tensor<half, 2, true>& outDistances, Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool useHgemm, bool useHgemm,
bool ignoreOutDistances = false, bool ignoreOutDistances = false);
int tileSizeOverride = -1);
#endif #endif
} } // namespace } } // namespace
...@@ -114,8 +114,7 @@ FlatIndex::query(Tensor<float, 2, true>& input, ...@@ -114,8 +114,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
int k, int k,
Tensor<float, 2, true>& outDistances, Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool exactDistance, bool exactDistance) {
int tileSize) {
auto stream = resources_->getDefaultStreamCurrentDevice(); auto stream = resources_->getDefaultStreamCurrentDevice();
auto& mem = resources_->getMemoryManagerCurrentDevice(); auto& mem = resources_->getMemoryManagerCurrentDevice();
...@@ -127,7 +126,7 @@ FlatIndex::query(Tensor<float, 2, true>& input, ...@@ -127,7 +126,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
DeviceTensor<half, 2, true> outDistancesHalf( DeviceTensor<half, 2, true> outDistancesHalf(
mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream); mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream);
query(inputHalf, k, outDistancesHalf, outIndices, exactDistance, tileSize); query(inputHalf, k, outDistancesHalf, outIndices, exactDistance);
if (exactDistance) { if (exactDistance) {
// Convert outDistances back // Convert outDistances back
...@@ -145,8 +144,7 @@ FlatIndex::query(Tensor<float, 2, true>& input, ...@@ -145,8 +144,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
outDistances, outDistances,
outIndices, outIndices,
// FIXME // FIXME
!exactDistance, !exactDistance);
tileSize);
} else { } else {
runIPDistance(resources_, runIPDistance(resources_,
vectors_, vectors_,
...@@ -154,8 +152,7 @@ FlatIndex::query(Tensor<float, 2, true>& input, ...@@ -154,8 +152,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
input, input,
k, k,
outDistances, outDistances,
outIndices, outIndices);
tileSize);
} }
} }
} }
...@@ -166,8 +163,7 @@ FlatIndex::query(Tensor<half, 2, true>& input, ...@@ -166,8 +163,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
int k, int k,
Tensor<half, 2, true>& outDistances, Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool exactDistance, bool exactDistance) {
int tileSize) {
FAISS_ASSERT(useFloat16_); FAISS_ASSERT(useFloat16_);
if (l2Distance_) { if (l2Distance_) {
...@@ -181,8 +177,7 @@ FlatIndex::query(Tensor<half, 2, true>& input, ...@@ -181,8 +177,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
outIndices, outIndices,
useFloat16Accumulator_, useFloat16Accumulator_,
// FIXME // FIXME
!exactDistance, !exactDistance);
tileSize);
} else { } else {
runIPDistance(resources_, runIPDistance(resources_,
vectorsHalf_, vectorsHalf_,
...@@ -191,8 +186,7 @@ FlatIndex::query(Tensor<half, 2, true>& input, ...@@ -191,8 +186,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
k, k,
outDistances, outDistances,
outIndices, outIndices,
useFloat16Accumulator_, useFloat16Accumulator_);
tileSize);
} }
} }
#endif #endif
...@@ -217,12 +211,14 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) { ...@@ -217,12 +211,14 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
rawData_.append((char*) devDataHalf.data(), rawData_.append((char*) devDataHalf.data(),
devDataHalf.getSizeInBytes(), devDataHalf.getSizeInBytes(),
stream); stream,
true /* reserve exactly */);
#endif #endif
} else { } else {
rawData_.append((char*) data, rawData_.append((char*) data,
(size_t) dim_ * numVecs * sizeof(float), (size_t) dim_ * numVecs * sizeof(float),
stream); stream,
true /* reserve exactly */);
} }
num_ += numVecs; num_ += numVecs;
......
...@@ -61,16 +61,14 @@ class FlatIndex { ...@@ -61,16 +61,14 @@ class FlatIndex {
int k, int k,
Tensor<float, 2, true>& outDistances, Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool exactDistance, bool exactDistance);
int tileSize = -1);
#ifdef FAISS_USE_FLOAT16 #ifdef FAISS_USE_FLOAT16
void query(Tensor<half, 2, true>& vecs, void query(Tensor<half, 2, true>& vecs,
int k, int k,
Tensor<half, 2, true>& outDistances, Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool exactDistance, bool exactDistance);
int tileSize = -1);
#endif #endif
/// Add vectors to ourselves; the pointer passed can be on the host /// Add vectors to ourselves; the pointer passed can be on the host
......
...@@ -195,10 +195,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs, ...@@ -195,10 +195,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
closestSubQDistanceView, closestSubQDistanceView,
closestSubQIndexView, closestSubQIndexView,
// We don't care about distances // We don't care about distances
true, true);
// Much larger tile size, since these vectors are a
// lot smaller than query vectors
1024);
} }
// Now, we have the nearest sub-q centroid for each slice of the // Now, we have the nearest sub-q centroid for each slice of the
......
...@@ -10,10 +10,10 @@ ...@@ -10,10 +10,10 @@
#include "IVFUtils.cuh" #include "IVFUtils.cuh"
#include "../utils/DeviceUtils.h" #include "../utils/DeviceUtils.h"
#include "../utils/Limits.cuh"
#include "../utils/Select.cuh" #include "../utils/Select.cuh"
#include "../utils/StaticUtils.h" #include "../utils/StaticUtils.h"
#include "../utils/Tensor.cuh" #include "../utils/Tensor.cuh"
#include <limits>
// //
// This kernel is split into a separate compilation unit to cut down // This kernel is split into a separate compilation unit to cut down
...@@ -22,9 +22,6 @@ ...@@ -22,9 +22,6 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
constexpr auto kMax = std::numeric_limits<float>::max();
constexpr auto kMin = std::numeric_limits<float>::min();
template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir> template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir>
__global__ void __global__ void
pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets, pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
...@@ -38,7 +35,7 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets, ...@@ -38,7 +35,7 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
__shared__ float smemK[kNumWarps * NumWarpQ]; __shared__ float smemK[kNumWarps * NumWarpQ];
__shared__ int smemV[kNumWarps * NumWarpQ]; __shared__ int smemV[kNumWarps * NumWarpQ];
constexpr auto kInit = Dir ? kMin : kMax; constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
BlockSelect<float, int, Dir, Comparator<float>, BlockSelect<float, int, Dir, Comparator<float>,
NumWarpQ, NumThreadQ, ThreadsPerBlock> NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(kInit, -1, smemK, smemV, k); heap(kInit, -1, smemK, smemV, k);
......
...@@ -10,10 +10,10 @@ ...@@ -10,10 +10,10 @@
#include "IVFUtils.cuh" #include "IVFUtils.cuh"
#include "../utils/DeviceUtils.h" #include "../utils/DeviceUtils.h"
#include "../utils/Limits.cuh"
#include "../utils/Select.cuh" #include "../utils/Select.cuh"
#include "../utils/StaticUtils.h" #include "../utils/StaticUtils.h"
#include "../utils/Tensor.cuh" #include "../utils/Tensor.cuh"
#include <limits>
// //
// This kernel is split into a separate compilation unit to cut down // This kernel is split into a separate compilation unit to cut down
...@@ -22,9 +22,6 @@ ...@@ -22,9 +22,6 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
constexpr auto kMax = std::numeric_limits<float>::max();
constexpr auto kMin = std::numeric_limits<float>::min();
// This is warp divergence central, but this is really a final step // This is warp divergence central, but this is really a final step
// and happening a small number of times // and happening a small number of times
inline __device__ int binarySearchForBucket(int* prefixSumOffsets, inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
...@@ -71,7 +68,7 @@ pass2SelectLists(Tensor<float, 2, true> heapDistances, ...@@ -71,7 +68,7 @@ pass2SelectLists(Tensor<float, 2, true> heapDistances,
__shared__ float smemK[kNumWarps * NumWarpQ]; __shared__ float smemK[kNumWarps * NumWarpQ];
__shared__ int smemV[kNumWarps * NumWarpQ]; __shared__ int smemV[kNumWarps * NumWarpQ];
constexpr auto kInit = Dir ? kMin : kMax; constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
BlockSelect<float, int, Dir, Comparator<float>, BlockSelect<float, int, Dir, Comparator<float>,
NumWarpQ, NumThreadQ, ThreadsPerBlock> NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(kInit, -1, smemK, smemV, k); heap(kInit, -1, smemK, smemV, k);
......
...@@ -31,28 +31,29 @@ namespace faiss { namespace gpu { ...@@ -31,28 +31,29 @@ namespace faiss { namespace gpu {
// T: the type we are doing the math in (e.g., float, half) // T: the type we are doing the math in (e.g., float, half)
// TVec: the potentially vectorized type we are loading in (e.g., // TVec: the potentially vectorized type we are loading in (e.g.,
// float4, half2) // float4, half2)
template <typename T, typename TVec, template <typename T, typename TVec, typename TIndex,
int RowTileSize, bool NormLoop, bool NormSquared> int RowTileSize, bool NormLoop, bool NormSquared>
__global__ void l2Norm(Tensor<TVec, 2, true> input, __global__ void l2Norm(Tensor<TVec, 2, true, TIndex> input,
Tensor<T, 1, true> output) { Tensor<T, 1, true, TIndex> output) {
extern __shared__ char smemByte[]; // #warps * RowTileSize elements extern __shared__ char smemByte[]; // #warps * RowTileSize elements
T* smem = (T*) smemByte; T* smem = (T*) smemByte;
int numWarps = utils::divUp(blockDim.x, kWarpSize); TIndex numWarps = utils::divUp(blockDim.x, kWarpSize);
int laneId = getLaneId(); TIndex laneId = getLaneId();
int warpId = threadIdx.x / kWarpSize; TIndex warpId = threadIdx.x / kWarpSize;
bool lastRowTile = (blockIdx.x == (gridDim.x - 1)); bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
int rowStart = RowTileSize * blockIdx.x; TIndex rowStart = RowTileSize * blockIdx.x;
T rowNorm[RowTileSize]; T rowNorm[RowTileSize];
if (lastRowTile) { if (lastRowTile) {
// We are handling the very end of the input matrix rows // We are handling the very end of the input matrix rows
for (int row = 0; row < input.getSize(0) - rowStart; ++row) { for (TIndex row = 0; row < input.getSize(0) - rowStart; ++row) {
if (NormLoop) { if (NormLoop) {
rowNorm[0] = Math<T>::zero(); rowNorm[0] = Math<T>::zero();
for (int col = threadIdx.x; col < input.getSize(1); col += blockDim.x) { for (TIndex col = threadIdx.x;
col < input.getSize(1); col += blockDim.x) {
TVec val = input[rowStart + row][col]; TVec val = input[rowStart + row][col];
val = Math<TVec>::mul(val, val); val = Math<TVec>::mul(val, val);
rowNorm[0] = Math<T>::add(rowNorm[0], Math<TVec>::reduceAdd(val)); rowNorm[0] = Math<T>::add(rowNorm[0], Math<TVec>::reduceAdd(val));
...@@ -82,7 +83,8 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input, ...@@ -82,7 +83,8 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input,
rowNorm[row] = Math<T>::zero(); rowNorm[row] = Math<T>::zero();
} }
for (int col = threadIdx.x; col < input.getSize(1); col += blockDim.x) { for (TIndex col = threadIdx.x;
col < input.getSize(1); col += blockDim.x) {
#pragma unroll #pragma unroll
for (int row = 0; row < RowTileSize; ++row) { for (int row = 0; row < RowTileSize; ++row) {
tmp[row] = input[rowStart + row][col]; tmp[row] = input[rowStart + row][col];
...@@ -172,32 +174,32 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input, ...@@ -172,32 +174,32 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input,
} }
} }
template <typename T, typename TVec> template <typename T, typename TVec, typename TIndex>
void runL2Norm(Tensor<T, 2, true>& input, void runL2Norm(Tensor<T, 2, true, TIndex>& input,
Tensor<T, 1, true>& output, Tensor<T, 1, true, TIndex>& output,
bool normSquared, bool normSquared,
cudaStream_t stream) { cudaStream_t stream) {
FAISS_ASSERT(input.getSize(0) == output.getSize(0)); FAISS_ASSERT(input.getSize(0) == output.getSize(0));
int maxThreads = getMaxThreadsCurrentDevice(); TIndex maxThreads = (TIndex) getMaxThreadsCurrentDevice();
constexpr int rowTileSize = 8; constexpr int rowTileSize = 8;
#define RUN_L2(TYPE_T, TYPE_TVEC, INPUT) \ #define RUN_L2(TYPE_T, TYPE_TVEC, INPUT) \
do { \ do { \
if (normLoop) { \ if (normLoop) { \
if (normSquared) { \ if (normSquared) { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, true, true> \ l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, true, true> \
<<<grid, block, smem, stream>>>(INPUT, output); \ <<<grid, block, smem, stream>>>(INPUT, output); \
} else { \ } else { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, true, false> \ l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, true, false> \
<<<grid, block, smem, stream>>>(INPUT, output); \ <<<grid, block, smem, stream>>>(INPUT, output); \
} \ } \
} else { \ } else { \
if (normSquared) { \ if (normSquared) { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, false, true> \ l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, false, true> \
<<<grid, block, smem, stream>>>(INPUT, output); \ <<<grid, block, smem, stream>>>(INPUT, output); \
} else { \ } else { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, false, false> \ l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, false, false> \
<<<grid, block, smem, stream>>>(INPUT, output); \ <<<grid, block, smem, stream>>>(INPUT, output); \
} \ } \
} \ } \
...@@ -207,9 +209,9 @@ void runL2Norm(Tensor<T, 2, true>& input, ...@@ -207,9 +209,9 @@ void runL2Norm(Tensor<T, 2, true>& input,
// Can load using the vectorized type // Can load using the vectorized type
auto inputV = input.template castResize<TVec>(); auto inputV = input.template castResize<TVec>();
int dim = inputV.getSize(1); auto dim = inputV.getSize(1);
bool normLoop = dim > maxThreads; bool normLoop = dim > maxThreads;
int numThreads = min(dim, maxThreads); auto numThreads = min(dim, maxThreads);
auto grid = dim3(utils::divUp(inputV.getSize(0), rowTileSize)); auto grid = dim3(utils::divUp(inputV.getSize(0), rowTileSize));
auto block = dim3(numThreads); auto block = dim3(numThreads);
...@@ -220,9 +222,9 @@ void runL2Norm(Tensor<T, 2, true>& input, ...@@ -220,9 +222,9 @@ void runL2Norm(Tensor<T, 2, true>& input,
} else { } else {
// Can't load using the vectorized type // Can't load using the vectorized type
int dim = input.getSize(1); auto dim = input.getSize(1);
bool normLoop = dim > maxThreads; bool normLoop = dim > maxThreads;
int numThreads = min(dim, maxThreads); auto numThreads = min(dim, maxThreads);
auto grid = dim3(utils::divUp(input.getSize(0), rowTileSize)); auto grid = dim3(utils::divUp(input.getSize(0), rowTileSize));
auto block = dim3(numThreads); auto block = dim3(numThreads);
...@@ -241,7 +243,13 @@ void runL2Norm(Tensor<float, 2, true>& input, ...@@ -241,7 +243,13 @@ void runL2Norm(Tensor<float, 2, true>& input,
Tensor<float, 1, true>& output, Tensor<float, 1, true>& output,
bool normSquared, bool normSquared,
cudaStream_t stream) { cudaStream_t stream) {
runL2Norm<float, float4>(input, output, normSquared, stream); if (input.canUseIndexType<int>()) {
runL2Norm<float, float4, int>(input, output, normSquared, stream);
} else {
auto inputCast = input.castIndexType<long>();
auto outputCast = output.castIndexType<long>();
runL2Norm<float, float4, long>(inputCast, outputCast, normSquared, stream);
}
} }
#ifdef FAISS_USE_FLOAT16 #ifdef FAISS_USE_FLOAT16
...@@ -249,7 +257,13 @@ void runL2Norm(Tensor<half, 2, true>& input, ...@@ -249,7 +257,13 @@ void runL2Norm(Tensor<half, 2, true>& input,
Tensor<half, 1, true>& output, Tensor<half, 1, true>& output,
bool normSquared, bool normSquared,
cudaStream_t stream) { cudaStream_t stream) {
runL2Norm<half, half2>(input, output, normSquared, stream); if (input.canUseIndexType<int>()) {
runL2Norm<half, half2, int>(input, output, normSquared, stream);
} else {
auto inputCast = input.castIndexType<long>();
auto outputCast = output.castIndexType<long>();
runL2Norm<half, half2, long>(inputCast, outputCast, normSquared, stream);
}
} }
#endif #endif
......
...@@ -29,11 +29,14 @@ DEFINE_int32(num, 128, "# of vecs"); ...@@ -29,11 +29,14 @@ DEFINE_int32(num, 128, "# of vecs");
DEFINE_int32(dim, 128, "# of dimensions"); DEFINE_int32(dim, 128, "# of dimensions");
DEFINE_int32(num_queries, 3, "number of query vectors"); DEFINE_int32(num_queries, 3, "number of query vectors");
DEFINE_bool(diff, true, "show exact distance + index output discrepancies"); DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
DEFINE_bool(use_float16, false, "use encodings in float16 instead of float32"); DEFINE_bool(use_float16, false, "use encodings in float16");
DEFINE_bool(use_float16_math, false, "perform math in float16");
DEFINE_bool(transposed, false, "store vectors transposed"); DEFINE_bool(transposed, false, "store vectors transposed");
DEFINE_int64(seed, -1, "specify random seed"); DEFINE_int64(seed, -1, "specify random seed");
DEFINE_int32(num_gpus, 1, "number of gpus to use"); DEFINE_int32(num_gpus, 1, "number of gpus to use");
DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use"); DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use");
DEFINE_bool(cpu, true, "run the CPU code for timing and comparison");
DEFINE_bool(use_unified_mem, false, "use Pascal unified memory for the index");
using namespace faiss::gpu; using namespace faiss::gpu;
...@@ -72,7 +75,10 @@ int main(int argc, char** argv) { ...@@ -72,7 +75,10 @@ int main(int argc, char** argv) {
GpuIndexFlatConfig config; GpuIndexFlatConfig config;
config.device = dev; config.device = dev;
config.useFloat16 = FLAGS_use_float16; config.useFloat16 = FLAGS_use_float16;
config.useFloat16Accumulator = FLAGS_use_float16_math;
config.storeTransposed = FLAGS_transposed; config.storeTransposed = FLAGS_transposed;
config.memorySpace = FLAGS_use_unified_mem ?
MemorySpace::Unified : MemorySpace::Device;
auto p = std::unique_ptr<faiss::gpu::GpuIndexFlatL2>( auto p = std::unique_ptr<faiss::gpu::GpuIndexFlatL2>(
new faiss::gpu::GpuIndexFlatL2(res, index.get(), config)); new faiss::gpu::GpuIndexFlatL2(res, index.get(), config));
...@@ -90,9 +96,9 @@ int main(int argc, char** argv) { ...@@ -90,9 +96,9 @@ int main(int argc, char** argv) {
HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k}); HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k}); HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
if (FLAGS_cpu) {
float cpuTime = 0.0f; float cpuTime = 0.0f;
{
CpuTimer timer; CpuTimer timer;
index->search(numQueries, index->search(numQueries,
cpuQuery.data(), cpuQuery.data(),
...@@ -101,9 +107,8 @@ int main(int argc, char** argv) { ...@@ -101,9 +107,8 @@ int main(int argc, char** argv) {
cpuIndices.data()); cpuIndices.data());
cpuTime = timer.elapsedMilliseconds(); cpuTime = timer.elapsedMilliseconds();
}
printf("CPU time %.3f ms\n", cpuTime); printf("CPU time %.3f ms\n", cpuTime);
}
HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k}); HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k}); HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
...@@ -131,14 +136,14 @@ int main(int argc, char** argv) { ...@@ -131,14 +136,14 @@ int main(int argc, char** argv) {
CUDA_VERIFY(cudaProfilerStop()); CUDA_VERIFY(cudaProfilerStop());
printf("GPU time %.3f ms\n", gpuTime); printf("GPU time %.3f ms\n", gpuTime);
if (FLAGS_cpu) {
compareLists(cpuDistances.data(), cpuIndices.data(), compareLists(cpuDistances.data(), cpuIndices.data(),
gpuDistances.data(), gpuIndices.data(), gpuDistances.data(), gpuIndices.data(),
numQueries, FLAGS_k, numQueries, FLAGS_k,
"", true, FLAGS_diff, false); "", true, FLAGS_diff, false);
}
CUDA_VERIFY(cudaDeviceSynchronize()); CUDA_VERIFY(cudaDeviceSynchronize());
// printf("\ncudaMalloc usage %zd\n",
// resources.getMemoryManager().getHighWaterCudaMalloc());
return 0; return 0;
} }
...@@ -21,29 +21,47 @@ ...@@ -21,29 +21,47 @@
constexpr float kF16MaxRelErr = 0.07f; constexpr float kF16MaxRelErr = 0.07f;
constexpr float kF32MaxRelErr = 6e-3f; constexpr float kF32MaxRelErr = 6e-3f;
void testFlat(bool useL2, struct TestFlatOptions {
bool useFloat16, TestFlatOptions()
bool useTransposed, : useL2(true),
int kOverride = -1) { useFloat16(false),
int numVecs = faiss::gpu::randVal(1000, 20000); useTransposed(false),
numVecsOverride(-1),
numQueriesOverride(-1),
kOverride(-1) {
}
bool useL2;
bool useFloat16;
bool useTransposed;
int numVecsOverride;
int numQueriesOverride;
int kOverride;
};
void testFlat(const TestFlatOptions& opt) {
int numVecs = opt.numVecsOverride > 0 ?
opt.numVecsOverride : faiss::gpu::randVal(1000, 20000);
int dim = faiss::gpu::randVal(50, 800); int dim = faiss::gpu::randVal(50, 800);
int numQuery = faiss::gpu::randVal(1, 512); int numQuery = opt.numQueriesOverride > 0 ?
opt.numQueriesOverride : faiss::gpu::randVal(1, 512);
// Due to loss of precision in a float16 accumulator, for large k, // Due to loss of precision in a float16 accumulator, for large k,
// the number of differences is pretty huge. Restrict ourselves to a // the number of differences is pretty huge. Restrict ourselves to a
// fairly small `k` for float16 // fairly small `k` for float16
int k = useFloat16 ? int k = opt.useFloat16 ?
std::min(faiss::gpu::randVal(1, 50), numVecs) : std::min(faiss::gpu::randVal(1, 50), numVecs) :
std::min(faiss::gpu::randVal(1, 1024), numVecs); std::min(faiss::gpu::randVal(1, 1024), numVecs);
if (kOverride > 0) { if (opt.kOverride > 0) {
k = kOverride; k = opt.kOverride;
} }
faiss::IndexFlatIP cpuIndexIP(dim); faiss::IndexFlatIP cpuIndexIP(dim);
faiss::IndexFlatL2 cpuIndexL2(dim); faiss::IndexFlatL2 cpuIndexL2(dim);
faiss::IndexFlat* cpuIndex = faiss::IndexFlat* cpuIndex =
useL2 ? (faiss::IndexFlat*) &cpuIndexL2 : (faiss::IndexFlat*) &cpuIndexIP; opt.useL2 ? (faiss::IndexFlat*) &cpuIndexL2 :
(faiss::IndexFlat*) &cpuIndexIP;
// Construct on a random device to test multi-device, if we have // Construct on a random device to test multi-device, if we have
// multiple devices // multiple devices
...@@ -55,14 +73,14 @@ void testFlat(bool useL2, ...@@ -55,14 +73,14 @@ void testFlat(bool useL2,
faiss::gpu::GpuIndexFlatConfig config; faiss::gpu::GpuIndexFlatConfig config;
config.device = device; config.device = device;
config.useFloat16 = useFloat16; config.useFloat16 = opt.useFloat16;
config.storeTransposed = useTransposed; config.storeTransposed = opt.useTransposed;
faiss::gpu::GpuIndexFlatIP gpuIndexIP(&res, dim, config); faiss::gpu::GpuIndexFlatIP gpuIndexIP(&res, dim, config);
faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config); faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
faiss::gpu::GpuIndexFlat* gpuIndex = faiss::gpu::GpuIndexFlat* gpuIndex =
useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 : opt.useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
(faiss::gpu::GpuIndexFlat*) &gpuIndexIP; (faiss::gpu::GpuIndexFlat*) &gpuIndexIP;
std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim); std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
...@@ -70,37 +88,53 @@ void testFlat(bool useL2, ...@@ -70,37 +88,53 @@ void testFlat(bool useL2,
gpuIndex->add(numVecs, vecs.data()); gpuIndex->add(numVecs, vecs.data());
std::stringstream str; std::stringstream str;
str << (useL2 ? "L2" : "IP") << " numVecs " << numVecs str << (opt.useL2 ? "L2" : "IP") << " numVecs " << numVecs
<< " dim " << dim << " dim " << dim
<< " useFloat16 " << useFloat16 << " useFloat16 " << opt.useFloat16
<< " transposed " << useTransposed << " transposed " << opt.useTransposed
<< " numQuery " << numQuery << " numQuery " << numQuery
<< " k " << k; << " k " << k;
// To some extent, we depend upon the relative error for the test // To some extent, we depend upon the relative error for the test
// for float16 // for float16
faiss::gpu::compareIndices(*cpuIndex, *gpuIndex, numQuery, dim, k, str.str(), faiss::gpu::compareIndices(*cpuIndex, *gpuIndex, numQuery, dim, k, str.str(),
useFloat16 ? kF16MaxRelErr : kF32MaxRelErr, opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
// FIXME: the fp16 bounds are // FIXME: the fp16 bounds are
// useless when math (the accumulator) is // useless when math (the accumulator) is
// in fp16. Figure out another way to test // in fp16. Figure out another way to test
useFloat16 ? 0.99f : 0.1f, opt.useFloat16 ? 0.99f : 0.1f,
useFloat16 ? 0.65f : 0.015f); opt.useFloat16 ? 0.65f : 0.015f);
} }
TEST(TestGpuIndexFlat, IP_Float32) { TEST(TestGpuIndexFlat, IP_Float32) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(false, false, false);
testFlat(false, false, true); TestFlatOptions opt;
opt.useL2 = false;
opt.useFloat16 = false;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
} }
} }
TEST(TestGpuIndexFlat, L2_Float32) { TEST(TestGpuIndexFlat, L2_Float32) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(true, false, false);
testFlat(true, false, true); TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
} }
} }
...@@ -108,24 +142,46 @@ TEST(TestGpuIndexFlat, L2_Float32) { ...@@ -108,24 +142,46 @@ TEST(TestGpuIndexFlat, L2_Float32) {
TEST(TestGpuIndexFlat, L2_Float32_K1) { TEST(TestGpuIndexFlat, L2_Float32_K1) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(true, false, false, 1);
testFlat(true, false, true, 1); TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
opt.kOverride = 1;
testFlat(opt);
} }
} }
TEST(TestGpuIndexFlat, IP_Float16) { TEST(TestGpuIndexFlat, IP_Float16) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(false, true, false);
testFlat(false, true, false); TestFlatOptions opt;
opt.useL2 = false;
opt.useFloat16 = true;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
} }
} }
TEST(TestGpuIndexFlat, L2_Float16) { TEST(TestGpuIndexFlat, L2_Float16) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(true, true, false);
testFlat(true, true, true); TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = true;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
} }
} }
...@@ -133,8 +189,33 @@ TEST(TestGpuIndexFlat, L2_Float16) { ...@@ -133,8 +189,33 @@ TEST(TestGpuIndexFlat, L2_Float16) {
TEST(TestGpuIndexFlat, L2_Float16_K1) { TEST(TestGpuIndexFlat, L2_Float16_K1) {
for (int tries = 0; tries < 5; ++tries) { for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed(); faiss::gpu::newTestSeed();
testFlat(true, true, false, 1);
testFlat(true, true, true, 1); TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = true;
opt.useTransposed = false;
opt.kOverride = 1;
testFlat(opt);
}
}
// test tiling along a huge vector set
TEST(TestGpuIndexFlat, L2_Tiling) {
for (int tries = 0; tries < 3; ++tries) {
faiss::gpu::newTestSeed();
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
opt.numVecsOverride = 1000000;
opt.numQueriesOverride = 8;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
} }
} }
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "../StandardGpuResources.h" #include "../StandardGpuResources.h"
#include "../utils/DeviceUtils.h" #include "../utils/DeviceUtils.h"
#include "../test/TestUtils.h" #include "../test/TestUtils.h"
#include <cmath>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <sstream> #include <sstream>
...@@ -390,6 +391,68 @@ TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) { ...@@ -390,6 +391,68 @@ TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) {
copyToTest(false, false); copyToTest(false, false);
} }
TEST(TestGpuIndexIVFFlat, Float32_negative) {
faiss::gpu::newTestSeed();
Options opt;
auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
// Put all vecs on negative side
for (auto& f : trainVecs) {
f = std::abs(f) * -1.0f;
}
for (auto& f : addVecs) {
f *= std::abs(f) * -1.0f;
}
faiss::IndexFlatIP quantizerIP(opt.dim);
faiss::Index* quantizer = (faiss::Index*) &quantizerIP;
faiss::IndexIVFFlat cpuIndex(quantizer,
opt.dim, opt.numCentroids,
faiss::METRIC_INNER_PRODUCT);
cpuIndex.train(opt.numTrain, trainVecs.data());
cpuIndex.add(opt.numAdd, addVecs.data());
cpuIndex.nprobe = opt.nprobe;
faiss::gpu::StandardGpuResources res;
res.noTempMemory();
faiss::gpu::GpuIndexIVFFlatConfig config;
config.device = opt.device;
config.indicesOptions = opt.indicesOpt;
faiss::gpu::GpuIndexIVFFlat gpuIndex(&res,
cpuIndex.d,
cpuIndex.nlist,
cpuIndex.metric_type,
config);
gpuIndex.copyFrom(&cpuIndex);
gpuIndex.setNumProbes(opt.nprobe);
// Construct a positive test set
auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
// Put all vecs on positive size
for (auto& f : queryVecs) {
f = std::abs(f);
}
bool compFloat16 = false;
faiss::gpu::compareIndices(queryVecs,
cpuIndex, gpuIndex,
opt.numQuery, opt.dim, opt.k, opt.toString(),
compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
// FIXME: the fp16 bounds are
// useless when math (the accumulator) is
// in fp16. Figure out another way to test
compFloat16 ? 0.99f : 0.1f,
compFloat16 ? 0.65f : 0.015f);
}
// //
// NaN tests // NaN tests
// //
......
...@@ -64,24 +64,23 @@ std::vector<float> randVecs(size_t num, size_t dim) { ...@@ -64,24 +64,23 @@ std::vector<float> randVecs(size_t num, size_t dim) {
return v; return v;
} }
void compareIndices(faiss::Index& refIndex, void compareIndices(const std::vector<float>& queryVecs,
faiss::Index& refIndex,
faiss::Index& testIndex, faiss::Index& testIndex,
int numQuery, int dim, int k, int numQuery, int dim, int k,
const std::string& configMsg, const std::string& configMsg,
float maxRelativeError, float maxRelativeError,
float pctMaxDiff1, float pctMaxDiff1,
float pctMaxDiffN) { float pctMaxDiffN) {
auto queries = faiss::gpu::randVecs(numQuery, dim);
// Compare // Compare
std::vector<float> refDistance(numQuery * k, 0); std::vector<float> refDistance(numQuery * k, 0);
std::vector<faiss::Index::idx_t> refIndices(numQuery * k, -1); std::vector<faiss::Index::idx_t> refIndices(numQuery * k, -1);
refIndex.search(numQuery, queries.data(), refIndex.search(numQuery, queryVecs.data(),
k, refDistance.data(), refIndices.data()); k, refDistance.data(), refIndices.data());
std::vector<float> testDistance(numQuery * k, 0); std::vector<float> testDistance(numQuery * k, 0);
std::vector<faiss::Index::idx_t> testIndices(numQuery * k, -1); std::vector<faiss::Index::idx_t> testIndices(numQuery * k, -1);
testIndex.search(numQuery, queries.data(), testIndex.search(numQuery, queryVecs.data(),
k, testDistance.data(), testIndices.data()); k, testDistance.data(), testIndices.data());
faiss::gpu::compareLists(refDistance.data(), faiss::gpu::compareLists(refDistance.data(),
...@@ -94,6 +93,25 @@ void compareIndices(faiss::Index& refIndex, ...@@ -94,6 +93,25 @@ void compareIndices(faiss::Index& refIndex,
maxRelativeError, pctMaxDiff1, pctMaxDiffN); maxRelativeError, pctMaxDiff1, pctMaxDiffN);
} }
void compareIndices(faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
const std::string& configMsg,
float maxRelativeError,
float pctMaxDiff1,
float pctMaxDiffN) {
auto queryVecs = faiss::gpu::randVecs(numQuery, dim);
compareIndices(queryVecs,
refIndex,
testIndex,
numQuery, dim, k,
configMsg,
maxRelativeError,
pctMaxDiff1,
pctMaxDiffN);
}
template <typename T> template <typename T>
inline T lookup(const T* p, int i, int j, int /*dim1*/, int dim2) { inline T lookup(const T* p, int i, int j, int /*dim1*/, int dim2) {
return p[i * dim2 + j]; return p[i * dim2 + j];
......
...@@ -56,7 +56,19 @@ T randSelect(std::initializer_list<T> vals) { ...@@ -56,7 +56,19 @@ T randSelect(std::initializer_list<T> vals) {
/// Generates a collection of random vectors in the range [0, 1] /// Generates a collection of random vectors in the range [0, 1]
std::vector<float> randVecs(size_t num, size_t dim); std::vector<float> randVecs(size_t num, size_t dim);
/// Compare two indices via query for similarity /// Compare two indices via query for similarity, with a user-specified set of
/// query vectors
void compareIndices(const std::vector<float>& queryVecs,
faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
const std::string& configMsg,
float maxRelativeError = 6e-5f,
float pctMaxDiff1 = 0.1f,
float pctMaxDiffN = 0.005f);
/// Compare two indices via query for similarity, generating random query
/// vectors
void compareIndices(faiss::Index& refIndex, void compareIndices(faiss::Index& refIndex,
faiss::Index& testIndex, faiss::Index& testIndex,
int numQuery, int dim, int k, int numQuery, int dim, int k,
......
...@@ -38,14 +38,14 @@ def search_index_pytorch(index, x, k, D=None, I=None): ...@@ -38,14 +38,14 @@ def search_index_pytorch(index, x, k, D=None, I=None):
assert I.__class__ in (torch.LongTensor, torch.cuda.LongTensor) assert I.__class__ in (torch.LongTensor, torch.cuda.LongTensor)
assert I.size() == (n, k) assert I.size() == (n, k)
assert I.is_contiguous() assert I.is_contiguous()
torch.cuda.synchronize()
xptr = x.storage().data_ptr() xptr = x.storage().data_ptr()
Iptr = I.storage().data_ptr() Iptr = I.storage().data_ptr()
Dptr = D.storage().data_ptr() Dptr = D.storage().data_ptr()
index.search_c(n, faiss.cast_integer_to_float_ptr(xptr), index.search_c(n, faiss.cast_integer_to_float_ptr(xptr),
k, faiss.cast_integer_to_float_ptr(Dptr), k, faiss.cast_integer_to_float_ptr(Dptr),
faiss.cast_integer_to_long_ptr(Iptr)) faiss.cast_integer_to_long_ptr(Iptr))
torch.cuda.synchronize()
return D, I return D, I
......
...@@ -77,4 +77,46 @@ void runBlockSelect(Tensor<float, 2, true>& in, ...@@ -77,4 +77,46 @@ void runBlockSelect(Tensor<float, 2, true>& in,
} }
} }
void runBlockSelectPair(Tensor<float, 2, true>& inK,
Tensor<int, 2, true>& inV,
Tensor<float, 2, true>& outK,
Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) {
FAISS_ASSERT(k <= 1024);
if (dir) {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(float, true, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(float, true, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(float, true, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(float, true, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(float, true, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(float, true, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(float, true, 1024);
}
} else {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(float, false, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(float, false, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(float, false, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(float, false, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(float, false, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(float, false, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(float, false, 1024);
}
}
}
} } // namespace } } // namespace
...@@ -79,6 +79,48 @@ void runBlockSelect(Tensor<half, 2, true>& in, ...@@ -79,6 +79,48 @@ void runBlockSelect(Tensor<half, 2, true>& in,
} }
} }
void runBlockSelectPair(Tensor<half, 2, true>& inK,
Tensor<int, 2, true>& inV,
Tensor<half, 2, true>& outK,
Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) {
FAISS_ASSERT(k <= 1024);
if (dir) {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(half, true, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(half, true, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(half, true, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(half, true, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(half, true, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(half, true, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(half, true, 1024);
}
} else {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(half, false, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(half, false, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(half, false, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(half, false, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(half, false, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(half, false, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(half, false, 1024);
}
}
}
#endif #endif
} } // namespace } } // namespace
...@@ -62,16 +62,79 @@ __global__ void blockSelect(Tensor<K, 2, true> in, ...@@ -62,16 +62,79 @@ __global__ void blockSelect(Tensor<K, 2, true> in,
} }
} }
template <typename K,
typename IndexType,
bool Dir,
int NumWarpQ,
int NumThreadQ,
int ThreadsPerBlock>
__global__ void blockSelectPair(Tensor<K, 2, true> inK,
Tensor<IndexType, 2, true> inV,
Tensor<K, 2, true> outK,
Tensor<IndexType, 2, true> outV,
K initK,
IndexType initV,
int k) {
constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
__shared__ K smemK[kNumWarps * NumWarpQ];
__shared__ IndexType smemV[kNumWarps * NumWarpQ];
BlockSelect<K, IndexType, Dir, Comparator<K>,
NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(initK, initV, smemK, smemV, k);
// Grid is exactly sized to rows available
int row = blockIdx.x;
int i = threadIdx.x;
K* inKStart = inK[row][i].data();
IndexType* inVStart = inV[row][i].data();
// Whole warps must participate in the selection
int limit = utils::roundDown(inK.getSize(1), kWarpSize);
for (; i < limit; i += ThreadsPerBlock) {
heap.add(*inKStart, *inVStart);
inKStart += ThreadsPerBlock;
inVStart += ThreadsPerBlock;
}
// Handle last remainder fraction of a warp of elements
if (i < inK.getSize(1)) {
heap.addThreadQ(*inKStart, *inVStart);
}
heap.reduce();
for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
outK[row][i] = smemK[i];
outV[row][i] = smemV[i];
}
}
void runBlockSelect(Tensor<float, 2, true>& in, void runBlockSelect(Tensor<float, 2, true>& in,
Tensor<float, 2, true>& outKeys, Tensor<float, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream); bool dir, int k, cudaStream_t stream);
void runBlockSelectPair(Tensor<float, 2, true>& inKeys,
Tensor<int, 2, true>& inIndices,
Tensor<float, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
#ifdef FAISS_USE_FLOAT16 #ifdef FAISS_USE_FLOAT16
void runBlockSelect(Tensor<half, 2, true>& in, void runBlockSelect(Tensor<half, 2, true>& in,
Tensor<half, 2, true>& outKeys, Tensor<half, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices, Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream); bool dir, int k, cudaStream_t stream);
void runBlockSelectPair(Tensor<half, 2, true>& inKeys,
Tensor<int, 2, true>& inIndices,
Tensor<half, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
#endif #endif
} } // namespace } } // namespace
...@@ -12,37 +12,37 @@ ...@@ -12,37 +12,37 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor() : DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor() :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner), state_(AllocState::NotOwner),
space_(MemorySpace::Device) { space_(MemorySpace::Device) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t) : DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner), state_(AllocState::NotOwner),
space_(MemorySpace::Device) { space_(MemorySpace::Device) {
this->operator=(std::move(t)); this->operator=(std::move(t));
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>& DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t) { DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
if (this->state_ == AllocState::Owner) { if (this->state_ == AllocState::Owner) {
CUDA_VERIFY(cudaFree(this->data_)); CUDA_VERIFY(cudaFree(this->data_));
} }
this->Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator=( this->Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
std::move(t)); std::move(t));
this->state_ = t.state_; t.state_ = AllocState::NotOwner; this->state_ = t.state_; t.state_ = AllocState::NotOwner;
...@@ -52,10 +52,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=( ...@@ -52,10 +52,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
return *this; return *this;
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() { DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~DeviceTensor() {
if (state_ == AllocState::Owner) { if (state_ == AllocState::Owner) {
FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0)); FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
CUDA_VERIFY(cudaFree(this->data_)); CUDA_VERIFY(cudaFree(this->data_));
...@@ -66,13 +66,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() { ...@@ -66,13 +66,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() {
// destructor will return the reservation // destructor will return the reservation
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
const IndexT sizes[Dim], const IndexT sizes[Dim],
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner), state_(AllocState::Owner),
space_(space) { space_(space) {
...@@ -80,13 +80,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -80,13 +80,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0)); FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
std::initializer_list<IndexT> sizes, std::initializer_list<IndexT> sizes,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner), state_(AllocState::Owner),
space_(space) { space_(space) {
...@@ -95,15 +95,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -95,15 +95,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
} }
// memory reservation constructor // memory reservation constructor
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m, DeviceMemory& m,
const IndexT sizes[Dim], const IndexT sizes[Dim],
cudaStream_t stream, cudaStream_t stream,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Reservation), state_(AllocState::Reservation),
space_(space) { space_(space) {
...@@ -116,15 +116,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -116,15 +116,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
} }
// memory reservation constructor // memory reservation constructor
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m, DeviceMemory& m,
std::initializer_list<IndexT> sizes, std::initializer_list<IndexT> sizes,
cudaStream_t stream, cudaStream_t stream,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Reservation), state_(AllocState::Reservation),
space_(space) { space_(space) {
...@@ -136,51 +136,51 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -136,51 +136,51 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
reservation_ = std::move(memory); reservation_ = std::move(memory);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data, DataPtrType data,
const IndexT sizes[Dim], const IndexT sizes[Dim],
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner), state_(AllocState::NotOwner),
space_(space) { space_(space) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data, DataPtrType data,
std::initializer_list<IndexT> sizes, std::initializer_list<IndexT> sizes,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner), state_(AllocState::NotOwner),
space_(space) { space_(space) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data, DataPtrType data,
const IndexT sizes[Dim], const IndexT sizes[Dim],
const IndexT strides[Dim], const IndexT strides[Dim],
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes, strides), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
state_(AllocState::NotOwner), state_(AllocState::NotOwner),
space_(space) { space_(space) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream, cudaStream_t stream,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Owner), state_(AllocState::Owner),
space_(space) { space_(space) {
...@@ -189,15 +189,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -189,15 +189,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
this->copyFrom(t, stream); this->copyFrom(t, stream);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m, DeviceMemory& m,
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream, cudaStream_t stream,
MemorySpace space) : MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Reservation), state_(AllocState::Reservation),
space_(space) { space_(space) {
...@@ -211,10 +211,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor( ...@@ -211,10 +211,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
this->copyFrom(t, stream); this->copyFrom(t, stream);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>& __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::zero( DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero(
cudaStream_t stream) { cudaStream_t stream) {
if (this->data_) { if (this->data_) {
// Region must be contiguous // Region must be contiguous
......
...@@ -18,10 +18,10 @@ namespace faiss { namespace gpu { ...@@ -18,10 +18,10 @@ namespace faiss { namespace gpu {
template <typename T, template <typename T,
int Dim, int Dim,
bool Contig = false, bool InnerContig = false,
typename IndexT = int, typename IndexT = int,
template <typename U> class PtrTraits = traits::DefaultPtrTraits> template <typename U> class PtrTraits = traits::DefaultPtrTraits>
class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> { class DeviceTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
public: public:
typedef IndexT IndexType; typedef IndexT IndexType;
typedef typename PtrTraits<T>::PtrType DataPtrType; typedef typename PtrTraits<T>::PtrType DataPtrType;
...@@ -33,11 +33,11 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> { ...@@ -33,11 +33,11 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
__host__ ~DeviceTensor(); __host__ ~DeviceTensor();
/// Move constructor /// Move constructor
__host__ DeviceTensor(DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t); __host__ DeviceTensor(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
/// Move assignment /// Move assignment
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>& __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
operator=(DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t); operator=(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
/// Constructs a tensor of the given size, allocating memory for it /// Constructs a tensor of the given size, allocating memory for it
/// locally /// locally
...@@ -76,19 +76,19 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> { ...@@ -76,19 +76,19 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
MemorySpace space = MemorySpace::Device); MemorySpace space = MemorySpace::Device);
/// Copies a tensor into ourselves, allocating memory for it locally /// Copies a tensor into ourselves, allocating memory for it locally
__host__ DeviceTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, __host__ DeviceTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream, cudaStream_t stream,
MemorySpace space = MemorySpace::Device); MemorySpace space = MemorySpace::Device);
/// Copies a tensor into ourselves, reserving a temporary /// Copies a tensor into ourselves, reserving a temporary
/// memory reservation via a memory manager. /// memory reservation via a memory manager.
__host__ DeviceTensor(DeviceMemory& m, __host__ DeviceTensor(DeviceMemory& m,
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream, cudaStream_t stream,
MemorySpace space = MemorySpace::Device); MemorySpace space = MemorySpace::Device);
/// Call to zero out memory /// Call to zero out memory
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>& __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
zero(cudaStream_t stream); zero(cudaStream_t stream);
private: private:
......
...@@ -43,7 +43,7 @@ void synchronizeAllDevices() { ...@@ -43,7 +43,7 @@ void synchronizeAllDevices() {
} }
} }
cudaDeviceProp& getDeviceProperties(int device) { const cudaDeviceProp& getDeviceProperties(int device) {
static std::mutex mutex; static std::mutex mutex;
static std::unordered_map<int, cudaDeviceProp> properties; static std::unordered_map<int, cudaDeviceProp> properties;
...@@ -61,6 +61,10 @@ cudaDeviceProp& getDeviceProperties(int device) { ...@@ -61,6 +61,10 @@ cudaDeviceProp& getDeviceProperties(int device) {
return it->second; return it->second;
} }
const cudaDeviceProp& getCurrentDeviceProperties() {
return getDeviceProperties(getCurrentDevice());
}
int getMaxThreads(int device) { int getMaxThreads(int device) {
return getDeviceProperties(device).maxThreadsPerBlock; return getDeviceProperties(device).maxThreadsPerBlock;
} }
......
...@@ -31,7 +31,10 @@ int getNumDevices(); ...@@ -31,7 +31,10 @@ int getNumDevices();
void synchronizeAllDevices(); void synchronizeAllDevices();
/// Returns a cached cudaDeviceProp for the given device /// Returns a cached cudaDeviceProp for the given device
cudaDeviceProp& getDeviceProperties(int device); const cudaDeviceProp& getDeviceProperties(int device);
/// Returns the cached cudaDeviceProp for the current device
const cudaDeviceProp& getCurrentDeviceProperties();
/// Returns the maximum number of threads available for the given GPU /// Returns the maximum number of threads available for the given GPU
/// device /// device
......
...@@ -10,18 +10,18 @@ ...@@ -10,18 +10,18 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor() : HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor() :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner) { state_(AllocState::NotOwner) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() { HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~HostTensor() {
if (state_ == AllocState::Owner) { if (state_ == AllocState::Owner) {
FAISS_ASSERT(this->data_ != nullptr); FAISS_ASSERT(this->data_ != nullptr);
delete[] this->data_; delete[] this->data_;
...@@ -29,67 +29,67 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() { ...@@ -29,67 +29,67 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() {
} }
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
const IndexT sizes[Dim]) : const IndexT sizes[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner) { state_(AllocState::Owner) {
this->data_ = new T[this->numElements()]; this->data_ = new T[this->numElements()];
FAISS_ASSERT(this->data_ != nullptr); FAISS_ASSERT(this->data_ != nullptr);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
std::initializer_list<IndexT> sizes) : std::initializer_list<IndexT> sizes) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner) { state_(AllocState::Owner) {
this->data_ = new T[this->numElements()]; this->data_ = new T[this->numElements()];
FAISS_ASSERT(this->data_ != nullptr); FAISS_ASSERT(this->data_ != nullptr);
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data, DataPtrType data,
const IndexT sizes[Dim]) : const IndexT sizes[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner) { state_(AllocState::NotOwner) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data, DataPtrType data,
std::initializer_list<IndexT> sizes) : std::initializer_list<IndexT> sizes) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner) { state_(AllocState::NotOwner) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data, DataPtrType data,
const IndexT sizes[Dim], const IndexT sizes[Dim],
const IndexT strides[Dim]) : const IndexT strides[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes, strides), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
state_(AllocState::NotOwner) { state_(AllocState::NotOwner) {
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ __host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream) : cudaStream_t stream) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()), Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Owner) { state_(AllocState::Owner) {
// Only contiguous arrays handled for now // Only contiguous arrays handled for now
FAISS_ASSERT(t.isContiguous()); FAISS_ASSERT(t.isContiguous());
...@@ -99,10 +99,10 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor( ...@@ -99,10 +99,10 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
} }
/// Call to zero out memory /// Call to zero out memory
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>& __host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() { HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero() {
// Region must be contiguous // Region must be contiguous
FAISS_ASSERT(this->isContiguous()); FAISS_ASSERT(this->isContiguous());
...@@ -113,17 +113,17 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() { ...@@ -113,17 +113,17 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() {
return *this; return *this;
} }
template <typename T, int Dim, bool Contig, template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits> typename IndexT, template <typename U> class PtrTraits>
__host__ T __host__ T
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::maxDiff( HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::maxDiff(
const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const { const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
auto size = this->numElements(); auto size = this->numElements();
FAISS_ASSERT(size == t.numElements()); FAISS_ASSERT(size == t.numElements());
FAISS_ASSERT(size > 0); FAISS_ASSERT(size > 0);
if (Contig) { if (InnerContig) {
auto a = this->data(); auto a = this->data();
auto b = t.data(); auto b = t.data();
......
...@@ -16,10 +16,10 @@ namespace faiss { namespace gpu { ...@@ -16,10 +16,10 @@ namespace faiss { namespace gpu {
template <typename T, template <typename T,
int Dim, int Dim,
bool Contig = false, bool InnerContig = false,
typename IndexT = int, typename IndexT = int,
template <typename U> class PtrTraits = traits::DefaultPtrTraits> template <typename U> class PtrTraits = traits::DefaultPtrTraits>
class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> { class HostTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
public: public:
typedef IndexT IndexType; typedef IndexT IndexType;
typedef typename PtrTraits<T>::PtrType DataPtrType; typedef typename PtrTraits<T>::PtrType DataPtrType;
...@@ -51,19 +51,19 @@ class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> { ...@@ -51,19 +51,19 @@ class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
/// Copies a tensor into ourselves, allocating memory for it /// Copies a tensor into ourselves, allocating memory for it
/// locally. If the tensor is on the GPU, then we will copy it to /// locally. If the tensor is on the GPU, then we will copy it to
/// ourselves wrt the given stream. /// ourselves wrt the given stream.
__host__ HostTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t, __host__ HostTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream); cudaStream_t stream);
/// Call to zero out memory /// Call to zero out memory
__host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>& zero(); __host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& zero();
/// Returns the maximum difference seen between two tensors /// Returns the maximum difference seen between two tensors
__host__ T __host__ T
maxDiff(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const; maxDiff(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const;
/// Are the two tensors exactly equal? /// Are the two tensors exactly equal?
__host__ bool __host__ bool
equal(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const { equal(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
return (maxDiff(t) == (T) 0); return (maxDiff(t) == (T) 0);
} }
......
...@@ -24,11 +24,12 @@ struct Limits { ...@@ -24,11 +24,12 @@ struct Limits {
// constexpr constructor for half // constexpr constructor for half
// FIXME: faiss CPU uses +/-FLT_MAX instead of +/-infinity // FIXME: faiss CPU uses +/-FLT_MAX instead of +/-infinity
constexpr float kFloatMax = std::numeric_limits<float>::max(); constexpr float kFloatMax = std::numeric_limits<float>::max();
constexpr float kFloatMin = std::numeric_limits<float>::lowest();
template <> template <>
struct Limits<float> { struct Limits<float> {
static __device__ __host__ inline float getMin() { static __device__ __host__ inline float getMin() {
return -kFloatMax; return kFloatMin;
} }
static __device__ __host__ inline float getMax() { static __device__ __host__ inline float getMax() {
return kFloatMax; return kFloatMax;
...@@ -55,8 +56,8 @@ struct Limits<half> { ...@@ -55,8 +56,8 @@ struct Limits<half> {
#endif // FAISS_USE_FLOAT16 #endif // FAISS_USE_FLOAT16
constexpr int kIntMin = std::numeric_limits<int>::min();
constexpr int kIntMax = std::numeric_limits<int>::max(); constexpr int kIntMax = std::numeric_limits<int>::max();
constexpr int kIntMin = std::numeric_limits<int>::lowest();
template <> template <>
struct Limits<int> { struct Limits<int> {
......
...@@ -112,6 +112,10 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC, ...@@ -112,6 +112,10 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC,
FAISS_ASSERT(aK == bK); FAISS_ASSERT(aK == bK);
FAISS_ASSERT(bN == cN); FAISS_ASSERT(bN == cN);
FAISS_ASSERT(a.getStride(1) == 1);
FAISS_ASSERT(b.getStride(1) == 1);
FAISS_ASSERT(c.getStride(1) == 1);
// Now, we have to represent the matrix multiplication in // Now, we have to represent the matrix multiplication in
// column-major layout // column-major layout
T* pA = transC ? a.data() : b.data(); T* pA = transC ? a.data() : b.data();
...@@ -122,9 +126,9 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC, ...@@ -122,9 +126,9 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC,
int n = c.getSize(0); // other size int n = c.getSize(0); // other size
int k = transA ? a.getSize(0) : a.getSize(1); int k = transA ? a.getSize(0) : a.getSize(1);
int lda = transC ? a.getSize(1) : b.getSize(1); int lda = transC ? a.getStride(0) : b.getStride(0);
int ldb = transC ? b.getSize(1) : a.getSize(1); int ldb = transC ? b.getStride(0) : a.getStride(0);
int ldc = c.getSize(1); int ldc = c.getStride(0);
auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N; auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N; auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
...@@ -238,9 +242,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC, ...@@ -238,9 +242,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
int n = c.getSize(1); // other size int n = c.getSize(1); // other size
int k = transA ? a.getSize(1) : a.getSize(2); int k = transA ? a.getSize(1) : a.getSize(2);
int lda = transC ? a.getSize(2) : b.getSize(2); int lda = transC ? a.getStride(1) : b.getStride(1);
int ldb = transC ? b.getSize(2) : a.getSize(2); int ldb = transC ? b.getStride(1) : a.getStride(1);
int ldc = c.getSize(2); int ldc = c.getStride(1);
auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N; auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N; auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
...@@ -254,9 +258,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC, ...@@ -254,9 +258,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
HostTensor<float*, 1, true> hostB({b.getSize(0)}); HostTensor<float*, 1, true> hostB({b.getSize(0)});
HostTensor<float*, 1, true> hostC({c.getSize(0)}); HostTensor<float*, 1, true> hostC({c.getSize(0)});
size_t aOffset = a.getSize(1) * a.getSize(2); size_t aOffset = a.getStride(0);
size_t bOffset = b.getSize(1) * b.getSize(2); size_t bOffset = b.getStride(0);
size_t cOffset = c.getSize(1) * c.getSize(2); size_t cOffset = c.getStride(0);
for (int i = 0; i < a.getSize(0); ++i) { for (int i = 0; i < a.getSize(0); ++i) {
hostA[i] = transC ? a.data() + i * aOffset : b.data() + i * bOffset; hostA[i] = transC ? a.data() + i * aOffset : b.data() + i * bOffset;
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
namespace faiss { namespace gpu { namespace faiss { namespace gpu {
template <int Dim, bool Contig = false, typename IndexT = int> template <int Dim, bool InnerContig = false, typename IndexT = int>
class NoTypeTensor { class NoTypeTensor {
public: public:
NoTypeTensor() NoTypeTensor()
...@@ -25,7 +25,7 @@ class NoTypeTensor { ...@@ -25,7 +25,7 @@ class NoTypeTensor {
} }
template <typename T> template <typename T>
NoTypeTensor(Tensor<T, Dim, Contig, IndexT>& t) NoTypeTensor(Tensor<T, Dim, InnerContig, IndexT>& t)
: mem_(t.data()), : mem_(t.data()),
typeSize_(sizeof(T)) { typeSize_(sizeof(T)) {
for (int i = 0; i < Dim; ++i) { for (int i = 0; i < Dim; ++i) {
...@@ -87,13 +87,14 @@ class NoTypeTensor { ...@@ -87,13 +87,14 @@ class NoTypeTensor {
} }
template <typename T> template <typename T>
Tensor<T, Dim, Contig, IndexT> toTensor() { Tensor<T, Dim, InnerContig, IndexT> toTensor() {
FAISS_ASSERT(sizeof(T) == typeSize_); FAISS_ASSERT(sizeof(T) == typeSize_);
return Tensor<T, Dim, Contig, IndexT>((T*) mem_, size_, stride_); return Tensor<T, Dim, InnerContig, IndexT>((T*) mem_, size_, stride_);
} }
NoTypeTensor<Dim, Contig, IndexT> narrowOutermost(IndexT start, IndexT size) { NoTypeTensor<Dim, InnerContig, IndexT> narrowOutermost(IndexT start,
IndexT size) {
char* newPtr = (char*) mem_; char* newPtr = (char*) mem_;
if (start > 0) { if (start > 0) {
...@@ -110,7 +111,7 @@ class NoTypeTensor { ...@@ -110,7 +111,7 @@ class NoTypeTensor {
} }
} }
return NoTypeTensor<Dim, Contig, IndexT>( return NoTypeTensor<Dim, InnerContig, IndexT>(
newPtr, typeSize_, newSize, stride_); newPtr, typeSize_, newSize, stride_);
} }
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -222,7 +222,6 @@ void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) { ...@@ -222,7 +222,6 @@ void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) {
} }
static void write_ivf_header (const IndexIVF * ivf, FILE *f, static void write_ivf_header (const IndexIVF * ivf, FILE *f,
bool include_ids = true) { bool include_ids = true) {
write_index_header (ivf, f); write_index_header (ivf, f);
...@@ -445,6 +444,7 @@ static void read_ScalarQuantizer (ScalarQuantizer *ivsc, FILE *f) { ...@@ -445,6 +444,7 @@ static void read_ScalarQuantizer (ScalarQuantizer *ivsc, FILE *f) {
READVECTOR (ivsc->trained); READVECTOR (ivsc->trained);
} }
ProductQuantizer * read_ProductQuantizer (const char*fname) { ProductQuantizer * read_ProductQuantizer (const char*fname) {
FILE *f = fopen (fname, "r"); FILE *f = fopen (fname, "r");
FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname); FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
...@@ -676,8 +676,8 @@ Index *read_index (FILE * f, bool try_mmap) { ...@@ -676,8 +676,8 @@ Index *read_index (FILE * f, bool try_mmap) {
} }
idx = idxmap; idx = idxmap;
} else { } else {
fprintf (stderr, "Index type 0x%08x not supported\n", h); FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
abort (); idx = nullptr;
} }
return idx; return idx;
} }
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -307,12 +307,20 @@ int fvec_madd_and_argmin (size_t n, const float *a, ...@@ -307,12 +307,20 @@ int fvec_madd_and_argmin (size_t n, const float *a,
void reflection (const float * u, float * x, size_t n, size_t d, size_t nu); void reflection (const float * u, float * x, size_t n, size_t d, size_t nu);
/** For k-means: update stage. Returns nb of split clusters. */ /** For k-means: update stage.
*
* @param x training vectors, size n * d
* @param centroids centroid vectors, size k * d
* @param assign nearest centroid for each training vector, size n
* @param k_frozen do not update the k_frozen first centroids
* @return nb of spliting operations to fight empty clusters
*/
int km_update_centroids ( int km_update_centroids (
const float * x, const float * x,
float * centroids, float * centroids,
long * assign, long * assign,
size_t d, size_t k, size_t n); size_t d, size_t k, size_t n,
size_t k_frozen);
/** compute the Q of the QR decomposition for m > n /** compute the Q of the QR decomposition for m > n
* @param a size n * m: input matrix and output Q * @param a size n * m: input matrix and output Q
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment