Commit 250a3d3f authored by matthijs's avatar matthijs

sync with FB version 2017-11-22

various bugfixes from github issues
kmean with some frozen centroids
GPU better tiling for large flat datasets
default AVX for vector ops
parent 71335194
......@@ -77,10 +77,10 @@ IntersectionCriterion::IntersectionCriterion (idx_t nq, idx_t R):
double IntersectionCriterion::evaluate(const float* /*D*/, const idx_t* I)
const {
FAISS_THROW_IF_NOT_MSG(
FAISS_THROW_IF_NOT_MSG(
(gt_I.size() == gt_nnn * nq && gt_nnn >= R && nnn >= R),
"ground truth not initialized");
long n_ok = 0;
long n_ok = 0;
#pragma omp parallel for reduction(+: n_ok)
for (idx_t q = 0; q < nq; q++) {
n_ok += ranklist_intersection_size (
......@@ -345,11 +345,13 @@ void ParameterSpace::initialize (const Index * index)
}
if (DC (IndexIVF)) {
ParameterRange & pr = add_range("nprobe");
for (int i = 0; i < 13; i++) {
size_t nprobe = 1 << i;
if (nprobe >= ix->nlist) break;
pr.values.push_back (nprobe);
{
ParameterRange & pr = add_range("nprobe");
for (int i = 0; i < 13; i++) {
size_t nprobe = 1 << i;
if (nprobe >= ix->nlist) break;
pr.values.push_back (nprobe);
}
}
}
if (DC (IndexPQ)) {
......@@ -371,7 +373,6 @@ void ParameterSpace::initialize (const Index * index)
}
}
if (DC (IndexIVFPQR)) {
assert (ix);
ParameterRange & pr = add_range("k_factor");
for (int i = 0; i <= 6; i++) {
pr.values.push_back (1 << i);
......@@ -427,12 +428,21 @@ void ParameterSpace::set_index_parameter (
if (name == "verbose") {
index->verbose = int(val);
// and fall through to also enable it on sub-indexes
}
if (DC (IndexPreTransform)) {
index = ix->index;
}
if (DC (IndexShards)) {
// call on all sub-indexes
for (auto & shard_index : ix->shard_indexes) {
set_index_parameter (shard_index, name, val);
}
return;
}
if (name == "verbose") {
index->verbose = int(val);
// in case it was an IndexPreTransform
}
if (DC (IndexRefineFlat)) {
if (name == "k_factor_rf") {
......@@ -449,9 +459,12 @@ void ParameterSpace::set_index_parameter (
return; // last verbose that we could find
}
if (name == "nprobe") {
DC(IndexIVF);
ix->nprobe = int(val);
} else if (name == "ht") {
if ( DC(IndexIVF)) {
ix->nprobe = int(val);
return;
}
}
if (name == "ht") {
if (DC (IndexPQ)) {
if (val >= ix->pq.code_size * 8) {
ix->search_type = IndexPQ::ST_PQ;
......@@ -459,25 +472,32 @@ void ParameterSpace::set_index_parameter (
ix->search_type = IndexPQ::ST_polysemous;
ix->polysemous_ht = int(val);
}
return;
} else if (DC (IndexIVFPQ)) {
if (val >= ix->pq.code_size * 8) {
ix->polysemous_ht = 0;
} else {
ix->polysemous_ht = int(val);
}
return;
}
} else if (name == "k_factor") {
DC (IndexIVFPQR);
ix->k_factor = val;
} else if (name == "max_codes") {
DC (IndexIVFPQ);
ix->max_codes = finite(val) ? size_t(val) : 0;
} else {
FAISS_THROW_FMT (
"ParameterSpace::set_index_parameter:"
"could not set parameter %s",
name.c_str());
}
if (name == "k_factor") {
if (DC (IndexIVFPQR)) {
ix->k_factor = val;
return;
}
}
if (name == "max_codes") {
if (DC (IndexIVFPQ)) {
ix->max_codes = finite(val) ? size_t(val) : 0;
return;
}
}
FAISS_THROW_FMT ("ParameterSpace::set_index_parameter:"
"could not set parameter %s",
name.c_str());
}
void ParameterSpace::display () const
......@@ -634,6 +654,15 @@ struct VTChain {
}
};
/// what kind of training does this coarse quantizer require?
char get_trains_alone(const Index *coarse_quantizer) {
return
dynamic_cast<const MultiIndexQuantizer*>(coarse_quantizer) ? 1 :
0;
}
}
Index *index_factory (int d, const char *description_in, MetricType metric)
......@@ -656,6 +685,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
tok;
tok = strtok_r (nullptr, " ,", &ptr)) {
int d_out, opq_M, nbit, M, M2;
char option[100];
std::string stok(tok);
// to avoid mem leaks with exceptions:
......@@ -686,7 +716,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
} else if (stok == "L2norm") {
vt_1 = new NormalizationTransform (d, 2.0);
// coarse quantizers
} else if (!coarse_quantizer &&
sscanf (tok, "IVF%d", &ncentroids) == 1) {
if (metric == METRIC_L2) {
......@@ -709,8 +739,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
IndexIVF *index_ivf = new IndexIVFFlat (
coarse_quantizer, d, ncentroids, metric);
index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
!= nullptr;
get_trains_alone (coarse_quantizer);
index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
del_coarse_quantizer.release ();
index_ivf->own_fields = true;
......@@ -728,8 +757,7 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
new IndexIVFScalarQuantizer (
coarse_quantizer, d, ncentroids, qt, metric);
index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
!= nullptr;
get_trains_alone (coarse_quantizer);
del_coarse_quantizer.release ();
index_ivf->own_fields = true;
index_1 = index_ivf;
......@@ -744,29 +772,31 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
IndexIVFPQR *index_ivf = new IndexIVFPQR (
coarse_quantizer, d, ncentroids, M, 8, M2, 8);
index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
!= nullptr;
get_trains_alone (coarse_quantizer);
del_coarse_quantizer.release ();
index_ivf->own_fields = true;
index_1 = index_ivf;
} else if (!index && sscanf (tok, "PQ%d", &M) == 1) {
} else if (!index && sscanf (tok, "PQ%d%10s", &M, option) == 2) {
std::string soption = option;
// np to disable polysemous trainign
FAISS_THROW_IF_NOT(soption == "" || soption == "np");
if (coarse_quantizer) {
IndexIVFPQ *index_ivf = new IndexIVFPQ (
coarse_quantizer, d, ncentroids, M, 8);
index_ivf->quantizer_trains_alone =
dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
!= nullptr;
get_trains_alone (coarse_quantizer);
index_ivf->metric_type = metric;
index_ivf->cp.spherical = metric == METRIC_INNER_PRODUCT;
del_coarse_quantizer.release ();
index_ivf->own_fields = true;
index_ivf->do_polysemous_training = true;
index_ivf->do_polysemous_training = soption != "np";
index_1 = index_ivf;
} else {
IndexPQ *index_pq = new IndexPQ (d, M, 8, metric);
index_pq->do_polysemous_training = true;
index_pq->do_polysemous_training = soption != "np";
index_1 = index_pq;
}
} else if (stok == "RFlat") {
make_IndexRefineFlat = true;
} else {
......
......@@ -25,7 +25,7 @@ namespace faiss {
/** The objective is to have a simple result structure while
* minimizing the number of mem copies in the result. The method
* do_allocation can be overloaded to allocate the result tables in
* the matrix type of a srcipting language like Lua or Python. */
* the matrix type of a scripting language like Lua or Python. */
struct RangeSearchResult {
size_t nq; ///< nb of queries
size_t *lims; ///< size (nq + 1)
......
......@@ -29,6 +29,7 @@ ClusteringParameters::ClusteringParameters ():
nredo(1),
verbose(false), spherical(false),
update_index(false),
frozen_centroids(false),
min_points_per_centroid(39),
max_points_per_centroid(256),
seed(1234)
......@@ -110,7 +111,24 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
float * dis = new float[nx];
ScopeDeleter<float> del2(dis);
// for redo
float best_err = 1e50;
std::vector<float> best_obj;
std::vector<float> best_centroids;
// support input centroids
FAISS_THROW_IF_NOT_MSG (
centroids.size() % d == 0,
"size of provided input centroids not a multiple of dimension");
size_t n_input_centroids = centroids.size() / d;
if (verbose && n_input_centroids > 0) {
printf (" Using %zd centroids provided as input (%sfrozen)\n",
n_input_centroids, frozen_centroids ? "" : "not ");
}
double t_search_tot = 0;
if (verbose) {
printf(" Preprocessing in %.2f s\n",
......@@ -120,39 +138,28 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
for (int redo = 0; redo < nredo; redo++) {
std::vector<float> buf_centroids;
std::vector<float> &cur_centroids =
nredo == 1 ? centroids : buf_centroids;
if (verbose && nredo > 1) {
printf("Outer iteration %d / %d\n", redo, nredo);
}
if (cur_centroids.size() == 0) {
// initialize centroids with random points from the dataset
cur_centroids.resize (d * k);
std::vector<int> perm (nx);
rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L);
#pragma omp parallel for
for (int i = 0; i < k ; i++)
memcpy (&cur_centroids[i * d], x + perm[i] * d,
d * sizeof (float));
} else { // assume user provides some meaningful initialization
FAISS_THROW_IF_NOT (cur_centroids.size() == d * k);
FAISS_THROW_IF_NOT_MSG (nredo == 1,
"will redo with same initialization");
}
// initialize remaining centroids with random points from the dataset
centroids.resize (d * k);
std::vector<int> perm (nx);
rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L);
for (int i = n_input_centroids; i < k ; i++)
memcpy (&centroids[i * d], x + perm[i] * d,
d * sizeof (float));
if (spherical)
fvec_renorm_L2 (d, k, cur_centroids.data());
fvec_renorm_L2 (d, k, centroids.data());
if (!index.is_trained)
index.train (k, cur_centroids.data());
index.train (k, centroids.data());
FAISS_THROW_IF_NOT (index.ntotal == 0);
index.add (k, cur_centroids.data());
index.add (k, centroids.data());
float err = 0;
for (int i = 0; i < niter; i++) {
double t0s = getmillisecs();
......@@ -164,8 +171,9 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
err += dis[j];
obj.push_back (err);
int nsplit = km_update_centroids (x, cur_centroids.data(),
assign, d, k, nx);
int nsplit = km_update_centroids (
x, centroids.data(),
assign, d, k, nx, frozen_centroids ? n_input_centroids : 0);
if (verbose) {
printf (" Iteration %d (%.2f s, search %.2f s): "
......@@ -178,26 +186,31 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
}
if (spherical)
fvec_renorm_L2 (d, k, cur_centroids.data());
fvec_renorm_L2 (d, k, centroids.data());
index.reset ();
if (update_index)
index.train (k, cur_centroids.data());
index.train (k, centroids.data());
assert (index.ntotal == 0);
index.add (k, cur_centroids.data());
index.add (k, centroids.data());
}
if (verbose) printf("\n");
if (nredo > 1) {
if (err < best_err) {
if (verbose)
printf ("Objective improved: keep new clusters\n");
centroids = buf_centroids;
best_centroids = centroids;
best_obj = obj;
best_err = err;
}
index.reset ();
}
}
if (nredo > 1) {
centroids = best_centroids;
obj = best_obj;
}
}
......
......@@ -28,6 +28,7 @@ struct ClusteringParameters {
bool verbose;
bool spherical; ///< do we want normalized centroids?
bool update_index; ///< update index after each iteration?
bool frozen_centroids; ///< use the centroids provided as input and do not change them during iterations
int min_points_per_centroid; ///< otherwise you get a warning
int max_points_per_centroid; ///< to limit size of dataset
......
......@@ -41,8 +41,7 @@ long Index::remove_ids(const IDSelector& /*sel*/) {
void Index::reconstruct (idx_t, float * ) const {
FAISS_THROW_MSG ("Can not compute reconstruct without "
"knowing how to do so");
FAISS_THROW_MSG ("reconstruct not implemented for this type of index");
}
......
......@@ -34,8 +34,9 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
nlist (nlist),
nprobe (1),
quantizer (quantizer),
quantizer_trains_alone (false),
quantizer_trains_alone (0),
own_fields (false),
clustering_index (nullptr),
ids (nlist),
maintain_direct_map (false)
{
......@@ -56,7 +57,8 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
IndexIVF::IndexIVF ():
nlist (0), nprobe (1), quantizer (nullptr),
quantizer_trains_alone (false), own_fields (false),
quantizer_trains_alone (0), own_fields (false),
clustering_index (nullptr),
maintain_direct_map (false)
{}
......@@ -157,22 +159,44 @@ void IndexIVF::train (idx_t n, const float *x)
if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
if (verbose)
printf ("IVF quantizer does not need training.\n");
} else if (quantizer_trains_alone) {
} else if (quantizer_trains_alone == 1) {
if (verbose)
printf ("IVF quantizer trains alone...\n");
quantizer->train (n, x);
quantizer->verbose = verbose;
FAISS_THROW_IF_NOT_MSG (quantizer->ntotal == nlist,
"nlist not consistent with quantizer size");
} else {
} else if (quantizer_trains_alone == 0) {
if (verbose)
printf ("Training IVF quantizer on %ld vectors in %dD\n",
n, d);
Clustering clus (d, nlist, cp);
quantizer->reset();
clus.train (n, x, *quantizer);
if (clustering_index) {
clus.train (n, x, *clustering_index);
quantizer->add (nlist, clus.centroids.data());
} else {
clus.train (n, x, *quantizer);
}
quantizer->is_trained = true;
} else if (quantizer_trains_alone == 2) {
if (verbose)
printf (
"Training L2 quantizer on %ld vectors in %dD%s\n",
n, d,
clustering_index ? "(user provided index)" : "");
FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
Clustering clus (d, nlist, cp);
if (!clustering_index) {
IndexFlatL2 assigner (d);
clus.train(n, x, assigner);
} else {
clus.train(n, x, *clustering_index);
}
if (verbose)
printf ("Adding centroids to quantizer\n");
quantizer->add (nlist, clus.centroids.data());
}
if (verbose)
printf ("Training IVF residual\n");
......@@ -250,8 +274,9 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
{
FAISS_THROW_IF_NOT (nlist == other.nlist);
FAISS_THROW_IF_NOT (!other.maintain_direct_map);
FAISS_THROW_IF_NOT_MSG (subset_type == 0 || subset_type == 2,
"this subset type is not implemented");
FAISS_THROW_IF_NOT_FMT (
subset_type == 0 || subset_type == 1 || subset_type == 2,
"subset type %d not implemented", subset_type);
size_t accu_n = 0;
size_t accu_a1 = 0;
......@@ -275,15 +300,24 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
other.ntotal++;
}
}
} else if (subset_type == 1) {
for (long i = 0; i < n; i++) {
idx_t id = ids_in[i];
if (id % a1 == a2) {
ids_out.push_back (id);
codes_out.insert (codes_out.end(),
codes_in.begin() + i * code_size,
codes_in.begin() + (i + 1) * code_size);
other.ntotal++;
}
}
} else if (subset_type == 2) {
// see what is allocated to a1 and to a2
size_t next_accu_n = accu_n + n;
size_t next_accu_a1 = next_accu_n * a1 / ntotal;
size_t i1 = next_accu_a1 - accu_a1;
accu_a1 = next_accu_a1;
size_t next_accu_a2 = next_accu_n * a2 / ntotal;
size_t i2 = next_accu_a2 - accu_a2;
accu_a2 = next_accu_a2;
ids_out.insert(ids_out.end(),
ids_in.begin() + i1,
ids_in.begin() + i2);
......@@ -291,6 +325,8 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
codes_in.begin() + i1 * code_size,
codes_in.begin() + i2 * code_size);
other.ntotal += i2 - i1;
accu_a1 = next_accu_a1;
accu_a2 = next_accu_a2;
}
accu_n += n;
}
......
......@@ -47,10 +47,17 @@ struct IndexIVF: Index {
size_t nprobe; ///< number of probes at query time
Index * quantizer; ///< quantizer that maps vectors to inverted lists
bool quantizer_trains_alone; ///< just pass over the trainset to quantizer
/**
* = 0: use the quantizer as index in a kmeans training
* = 1: just pass on the training set to the train() of the quantizer
* = 2: kmeans training on a flat index + add the centroids to the quantizer
*/
char quantizer_trains_alone;
bool own_fields; ///< whether object owns the quantizer
ClusteringParameters cp; ///< to override default clustering params
Index *clustering_index; ///< to override index used during clustering
std::vector < std::vector<long> > ids; ///< Inverted lists for indexes
......
......@@ -291,8 +291,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
for (int j = 0; j < d; j++) {
r[j] += centroid[j];
}
}
else {
} else {
pq.decode (code_line + ofs * pq.code_size, r);
}
}
......@@ -303,6 +302,7 @@ void IndexIVFPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
void IndexIVFPQ::reconstruct (idx_t key, float * recons) const
{
FAISS_THROW_IF_NOT (direct_map.size() == ntotal);
int list_no = direct_map[key] >> 32;
int ofs = direct_map[key] & 0xffffffff;
......@@ -1029,6 +1029,51 @@ void IndexIVFPQ::search_preassigned (idx_t nx, const float *qx, idx_t k,
}
void IndexIVFPQ::search_and_reconstruct (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels,
float *reconstructed)
{
long * idx = new long [n * nprobe];
ScopeDeleter<long> del (idx);
float * coarse_dis = new float [n * nprobe];
ScopeDeleter<float> del2 (coarse_dis);
quantizer->search (n, x, nprobe, coarse_dis, idx);
search_preassigned (n, x, k, idx, coarse_dis,
distances, labels, true);
for (long i = 0; i < n; i++) {
for (long j = 0; j < k; j++) {
long ij = i * k + j;
idx_t res = labels[ij];
float *recons = reconstructed + d * (ij);
if (res < 0) {
// fill with NaNs
memset(recons, -1, sizeof(*recons) * d);
} else {
int list_no = res >> 32;
int ofs = res & 0xffffffff;
labels[ij] = ids[list_no][ofs];
quantizer->reconstruct (list_no, recons);
const uint8_t * code = &(codes[list_no][ofs * pq.code_size]);
for (size_t m = 0; m < pq.M; m++) {
float * out = recons + m * pq.dsub;
const float * cent = pq.get_centroids (m, code[m]);
for (size_t l = 0; l < pq.dsub; l++) {
out[l] += cent[l];
}
}
}
}
}
}
IndexIVFPQ::IndexIVFPQ ()
......
......@@ -114,6 +114,15 @@ struct IndexIVFPQ: IndexIVF {
float *distances, idx_t *labels,
bool store_pairs) const override;
/** Same as the search function, but also reconstruct approximate
* vectors for the search results
*
* @param reconstructed size (n, k, d)
**/
void search_and_reconstruct (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels,
float *reconstructed);
/// build precomputed table
void precompute_table ();
......
......@@ -124,8 +124,8 @@ struct Codec4bit {
struct SimilarityL2 {
const float *y, *yi;
explicit SimilarityL2 (const float * y): y(y) {}
explicit SimilarityL2 (const float * y): y(y) {}
/******* scalar accumulator *******/
......@@ -676,19 +676,19 @@ void ScalarQuantizer::compute_codes (const float * x,
size_t n) const
{
Quantizer *squant = select_quantizer (*this);
ScopeDeleter1<Quantizer> del(squant);
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
squant->encode_vector (x + i * d, codes + i * code_size);
delete squant;
}
void ScalarQuantizer::decode (const uint8_t *codes, float *x, size_t n) const
{
Quantizer *squant = select_quantizer (*this);
ScopeDeleter1<Quantizer> del(squant);
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
squant->decode_vector (codes + i * code_size, x + i * d);
delete squant;
}
/*******************************************************************
......@@ -754,6 +754,7 @@ void IndexScalarQuantizer::search(
}
ci += code_size;
}
minheap_reorder (k, simi, idxi);
}
} else {
#pragma omp parallel for
......@@ -774,7 +775,7 @@ void IndexScalarQuantizer::search(
}
ci += code_size;
}
maxheap_reorder (k, simi, idxi);
}
}
......@@ -855,6 +856,7 @@ void IndexIVFScalarQuantizer::add_with_ids
int nt = omp_get_num_threads();
int rank = omp_get_thread_num();
// each thread takes care of a subset of lists
for (size_t i = 0; i < n; i++) {
long list_no = idx [i];
......@@ -879,6 +881,7 @@ void IndexIVFScalarQuantizer::add_with_ids
ntotal += nadd;
}
namespace {
void search_with_probes_ip (const IndexIVFScalarQuantizer & index,
const float *x,
......@@ -958,6 +961,8 @@ void search_with_probes_L2 (const IndexIVFScalarQuantizer & index,
maxheap_reorder (k, simi, idxi);
}
} // anonymous namespace
void IndexIVFScalarQuantizer::search_preassigned (
idx_t n, const float *x, idx_t k,
const idx_t *idx,
......
......@@ -87,54 +87,59 @@ _swigfaiss.so: python/_swigfaiss.so
cp python/_swigfaiss.so python/swigfaiss.py .
#############################
# Dependencies
# Dependencies.
# make dep > x
# then copy/paste from x by hand below
# for i in *.cpp ; do g++ -std=c++11 -I.. -MM $i -msse4; done
dep:
for i in $(patsubst %.o,%.cpp,$(LIBOBJ)) ; do \
cpp -MM -std=gnu++0x $$i ; \
done
AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h IndexFlat.h
FaissException.o: FaissException.cpp FaissException.h
hamming.o: hamming.cpp hamming.h Heap.h FaissAssert.h FaissException.h
Heap.o: Heap.cpp Heap.h
Index.o: Index.cpp IndexFlat.h Index.h FaissAssert.h FaissException.h
utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \
FaissAssert.h FaissException.h
IndexFlat.o: IndexFlat.cpp IndexFlat.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h AuxIndexStructures.h
index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
IndexIVF.o: IndexIVF.cpp IndexIVF.h Index.h Clustering.h Heap.h utils.h \
hamming.h FaissAssert.h FaissException.h IndexFlat.h \
AuxIndexStructures.h
IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
IndexFlat.h hamming.h FaissAssert.h FaissException.h \
AuxIndexStructures.h
IndexLSH.o: IndexLSH.cpp IndexLSH.h Index.h VectorTransform.h utils.h \
Heap.h hamming.h FaissAssert.h FaissException.h
IndexPQ.o: IndexPQ.cpp IndexPQ.h Index.h ProductQuantizer.h Clustering.h \
Heap.h PolysemousTraining.h FaissAssert.h FaissException.h hamming.h
IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \
IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \
FaissException.h
MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \
FaissException.h Heap.h AuxIndexStructures.h
IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
IndexFlat.h hamming.h FaissAssert.h FaissException.h \
AuxIndexStructures.h
Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
FaissAssert.h FaissException.h IndexFlat.h
Heap.o: Heap.cpp Heap.h
VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \
Heap.h FaissAssert.h FaissException.h IndexPQ.h ProductQuantizer.h \
Clustering.h PolysemousTraining.h
index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
PolysemousTraining.o: PolysemousTraining.cpp PolysemousTraining.h \
ProductQuantizer.h Clustering.h Index.h Heap.h utils.h hamming.h \
FaissAssert.h FaissException.h
MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \
FaissException.h Heap.h AuxIndexStructures.h
Index.o: Index.cpp IndexFlat.h Index.h FaissAssert.h FaissException.h
ProductQuantizer.o: ProductQuantizer.cpp ProductQuantizer.h Clustering.h \
Index.h Heap.h FaissAssert.h FaissException.h VectorTransform.h \
IndexFlat.h utils.h
utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \
FaissAssert.h FaissException.h
VectorTransform.o: VectorTransform.cpp VectorTransform.h Index.h utils.h \
Heap.h FaissAssert.h FaissException.h IndexPQ.h ProductQuantizer.h \
Clustering.h PolysemousTraining.h
AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \
IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \
FaissException.h
FaissException.o: FaissException.cpp FaissException.h
clean:
......
......@@ -76,6 +76,17 @@ void IndexIDMap::search (idx_t n, const float *x, idx_t k,
}
}
void IndexIDMap::range_search (idx_t n, const float *x, float radius,
RangeSearchResult *result) const
{
index->range_search(n, x, radius, result);
for (idx_t i = 0; i < result->lims[result->nq]; i++) {
result->labels[i] = result->labels[i] < 0 ?
result->labels[i] : id_map[result->labels[i]];
}
}
namespace {
struct IDTranslatedSelector: IDSelector {
......@@ -109,6 +120,7 @@ long IndexIDMap::remove_ids (const IDSelector & sel)
}
FAISS_ASSERT (j == index->ntotal);
ntotal = j;
id_map.resize(ntotal);
return nremove;
}
......
......@@ -51,6 +51,9 @@ struct IndexIDMap : Index {
/// remove ids adapted to IndexFlat
long remove_ids(const IDSelector& sel) override;
void range_search (idx_t n, const float *x, float radius,
RangeSearchResult *result) const override;
~IndexIDMap() override;
IndexIDMap () {own_fields=false; index=nullptr; }
};
......
......@@ -804,18 +804,38 @@ void IndexPreTransform::train (idx_t n, const float *x)
const float *prev_x = x;
ScopeDeleter<float> del;
if (verbose) {
printf("IndexPreTransform::train: training chain 0 to %d\n",
last_untrained);
}
for (int i = 0; i <= last_untrained; i++) {
if (i < chain.size()) {
VectorTransform *ltrans = chain [i];
if (!ltrans->is_trained)
ltrans->train(n, prev_x);
if (!ltrans->is_trained) {
if (verbose) {
printf(" Training chain component %d/%zd\n",
i, chain.size());
if (OPQMatrix *opqm = dynamic_cast<OPQMatrix*>(ltrans)) {
opqm->verbose = true;
}
}
ltrans->train (n, prev_x);
}
} else {
if (verbose) {
printf(" Training sub-index\n");
}
index->train (n, prev_x);
}
if (i == last_untrained) break;
if (verbose) {
printf(" Applying transform %d/%zd\n",
i, chain.size());
}
float * xt = chain[i]->apply (n, prev_x);
if (prev_x != x) delete prev_x;
if (prev_x != x) delete [] prev_x;
prev_x = xt;
del.set(xt);
}
......
......@@ -521,7 +521,7 @@ def compute_populated_index(preproc):
co.verbose = True
co.reserveVecs = max_add if max_add > 0 else xb.shape[0]
co.shard = True
assert co.shard_type in (0, 1, 2)
vres, vdev = make_vres_vdev()
gpu_index = faiss.index_cpu_to_gpu_multiple(
vres, vdev, indexall, co)
......@@ -630,7 +630,7 @@ def get_populated_index(preproc):
co.usePrecomputed = use_precomputed_tables
co.indicesOptions = 0
co.verbose = True
co.shard = True # the replicas will be made "manually"
co.shard = True # the replicas will be made "manually"
t0 = time.time()
print "CPU index contains %d vectors, move to GPU" % indexall.ntotal
if replicas == 1:
......
......@@ -121,6 +121,18 @@ def handle_Index(the_class):
swig_ptr(labels))
return distances, labels
def replacement_search_and_reconstruct(self, x, k):
n, d = x.shape
assert d == self.d
distances = np.empty((n, k), dtype=np.float32)
labels = np.empty((n, k), dtype=np.int64)
recons = np.empty((n, k, d), dtype=np.float32)
self.search_and_reconstruct_c(n, swig_ptr(x),
k, swig_ptr(distances),
swig_ptr(labels),
swig_ptr(recons))
return distances, labels, recons
def replacement_remove_ids(self, x):
if isinstance(x, IDSelector):
sel = x
......@@ -167,6 +179,8 @@ def handle_Index(the_class):
replace_method(the_class, 'range_search', replacement_range_search)
replace_method(the_class, 'update_vectors', replacement_update_vectors,
ignore_missing=True)
replace_method(the_class, 'search_and_reconstruct',
replacement_search_and_reconstruct, ignore_missing=True)
def handle_VectorTransform(the_class):
......@@ -258,12 +272,52 @@ def index_cpu_to_gpu_multiple_py(resources, index, co=None):
return index_cpu_to_gpu_multiple(vres, vdev, index, co)
def vector_float_to_array(v):
a = np.empty(v.size(), dtype='float32')
memcpy(swig_ptr(a), v.data(), 4 * v.size())
def index_cpu_to_all_gpus(index, co=None, ngpu=-1):
if ngpu == -1:
ngpu = get_num_gpus()
res = [StandardGpuResources() for i in range(ngpu)]
index2 = index_cpu_to_gpu_multiple_py(res, index, co)
index2.dont_dealloc = res
return index2
# mapping from vector names in swigfaiss.swig and the numpy dtype names
vector_name_map = {
'Float': 'float32',
'Byte': 'uint8',
'Uint64': 'uint64',
'Long': 'int64',
'Int': 'int32',
'Double': 'float64'
}
def vector_to_array(v):
""" convert a C++ vector to a numpy array """
classname = v.__class__.__name__
assert classname.endswith('Vector')
dtype = np.dtype(vector_name_map[classname[:-6]])
a = np.empty(v.size(), dtype=dtype)
memcpy(swig_ptr(a), v.data(), a.nbytes)
return a
def vector_float_to_array(v):
return vector_to_array(v)
def copy_array_to_vector(a, v):
""" copy a numpy array to a vector """
n, = a.shape
classname = v.__class__.__name__
assert classname.endswith('Vector')
dtype = np.dtype(vector_name_map[classname[:-6]])
assert dtype == a.dtype, (
'cannot copy a %s array to a %s (should be %s)' % (
a.dtype, classname, dtype))
v.resize(n)
memcpy(v.data(), swig_ptr(a), a.nbytes)
class Kmeans:
def __init__(self, d, k, niter=25, verbose=False, spherical = False):
......@@ -364,3 +418,18 @@ def eval_intersection(I1, I2):
def normalize_L2(x):
fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x))
def replacement_map_add(self, keys, vals):
n, = keys.shape
assert (n,) == keys.shape
self.add_c(n, swig_ptr(keys), swig_ptr(vals))
def replacement_map_search_multiple(self, keys):
n, = keys.shape
vals = np.empty(n, dtype='uint64')
self.search_multiple_c(n, swig_ptr(keys), swig_ptr(vals))
return vals
replace_method(MapLong2Long, 'add', replacement_map_add)
replace_method(MapLong2Long, 'search_multiple', replacement_map_search_multiple)
......@@ -8,6 +8,7 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#include "GpuAutoTune.h"
#include <typeinfo>
#include "GpuIndex.h"
#include "../FaissAssert.h"
......@@ -97,17 +98,6 @@ faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index)
GpuClonerOptions::GpuClonerOptions():
indicesOptions(INDICES_64_BIT),
useFloat16CoarseQuantizer(false),
useFloat16(false),
usePrecomputed(true),
reserveVecs(0),
storeTransposed(false),
verbose(0)
{}
struct ToGpuCloner: faiss::Cloner, GpuClonerOptions {
GpuResources *resources;
int device;
......@@ -185,9 +175,6 @@ faiss::Index * index_cpu_to_gpu(
return cl.clone_Index(index);
}
GpuMultipleClonerOptions::GpuMultipleClonerOptions(): shard(false)
{}
struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
std::vector<ToGpuCloner> sub_cloners;
......@@ -211,6 +198,28 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
{}
void copy_ivf_shard (const IndexIVF *index_ivf, IndexIVF *idx2,
long n, long i) {
if (shard_type == 2) {
long i0 = i * index_ivf->ntotal / n;
long i1 = (i + 1) * index_ivf->ntotal / n;
if(verbose)
printf("IndexShards shard %ld indices %ld:%ld\n",
i, i0, i1);
index_ivf->copy_subset_to(*idx2, 2, i0, i1);
FAISS_ASSERT(idx2->ntotal == i1 - i0);
} else if (shard_type == 1) {
if(verbose)
printf("IndexShards shard %ld select modulo %ld = %ld\n",
i, n, i);
index_ivf->copy_subset_to(*idx2, 1, n, i);
} else {
FAISS_THROW_FMT ("shard_type %d not implemented", shard_type);
}
}
Index *clone_Index(const Index *index) override {
long n = sub_cloners.size();
if (n == 1)
......@@ -231,19 +240,13 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
dynamic_cast<const faiss::IndexIVFPQ *>(index);
auto index_ivfflat =
dynamic_cast<const faiss::IndexIVFFlat *>(index);
FAISS_ASSERT_MSG (index_ivfpq || index_ivfflat,
FAISS_THROW_IF_NOT_MSG (index_ivfpq || index_ivfflat,
"IndexShards implemented only for "
"IndexIVFFlat or IndexIVFPQ");
std::vector<faiss::Index*> shards(n);
for(long i = 0; i < n; i++) {
// make a shallow copy
long i0 = i * index->ntotal / n;
long i1 = (i + 1) * index->ntotal / n;
if(verbose)
printf("IndexShards shard %ld indices %ld:%ld\n",
i, i0, i1);
if(reserveVecs)
sub_cloners[i].reserveVecs =
(reserveVecs + n - 1) / n;
......@@ -258,18 +261,19 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
idx2.nprobe = index_ivfpq->nprobe;
idx2.use_precomputed_table = 0;
idx2.is_trained = index->is_trained;
index_ivfpq->copy_subset_to(idx2, 2, i0, i1);
FAISS_ASSERT(idx2.ntotal == i1 - i0);
copy_ivf_shard (index_ivfpq, &idx2, n, i);
shards[i] = sub_cloners[i].clone_Index(&idx2);
} else if (index_ivfflat) {
faiss::IndexIVFFlat idx2(
index_ivfflat->quantizer, index->d,
index_ivfflat->nlist, index_ivfflat->metric_type);
idx2.nprobe = index_ivfflat->nprobe;
index_ivfflat->copy_subset_to(idx2, 2, i0, i1);
idx2.nprobe = index_ivfflat->nprobe;
copy_ivf_shard (index_ivfflat, &idx2, n, i);
shards[i] = sub_cloners[i].clone_Index(&idx2);
}
}
faiss::IndexShards *res =
new faiss::IndexShards(index->d, true, false);
......@@ -372,33 +376,26 @@ void GpuParameterSpace::initialize (const Index * index)
void GpuParameterSpace::set_index_parameter (
Index * index, const std::string & name, double val) const
{
if (DC (IndexPreTransform)) {
index = ix->index;
}
if (DC (IndexProxy)) {
for (int i = 0; i < ix->count(); i++)
set_index_parameter (ix->at(i), name, val);
return;
}
if (DC (faiss::IndexShards)) {
for (auto sub_index : ix->shard_indexes)
set_index_parameter (sub_index, name, val);
return;
}
if (name == "nprobe") {
DC (GpuIndexIVF);
FAISS_ASSERT(ix);
ix->setNumProbes (int (val));
return;
if (DC (GpuIndexIVF)) {
if (name == "nprobe") {
ix->setNumProbes (int (val));
return;
}
}
if (name == "use_precomputed_table") {
DC (GpuIndexIVFPQ);
FAISS_ASSERT(ix);
ix->setPrecomputedCodes(bool (val));
return;
if(DC (GpuIndexIVFPQ)) {
if (name == "use_precomputed_table") {
ix->setPrecomputedCodes(bool (val));
return;
}
}
FAISS_ASSERT_MSG (false, "unknown parameter");
// maybe norma lindex parameters apply?
ParameterSpace::set_index_parameter (index, name, val);
}
......
......@@ -22,7 +22,9 @@ GpuClonerOptions::GpuClonerOptions()
}
GpuMultipleClonerOptions::GpuMultipleClonerOptions()
: shard(false) {
: shard(false),
shard_type(1)
{
}
} } // namespace
......@@ -47,6 +47,9 @@ struct GpuMultipleClonerOptions : public GpuClonerOptions {
/// Whether to shard the index across GPUs, versus replication
/// across GPUs
bool shard;
/// IndexIVF::copy_subset_to subset type
int shard_type;
};
} } // namespace
......@@ -26,7 +26,7 @@ struct GpuIndexConfig {
/// GPU device on which the index is resident
int device;
/// What memory space to use for primary storae.
/// What memory space to use for primary storage.
/// On Pascal and above (CC 6+) architectures, allows GPUs to use
/// more memory than is available on the GPU.
MemorySpace memorySpace;
......
......@@ -184,7 +184,7 @@ GpuIndexIVF::copyTo(faiss::IndexIVF* index) const {
}
index->quantizer = q;
index->quantizer_trains_alone = false;
index->quantizer_trains_alone = 0;
index->own_fields = true;
index->cp = this->cp;
index->ids.clear();
......
......@@ -96,7 +96,6 @@ GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
FAISS_ASSERT(index->pq.byte_per_idx == 1);
FAISS_ASSERT(index->by_residual);
FAISS_ASSERT(index->polysemous_ht == 0);
ivfpqConfig_.usePrecomputedTables = (bool) index->use_precomputed_table;
verifySettings_();
......
This diff is collapsed.
This diff is collapsed.
......@@ -31,11 +31,7 @@ void runL2Distance(GpuResources* resources,
Tensor<int, 2, true>& outIndices,
// Do we care about `outDistances`? If not, we can
// take shortcuts.
bool ignoreOutDistances = false,
// Hint to use a different sized tile for
// multi-streaming the queries. If <= 0, we use the
// default
int tileSizeOverride = -1);
bool ignoreOutDistances = false);
/// Calculates brute-force inner product distance between `vectors`
/// and `queries`, returning the k closest results seen
......@@ -45,11 +41,7 @@ void runIPDistance(GpuResources* resources,
Tensor<float, 2, true>& queries,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
// Hint to use a different sized tile for
// multi-streaming the queries. If <= 0, we use the
// default
int tileSizeOverride = -1);
Tensor<int, 2, true>& outIndices);
#ifdef FAISS_USE_FLOAT16
void runIPDistance(GpuResources* resources,
......@@ -59,8 +51,7 @@ void runIPDistance(GpuResources* resources,
int k,
Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool useHgemm,
int tileSizeOverride = -1);
bool useHgemm);
void runL2Distance(GpuResources* resources,
Tensor<half, 2, true>& vectors,
......@@ -71,8 +62,7 @@ void runL2Distance(GpuResources* resources,
Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool useHgemm,
bool ignoreOutDistances = false,
int tileSizeOverride = -1);
bool ignoreOutDistances = false);
#endif
} } // namespace
......@@ -114,8 +114,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance,
int tileSize) {
bool exactDistance) {
auto stream = resources_->getDefaultStreamCurrentDevice();
auto& mem = resources_->getMemoryManagerCurrentDevice();
......@@ -127,7 +126,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
DeviceTensor<half, 2, true> outDistancesHalf(
mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream);
query(inputHalf, k, outDistancesHalf, outIndices, exactDistance, tileSize);
query(inputHalf, k, outDistancesHalf, outIndices, exactDistance);
if (exactDistance) {
// Convert outDistances back
......@@ -145,8 +144,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
outDistances,
outIndices,
// FIXME
!exactDistance,
tileSize);
!exactDistance);
} else {
runIPDistance(resources_,
vectors_,
......@@ -154,8 +152,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
input,
k,
outDistances,
outIndices,
tileSize);
outIndices);
}
}
}
......@@ -166,8 +163,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
int k,
Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance,
int tileSize) {
bool exactDistance) {
FAISS_ASSERT(useFloat16_);
if (l2Distance_) {
......@@ -181,8 +177,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
outIndices,
useFloat16Accumulator_,
// FIXME
!exactDistance,
tileSize);
!exactDistance);
} else {
runIPDistance(resources_,
vectorsHalf_,
......@@ -191,8 +186,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
k,
outDistances,
outIndices,
useFloat16Accumulator_,
tileSize);
useFloat16Accumulator_);
}
}
#endif
......@@ -217,12 +211,14 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
rawData_.append((char*) devDataHalf.data(),
devDataHalf.getSizeInBytes(),
stream);
stream,
true /* reserve exactly */);
#endif
} else {
rawData_.append((char*) data,
(size_t) dim_ * numVecs * sizeof(float),
stream);
stream,
true /* reserve exactly */);
}
num_ += numVecs;
......
......@@ -61,16 +61,14 @@ class FlatIndex {
int k,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance,
int tileSize = -1);
bool exactDistance);
#ifdef FAISS_USE_FLOAT16
void query(Tensor<half, 2, true>& vecs,
int k,
Tensor<half, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance,
int tileSize = -1);
bool exactDistance);
#endif
/// Add vectors to ourselves; the pointer passed can be on the host
......
......@@ -195,10 +195,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
closestSubQDistanceView,
closestSubQIndexView,
// We don't care about distances
true,
// Much larger tile size, since these vectors are a
// lot smaller than query vectors
1024);
true);
}
// Now, we have the nearest sub-q centroid for each slice of the
......
......@@ -10,10 +10,10 @@
#include "IVFUtils.cuh"
#include "../utils/DeviceUtils.h"
#include "../utils/Limits.cuh"
#include "../utils/Select.cuh"
#include "../utils/StaticUtils.h"
#include "../utils/Tensor.cuh"
#include <limits>
//
// This kernel is split into a separate compilation unit to cut down
......@@ -22,9 +22,6 @@
namespace faiss { namespace gpu {
constexpr auto kMax = std::numeric_limits<float>::max();
constexpr auto kMin = std::numeric_limits<float>::min();
template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir>
__global__ void
pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
......@@ -38,9 +35,9 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
__shared__ float smemK[kNumWarps * NumWarpQ];
__shared__ int smemV[kNumWarps * NumWarpQ];
constexpr auto kInit = Dir ? kMin : kMax;
constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
BlockSelect<float, int, Dir, Comparator<float>,
NumWarpQ, NumThreadQ, ThreadsPerBlock>
NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(kInit, -1, smemK, smemV, k);
auto queryId = blockIdx.y;
......
......@@ -10,10 +10,10 @@
#include "IVFUtils.cuh"
#include "../utils/DeviceUtils.h"
#include "../utils/Limits.cuh"
#include "../utils/Select.cuh"
#include "../utils/StaticUtils.h"
#include "../utils/Tensor.cuh"
#include <limits>
//
// This kernel is split into a separate compilation unit to cut down
......@@ -22,9 +22,6 @@
namespace faiss { namespace gpu {
constexpr auto kMax = std::numeric_limits<float>::max();
constexpr auto kMin = std::numeric_limits<float>::min();
// This is warp divergence central, but this is really a final step
// and happening a small number of times
inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
......@@ -71,7 +68,7 @@ pass2SelectLists(Tensor<float, 2, true> heapDistances,
__shared__ float smemK[kNumWarps * NumWarpQ];
__shared__ int smemV[kNumWarps * NumWarpQ];
constexpr auto kInit = Dir ? kMin : kMax;
constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
BlockSelect<float, int, Dir, Comparator<float>,
NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(kInit, -1, smemK, smemV, k);
......
......@@ -31,28 +31,29 @@ namespace faiss { namespace gpu {
// T: the type we are doing the math in (e.g., float, half)
// TVec: the potentially vectorized type we are loading in (e.g.,
// float4, half2)
template <typename T, typename TVec,
template <typename T, typename TVec, typename TIndex,
int RowTileSize, bool NormLoop, bool NormSquared>
__global__ void l2Norm(Tensor<TVec, 2, true> input,
Tensor<T, 1, true> output) {
__global__ void l2Norm(Tensor<TVec, 2, true, TIndex> input,
Tensor<T, 1, true, TIndex> output) {
extern __shared__ char smemByte[]; // #warps * RowTileSize elements
T* smem = (T*) smemByte;
int numWarps = utils::divUp(blockDim.x, kWarpSize);
int laneId = getLaneId();
int warpId = threadIdx.x / kWarpSize;
TIndex numWarps = utils::divUp(blockDim.x, kWarpSize);
TIndex laneId = getLaneId();
TIndex warpId = threadIdx.x / kWarpSize;
bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
int rowStart = RowTileSize * blockIdx.x;
TIndex rowStart = RowTileSize * blockIdx.x;
T rowNorm[RowTileSize];
if (lastRowTile) {
// We are handling the very end of the input matrix rows
for (int row = 0; row < input.getSize(0) - rowStart; ++row) {
for (TIndex row = 0; row < input.getSize(0) - rowStart; ++row) {
if (NormLoop) {
rowNorm[0] = Math<T>::zero();
for (int col = threadIdx.x; col < input.getSize(1); col += blockDim.x) {
for (TIndex col = threadIdx.x;
col < input.getSize(1); col += blockDim.x) {
TVec val = input[rowStart + row][col];
val = Math<TVec>::mul(val, val);
rowNorm[0] = Math<T>::add(rowNorm[0], Math<TVec>::reduceAdd(val));
......@@ -82,7 +83,8 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input,
rowNorm[row] = Math<T>::zero();
}
for (int col = threadIdx.x; col < input.getSize(1); col += blockDim.x) {
for (TIndex col = threadIdx.x;
col < input.getSize(1); col += blockDim.x) {
#pragma unroll
for (int row = 0; row < RowTileSize; ++row) {
tmp[row] = input[rowStart + row][col];
......@@ -172,44 +174,44 @@ __global__ void l2Norm(Tensor<TVec, 2, true> input,
}
}
template <typename T, typename TVec>
void runL2Norm(Tensor<T, 2, true>& input,
Tensor<T, 1, true>& output,
template <typename T, typename TVec, typename TIndex>
void runL2Norm(Tensor<T, 2, true, TIndex>& input,
Tensor<T, 1, true, TIndex>& output,
bool normSquared,
cudaStream_t stream) {
FAISS_ASSERT(input.getSize(0) == output.getSize(0));
int maxThreads = getMaxThreadsCurrentDevice();
TIndex maxThreads = (TIndex) getMaxThreadsCurrentDevice();
constexpr int rowTileSize = 8;
#define RUN_L2(TYPE_T, TYPE_TVEC, INPUT) \
do { \
if (normLoop) { \
if (normSquared) { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, true, true> \
<<<grid, block, smem, stream>>>(INPUT, output); \
} else { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, true, false> \
<<<grid, block, smem, stream>>>(INPUT, output); \
} \
} else { \
if (normSquared) { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, false, true> \
<<<grid, block, smem, stream>>>(INPUT, output); \
} else { \
l2Norm<TYPE_T, TYPE_TVEC, rowTileSize, false, false> \
<<<grid, block, smem, stream>>>(INPUT, output); \
} \
} \
#define RUN_L2(TYPE_T, TYPE_TVEC, INPUT) \
do { \
if (normLoop) { \
if (normSquared) { \
l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, true, true> \
<<<grid, block, smem, stream>>>(INPUT, output); \
} else { \
l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, true, false> \
<<<grid, block, smem, stream>>>(INPUT, output); \
} \
} else { \
if (normSquared) { \
l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, false, true> \
<<<grid, block, smem, stream>>>(INPUT, output); \
} else { \
l2Norm<TYPE_T, TYPE_TVEC, TIndex, rowTileSize, false, false> \
<<<grid, block, smem, stream>>>(INPUT, output); \
} \
} \
} while (0)
if (input.template canCastResize<TVec>()) {
// Can load using the vectorized type
auto inputV = input.template castResize<TVec>();
int dim = inputV.getSize(1);
auto dim = inputV.getSize(1);
bool normLoop = dim > maxThreads;
int numThreads = min(dim, maxThreads);
auto numThreads = min(dim, maxThreads);
auto grid = dim3(utils::divUp(inputV.getSize(0), rowTileSize));
auto block = dim3(numThreads);
......@@ -220,9 +222,9 @@ void runL2Norm(Tensor<T, 2, true>& input,
} else {
// Can't load using the vectorized type
int dim = input.getSize(1);
auto dim = input.getSize(1);
bool normLoop = dim > maxThreads;
int numThreads = min(dim, maxThreads);
auto numThreads = min(dim, maxThreads);
auto grid = dim3(utils::divUp(input.getSize(0), rowTileSize));
auto block = dim3(numThreads);
......@@ -241,7 +243,13 @@ void runL2Norm(Tensor<float, 2, true>& input,
Tensor<float, 1, true>& output,
bool normSquared,
cudaStream_t stream) {
runL2Norm<float, float4>(input, output, normSquared, stream);
if (input.canUseIndexType<int>()) {
runL2Norm<float, float4, int>(input, output, normSquared, stream);
} else {
auto inputCast = input.castIndexType<long>();
auto outputCast = output.castIndexType<long>();
runL2Norm<float, float4, long>(inputCast, outputCast, normSquared, stream);
}
}
#ifdef FAISS_USE_FLOAT16
......@@ -249,7 +257,13 @@ void runL2Norm(Tensor<half, 2, true>& input,
Tensor<half, 1, true>& output,
bool normSquared,
cudaStream_t stream) {
runL2Norm<half, half2>(input, output, normSquared, stream);
if (input.canUseIndexType<int>()) {
runL2Norm<half, half2, int>(input, output, normSquared, stream);
} else {
auto inputCast = input.castIndexType<long>();
auto outputCast = output.castIndexType<long>();
runL2Norm<half, half2, long>(inputCast, outputCast, normSquared, stream);
}
}
#endif
......
......@@ -29,11 +29,14 @@ DEFINE_int32(num, 128, "# of vecs");
DEFINE_int32(dim, 128, "# of dimensions");
DEFINE_int32(num_queries, 3, "number of query vectors");
DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
DEFINE_bool(use_float16, false, "use encodings in float16 instead of float32");
DEFINE_bool(use_float16, false, "use encodings in float16");
DEFINE_bool(use_float16_math, false, "perform math in float16");
DEFINE_bool(transposed, false, "store vectors transposed");
DEFINE_int64(seed, -1, "specify random seed");
DEFINE_int32(num_gpus, 1, "number of gpus to use");
DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use");
DEFINE_bool(cpu, true, "run the CPU code for timing and comparison");
DEFINE_bool(use_unified_mem, false, "use Pascal unified memory for the index");
using namespace faiss::gpu;
......@@ -72,7 +75,10 @@ int main(int argc, char** argv) {
GpuIndexFlatConfig config;
config.device = dev;
config.useFloat16 = FLAGS_use_float16;
config.useFloat16Accumulator = FLAGS_use_float16_math;
config.storeTransposed = FLAGS_transposed;
config.memorySpace = FLAGS_use_unified_mem ?
MemorySpace::Unified : MemorySpace::Device;
auto p = std::unique_ptr<faiss::gpu::GpuIndexFlatL2>(
new faiss::gpu::GpuIndexFlatL2(res, index.get(), config));
......@@ -90,9 +96,9 @@ int main(int argc, char** argv) {
HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
float cpuTime = 0.0f;
if (FLAGS_cpu) {
float cpuTime = 0.0f;
{
CpuTimer timer;
index->search(numQueries,
cpuQuery.data(),
......@@ -101,10 +107,9 @@ int main(int argc, char** argv) {
cpuIndices.data());
cpuTime = timer.elapsedMilliseconds();
printf("CPU time %.3f ms\n", cpuTime);
}
printf("CPU time %.3f ms\n", cpuTime);
HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
......@@ -131,14 +136,14 @@ int main(int argc, char** argv) {
CUDA_VERIFY(cudaProfilerStop());
printf("GPU time %.3f ms\n", gpuTime);
compareLists(cpuDistances.data(), cpuIndices.data(),
gpuDistances.data(), gpuIndices.data(),
numQueries, FLAGS_k,
"", true, FLAGS_diff, false);
if (FLAGS_cpu) {
compareLists(cpuDistances.data(), cpuIndices.data(),
gpuDistances.data(), gpuIndices.data(),
numQueries, FLAGS_k,
"", true, FLAGS_diff, false);
}
CUDA_VERIFY(cudaDeviceSynchronize());
// printf("\ncudaMalloc usage %zd\n",
// resources.getMemoryManager().getHighWaterCudaMalloc());
return 0;
}
......@@ -21,29 +21,47 @@
constexpr float kF16MaxRelErr = 0.07f;
constexpr float kF32MaxRelErr = 6e-3f;
void testFlat(bool useL2,
bool useFloat16,
bool useTransposed,
int kOverride = -1) {
int numVecs = faiss::gpu::randVal(1000, 20000);
struct TestFlatOptions {
TestFlatOptions()
: useL2(true),
useFloat16(false),
useTransposed(false),
numVecsOverride(-1),
numQueriesOverride(-1),
kOverride(-1) {
}
bool useL2;
bool useFloat16;
bool useTransposed;
int numVecsOverride;
int numQueriesOverride;
int kOverride;
};
void testFlat(const TestFlatOptions& opt) {
int numVecs = opt.numVecsOverride > 0 ?
opt.numVecsOverride : faiss::gpu::randVal(1000, 20000);
int dim = faiss::gpu::randVal(50, 800);
int numQuery = faiss::gpu::randVal(1, 512);
int numQuery = opt.numQueriesOverride > 0 ?
opt.numQueriesOverride : faiss::gpu::randVal(1, 512);
// Due to loss of precision in a float16 accumulator, for large k,
// the number of differences is pretty huge. Restrict ourselves to a
// fairly small `k` for float16
int k = useFloat16 ?
int k = opt.useFloat16 ?
std::min(faiss::gpu::randVal(1, 50), numVecs) :
std::min(faiss::gpu::randVal(1, 1024), numVecs);
if (kOverride > 0) {
k = kOverride;
if (opt.kOverride > 0) {
k = opt.kOverride;
}
faiss::IndexFlatIP cpuIndexIP(dim);
faiss::IndexFlatL2 cpuIndexL2(dim);
faiss::IndexFlat* cpuIndex =
useL2 ? (faiss::IndexFlat*) &cpuIndexL2 : (faiss::IndexFlat*) &cpuIndexIP;
opt.useL2 ? (faiss::IndexFlat*) &cpuIndexL2 :
(faiss::IndexFlat*) &cpuIndexIP;
// Construct on a random device to test multi-device, if we have
// multiple devices
......@@ -55,14 +73,14 @@ void testFlat(bool useL2,
faiss::gpu::GpuIndexFlatConfig config;
config.device = device;
config.useFloat16 = useFloat16;
config.storeTransposed = useTransposed;
config.useFloat16 = opt.useFloat16;
config.storeTransposed = opt.useTransposed;
faiss::gpu::GpuIndexFlatIP gpuIndexIP(&res, dim, config);
faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
faiss::gpu::GpuIndexFlat* gpuIndex =
useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
opt.useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
(faiss::gpu::GpuIndexFlat*) &gpuIndexIP;
std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
......@@ -70,37 +88,53 @@ void testFlat(bool useL2,
gpuIndex->add(numVecs, vecs.data());
std::stringstream str;
str << (useL2 ? "L2" : "IP") << " numVecs " << numVecs
str << (opt.useL2 ? "L2" : "IP") << " numVecs " << numVecs
<< " dim " << dim
<< " useFloat16 " << useFloat16
<< " transposed " << useTransposed
<< " useFloat16 " << opt.useFloat16
<< " transposed " << opt.useTransposed
<< " numQuery " << numQuery
<< " k " << k;
// To some extent, we depend upon the relative error for the test
// for float16
faiss::gpu::compareIndices(*cpuIndex, *gpuIndex, numQuery, dim, k, str.str(),
useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
// FIXME: the fp16 bounds are
// useless when math (the accumulator) is
// in fp16. Figure out another way to test
useFloat16 ? 0.99f : 0.1f,
useFloat16 ? 0.65f : 0.015f);
opt.useFloat16 ? 0.99f : 0.1f,
opt.useFloat16 ? 0.65f : 0.015f);
}
TEST(TestGpuIndexFlat, IP_Float32) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(false, false, false);
testFlat(false, false, true);
TestFlatOptions opt;
opt.useL2 = false;
opt.useFloat16 = false;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
}
}
TEST(TestGpuIndexFlat, L2_Float32) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(true, false, false);
testFlat(true, false, true);
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
}
}
......@@ -108,24 +142,46 @@ TEST(TestGpuIndexFlat, L2_Float32) {
TEST(TestGpuIndexFlat, L2_Float32_K1) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(true, false, false, 1);
testFlat(true, false, true, 1);
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
opt.kOverride = 1;
testFlat(opt);
}
}
TEST(TestGpuIndexFlat, IP_Float16) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(false, true, false);
testFlat(false, true, false);
TestFlatOptions opt;
opt.useL2 = false;
opt.useFloat16 = true;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
}
}
TEST(TestGpuIndexFlat, L2_Float16) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(true, true, false);
testFlat(true, true, true);
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = true;
opt.useTransposed = false;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
}
}
......@@ -133,8 +189,33 @@ TEST(TestGpuIndexFlat, L2_Float16) {
TEST(TestGpuIndexFlat, L2_Float16_K1) {
for (int tries = 0; tries < 5; ++tries) {
faiss::gpu::newTestSeed();
testFlat(true, true, false, 1);
testFlat(true, true, true, 1);
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = true;
opt.useTransposed = false;
opt.kOverride = 1;
testFlat(opt);
}
}
// test tiling along a huge vector set
TEST(TestGpuIndexFlat, L2_Tiling) {
for (int tries = 0; tries < 3; ++tries) {
faiss::gpu::newTestSeed();
TestFlatOptions opt;
opt.useL2 = true;
opt.useFloat16 = false;
opt.useTransposed = false;
opt.numVecsOverride = 1000000;
opt.numQueriesOverride = 8;
testFlat(opt);
opt.useTransposed = true;
testFlat(opt);
}
}
......
......@@ -14,6 +14,7 @@
#include "../StandardGpuResources.h"
#include "../utils/DeviceUtils.h"
#include "../test/TestUtils.h"
#include <cmath>
#include <gtest/gtest.h>
#include <glog/logging.h>
#include <sstream>
......@@ -390,6 +391,68 @@ TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) {
copyToTest(false, false);
}
TEST(TestGpuIndexIVFFlat, Float32_negative) {
faiss::gpu::newTestSeed();
Options opt;
auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
// Put all vecs on negative side
for (auto& f : trainVecs) {
f = std::abs(f) * -1.0f;
}
for (auto& f : addVecs) {
f *= std::abs(f) * -1.0f;
}
faiss::IndexFlatIP quantizerIP(opt.dim);
faiss::Index* quantizer = (faiss::Index*) &quantizerIP;
faiss::IndexIVFFlat cpuIndex(quantizer,
opt.dim, opt.numCentroids,
faiss::METRIC_INNER_PRODUCT);
cpuIndex.train(opt.numTrain, trainVecs.data());
cpuIndex.add(opt.numAdd, addVecs.data());
cpuIndex.nprobe = opt.nprobe;
faiss::gpu::StandardGpuResources res;
res.noTempMemory();
faiss::gpu::GpuIndexIVFFlatConfig config;
config.device = opt.device;
config.indicesOptions = opt.indicesOpt;
faiss::gpu::GpuIndexIVFFlat gpuIndex(&res,
cpuIndex.d,
cpuIndex.nlist,
cpuIndex.metric_type,
config);
gpuIndex.copyFrom(&cpuIndex);
gpuIndex.setNumProbes(opt.nprobe);
// Construct a positive test set
auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
// Put all vecs on positive size
for (auto& f : queryVecs) {
f = std::abs(f);
}
bool compFloat16 = false;
faiss::gpu::compareIndices(queryVecs,
cpuIndex, gpuIndex,
opt.numQuery, opt.dim, opt.k, opt.toString(),
compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
// FIXME: the fp16 bounds are
// useless when math (the accumulator) is
// in fp16. Figure out another way to test
compFloat16 ? 0.99f : 0.1f,
compFloat16 ? 0.65f : 0.015f);
}
//
// NaN tests
//
......
......@@ -64,24 +64,23 @@ std::vector<float> randVecs(size_t num, size_t dim) {
return v;
}
void compareIndices(faiss::Index& refIndex,
void compareIndices(const std::vector<float>& queryVecs,
faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
const std::string& configMsg,
float maxRelativeError,
float pctMaxDiff1,
float pctMaxDiffN) {
auto queries = faiss::gpu::randVecs(numQuery, dim);
// Compare
std::vector<float> refDistance(numQuery * k, 0);
std::vector<faiss::Index::idx_t> refIndices(numQuery * k, -1);
refIndex.search(numQuery, queries.data(),
refIndex.search(numQuery, queryVecs.data(),
k, refDistance.data(), refIndices.data());
std::vector<float> testDistance(numQuery * k, 0);
std::vector<faiss::Index::idx_t> testIndices(numQuery * k, -1);
testIndex.search(numQuery, queries.data(),
testIndex.search(numQuery, queryVecs.data(),
k, testDistance.data(), testIndices.data());
faiss::gpu::compareLists(refDistance.data(),
......@@ -94,6 +93,25 @@ void compareIndices(faiss::Index& refIndex,
maxRelativeError, pctMaxDiff1, pctMaxDiffN);
}
void compareIndices(faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
const std::string& configMsg,
float maxRelativeError,
float pctMaxDiff1,
float pctMaxDiffN) {
auto queryVecs = faiss::gpu::randVecs(numQuery, dim);
compareIndices(queryVecs,
refIndex,
testIndex,
numQuery, dim, k,
configMsg,
maxRelativeError,
pctMaxDiff1,
pctMaxDiffN);
}
template <typename T>
inline T lookup(const T* p, int i, int j, int /*dim1*/, int dim2) {
return p[i * dim2 + j];
......
......@@ -56,7 +56,19 @@ T randSelect(std::initializer_list<T> vals) {
/// Generates a collection of random vectors in the range [0, 1]
std::vector<float> randVecs(size_t num, size_t dim);
/// Compare two indices via query for similarity
/// Compare two indices via query for similarity, with a user-specified set of
/// query vectors
void compareIndices(const std::vector<float>& queryVecs,
faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
const std::string& configMsg,
float maxRelativeError = 6e-5f,
float pctMaxDiff1 = 0.1f,
float pctMaxDiffN = 0.005f);
/// Compare two indices via query for similarity, generating random query
/// vectors
void compareIndices(faiss::Index& refIndex,
faiss::Index& testIndex,
int numQuery, int dim, int k,
......
......@@ -38,14 +38,14 @@ def search_index_pytorch(index, x, k, D=None, I=None):
assert I.__class__ in (torch.LongTensor, torch.cuda.LongTensor)
assert I.size() == (n, k)
assert I.is_contiguous()
torch.cuda.synchronize()
xptr = x.storage().data_ptr()
Iptr = I.storage().data_ptr()
Dptr = D.storage().data_ptr()
index.search_c(n, faiss.cast_integer_to_float_ptr(xptr),
k, faiss.cast_integer_to_float_ptr(Dptr),
faiss.cast_integer_to_long_ptr(Iptr))
torch.cuda.synchronize()
return D, I
......
......@@ -37,9 +37,9 @@ BLOCK_SELECT_DECL(float, false, 512);
BLOCK_SELECT_DECL(float, false, 1024);
void runBlockSelect(Tensor<float, 2, true>& in,
Tensor<float, 2, true>& outK,
Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) {
Tensor<float, 2, true>& outK,
Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) {
FAISS_ASSERT(k <= 1024);
if (dir) {
......@@ -77,4 +77,46 @@ void runBlockSelect(Tensor<float, 2, true>& in,
}
}
void runBlockSelectPair(Tensor<float, 2, true>& inK,
Tensor<int, 2, true>& inV,
Tensor<float, 2, true>& outK,
Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) {
FAISS_ASSERT(k <= 1024);
if (dir) {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(float, true, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(float, true, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(float, true, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(float, true, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(float, true, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(float, true, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(float, true, 1024);
}
} else {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(float, false, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(float, false, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(float, false, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(float, false, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(float, false, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(float, false, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(float, false, 1024);
}
}
}
} } // namespace
......@@ -39,9 +39,9 @@ BLOCK_SELECT_DECL(half, false, 512);
BLOCK_SELECT_DECL(half, false, 1024);
void runBlockSelect(Tensor<half, 2, true>& in,
Tensor<half, 2, true>& outK,
Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) {
Tensor<half, 2, true>& outK,
Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) {
FAISS_ASSERT(k <= 1024);
if (dir) {
......@@ -79,6 +79,48 @@ void runBlockSelect(Tensor<half, 2, true>& in,
}
}
void runBlockSelectPair(Tensor<half, 2, true>& inK,
Tensor<int, 2, true>& inV,
Tensor<half, 2, true>& outK,
Tensor<int, 2, true>& outV,
bool dir, int k, cudaStream_t stream) {
FAISS_ASSERT(k <= 1024);
if (dir) {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(half, true, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(half, true, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(half, true, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(half, true, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(half, true, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(half, true, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(half, true, 1024);
}
} else {
if (k == 1) {
BLOCK_SELECT_PAIR_CALL(half, false, 1);
} else if (k <= 32) {
BLOCK_SELECT_PAIR_CALL(half, false, 32);
} else if (k <= 64) {
BLOCK_SELECT_PAIR_CALL(half, false, 64);
} else if (k <= 128) {
BLOCK_SELECT_PAIR_CALL(half, false, 128);
} else if (k <= 256) {
BLOCK_SELECT_PAIR_CALL(half, false, 256);
} else if (k <= 512) {
BLOCK_SELECT_PAIR_CALL(half, false, 512);
} else if (k <= 1024) {
BLOCK_SELECT_PAIR_CALL(half, false, 1024);
}
}
}
#endif
} } // namespace
......@@ -32,7 +32,7 @@ __global__ void blockSelect(Tensor<K, 2, true> in,
__shared__ IndexType smemV[kNumWarps * NumWarpQ];
BlockSelect<K, IndexType, Dir, Comparator<K>,
NumWarpQ, NumThreadQ, ThreadsPerBlock>
NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(initK, initV, smemK, smemV, k);
// Grid is exactly sized to rows available
......@@ -62,16 +62,79 @@ __global__ void blockSelect(Tensor<K, 2, true> in,
}
}
template <typename K,
typename IndexType,
bool Dir,
int NumWarpQ,
int NumThreadQ,
int ThreadsPerBlock>
__global__ void blockSelectPair(Tensor<K, 2, true> inK,
Tensor<IndexType, 2, true> inV,
Tensor<K, 2, true> outK,
Tensor<IndexType, 2, true> outV,
K initK,
IndexType initV,
int k) {
constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
__shared__ K smemK[kNumWarps * NumWarpQ];
__shared__ IndexType smemV[kNumWarps * NumWarpQ];
BlockSelect<K, IndexType, Dir, Comparator<K>,
NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(initK, initV, smemK, smemV, k);
// Grid is exactly sized to rows available
int row = blockIdx.x;
int i = threadIdx.x;
K* inKStart = inK[row][i].data();
IndexType* inVStart = inV[row][i].data();
// Whole warps must participate in the selection
int limit = utils::roundDown(inK.getSize(1), kWarpSize);
for (; i < limit; i += ThreadsPerBlock) {
heap.add(*inKStart, *inVStart);
inKStart += ThreadsPerBlock;
inVStart += ThreadsPerBlock;
}
// Handle last remainder fraction of a warp of elements
if (i < inK.getSize(1)) {
heap.addThreadQ(*inKStart, *inVStart);
}
heap.reduce();
for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
outK[row][i] = smemK[i];
outV[row][i] = smemV[i];
}
}
void runBlockSelect(Tensor<float, 2, true>& in,
Tensor<float, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
Tensor<float, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
void runBlockSelectPair(Tensor<float, 2, true>& inKeys,
Tensor<int, 2, true>& inIndices,
Tensor<float, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
#ifdef FAISS_USE_FLOAT16
void runBlockSelect(Tensor<half, 2, true>& in,
Tensor<half, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
Tensor<half, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
void runBlockSelectPair(Tensor<half, 2, true>& inKeys,
Tensor<int, 2, true>& inIndices,
Tensor<half, 2, true>& outKeys,
Tensor<int, 2, true>& outIndices,
bool dir, int k, cudaStream_t stream);
#endif
} } // namespace
......@@ -12,37 +12,37 @@
namespace faiss { namespace gpu {
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor() :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(),
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor() :
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner),
space_(MemorySpace::Device) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(),
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) :
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner),
space_(MemorySpace::Device) {
this->operator=(std::move(t));
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t) {
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
if (this->state_ == AllocState::Owner) {
CUDA_VERIFY(cudaFree(this->data_));
}
this->Tensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
this->Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
std::move(t));
this->state_ = t.state_; t.state_ = AllocState::NotOwner;
......@@ -52,10 +52,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::operator=(
return *this;
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() {
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~DeviceTensor() {
if (state_ == AllocState::Owner) {
FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
CUDA_VERIFY(cudaFree(this->data_));
......@@ -66,13 +66,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::~DeviceTensor() {
// destructor will return the reservation
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
const IndexT sizes[Dim],
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner),
space_(space) {
......@@ -80,13 +80,13 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
FAISS_ASSERT(this->data_ || (this->getSizeInBytes() == 0));
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
std::initializer_list<IndexT> sizes,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner),
space_(space) {
......@@ -95,15 +95,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
}
// memory reservation constructor
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m,
const IndexT sizes[Dim],
cudaStream_t stream,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Reservation),
space_(space) {
......@@ -116,15 +116,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
}
// memory reservation constructor
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m,
std::initializer_list<IndexT> sizes,
cudaStream_t stream,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Reservation),
space_(space) {
......@@ -136,51 +136,51 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
reservation_ = std::move(memory);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data,
const IndexT sizes[Dim],
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner),
space_(space) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data,
std::initializer_list<IndexT> sizes,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner),
space_(space) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DataPtrType data,
const IndexT sizes[Dim],
const IndexT strides[Dim],
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes, strides),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
state_(AllocState::NotOwner),
space_(space) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Owner),
space_(space) {
......@@ -189,15 +189,15 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
this->copyFrom(t, stream);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::DeviceTensor(
DeviceMemory& m,
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream,
MemorySpace space) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Reservation),
space_(space) {
......@@ -211,10 +211,10 @@ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::DeviceTensor(
this->copyFrom(t, stream);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>::zero(
__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero(
cudaStream_t stream) {
if (this->data_) {
// Region must be contiguous
......
......@@ -18,10 +18,10 @@ namespace faiss { namespace gpu {
template <typename T,
int Dim,
bool Contig = false,
bool InnerContig = false,
typename IndexT = int,
template <typename U> class PtrTraits = traits::DefaultPtrTraits>
class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
class DeviceTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
public:
typedef IndexT IndexType;
typedef typename PtrTraits<T>::PtrType DataPtrType;
......@@ -33,11 +33,11 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
__host__ ~DeviceTensor();
/// Move constructor
__host__ DeviceTensor(DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t);
__host__ DeviceTensor(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
/// Move assignment
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
operator=(DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&& t);
__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
operator=(DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
/// Constructs a tensor of the given size, allocating memory for it
/// locally
......@@ -76,19 +76,19 @@ class DeviceTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
MemorySpace space = MemorySpace::Device);
/// Copies a tensor into ourselves, allocating memory for it locally
__host__ DeviceTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
__host__ DeviceTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream,
MemorySpace space = MemorySpace::Device);
/// Copies a tensor into ourselves, reserving a temporary
/// memory reservation via a memory manager.
__host__ DeviceTensor(DeviceMemory& m,
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream,
MemorySpace space = MemorySpace::Device);
/// Call to zero out memory
__host__ DeviceTensor<T, Dim, Contig, IndexT, PtrTraits>&
__host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
zero(cudaStream_t stream);
private:
......
......@@ -43,7 +43,7 @@ void synchronizeAllDevices() {
}
}
cudaDeviceProp& getDeviceProperties(int device) {
const cudaDeviceProp& getDeviceProperties(int device) {
static std::mutex mutex;
static std::unordered_map<int, cudaDeviceProp> properties;
......@@ -61,6 +61,10 @@ cudaDeviceProp& getDeviceProperties(int device) {
return it->second;
}
const cudaDeviceProp& getCurrentDeviceProperties() {
return getDeviceProperties(getCurrentDevice());
}
int getMaxThreads(int device) {
return getDeviceProperties(device).maxThreadsPerBlock;
}
......
......@@ -31,7 +31,10 @@ int getNumDevices();
void synchronizeAllDevices();
/// Returns a cached cudaDeviceProp for the given device
cudaDeviceProp& getDeviceProperties(int device);
const cudaDeviceProp& getDeviceProperties(int device);
/// Returns the cached cudaDeviceProp for the current device
const cudaDeviceProp& getCurrentDeviceProperties();
/// Returns the maximum number of threads available for the given GPU
/// device
......
......@@ -10,18 +10,18 @@
namespace faiss { namespace gpu {
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor() :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(),
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor() :
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(),
state_(AllocState::NotOwner) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() {
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::~HostTensor() {
if (state_ == AllocState::Owner) {
FAISS_ASSERT(this->data_ != nullptr);
delete[] this->data_;
......@@ -29,67 +29,67 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::~HostTensor() {
}
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
const IndexT sizes[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner) {
this->data_ = new T[this->numElements()];
FAISS_ASSERT(this->data_ != nullptr);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
std::initializer_list<IndexT> sizes) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, sizes),
state_(AllocState::Owner) {
this->data_ = new T[this->numElements()];
FAISS_ASSERT(this->data_ != nullptr);
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data,
const IndexT sizes[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data,
std::initializer_list<IndexT> sizes) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes),
state_(AllocState::NotOwner) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
DataPtrType data,
const IndexT sizes[Dim],
const IndexT strides[Dim]) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(data, sizes, strides),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data, sizes, strides),
state_(AllocState::NotOwner) {
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::HostTensor(
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream) :
Tensor<T, Dim, Contig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(nullptr, t.sizes(), t.strides()),
state_(AllocState::Owner) {
// Only contiguous arrays handled for now
FAISS_ASSERT(t.isContiguous());
......@@ -99,10 +99,10 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::HostTensor(
}
/// Call to zero out memory
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>&
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() {
__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>&
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::zero() {
// Region must be contiguous
FAISS_ASSERT(this->isContiguous());
......@@ -113,17 +113,17 @@ HostTensor<T, Dim, Contig, IndexT, PtrTraits>::zero() {
return *this;
}
template <typename T, int Dim, bool Contig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ T
HostTensor<T, Dim, Contig, IndexT, PtrTraits>::maxDiff(
const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const {
HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>::maxDiff(
const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
auto size = this->numElements();
FAISS_ASSERT(size == t.numElements());
FAISS_ASSERT(size > 0);
if (Contig) {
if (InnerContig) {
auto a = this->data();
auto b = t.data();
......
......@@ -16,10 +16,10 @@ namespace faiss { namespace gpu {
template <typename T,
int Dim,
bool Contig = false,
bool InnerContig = false,
typename IndexT = int,
template <typename U> class PtrTraits = traits::DefaultPtrTraits>
class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
class HostTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
public:
typedef IndexT IndexType;
typedef typename PtrTraits<T>::PtrType DataPtrType;
......@@ -51,19 +51,19 @@ class HostTensor : public Tensor<T, Dim, Contig, IndexT, PtrTraits> {
/// Copies a tensor into ourselves, allocating memory for it
/// locally. If the tensor is on the GPU, then we will copy it to
/// ourselves wrt the given stream.
__host__ HostTensor(Tensor<T, Dim, Contig, IndexT, PtrTraits>& t,
__host__ HostTensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t,
cudaStream_t stream);
/// Call to zero out memory
__host__ HostTensor<T, Dim, Contig, IndexT, PtrTraits>& zero();
__host__ HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& zero();
/// Returns the maximum difference seen between two tensors
__host__ T
maxDiff(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const;
maxDiff(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const;
/// Are the two tensors exactly equal?
__host__ bool
equal(const HostTensor<T, Dim, Contig, IndexT, PtrTraits>& t) const {
equal(const HostTensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) const {
return (maxDiff(t) == (T) 0);
}
......
......@@ -24,11 +24,12 @@ struct Limits {
// constexpr constructor for half
// FIXME: faiss CPU uses +/-FLT_MAX instead of +/-infinity
constexpr float kFloatMax = std::numeric_limits<float>::max();
constexpr float kFloatMin = std::numeric_limits<float>::lowest();
template <>
struct Limits<float> {
static __device__ __host__ inline float getMin() {
return -kFloatMax;
return kFloatMin;
}
static __device__ __host__ inline float getMax() {
return kFloatMax;
......@@ -55,8 +56,8 @@ struct Limits<half> {
#endif // FAISS_USE_FLOAT16
constexpr int kIntMin = std::numeric_limits<int>::min();
constexpr int kIntMax = std::numeric_limits<int>::max();
constexpr int kIntMin = std::numeric_limits<int>::lowest();
template <>
struct Limits<int> {
......
......@@ -112,6 +112,10 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC,
FAISS_ASSERT(aK == bK);
FAISS_ASSERT(bN == cN);
FAISS_ASSERT(a.getStride(1) == 1);
FAISS_ASSERT(b.getStride(1) == 1);
FAISS_ASSERT(c.getStride(1) == 1);
// Now, we have to represent the matrix multiplication in
// column-major layout
T* pA = transC ? a.data() : b.data();
......@@ -122,9 +126,9 @@ runMatrixMult(Tensor<T, 2, true>& c, bool transC,
int n = c.getSize(0); // other size
int k = transA ? a.getSize(0) : a.getSize(1);
int lda = transC ? a.getSize(1) : b.getSize(1);
int ldb = transC ? b.getSize(1) : a.getSize(1);
int ldc = c.getSize(1);
int lda = transC ? a.getStride(0) : b.getStride(0);
int ldb = transC ? b.getStride(0) : a.getStride(0);
int ldc = c.getStride(0);
auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
......@@ -238,9 +242,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
int n = c.getSize(1); // other size
int k = transA ? a.getSize(1) : a.getSize(2);
int lda = transC ? a.getSize(2) : b.getSize(2);
int ldb = transC ? b.getSize(2) : a.getSize(2);
int ldc = c.getSize(2);
int lda = transC ? a.getStride(1) : b.getStride(1);
int ldb = transC ? b.getStride(1) : a.getStride(1);
int ldc = c.getStride(1);
auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
......@@ -254,9 +258,9 @@ runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
HostTensor<float*, 1, true> hostB({b.getSize(0)});
HostTensor<float*, 1, true> hostC({c.getSize(0)});
size_t aOffset = a.getSize(1) * a.getSize(2);
size_t bOffset = b.getSize(1) * b.getSize(2);
size_t cOffset = c.getSize(1) * c.getSize(2);
size_t aOffset = a.getStride(0);
size_t bOffset = b.getStride(0);
size_t cOffset = c.getStride(0);
for (int i = 0; i < a.getSize(0); ++i) {
hostA[i] = transC ? a.data() + i * aOffset : b.data() + i * bOffset;
......
......@@ -16,7 +16,7 @@
namespace faiss { namespace gpu {
template <int Dim, bool Contig = false, typename IndexT = int>
template <int Dim, bool InnerContig = false, typename IndexT = int>
class NoTypeTensor {
public:
NoTypeTensor()
......@@ -25,7 +25,7 @@ class NoTypeTensor {
}
template <typename T>
NoTypeTensor(Tensor<T, Dim, Contig, IndexT>& t)
NoTypeTensor(Tensor<T, Dim, InnerContig, IndexT>& t)
: mem_(t.data()),
typeSize_(sizeof(T)) {
for (int i = 0; i < Dim; ++i) {
......@@ -87,13 +87,14 @@ class NoTypeTensor {
}
template <typename T>
Tensor<T, Dim, Contig, IndexT> toTensor() {
Tensor<T, Dim, InnerContig, IndexT> toTensor() {
FAISS_ASSERT(sizeof(T) == typeSize_);
return Tensor<T, Dim, Contig, IndexT>((T*) mem_, size_, stride_);
return Tensor<T, Dim, InnerContig, IndexT>((T*) mem_, size_, stride_);
}
NoTypeTensor<Dim, Contig, IndexT> narrowOutermost(IndexT start, IndexT size) {
NoTypeTensor<Dim, InnerContig, IndexT> narrowOutermost(IndexT start,
IndexT size) {
char* newPtr = (char*) mem_;
if (start > 0) {
......@@ -110,7 +111,7 @@ class NoTypeTensor {
}
}
return NoTypeTensor<Dim, Contig, IndexT>(
return NoTypeTensor<Dim, InnerContig, IndexT>(
newPtr, typeSize_, newSize, stride_);
}
......
This diff is collapsed.
This diff is collapsed.
......@@ -19,26 +19,26 @@
namespace faiss { namespace gpu {
template <typename T>
template <typename T, typename IndexT>
struct TensorInfo {
static constexpr int kMaxDims = 8;
T* data;
int sizes[kMaxDims];
int strides[kMaxDims];
IndexT sizes[kMaxDims];
IndexT strides[kMaxDims];
int dims;
};
template <typename T, int Dim>
template <typename T, typename IndexT, int Dim>
struct TensorInfoOffset {
__device__ inline static unsigned int get(const TensorInfo<T>& info,
unsigned int linearId) {
unsigned int offset = 0;
__device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
IndexT linearId) {
IndexT offset = 0;
#pragma unroll
for (int i = Dim - 1; i >= 0; --i) {
unsigned int curDimIndex = linearId % info.sizes[i];
unsigned int curDimOffset = curDimIndex * info.strides[i];
IndexT curDimIndex = linearId % info.sizes[i];
IndexT curDimOffset = curDimIndex * info.strides[i];
offset += curDimOffset;
......@@ -51,21 +51,21 @@ struct TensorInfoOffset {
}
};
template <typename T>
struct TensorInfoOffset<T, -1> {
__device__ inline static unsigned int get(const TensorInfo<T>& info,
unsigned int linearId) {
template <typename T, typename IndexT>
struct TensorInfoOffset<T, IndexT, -1> {
__device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
IndexT linearId) {
return linearId;
}
};
template <typename T, int Dim>
TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) {
TensorInfo<T> info;
template <typename T, typename IndexT, int Dim>
TensorInfo<T, IndexT> getTensorInfo(const Tensor<T, Dim, true>& t) {
TensorInfo<T, IndexT> info;
for (int i = 0; i < Dim; ++i) {
info.sizes[i] = t.getSize(i);
info.strides[i] = t.getStride(i);
info.sizes[i] = (IndexT) t.getSize(i);
info.strides[i] = (IndexT) t.getStride(i);
}
info.data = t.data();
......@@ -74,26 +74,22 @@ TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) {
return info;
}
template <typename T, int DimInput, int DimOutput>
__global__ void transposeAny(TensorInfo<T> input,
TensorInfo<T> output,
unsigned int totalSize) {
auto linearThreadId = blockIdx.x * blockDim.x + threadIdx.x;
if (linearThreadId >= totalSize) {
return;
}
auto inputOffset =
TensorInfoOffset<T, DimInput>::get(input, linearThreadId);
auto outputOffset =
TensorInfoOffset<T, DimOutput>::get(output, linearThreadId);
template <typename T, typename IndexT, int DimInput, int DimOutput>
__global__ void transposeAny(TensorInfo<T, IndexT> input,
TensorInfo<T, IndexT> output,
IndexT totalSize) {
for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x;
i < totalSize;
i += gridDim.x + blockDim.x) {
auto inputOffset = TensorInfoOffset<T, IndexT, DimInput>::get(input, i);
auto outputOffset = TensorInfoOffset<T, IndexT, DimOutput>::get(output, i);
#if __CUDA_ARCH__ >= 350
output.data[outputOffset] = __ldg(&input.data[inputOffset]);
output.data[outputOffset] = __ldg(&input.data[inputOffset]);
#else
output.data[outputOffset] = input.data[inputOffset];
output.data[outputOffset] = input.data[inputOffset];
#endif
}
}
/// Performs an out-of-place transposition between any two dimensions.
......@@ -110,7 +106,8 @@ void runTransposeAny(Tensor<T, Dim, true>& in,
int dim1, int dim2,
Tensor<T, Dim, true>& out,
cudaStream_t stream) {
static_assert(Dim <= TensorInfo<T>::kMaxDims, "too many dimensions");
static_assert(Dim <= TensorInfo<T, unsigned int>::kMaxDims,
"too many dimensions");
FAISS_ASSERT(dim1 != dim2);
FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
......@@ -127,20 +124,33 @@ void runTransposeAny(Tensor<T, Dim, true>& in,
FAISS_ASSERT(out.getSize(i) == outSize[i]);
}
auto inInfo = getTensorInfo<T, Dim>(in);
auto outInfo = getTensorInfo<T, Dim>(out);
size_t totalSize = in.numElements();
size_t block = std::min((size_t) getMaxThreadsCurrentDevice(), totalSize);
if (totalSize <= (size_t) std::numeric_limits<int>::max()) {
// div/mod seems faster with unsigned types
auto inInfo = getTensorInfo<T, unsigned int, Dim>(in);
auto outInfo = getTensorInfo<T, unsigned int, Dim>(out);
std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
int totalSize = in.numElements();
transposeAny<T, unsigned int, Dim, -1>
<<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
} else {
auto inInfo = getTensorInfo<T, unsigned long, Dim>(in);
auto outInfo = getTensorInfo<T, unsigned long, Dim>(out);
int numThreads = std::min(getMaxThreadsCurrentDevice(), totalSize);
auto grid = dim3(utils::divUp(totalSize, numThreads));
auto block = dim3(numThreads);
std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
transposeAny<T, Dim, -1><<<grid, block, 0, stream>>>(
inInfo, outInfo, totalSize);
auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
transposeAny<T, unsigned long, Dim, -1>
<<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
}
CUDA_TEST_ERROR();
}
......
This diff is collapsed.
......@@ -222,7 +222,6 @@ void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) {
}
static void write_ivf_header (const IndexIVF * ivf, FILE *f,
bool include_ids = true) {
write_index_header (ivf, f);
......@@ -445,6 +444,7 @@ static void read_ScalarQuantizer (ScalarQuantizer *ivsc, FILE *f) {
READVECTOR (ivsc->trained);
}
ProductQuantizer * read_ProductQuantizer (const char*fname) {
FILE *f = fopen (fname, "r");
FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
......@@ -676,8 +676,8 @@ Index *read_index (FILE * f, bool try_mmap) {
}
idx = idxmap;
} else {
fprintf (stderr, "Index type 0x%08x not supported\n", h);
abort ();
FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
idx = nullptr;
}
return idx;
}
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -42,6 +42,42 @@ class TestRemove(unittest.TestCase):
else:
assert False, 'should have raised an exception'
def test_remove_id_map_2(self):
# from https://github.com/facebookresearch/faiss/issues/255
rs = np.random.RandomState(1234)
X = rs.randn(10, 10).astype(np.float32)
idx = np.array([0, 10, 20, 30, 40, 5, 15, 25, 35, 45], np.int64)
remove_set = np.array([10, 30], dtype=np.int64)
index = faiss.index_factory(10, 'IDMap,Flat')
index.add_with_ids(X[:5, :], idx[:5])
index.remove_ids(remove_set)
index.add_with_ids(X[5:, :], idx[5:])
print (index.search(X, 1))
for i in range(10):
_, searchres = index.search(X[i:i + 1, :], 1)
if idx[i] in remove_set:
assert searchres[0] != idx[i]
else:
assert searchres[0] == idx[i]
class TestRangeSearch(unittest.TestCase):
def test_range_search_id_map(self):
sub_index = faiss.IndexFlat(5, 1) # L2 search instead of inner product
xb = np.zeros((10, 5), dtype='float32')
xb[:, 0] = np.arange(10) + 1000
index = faiss.IndexIDMap2(sub_index)
index.add_with_ids(xb, np.arange(10) + 100)
dist = float(np.linalg.norm(xb[3] - xb[0])) * 0.99
res_subindex = sub_index.range_search(xb[[0], :], dist)
res_index = index.range_search(xb[[0], :], dist)
assert len(res_subindex[2]) == 2
np.testing.assert_array_equal(res_subindex[2] + 100, res_index[2])
class TestUpdate(unittest.TestCase):
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment