Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
faiss
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
钟尚武
faiss
Commits
fe446e41
Commit
fe446e41
authored
Nov 23, 2017
by
matthijs
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
bugfixes
parent
250a3d3f
Hide whitespace changes
Inline
Side-by-side
Showing
21 changed files
with
243 additions
and
154 deletions
+243
-154
AutoTune.cpp
AutoTune.cpp
+13
-16
IndexPQ.cpp
IndexPQ.cpp
+15
-5
GpuAutoTune.cpp
gpu/GpuAutoTune.cpp
+5
-5
GpuIndexIVF.cu
gpu/GpuIndexIVF.cu
+3
-2
GpuIndexIVFPQ.cu
gpu/GpuIndexIVFPQ.cu
+1
-1
IVFFlat.cu
gpu/impl/IVFFlat.cu
+3
-4
PQScanMultiPassNoPrecomputed.cu
gpu/impl/PQScanMultiPassNoPrecomputed.cu
+0
-1
test_gpu_index.py
gpu/test/test_gpu_index.py
+8
-6
DeviceDefs.cuh
gpu/utils/DeviceDefs.cuh
+7
-29
Float16.cu
gpu/utils/Float16.cu
+8
-2
Float16.cuh
gpu/utils/Float16.cuh
+1
-1
Limits.cuh
gpu/utils/Limits.cuh
+6
-0
LoadStoreOperators.cuh
gpu/utils/LoadStoreOperators.cuh
+30
-2
MathOperators.cuh
gpu/utils/MathOperators.cuh
+4
-0
Select.cuh
gpu/utils/Select.cuh
+16
-2
Tensor-inl.cuh
gpu/utils/Tensor-inl.cuh
+31
-0
Tensor.cuh
gpu/utils/Tensor.cuh
+3
-5
WarpShuffles.cuh
gpu/utils/WarpShuffles.cuh
+26
-5
demo_ivfpq.py
python/demo_ivfpq.py
+0
-61
demo_auto_tune.py
tests/demo_auto_tune.py
+13
-7
test_factory.py
tests/test_factory.py
+50
-0
No files found.
AutoTune.cpp
View file @
fe446e41
...
@@ -431,7 +431,8 @@ void ParameterSpace::set_index_parameter (
...
@@ -431,7 +431,8 @@ void ParameterSpace::set_index_parameter (
// and fall through to also enable it on sub-indexes
// and fall through to also enable it on sub-indexes
}
}
if
(
DC
(
IndexPreTransform
))
{
if
(
DC
(
IndexPreTransform
))
{
index
=
ix
->
index
;
set_index_parameter
(
ix
->
index
,
name
,
val
);
return
;
}
}
if
(
DC
(
IndexShards
))
{
if
(
DC
(
IndexShards
))
{
// call on all sub-indexes
// call on all sub-indexes
...
@@ -440,30 +441,28 @@ void ParameterSpace::set_index_parameter (
...
@@ -440,30 +441,28 @@ void ParameterSpace::set_index_parameter (
}
}
return
;
return
;
}
}
if
(
name
==
"verbose"
)
{
index
->
verbose
=
int
(
val
);
// in case it was an IndexPreTransform
}
if
(
DC
(
IndexRefineFlat
))
{
if
(
DC
(
IndexRefineFlat
))
{
if
(
name
==
"k_factor_rf"
)
{
if
(
name
==
"k_factor_rf"
)
{
ix
->
k_factor
=
int
(
val
);
ix
->
k_factor
=
int
(
val
);
return
;
return
;
}
}
index
=
ix
->
base_index
;
// otherwise it is for the sub-index
}
set_index_parameter
(
&
ix
->
refine_index
,
name
,
val
);
if
(
DC
(
IndexPreTransform
))
{
return
;
index
=
ix
->
index
;
}
}
if
(
name
==
"verbose"
)
{
if
(
name
==
"verbose"
)
{
index
->
verbose
=
int
(
val
);
index
->
verbose
=
int
(
val
);
return
;
// last verbose that we could find
return
;
// last verbose that we could find
}
}
if
(
name
==
"nprobe"
)
{
if
(
name
==
"nprobe"
)
{
if
(
DC
(
IndexIVF
))
{
if
(
DC
(
IndexIVF
))
{
ix
->
nprobe
=
int
(
val
);
ix
->
nprobe
=
int
(
val
);
return
;
return
;
}
}
}
}
if
(
name
==
"ht"
)
{
if
(
name
==
"ht"
)
{
if
(
DC
(
IndexPQ
))
{
if
(
DC
(
IndexPQ
))
{
if
(
val
>=
ix
->
pq
.
code_size
*
8
)
{
if
(
val
>=
ix
->
pq
.
code_size
*
8
)
{
...
@@ -685,7 +684,6 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
...
@@ -685,7 +684,6 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
tok
;
tok
;
tok
=
strtok_r
(
nullptr
,
" ,"
,
&
ptr
))
{
tok
=
strtok_r
(
nullptr
,
" ,"
,
&
ptr
))
{
int
d_out
,
opq_M
,
nbit
,
M
,
M2
;
int
d_out
,
opq_M
,
nbit
,
M
,
M2
;
char
option
[
100
];
std
::
string
stok
(
tok
);
std
::
string
stok
(
tok
);
// to avoid mem leaks with exceptions:
// to avoid mem leaks with exceptions:
...
@@ -776,10 +774,9 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
...
@@ -776,10 +774,9 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
del_coarse_quantizer
.
release
();
del_coarse_quantizer
.
release
();
index_ivf
->
own_fields
=
true
;
index_ivf
->
own_fields
=
true
;
index_1
=
index_ivf
;
index_1
=
index_ivf
;
}
else
if
(
!
index
&&
sscanf
(
tok
,
"PQ%d%10s"
,
&
M
,
option
)
==
2
)
{
}
else
if
(
!
index
&&
(
sscanf
(
tok
,
"PQ%d"
,
&
M
)
==
1
||
std
::
string
soption
=
option
;
sscanf
(
tok
,
"PQ%dnp"
,
&
M
)
==
1
))
{
// np to disable polysemous trainign
bool
do_polysemous_training
=
stok
.
find
(
"np"
)
==
std
::
string
::
npos
;
FAISS_THROW_IF_NOT
(
soption
==
""
||
soption
==
"np"
);
if
(
coarse_quantizer
)
{
if
(
coarse_quantizer
)
{
IndexIVFPQ
*
index_ivf
=
new
IndexIVFPQ
(
IndexIVFPQ
*
index_ivf
=
new
IndexIVFPQ
(
coarse_quantizer
,
d
,
ncentroids
,
M
,
8
);
coarse_quantizer
,
d
,
ncentroids
,
M
,
8
);
...
@@ -789,11 +786,11 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
...
@@ -789,11 +786,11 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
index_ivf
->
cp
.
spherical
=
metric
==
METRIC_INNER_PRODUCT
;
index_ivf
->
cp
.
spherical
=
metric
==
METRIC_INNER_PRODUCT
;
del_coarse_quantizer
.
release
();
del_coarse_quantizer
.
release
();
index_ivf
->
own_fields
=
true
;
index_ivf
->
own_fields
=
true
;
index_ivf
->
do_polysemous_training
=
soption
!=
"np"
;
index_ivf
->
do_polysemous_training
=
do_polysemous_training
;
index_1
=
index_ivf
;
index_1
=
index_ivf
;
}
else
{
}
else
{
IndexPQ
*
index_pq
=
new
IndexPQ
(
d
,
M
,
8
,
metric
);
IndexPQ
*
index_pq
=
new
IndexPQ
(
d
,
M
,
8
,
metric
);
index_pq
->
do_polysemous_training
=
soption
!=
"np"
;
index_pq
->
do_polysemous_training
=
do_polysemous_training
;
index_1
=
index_pq
;
index_1
=
index_pq
;
}
}
...
...
IndexPQ.cpp
View file @
fe446e41
...
@@ -260,7 +260,6 @@ static size_t polysemous_inner_loop (
...
@@ -260,7 +260,6 @@ static size_t polysemous_inner_loop (
void
IndexPQ
::
search_core_polysemous
(
idx_t
n
,
const
float
*
x
,
idx_t
k
,
void
IndexPQ
::
search_core_polysemous
(
idx_t
n
,
const
float
*
x
,
idx_t
k
,
float
*
distances
,
idx_t
*
labels
)
const
float
*
distances
,
idx_t
*
labels
)
const
{
{
FAISS_THROW_IF_NOT
(
pq
.
code_size
%
8
==
0
);
FAISS_THROW_IF_NOT
(
pq
.
byte_per_idx
==
1
);
FAISS_THROW_IF_NOT
(
pq
.
byte_per_idx
==
1
);
// PQ distance tables
// PQ distance tables
...
@@ -319,12 +318,17 @@ void IndexPQ::search_core_polysemous (idx_t n, const float *x, idx_t k,
...
@@ -319,12 +318,17 @@ void IndexPQ::search_core_polysemous (idx_t n, const float *x, idx_t k,
(
*
this
,
dis_table_qi
,
q_code
,
k
,
heap_dis
,
heap_ids
);
(
*
this
,
dis_table_qi
,
q_code
,
k
,
heap_dis
,
heap_ids
);
break
;
break
;
default
:
default
:
if
(
pq
.
code_size
%
8
==
0
)
if
(
pq
.
code_size
%
8
==
0
)
{
n_pass
+=
polysemous_inner_loop
<
HammingComputerM8
>
n_pass
+=
polysemous_inner_loop
<
HammingComputerM8
>
(
*
this
,
dis_table_qi
,
q_code
,
k
,
heap_dis
,
heap_ids
);
(
*
this
,
dis_table_qi
,
q_code
,
k
,
heap_dis
,
heap_ids
);
else
}
else
if
(
pq
.
code_size
%
4
==
0
)
{
n_pass
+=
polysemous_inner_loop
<
HammingComputerM4
>
n_pass
+=
polysemous_inner_loop
<
HammingComputerM4
>
(
*
this
,
dis_table_qi
,
q_code
,
k
,
heap_dis
,
heap_ids
);
(
*
this
,
dis_table_qi
,
q_code
,
k
,
heap_dis
,
heap_ids
);
}
else
{
FAISS_THROW_FMT
(
"code size %zd not supported for polysemous"
,
pq
.
code_size
);
}
break
;
break
;
}
}
}
else
{
}
else
{
...
@@ -342,8 +346,14 @@ void IndexPQ::search_core_polysemous (idx_t n, const float *x, idx_t k,
...
@@ -342,8 +346,14 @@ void IndexPQ::search_core_polysemous (idx_t n, const float *x, idx_t k,
(
*
this
,
dis_table_qi
,
q_code
,
k
,
heap_dis
,
heap_ids
);
(
*
this
,
dis_table_qi
,
q_code
,
k
,
heap_dis
,
heap_ids
);
break
;
break
;
default
:
default
:
n_pass
+=
polysemous_inner_loop
<
GenHammingComputerM8
>
if
(
pq
.
code_size
%
8
==
0
)
{
(
*
this
,
dis_table_qi
,
q_code
,
k
,
heap_dis
,
heap_ids
);
n_pass
+=
polysemous_inner_loop
<
GenHammingComputerM8
>
(
*
this
,
dis_table_qi
,
q_code
,
k
,
heap_dis
,
heap_ids
);
}
else
{
FAISS_THROW_FMT
(
"code size %zd not supported for polysemous"
,
pq
.
code_size
);
}
break
;
break
;
}
}
}
}
...
...
gpu/GpuAutoTune.cpp
View file @
fe446e41
...
@@ -381,20 +381,20 @@ void GpuParameterSpace::set_index_parameter (
...
@@ -381,20 +381,20 @@ void GpuParameterSpace::set_index_parameter (
set_index_parameter
(
ix
->
at
(
i
),
name
,
val
);
set_index_parameter
(
ix
->
at
(
i
),
name
,
val
);
return
;
return
;
}
}
if
(
DC
(
GpuIndexIVF
)
)
{
if
(
name
==
"nprobe"
)
{
if
(
name
==
"nprobe"
)
{
if
(
DC
(
GpuIndexIVF
)
)
{
ix
->
setNumProbes
(
int
(
val
));
ix
->
setNumProbes
(
int
(
val
));
return
;
return
;
}
}
}
}
if
(
DC
(
GpuIndexIVFPQ
)
)
{
if
(
name
==
"use_precomputed_table"
)
{
if
(
name
==
"use_precomputed_table"
)
{
if
(
DC
(
GpuIndexIVFPQ
)
)
{
ix
->
setPrecomputedCodes
(
bool
(
val
));
ix
->
setPrecomputedCodes
(
bool
(
val
));
return
;
return
;
}
}
}
}
// maybe norma
l
index parameters apply?
// maybe norma
l
index parameters apply?
ParameterSpace
::
set_index_parameter
(
index
,
name
,
val
);
ParameterSpace
::
set_index_parameter
(
index
,
name
,
val
);
}
}
...
...
gpu/GpuIndexIVF.cu
View file @
fe446e41
...
@@ -29,8 +29,9 @@ GpuIndexIVF::GpuIndexIVF(GpuResources* resources,
...
@@ -29,8 +29,9 @@ GpuIndexIVF::GpuIndexIVF(GpuResources* resources,
nprobe_(1),
nprobe_(1),
quantizer_(nullptr) {
quantizer_(nullptr) {
#ifndef FAISS_USE_FLOAT16
#ifndef FAISS_USE_FLOAT16
FAISS_THROW_IF_NOT_MSG(!ivfConfig_.flatConfig.useFloat16CoarseQuantizer,
FAISS_THROW_IF_NOT_MSG(!ivfConfig_.flatConfig.useFloat16 &&
"float16 unsupported; need CUDA SDK >= 7.5");
!ivfConfig_.flatConfig.useFloat16Accumulator,
"float16 unsupported; need CUDA SDK >= 7.5");
#endif
#endif
init_();
init_();
...
...
gpu/GpuIndexIVFPQ.cu
View file @
fe446e41
...
@@ -60,7 +60,7 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(GpuResources* resources,
...
@@ -60,7 +60,7 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(GpuResources* resources,
reserveMemoryVecs_(0),
reserveMemoryVecs_(0),
index_(nullptr) {
index_(nullptr) {
#ifndef FAISS_USE_FLOAT16
#ifndef FAISS_USE_FLOAT16
FAISS_ASSERT(!
useFloat16LookupTables_
);
FAISS_ASSERT(!
config.useFloat16LookupTables
);
#endif
#endif
verifySettings_();
verifySettings_();
...
...
gpu/impl/IVFFlat.cu
View file @
fe446e41
...
@@ -45,10 +45,6 @@ IVFFlat::IVFFlat(GpuResources* resources,
...
@@ -45,10 +45,6 @@ IVFFlat::IVFFlat(GpuResources* resources,
space),
space),
l2Distance_(l2Distance),
l2Distance_(l2Distance),
useFloat16_(useFloat16) {
useFloat16_(useFloat16) {
#ifndef FAISS_USE_FLOAT16
FAISS_ASSERT_MSG(!useFloat16, "float16 unsupported");
useFloat16_ = false;
#endif
}
}
IVFFlat::~IVFFlat() {
IVFFlat::~IVFFlat() {
...
@@ -95,6 +91,9 @@ IVFFlat::addCodeVectorsFromCpu(int listId,
...
@@ -95,6 +91,9 @@ IVFFlat::addCodeVectorsFromCpu(int listId,
lengthInBytes,
lengthInBytes,
stream,
stream,
true /* exact reserved size */);
true /* exact reserved size */);
#else
// we are not compiling with float16 support
FAISS_ASSERT(false);
#endif
#endif
} else {
} else {
listData->append((unsigned char*) vecs,
listData->append((unsigned char*) vecs,
...
...
gpu/impl/PQScanMultiPassNoPrecomputed.cu
View file @
fe446e41
...
@@ -506,7 +506,6 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
...
@@ -506,7 +506,6 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
}
}
#else
#else
FAISS_ASSERT(!useFloat16Lookup);
FAISS_ASSERT(!useFloat16Lookup);
int codeSize = sizeof(float);
#endif
#endif
int totalCodeDistancesSize =
int totalCodeDistancesSize =
...
...
gpu/test/test_gpu_index.py
View file @
fe446e41
...
@@ -7,16 +7,12 @@
...
@@ -7,16 +7,12 @@
#! /usr/bin/env python2
#! /usr/bin/env python2
import
time
import
time
import
libfb.py.mkl
# noqa
import
unittest
import
numpy
as
np
import
numpy
as
np
from
libfb
import
testutil
import
faiss
import
faiss
class
EvalIVFPQAccuracy
(
testutil
.
BaseFacebook
TestCase
):
class
EvalIVFPQAccuracy
(
unittest
.
TestCase
):
def
get_dataset
(
self
,
small_one
=
False
):
def
get_dataset
(
self
,
small_one
=
False
):
if
not
small_one
:
if
not
small_one
:
...
@@ -110,3 +106,9 @@ class EvalIVFPQAccuracy(testutil.BaseFacebookTestCase):
...
@@ -110,3 +106,9 @@ class EvalIVFPQAccuracy(testutil.BaseFacebookTestCase):
def
test_cpu_to_gpu_IVFFlat
(
self
):
def
test_cpu_to_gpu_IVFFlat
(
self
):
self
.
do_cpu_to_gpu
(
'IVF128,Flat'
)
self
.
do_cpu_to_gpu
(
'IVF128,Flat'
)
def
test_set_gpu_param
(
self
):
index
=
faiss
.
index_factory
(
12
,
"PCAR8,IVF10,PQ4"
)
res
=
faiss
.
StandardGpuResources
()
gpu_index
=
faiss
.
index_cpu_to_gpu
(
res
,
0
,
index
)
faiss
.
GpuParameterSpace
()
.
set_index_parameter
(
index
,
"nprobe"
,
3
)
gpu/utils/DeviceDefs.cuh
View file @
fe446e41
...
@@ -13,7 +13,7 @@
...
@@ -13,7 +13,7 @@
namespace faiss { namespace gpu {
namespace faiss { namespace gpu {
#ifdef __CUDA_ARCH__
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ <=
62
0
#if __CUDA_ARCH__ <=
70
0
constexpr int kWarpSize = 32;
constexpr int kWarpSize = 32;
#else
#else
#error Unknown __CUDA_ARCH__; please define parameters for compute capability
#error Unknown __CUDA_ARCH__; please define parameters for compute capability
...
@@ -25,37 +25,15 @@ constexpr int kWarpSize = 32;
...
@@ -25,37 +25,15 @@ constexpr int kWarpSize = 32;
constexpr int kWarpSize = 32;
constexpr int kWarpSize = 32;
#endif // !__CUDA_ARCH__
#endif // !__CUDA_ARCH__
// This is a memory barrier for intra-warp writes to shared memory.
__forceinline__ __device__ void warpFence() {
__forceinline__ __device__ void warpFence() {
// Technically, memory barriers are required via the CUDA
// programming model, since warp synchronous programming no longer
// is guaranteed.
//
// There are two components to it:
// -a barrier known to the compiler such that the compiler will not
// schedule loads and stores across the barrier;
// -a HW-level barrier that guarantees that writes are seen in the
// proper order
//
// However, __threadfence_block() is a stronger constraint than what
// we really want out of the hardware: a warp-wide barrier.
//
// In current hardware, it appears that warp synchronous programming
// is a reality; by all tests it appears safe and race-free.
//
// However, understandably it may not be in the future (based on
// what Nvidia says in the Kepler guide, it may change depending
// upon compiler/toolchain issues or future hardware).
//
// Removing the fence results in 10%+ faster performance.
// However, we are judicious as to where we insert the fence, so if
// this reality ever changes, uncommenting this will result in CUDA
// programming model-safe ordering again.
//
// FIXME: we should probably qualify as volatile as well, since the
// compiler could technically preserve values across loops? This
// seems very impractical for the compiler to do, however.
#if __CUDA_ARCH__ >= 700
__syncwarp();
#else
// For the time being, assume synchronicity.
// __threadfence_block();
// __threadfence_block();
#endif
}
}
} } // namespace
} } // namespace
gpu/utils/Float16.cu
View file @
fe446e41
...
@@ -48,10 +48,16 @@ void runConvertToFloat32(float* out,
...
@@ -48,10 +48,16 @@ void runConvertToFloat32(float* out,
in, in + num, out, HalfToFloat());
in, in + num, out, HalfToFloat());
}
}
half hostFloat2Half(float a) {
__half hostFloat2Half(float a) {
half h;
#if CUDA_VERSION >= 9000
__half_raw raw;
raw.x = cpu_float2half_rn(a).x;
return __half(raw);
#else
__half h;
h.x = cpu_float2half_rn(a).x;
h.x = cpu_float2half_rn(a).x;
return h;
return h;
#endif
}
}
} } // namespace
} } // namespace
...
...
gpu/utils/Float16.cuh
View file @
fe446e41
...
@@ -149,7 +149,7 @@ DeviceTensor<float, Dim, true> fromHalf(GpuResources* resources,
...
@@ -149,7 +149,7 @@ DeviceTensor<float, Dim, true> fromHalf(GpuResources* resources,
return out;
return out;
}
}
half hostFloat2Half(float v);
__
half hostFloat2Half(float v);
#endif // FAISS_USE_FLOAT16
#endif // FAISS_USE_FLOAT16
...
...
gpu/utils/Limits.cuh
View file @
fe446e41
...
@@ -39,9 +39,15 @@ struct Limits<float> {
...
@@ -39,9 +39,15 @@ struct Limits<float> {
#ifdef FAISS_USE_FLOAT16
#ifdef FAISS_USE_FLOAT16
inline __device__ __host__ half kGetHalf(unsigned short v) {
inline __device__ __host__ half kGetHalf(unsigned short v) {
#if CUDA_VERSION >= 9000
__half_raw h;
h.x = v;
return __half(h);
#else
half h;
half h;
h.x = v;
h.x = v;
return h;
return h;
#endif
}
}
template <>
template <>
...
...
gpu/utils/LoadStoreOperators.cuh
View file @
fe446e41
...
@@ -12,6 +12,12 @@
...
@@ -12,6 +12,12 @@
#include "Float16.cuh"
#include "Float16.cuh"
#ifndef __HALF2_TO_UI
// cuda_fp16.hpp doesn't export this
#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
#endif
//
//
// Templated wrappers to express load/store for different scalar and vector
// Templated wrappers to express load/store for different scalar and vector
// types, so kernels can have the same written form but can operate
// types, so kernels can have the same written form but can operate
...
@@ -37,13 +43,23 @@ template <>
...
@@ -37,13 +43,23 @@ template <>
struct LoadStore<Half4> {
struct LoadStore<Half4> {
static inline __device__ Half4 load(void* p) {
static inline __device__ Half4 load(void* p) {
Half4 out;
Half4 out;
#if CUDA_VERSION >= 9000
asm("ld.global.v2.u32 {%0, %1}, [%2];" :
"=r"(__HALF2_TO_UI(out.a)), "=r"(__HALF2_TO_UI(out.b)) : "l"(p));
#else
asm("ld.global.v2.u32 {%0, %1}, [%2];" :
asm("ld.global.v2.u32 {%0, %1}, [%2];" :
"=r"(out.a.x), "=r"(out.b.x) : "l"(p));
"=r"(out.a.x), "=r"(out.b.x) : "l"(p));
#endif
return out;
return out;
}
}
static inline __device__ void store(void* p, const Half4& v) {
static inline __device__ void store(void* p, Half4& v) {
#if CUDA_VERSION >= 9000
asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p),
"r"(__HALF2_TO_UI(v.a)), "r"(__HALF2_TO_UI(v.b)));
#else
asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p), "r"(v.a.x), "r"(v.b.x));
asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p), "r"(v.a.x), "r"(v.b.x));
#endif
}
}
};
};
...
@@ -51,15 +67,27 @@ template <>
...
@@ -51,15 +67,27 @@ template <>
struct LoadStore<Half8> {
struct LoadStore<Half8> {
static inline __device__ Half8 load(void* p) {
static inline __device__ Half8 load(void* p) {
Half8 out;
Half8 out;
#if CUDA_VERSION >= 9000
asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" :
"=r"(__HALF2_TO_UI(out.a.a)), "=r"(__HALF2_TO_UI(out.a.b)),
"=r"(__HALF2_TO_UI(out.b.a)), "=r"(__HALF2_TO_UI(out.b.b)) : "l"(p));
#else
asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" :
asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" :
"=r"(out.a.a.x), "=r"(out.a.b.x),
"=r"(out.a.a.x), "=r"(out.a.b.x),
"=r"(out.b.a.x), "=r"(out.b.b.x) : "l"(p));
"=r"(out.b.a.x), "=r"(out.b.b.x) : "l"(p));
#endif
return out;
return out;
}
}
static inline __device__ void store(void* p, const Half8& v) {
static inline __device__ void store(void* p, Half8& v) {
#if CUDA_VERSION >= 9000
asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
: : "l"(p), "r"(__HALF2_TO_UI(v.a.a)), "r"(__HALF2_TO_UI(v.a.b)),
"r"(__HALF2_TO_UI(v.b.a)), "r"(__HALF2_TO_UI(v.b.b)));
#else
asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
: : "l"(p), "r"(v.a.a.x), "r"(v.a.b.x), "r"(v.b.a.x), "r"(v.b.b.x));
: : "l"(p), "r"(v.a.a.x), "r"(v.a.b.x), "r"(v.b.a.x), "r"(v.b.b.x));
#endif
}
}
};
};
...
...
gpu/utils/MathOperators.cuh
View file @
fe446e41
...
@@ -285,9 +285,13 @@ struct Math<half> {
...
@@ -285,9 +285,13 @@ struct Math<half> {
}
}
static inline __device__ half zero() {
static inline __device__ half zero() {
#if CUDA_VERSION >= 9000
return 0;
#else
half h;
half h;
h.x = 0;
h.x = 0;
return h;
return h;
#endif
}
}
};
};
...
...
gpu/utils/Select.cuh
View file @
fe446e41
...
@@ -142,7 +142,14 @@ struct BlockSelect {
...
@@ -142,7 +142,14 @@ struct BlockSelect {
__device__ inline void checkThreadQ() {
__device__ inline void checkThreadQ() {
bool needSort = (numVals == NumThreadQ);
bool needSort = (numVals == NumThreadQ);
if (!__any(needSort)) {
#if CUDA_VERSION >= 9000
needSort = __any_sync(0xffffffff, needSort);
#else
needSort = __any(needSort);
#endif
if (!needSort) {
// no lanes have triggered a sort
return;
return;
}
}
...
@@ -408,7 +415,14 @@ struct WarpSelect {
...
@@ -408,7 +415,14 @@ struct WarpSelect {
__device__ inline void checkThreadQ() {
__device__ inline void checkThreadQ() {
bool needSort = (numVals == NumThreadQ);
bool needSort = (numVals == NumThreadQ);
if (!__any(needSort)) {
#if CUDA_VERSION >= 9000
needSort = __any_sync(0xffffffff, needSort);
#else
needSort = __any(needSort);
#endif
if (!needSort) {
// no lanes have triggered a sort
return;
return;
}
}
...
...
gpu/utils/Tensor-inl.cuh
View file @
fe446e41
...
@@ -27,6 +27,37 @@ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor()
...
@@ -27,6 +27,37 @@ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor()
}
}
}
}
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) {
this->operator=(t);
}
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::Tensor(
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t) {
this->operator=(std::move(t));
}
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::operator=(
Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t) {
data_ = t.data_;
for (int i = 0; i < Dim; ++i) {
size_[i] = t.size_[i];
stride_[i] = t.stride_[i];
}
return *this;
}
template <typename T, int Dim, bool InnerContig,
template <typename T, int Dim, bool InnerContig,
typename IndexT, template <typename U> class PtrTraits>
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
__host__ __device__
...
...
gpu/utils/Tensor.cuh
View file @
fe446e41
...
@@ -87,16 +87,14 @@ class Tensor {
...
@@ -87,16 +87,14 @@ class Tensor {
__host__ __device__ Tensor();
__host__ __device__ Tensor();
/// Copy constructor
/// Copy constructor
__host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t)
__host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t);
= default;
/// Move constructor
/// Move constructor
__host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t)
__host__ __device__ Tensor(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&& t);
= default;
/// Assignment
/// Assignment
__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t)
= default
;
operator=(Tensor<T, Dim, InnerContig, IndexT, PtrTraits>& t);
/// Move assignment
/// Move assignment
__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
__host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>&
...
...
gpu/utils/WarpShuffles.cuh
View file @
fe446e41
...
@@ -19,7 +19,11 @@ namespace faiss { namespace gpu {
...
@@ -19,7 +19,11 @@ namespace faiss { namespace gpu {
template <typename T>
template <typename T>
inline __device__ T shfl(const T val,
inline __device__ T shfl(const T val,
int srcLane, int width = kWarpSize) {
int srcLane, int width = kWarpSize) {
#if CUDA_VERSION >= 9000
return __shfl_sync(0xffffffff, val, srcLane, width);
#else
return __shfl(val, srcLane, width);
return __shfl(val, srcLane, width);
#endif
}
}
// CUDA SDK does not provide specializations for T*
// CUDA SDK does not provide specializations for T*
...
@@ -28,13 +32,18 @@ inline __device__ T* shfl(T* const val,
...
@@ -28,13 +32,18 @@ inline __device__ T* shfl(T* const val,
int srcLane, int width = kWarpSize) {
int srcLane, int width = kWarpSize) {
static_assert(sizeof(T*) == sizeof(long long), "pointer size");
static_assert(sizeof(T*) == sizeof(long long), "pointer size");
long long v = (long long) val;
long long v = (long long) val;
return (T*) __shfl(v, srcLane, width);
return (T*) shfl(v, srcLane, width);
}
}
template <typename T>
template <typename T>
inline __device__ T shfl_up(const T val,
inline __device__ T shfl_up(const T val,
unsigned int delta, int width = kWarpSize) {
unsigned int delta, int width = kWarpSize) {
#if CUDA_VERSION >= 9000
return __shfl_up_sync(0xffffffff, val, delta, width);
#else
return __shfl_up(val, delta, width);
return __shfl_up(val, delta, width);
#endif
}
}
// CUDA SDK does not provide specializations for T*
// CUDA SDK does not provide specializations for T*
...
@@ -43,13 +52,18 @@ inline __device__ T* shfl_up(T* const val,
...
@@ -43,13 +52,18 @@ inline __device__ T* shfl_up(T* const val,
unsigned int delta, int width = kWarpSize) {
unsigned int delta, int width = kWarpSize) {
static_assert(sizeof(T*) == sizeof(long long), "pointer size");
static_assert(sizeof(T*) == sizeof(long long), "pointer size");
long long v = (long long) val;
long long v = (long long) val;
return (T*) __shfl_up(v, delta, width);
return (T*) shfl_up(v, delta, width);
}
}
template <typename T>
template <typename T>
inline __device__ T shfl_down(const T val,
inline __device__ T shfl_down(const T val,
unsigned int delta, int width = kWarpSize) {
unsigned int delta, int width = kWarpSize) {
#if CUDA_VERSION >= 9000
return __shfl_down_sync(0xffffffff, val, delta, width);
#else
return __shfl_down(val, delta, width);
return __shfl_down(val, delta, width);
#endif
}
}
// CUDA SDK does not provide specializations for T*
// CUDA SDK does not provide specializations for T*
...
@@ -58,13 +72,17 @@ inline __device__ T* shfl_down(T* const val,
...
@@ -58,13 +72,17 @@ inline __device__ T* shfl_down(T* const val,
unsigned int delta, int width = kWarpSize) {
unsigned int delta, int width = kWarpSize) {
static_assert(sizeof(T*) == sizeof(long long), "pointer size");
static_assert(sizeof(T*) == sizeof(long long), "pointer size");
long long v = (long long) val;
long long v = (long long) val;
return (T*)
__
shfl_down(v, delta, width);
return (T*) shfl_down(v, delta, width);
}
}
template <typename T>
template <typename T>
inline __device__ T shfl_xor(const T val,
inline __device__ T shfl_xor(const T val,
int laneMask, int width = kWarpSize) {
int laneMask, int width = kWarpSize) {
#if CUDA_VERSION >= 9000
return __shfl_xor_sync(0xffffffff, val, laneMask, width);
#else
return __shfl_xor(val, laneMask, width);
return __shfl_xor(val, laneMask, width);
#endif
}
}
// CUDA SDK does not provide specializations for T*
// CUDA SDK does not provide specializations for T*
...
@@ -73,10 +91,12 @@ inline __device__ T* shfl_xor(T* const val,
...
@@ -73,10 +91,12 @@ inline __device__ T* shfl_xor(T* const val,
int laneMask, int width = kWarpSize) {
int laneMask, int width = kWarpSize) {
static_assert(sizeof(T*) == sizeof(long long), "pointer size");
static_assert(sizeof(T*) == sizeof(long long), "pointer size");
long long v = (long long) val;
long long v = (long long) val;
return (T*)
__
shfl_xor(v, laneMask, width);
return (T*) shfl_xor(v, laneMask, width);
}
}
#ifdef FAISS_USE_FLOAT16
#ifdef FAISS_USE_FLOAT16
// CUDA 9.0 has half shuffle
#if CUDA_VERSION < 9000
inline __device__ half shfl(half v,
inline __device__ half shfl(half v,
int srcLane, int width = kWarpSize) {
int srcLane, int width = kWarpSize) {
unsigned int vu = v.x;
unsigned int vu = v.x;
...
@@ -96,6 +116,7 @@ inline __device__ half shfl_xor(half v,
...
@@ -96,6 +116,7 @@ inline __device__ half shfl_xor(half v,
h.x = (unsigned short) vu;
h.x = (unsigned short) vu;
return h;
return h;
}
}
#endif
#endif // CUDA_VERSION
#endif // FAISS_USE_FLOAT16
} } // namespace
} } // namespace
python/demo_ivfpq.py
deleted
100644 → 0
View file @
250a3d3f
# Copyright (c) 2015-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD+Patents license found in the
# LICENSE file in the root directory of this source tree.
#! /usr/bin/env python2
import
numpy
as
np
import
faiss
def
fvecs_read
(
filename
):
fv
=
np
.
fromfile
(
filename
,
dtype
=
'float32'
)
if
fv
.
size
==
0
:
return
np
.
zeros
((
0
,
0
),
dtype
=
'float32'
)
dim
=
fv
.
view
(
'int32'
)[
0
]
assert
dim
>
0
fv
=
fv
.
reshape
(
-
1
,
1
+
dim
)
if
not
all
(
fv
.
view
(
'int32'
)[:,
0
]
==
dim
):
raise
IOError
(
"non-uniform vector sizes in "
+
filename
)
fv
=
fv
[:,
1
:]
return
fv
.
copy
()
# to make contiguous
rootdir
=
'/mnt/vol/gfsai-east/ai-group/datasets/simsearch/sift1M'
print
"loading database"
xb
=
fvecs_read
(
rootdir
+
'/sift_base.fvecs'
)
xt
=
fvecs_read
(
rootdir
+
'/sift_learn.fvecs'
)
xq
=
fvecs_read
(
rootdir
+
'/sift_query.fvecs'
)
d
=
xt
.
shape
[
1
]
gt_index
=
faiss
.
IndexFlatL2
(
d
)
gt_index
.
add
(
xb
)
D
,
gt_nns
=
gt_index
.
search
(
xq
,
1
)
coarse_quantizer
=
faiss
.
IndexFlatL2
(
d
)
index
=
faiss
.
IndexIVFPQ
(
coarse_quantizer
,
d
,
25
,
16
,
8
)
print
"train"
index
.
train
(
xt
)
print
"add"
index
.
add
(
xb
)
print
"search"
index
.
nprobe
=
5
D
,
nns
=
index
.
search
(
xq
,
10
)
n_ok
=
(
nns
==
gt_nns
)
.
sum
()
nq
=
xq
.
shape
[
0
]
print
"n_ok=
%
d/
%
d"
%
(
n_ok
,
nq
)
python
/demo_auto_tune.py
→
tests
/demo_auto_tune.py
View file @
fe446e41
...
@@ -11,9 +11,13 @@ import time
...
@@ -11,9 +11,13 @@ import time
import
numpy
as
np
import
numpy
as
np
import
pdb
import
pdb
import
matplotlib
try
:
matplotlib
.
use
(
'Agg'
)
import
matplotlib
from
matplotlib
import
pyplot
matplotlib
.
use
(
'Agg'
)
from
matplotlib
import
pyplot
graphical_output
=
True
except
ImportError
:
graphical_output
=
False
import
faiss
import
faiss
...
@@ -93,15 +97,15 @@ keys_mem_32 = [
...
@@ -93,15 +97,15 @@ keys_mem_32 = [
# indexes that can run on the GPU
# indexes that can run on the GPU
keys_gpu
=
[
keys_gpu
=
[
"PCA64,IVF4096,Flat"
,
"PCA64,Flat"
,
"Flat"
,
"IVF4096,Flat"
,
"IVF16384,Flat"
,
"PCA64,Flat"
,
"Flat"
,
"IVF4096,Flat"
,
"IVF16384,Flat"
,
"
PCA64,IVF4096,Flat"
,
"
IVF4096,PQ32"
]
"IVF4096,PQ32"
]
keys_to_test
=
unlimited_mem_keys
keys_to_test
=
unlimited_mem_keys
use_gpu
=
False
use_gpu
=
False
if
use_gpu
:
if
use_gpu
:
# if this fails, it means that the GPU version was not comp
# if this fails, it means that the GPU version was not comp
assert
faiss
.
StandardGpuResources
,
\
assert
faiss
.
StandardGpuResources
,
\
...
@@ -129,6 +133,8 @@ for index_key in keys_to_test:
...
@@ -129,6 +133,8 @@ for index_key in keys_to_test:
# transfer to GPU (may be partial)
# transfer to GPU (may be partial)
index
=
faiss
.
index_cpu_to_gpu
(
res
,
dev_no
,
index
)
index
=
faiss
.
index_cpu_to_gpu
(
res
,
dev_no
,
index
)
params
=
faiss
.
GpuParameterSpace
()
params
=
faiss
.
GpuParameterSpace
()
print
"GGGG"
raw_input
()
else
:
else
:
params
=
faiss
.
ParameterSpace
()
params
=
faiss
.
ParameterSpace
()
...
@@ -152,7 +158,7 @@ for index_key in keys_to_test:
...
@@ -152,7 +158,7 @@ for index_key in keys_to_test:
op_per_key
.
append
((
index_key
,
opi
))
op_per_key
.
append
((
index_key
,
opi
))
if
True
:
if
graphical_output
:
# graphical output (to tmp/ subdirectory)
# graphical output (to tmp/ subdirectory)
fig
=
pyplot
.
figure
(
figsize
=
(
12
,
9
))
fig
=
pyplot
.
figure
(
figsize
=
(
12
,
9
))
...
...
tests/test_factory.py
0 → 100644
View file @
fe446e41
# Copyright (c) 2015-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD+Patents license found in the
# LICENSE file in the root directory of this source tree.
#! /usr/bin/env python2
import
numpy
as
np
import
unittest
import
faiss
class
TestFactory
(
unittest
.
TestCase
):
def
test_factory_1
(
self
):
index
=
faiss
.
index_factory
(
12
,
"IVF10,PQ4"
)
assert
index
.
do_polysemous_training
index
=
faiss
.
index_factory
(
12
,
"IVF10,PQ4np"
)
assert
not
index
.
do_polysemous_training
index
=
faiss
.
index_factory
(
12
,
"PQ4"
)
assert
index
.
do_polysemous_training
index
=
faiss
.
index_factory
(
12
,
"PQ4np"
)
assert
not
index
.
do_polysemous_training
try
:
index
=
faiss
.
index_factory
(
10
,
"PQ4"
)
except
RuntimeError
:
pass
else
:
assert
False
,
"should do a runtime error"
def
test_factory_2
(
self
):
index
=
faiss
.
index_factory
(
12
,
"SQ8"
)
assert
index
.
code_size
==
12
def
test_factory_3
(
self
):
index
=
faiss
.
index_factory
(
12
,
"IVF10,PQ4"
)
faiss
.
ParameterSpace
()
.
set_index_parameter
(
index
,
"nprobe"
,
3
)
assert
index
.
nprobe
==
3
index
=
faiss
.
index_factory
(
12
,
"PCAR8,IVF10,PQ4"
)
faiss
.
ParameterSpace
()
.
set_index_parameter
(
index
,
"nprobe"
,
3
)
assert
faiss
.
downcast_index
(
index
.
index
)
.
nprobe
==
3
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment