1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
/**
* Copyright (c) 2015-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD+Patents license found in the
* LICENSE file in the root directory of this source tree.
*/
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#include "../../FaissAssert.h"
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <vector>
namespace faiss { namespace gpu {
/// Returns the current thread-local GPU device
int getCurrentDevice();
/// Sets the current thread-local GPU device
void setCurrentDevice(int device);
/// Returns the number of available GPU devices
int getNumDevices();
/// Synchronizes the CPU against all devices (equivalent to
/// cudaDeviceSynchronize for each device)
void synchronizeAllDevices();
/// Returns a cached cudaDeviceProp for the given device
cudaDeviceProp& getDeviceProperties(int device);
/// Returns the maximum number of threads available for the given GPU
/// device
int getMaxThreads(int device);
/// Equivalent to getMaxThreads(getCurrentDevice())
int getMaxThreadsCurrentDevice();
/// Returns the maximum smem available for the given GPU device
size_t getMaxSharedMemPerBlock(int device);
/// Equivalent to getMaxSharedMemPerBlock(getCurrentDevice())
size_t getMaxSharedMemPerBlockCurrentDevice();
/// For a given pointer, returns whether or not it is located on
/// a device (deviceId >= 0) or the host (-1).
int getDeviceForAddress(const void* p);
/// Does the given device support full unified memory sharing host
/// memory?
bool getFullUnifiedMemSupport(int device);
/// Equivalent to getFullUnifiedMemSupport(getCurrentDevice())
bool getFullUnifiedMemSupportCurrentDevice();
/// RAII object to set the current device, and restore the previous
/// device upon destruction
class DeviceScope {
public:
explicit DeviceScope(int device);
~DeviceScope();
private:
int prevDevice_;
};
/// RAII object to manage a cublasHandle_t
class CublasHandleScope {
public:
CublasHandleScope();
~CublasHandleScope();
cublasHandle_t get() { return blasHandle_; }
private:
cublasHandle_t blasHandle_;
};
// RAII object to manage a cudaEvent_t
class CudaEvent {
public:
/// Creates an event and records it in this stream
explicit CudaEvent(cudaStream_t stream);
CudaEvent(const CudaEvent& event) = delete;
CudaEvent(CudaEvent&& event) noexcept;
~CudaEvent();
inline cudaEvent_t get() { return event_; }
/// Wait on this event in this stream
void streamWaitOnEvent(cudaStream_t stream);
/// Have the CPU wait for the completion of this event
void cpuWaitOnEvent();
CudaEvent& operator=(CudaEvent&& event) noexcept;
CudaEvent& operator=(CudaEvent& event) = delete;
private:
cudaEvent_t event_;
};
/// Wrapper to test return status of CUDA functions
#define CUDA_VERIFY(X) \
do { \
auto err__ = (X); \
FAISS_ASSERT_FMT(err__ == cudaSuccess, "CUDA error %d", (int) err__); \
} while (0)
/// Wrapper to synchronously probe for CUDA errors
// #define FAISS_GPU_SYNC_ERROR 1
#ifdef FAISS_GPU_SYNC_ERROR
#define CUDA_TEST_ERROR() \
do { \
CUDA_VERIFY(cudaDeviceSynchronize()); \
} while (0)
#else
#define CUDA_TEST_ERROR() \
do { \
CUDA_VERIFY(cudaGetLastError()); \
} while (0)
#endif
/// Call for a collection of streams to wait on
template <typename L1, typename L2>
void streamWaitBase(const L1& listWaiting, const L2& listWaitOn) {
// For all the streams we are waiting on, create an event
std::vector<cudaEvent_t> events;
for (auto& stream : listWaitOn) {
cudaEvent_t event;
CUDA_VERIFY(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
CUDA_VERIFY(cudaEventRecord(event, stream));
events.push_back(event);
}
// For all the streams that are waiting, issue a wait
for (auto& stream : listWaiting) {
for (auto& event : events) {
CUDA_VERIFY(cudaStreamWaitEvent(stream, event, 0));
}
}
for (auto& event : events) {
CUDA_VERIFY(cudaEventDestroy(event));
}
}
/// These versions allow usage of initializer_list as arguments, since
/// otherwise {...} doesn't have a type
template <typename L1>
void streamWait(const L1& a,
const std::initializer_list<cudaStream_t>& b) {
streamWaitBase(a, b);
}
template <typename L2>
void streamWait(const std::initializer_list<cudaStream_t>& a,
const L2& b) {
streamWaitBase(a, b);
}
inline void streamWait(const std::initializer_list<cudaStream_t>& a,
const std::initializer_list<cudaStream_t>& b) {
streamWaitBase(a, b);
}
} } // namespace