11 #include "StandardGpuResources.h" 
   12 #include "../FaissAssert.h" 
   14 namespace faiss { 
namespace gpu {
 
   18 constexpr 
int kNumStreams = 2;
 
   21 constexpr 
float kDefaultTempMemFraction = 0.18f;
 
   24 constexpr 
size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
 
   28 StandardGpuResources::StandardGpuResources() :
 
   29     pinnedMemAlloc_(nullptr),
 
   30     pinnedMemAllocSize_(0),
 
   31     tempMemFraction_(kDefaultTempMemFraction),
 
   34     pinnedMemSize_(kDefaultPinnedMemoryAllocation) {
 
   37 StandardGpuResources::~StandardGpuResources() {
 
   38   for (
auto& entry : defaultStreams_) {
 
   39     DeviceScope scope(entry.first);
 
   41     auto it = userDefaultStreams_.find(entry.first);
 
   42     if (it == userDefaultStreams_.end()) {
 
   45       CUDA_VERIFY(cudaStreamDestroy(entry.second));
 
   49   for (
auto& entry : alternateStreams_) {
 
   50     DeviceScope scope(entry.first);
 
   52     for (
auto stream : entry.second) {
 
   53       CUDA_VERIFY(cudaStreamDestroy(stream));
 
   57   for (
auto& entry : asyncCopyStreams_) {
 
   58     DeviceScope scope(entry.first);
 
   60     CUDA_VERIFY(cudaStreamDestroy(entry.second));
 
   63   for (
auto& entry : blasHandles_) {
 
   64     DeviceScope scope(entry.first);
 
   66     auto blasStatus = cublasDestroy(entry.second);
 
   67     FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
 
   70   if (pinnedMemAlloc_) {
 
   71     CUDA_VERIFY(cudaFreeHost(pinnedMemAlloc_));
 
   88   FAISS_ASSERT(fraction >= 0.0f && fraction <= 0.5f);
 
   90   tempMemFraction_ = fraction;
 
   96   FAISS_ASSERT(defaultStreams_.size() == 0);
 
   97   FAISS_ASSERT(!pinnedMemAlloc_);
 
   99   pinnedMemSize_ = size;
 
  104   auto it = defaultStreams_.find(device);
 
  105   if (it != defaultStreams_.end()) {
 
  107     CUDA_VERIFY(cudaStreamDestroy(it->second));
 
  111   userDefaultStreams_[device] = stream;
 
  116   for (
int dev = 0; dev < getNumDevices(); ++dev) {
 
  125   if (defaultStreams_.count(device) != 0) {
 
  131   if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
 
  132     CUDA_VERIFY(cudaHostAlloc(&pinnedMemAlloc_,
 
  134                               cudaHostAllocDefault));
 
  135     pinnedMemAllocSize_ = pinnedMemSize_;
 
  138   FAISS_ASSERT(device < getNumDevices());
 
  142   auto& prop = getDeviceProperties(device);
 
  145   FAISS_ASSERT_FMT(prop.major >= 3,
 
  146                    "Device id %d with CC %d.%d not supported, " 
  147                    "need 3.0+ compute capability",
 
  148                    device, prop.major, prop.minor);
 
  151   cudaStream_t defaultStream = 0;
 
  152   auto it = userDefaultStreams_.find(device);
 
  153   if (it != userDefaultStreams_.end()) {
 
  155     defaultStream = it->second;
 
  157     CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
 
  158                                           cudaStreamNonBlocking));
 
  161   defaultStreams_[device] = defaultStream;
 
  163   cudaStream_t asyncCopyStream = 0;
 
  164   CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
 
  165                                         cudaStreamNonBlocking));
 
  167   asyncCopyStreams_[device] = asyncCopyStream;
 
  169   std::vector<cudaStream_t> deviceStreams;
 
  170   for (
int j = 0; j < kNumStreams; ++j) {
 
  171     cudaStream_t stream = 0;
 
  172     CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
 
  173                                           cudaStreamNonBlocking));
 
  175     deviceStreams.push_back(stream);
 
  178   alternateStreams_[device] = std::move(deviceStreams);
 
  181   cublasHandle_t blasHandle = 0;
 
  182   auto blasStatus = cublasCreate(&blasHandle);
 
  183   FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
 
  184   blasHandles_[device] = blasHandle;
 
  191     CUDA_VERIFY(cudaMemGetInfo(&devFree, &devTotal));
 
  193     toAlloc = (size_t) (tempMemFraction_ * devTotal);
 
  195     toAlloc = tempMemSize_;
 
  198   FAISS_ASSERT(memory_.count(device) == 0);
 
  199   memory_.emplace(device,
 
  200                   std::unique_ptr<StackDeviceMemory>(
 
  207   return blasHandles_[device];
 
  213   return defaultStreams_[device];
 
  216 std::vector<cudaStream_t>
 
  219   return alternateStreams_[device];
 
  224   return *memory_[device];
 
  227 std::pair<void*, size_t>
 
  229   return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
 
  235   return asyncCopyStreams_[device];
 
void setDefaultStream(int device, cudaStream_t stream)
Called to change the stream for work ordering. 
cublasHandle_t getBlasHandle(int device) override
Returns the cuBLAS handle that we use for the given device. 
void setTempMemoryFraction(float fraction)
void initializeForDevice(int device) override
Internal system calls. 
cudaStream_t getAsyncCopyStream(int device) override
Returns the stream on which we perform async CPU <-> GPU copies. 
DeviceMemory & getMemoryManager(int device) override
Returns the temporary memory manager for the given device. 
void setTempMemory(size_t size)
void setPinnedMemory(size_t size)
cudaStream_t getDefaultStream(int device) override
void setDefaultNullStreamAllDevices()
Manages temporary memory allocations on a GPU device. 
std::pair< void *, size_t > getPinnedMemory() override
Returns the available CPU pinned memory buffer. 
std::vector< cudaStream_t > getAlternateStreams(int device) override
Returns the set of alternative streams that we use for the given device.