Commit e8d24076 authored by Davis King's avatar Davis King

Simplified the device_global_buffer() code and API.

parent d7fb51e6
......@@ -13,6 +13,36 @@ namespace dlib
namespace cuda
{
// ----------------------------------------------------------------------------------------
weak_cuda_data_void_ptr::
weak_cuda_data_void_ptr(
const cuda_data_void_ptr& ptr
) : num(ptr.num), pdata(ptr.pdata)
{
}
// ----------------------------------------------------------------------------------------
cuda_data_void_ptr weak_cuda_data_void_ptr::
lock() const
{
auto ptr = pdata.lock();
if (ptr)
{
cuda_data_void_ptr temp;
temp.pdata = ptr;
temp.num = num;
return temp;
}
else
{
return cuda_data_void_ptr();
}
}
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
cuda_data_void_ptr::
......@@ -100,7 +130,8 @@ namespace dlib
{
}
std::shared_ptr<resizable_cuda_buffer> get_buffer (
cuda_data_void_ptr get (
size_t size
)
{
int new_device_id;
......@@ -109,11 +140,12 @@ namespace dlib
if (new_device_id >= (long)buffers.size())
buffers.resize(new_device_id+16);
// If we don't have a buffer already for this device then make one
std::shared_ptr<resizable_cuda_buffer> buff = buffers[new_device_id].lock();
if (!buff)
// If we don't have a buffer already for this device then make one, or if it's too
// small, make a bigger one.
cuda_data_void_ptr buff = buffers[new_device_id].lock();
if (!buff || buff.size() < size)
{
buff = std::make_shared<resizable_cuda_buffer>();
buff = cuda_data_void_ptr(size);
buffers[new_device_id] = buff;
}
......@@ -123,13 +155,15 @@ namespace dlib
private:
std::vector<std::weak_ptr<resizable_cuda_buffer>> buffers;
std::vector<weak_cuda_data_void_ptr> buffers;
};
std::shared_ptr<resizable_cuda_buffer> device_global_buffer()
// ----------------------------------------------------------------------------------------
cuda_data_void_ptr device_global_buffer(size_t size)
{
thread_local cudnn_device_buffer buffer;
return buffer.get_buffer();
return buffer.get(size);
}
// ------------------------------------------------------------------------------------
......
......@@ -17,6 +17,37 @@ namespace dlib
// ------------------------------------------------------------------------------------
class cuda_data_void_ptr;
class weak_cuda_data_void_ptr
{
/*!
WHAT THIS OBJECT REPRESENTS
This is just like a std::weak_ptr version of cuda_data_void_ptr. It allows you
to hold a non-owning reference to a cuda_data_void_ptr.
!*/
public:
weak_cuda_data_void_ptr() = default;
weak_cuda_data_void_ptr(const cuda_data_void_ptr& ptr);
void reset() { pdata.reset(); num = 0; }
cuda_data_void_ptr lock() const;
/*!
ensures
- if (the memory block referenced by this object hasn't been deleted) then
- returns a cuda_data_void_ptr referencing that memory block
- else
- returns a default initialized cuda_data_void_ptr (i.e. an empty one).
!*/
private:
size_t num = 0;
std::weak_ptr<void> pdata;
};
// ----------------------------------------------------------------------------------------
class cuda_data_void_ptr
{
/*!
......@@ -64,6 +95,7 @@ namespace dlib
private:
friend class weak_cuda_data_void_ptr;
size_t num = 0;
std::shared_ptr<void> pdata;
};
......@@ -214,40 +246,7 @@ namespace dlib
// ------------------------------------------------------------------------------------
class resizable_cuda_buffer
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a block of memory on a CUDA device that will be automatically
resized if requested size is larger than allocated.
!*/
public:
cuda_data_void_ptr get(size_t size)
/*!
ensures
- This object will return the buffer of requested size or larger.
- buffer.size() >= size
- Client code should not hold the returned cuda_data_void_ptr for long
durations, but instead should call get() whenever the buffer is
needed. Doing so ensures that multiple buffers are not kept around
in the event of a resize.
!*/
{
if (buffer.size() < size)
{
buffer.reset();
buffer = cuda_data_void_ptr(size);
}
return buffer;
}
private:
cuda_data_void_ptr buffer;
};
// ----------------------------------------------------------------------------------------
std::shared_ptr<resizable_cuda_buffer> device_global_buffer(
);
cuda_data_void_ptr device_global_buffer(size_t size);
/*!
ensures
- Returns a pointer to a globally shared CUDA memory buffer on the
......@@ -256,10 +255,28 @@ namespace dlib
as scratch space for CUDA computations that all take place on the default
stream. Using it in this way ensures that there aren't any race conditions
involving the use of the buffer.
- The global buffer is deallocated once all references to it are
destructed. It will be reallocated as required. So if you want to avoid
these reallocations then hold a copy of the shared_ptr returned by this
function.
- The returned pointer will point to at least size bytes. It may point to more.
- The global buffer is deallocated once all references to it are destructed.
However, if device_global_buffer() is called before then with a size <= the last
size requested, then the previously returned global buffer pointer is returned.
This avoids triggering expensive CUDA reallocations. So if you want to avoid
these reallocations then hold a copy of the pointer returned by this function.
However, as a general rule, client code should not hold the returned
cuda_data_void_ptr for long durations, but instead should call
device_global_buffer() whenever the buffer is needed, and overwrite the previously
returned pointer with the new pointer. Doing so ensures multiple buffers are not
kept around in the event that multiple sized buffers are requested. To explain
this, consider this code, assumed to execute at program startup:
auto ptr1 = device_global_buffer(1);
auto ptr2 = device_global_buffer(2);
auto ptr3 = device_global_buffer(3);
since the sizes increased at each call 3 separate buffers were allocated. First
one of size 1, then of size 2, then of size 3. If we then executed:
ptr1 = device_global_buffer(1);
ptr2 = device_global_buffer(2);
ptr3 = device_global_buffer(3);
all three of these pointers would now point to the same buffer, since the smaller
requests can be satisfied by returning the size 3 buffer in each case.
!*/
// ----------------------------------------------------------------------------------------
......
......@@ -438,8 +438,7 @@ namespace dlib
const size_t bytes_per_plane = subnetwork_output.nr()*subnetwork_output.nc()*sizeof(uint16_t);
// Allocate a cuda buffer to store all the truth images and also one float
// for the scalar loss output.
work = device_global_buffer();
buf = work->get(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float));
buf = device_global_buffer(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float));
cuda_data_void_ptr loss_buf = buf;
buf = buf+sizeof(float);
......@@ -466,7 +465,6 @@ namespace dlib
double& loss
);
mutable std::shared_ptr<resizable_cuda_buffer> work;
mutable cuda_data_void_ptr buf;
};
......
......@@ -749,7 +749,6 @@ namespace dlib
forward_workspace.reset();
backward_data_workspace.reset();
backward_filters_workspace.reset();
workspace.reset();
}
void tensor_conv::
......@@ -919,8 +918,6 @@ namespace dlib
(const cudnnFilterDescriptor_t)filter_handle,
backward_filters_best_algo,
&backward_filters_workspace_size_in_bytes));
workspace = device_global_buffer();
}
catch(...)
{
......@@ -989,7 +986,7 @@ namespace dlib
// while the function is still executing on the device. But each time we come
// here, we make sure to grab the latest workspace buffer so that, globally, we
// minimize the number of such buffers.
forward_workspace = workspace->get(forward_workspace_size_in_bytes);
forward_workspace = device_global_buffer(forward_workspace_size_in_bytes);
CHECK_CUDNN(cudnnConvolutionForward(
context(),
......@@ -1022,7 +1019,7 @@ namespace dlib
// while the function is still executing on the device. But each time we come
// here, we make sure to grab the latest workspace buffer so that, globally, we
// minimize the number of such buffers.
backward_data_workspace = workspace->get(backward_data_workspace_size_in_bytes);
backward_data_workspace = device_global_buffer(backward_data_workspace_size_in_bytes);
CHECK_CUDNN(cudnnConvolutionBackwardData(context(),
......@@ -1056,7 +1053,7 @@ namespace dlib
// while the function is still executing on the device. But each time we come
// here, we make sure to grab the latest workspace buffer so that, globally, we
// minimize the number of such buffers.
backward_filters_workspace = workspace->get(backward_filters_workspace_size_in_bytes);
backward_filters_workspace = device_global_buffer(backward_filters_workspace_size_in_bytes);
CHECK_CUDNN(cudnnConvolutionBackwardFilter(context(),
&alpha,
......
......@@ -268,7 +268,6 @@ namespace dlib
size_t forward_workspace_size_in_bytes;
size_t backward_data_workspace_size_in_bytes;
size_t backward_filters_workspace_size_in_bytes;
std::shared_ptr<resizable_cuda_buffer> workspace;
cuda_data_void_ptr forward_workspace;
cuda_data_void_ptr backward_data_workspace;
cuda_data_void_ptr backward_filters_workspace;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment