Simplified the device_global_buffer() code and API.

e8d24076 · Davis King · d7fb51e6 · e8d24076 · e8d24076 · e8d24076
Commit e8d24076 authored Aug 17, 2019 by Davis King
5 changed files
--- a/dlib/cuda/cuda_data_ptr.cpp
+++ b/dlib/cuda/cuda_data_ptr.cpp
@@ -13,6 +13,36 @@ namespace dlib
    namespace cuda 
    {

+    // ----------------------------------------------------------------------------------------
+
+        weak_cuda_data_void_ptr::
+        weak_cuda_data_void_ptr(
+            const cuda_data_void_ptr& ptr
+        ) : num(ptr.num), pdata(ptr.pdata)
+        {
+
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        cuda_data_void_ptr weak_cuda_data_void_ptr::
+        lock() const 
+        {
+            auto ptr = pdata.lock();
+            if (ptr)
+            {
+                cuda_data_void_ptr temp;
+                temp.pdata = ptr;
+                temp.num = num;
+                return temp;
+            }
+            else
+            {
+                return cuda_data_void_ptr();
+            }
+        }
+
+    // -----------------------------------------------------------------------------------
    // -----------------------------------------------------------------------------------

        cuda_data_void_ptr::
@@ -100,7 +130,8 @@ namespace dlib
            {
            }

-            std::shared_ptr<resizable_cuda_buffer> get_buffer (
+            cuda_data_void_ptr get (
+                size_t size
            )
            {
                int new_device_id;
@@ -109,11 +140,12 @@ namespace dlib
                if (new_device_id >= (long)buffers.size())
                    buffers.resize(new_device_id+16);

-                // If we don't have a buffer already for this device then make one
-                std::shared_ptr<resizable_cuda_buffer> buff = buffers[new_device_id].lock();
-                if (!buff)
+                // If we don't have a buffer already for this device then make one, or if it's too
+                // small, make a bigger one.
+                cuda_data_void_ptr buff = buffers[new_device_id].lock();
+                if (!buff || buff.size() < size)
                {
-                    buff = std::make_shared<resizable_cuda_buffer>();
+                    buff = cuda_data_void_ptr(size);
                    buffers[new_device_id] = buff;
                }

@@ -123,13 +155,15 @@ namespace dlib

        private:

-            std::vector<std::weak_ptr<resizable_cuda_buffer>> buffers;
+            std::vector<weak_cuda_data_void_ptr> buffers;
        };

-        std::shared_ptr<resizable_cuda_buffer> device_global_buffer()
+    // ----------------------------------------------------------------------------------------
+
+        cuda_data_void_ptr device_global_buffer(size_t size) 
        {
            thread_local cudnn_device_buffer buffer;
-            return buffer.get_buffer();
+            return buffer.get(size);
        }

    // ------------------------------------------------------------------------------------

--- a/dlib/cuda/cuda_data_ptr.h
+++ b/dlib/cuda/cuda_data_ptr.h
@@ -17,6 +17,37 @@ namespace dlib

    // ------------------------------------------------------------------------------------

+        class cuda_data_void_ptr;
+        class weak_cuda_data_void_ptr 
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is just like a std::weak_ptr version of cuda_data_void_ptr.  It allows you
+                    to hold a non-owning reference to a cuda_data_void_ptr.
+            !*/
+        public:
+            weak_cuda_data_void_ptr() = default;
+
+            weak_cuda_data_void_ptr(const cuda_data_void_ptr& ptr);
+
+            void reset() { pdata.reset(); num = 0; }
+
+            cuda_data_void_ptr lock() const;
+            /*!
+                ensures
+                    - if (the memory block referenced by this object hasn't been deleted) then
+                        - returns a cuda_data_void_ptr referencing that memory block
+                    - else
+                        - returns a default initialized cuda_data_void_ptr (i.e. an empty one).
+            !*/
+
+        private:
+            size_t num = 0;
+            std::weak_ptr<void> pdata;
+        };
+
+    // ----------------------------------------------------------------------------------------
+
        class cuda_data_void_ptr
        {
            /*!
@@ -64,6 +95,7 @@ namespace dlib

        private:

+            friend class weak_cuda_data_void_ptr;
            size_t num = 0;
            std::shared_ptr<void> pdata;
        };
@@ -214,40 +246,7 @@ namespace dlib

    // ------------------------------------------------------------------------------------

-        class resizable_cuda_buffer
-        {
-            /*!
-                WHAT THIS OBJECT REPRESENTS
-                    This is a block of memory on a CUDA device that will be automatically
-                    resized if requested size is larger than allocated.
-            !*/
-        public:
-            cuda_data_void_ptr get(size_t size)
-            /*!
-                ensures
-                    - This object will return the buffer of requested size or larger.
-                    - buffer.size() >= size
-                    - Client code should not hold the returned cuda_data_void_ptr for long
-                      durations, but instead should call get() whenever the buffer is
-                      needed.  Doing so ensures that multiple buffers are not kept around
-                      in the event of a resize.
-            !*/
-            {
-                if (buffer.size() < size)
-                {
-                    buffer.reset();
-                    buffer = cuda_data_void_ptr(size);
-                }
-                return buffer;
-            }
-        private:
-            cuda_data_void_ptr buffer;
-        };
-
-    // ----------------------------------------------------------------------------------------
-
-        std::shared_ptr<resizable_cuda_buffer> device_global_buffer(
-        );
+        cuda_data_void_ptr device_global_buffer(size_t size);
        /*!
            ensures
                - Returns a pointer to a globally shared CUDA memory buffer on the
@@ -256,10 +255,28 @@ namespace dlib
                  as scratch space for CUDA computations that all take place on the default
                  stream.  Using it in this way ensures that there aren't any race conditions
                  involving the use of the buffer.
-                - The global buffer is deallocated once all references to it are
-                  destructed.  It will be reallocated as required.  So if you want to avoid
-                  these reallocations then hold a copy of the shared_ptr returned by this
-                  function.
+                - The returned pointer will point to at least size bytes.  It may point to more.
+                - The global buffer is deallocated once all references to it are destructed.
+                  However, if device_global_buffer() is called before then with a size <= the last
+                  size requested, then the previously returned global buffer pointer is returned.
+                  This avoids triggering expensive CUDA reallocations.  So if you want to avoid
+                  these reallocations then hold a copy of the pointer returned by this function.
+                  However, as a general rule, client code should not hold the returned
+                  cuda_data_void_ptr for long durations, but instead should call
+                  device_global_buffer() whenever the buffer is needed, and overwrite the previously
+                  returned pointer with the new pointer.  Doing so ensures multiple buffers are not
+                  kept around in the event that multiple sized buffers are requested.  To explain
+                  this, consider this code, assumed to execute at program startup:
+                    auto ptr1 = device_global_buffer(1);
+                    auto ptr2 = device_global_buffer(2);
+                    auto ptr3 = device_global_buffer(3);
+                  since the sizes increased at each call 3 separate buffers were allocated.  First
+                  one of size 1, then of size 2, then of size 3.  If we then executed:
+                    ptr1 = device_global_buffer(1);
+                    ptr2 = device_global_buffer(2);
+                    ptr3 = device_global_buffer(3);
+                  all three of these pointers would now point to the same buffer, since the smaller
+                  requests can be satisfied by returning the size 3 buffer in each case.
        !*/

    // ----------------------------------------------------------------------------------------

--- a/dlib/cuda/cuda_dlib.h
+++ b/dlib/cuda/cuda_dlib.h
@@ -438,8 +438,7 @@ namespace dlib
                const size_t bytes_per_plane = subnetwork_output.nr()*subnetwork_output.nc()*sizeof(uint16_t);
                // Allocate a cuda buffer to store all the truth images and also one float
                // for the scalar loss output.
-                work = device_global_buffer();
-                buf = work->get(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float));
+                buf = device_global_buffer(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float));

                cuda_data_void_ptr loss_buf = buf;
                buf = buf+sizeof(float);
@@ -466,7 +465,6 @@ namespace dlib
                double& loss
            );
            
-            mutable std::shared_ptr<resizable_cuda_buffer> work;
            mutable cuda_data_void_ptr buf;
        };


--- a/dlib/cuda/cudnn_dlibapi.cpp
+++ b/dlib/cuda/cudnn_dlibapi.cpp
@@ -749,7 +749,6 @@ namespace dlib
            forward_workspace.reset();
            backward_data_workspace.reset();
            backward_filters_workspace.reset();
-            workspace.reset();
        }

        void tensor_conv::
@@ -919,8 +918,6 @@ namespace dlib
                        (const cudnnFilterDescriptor_t)filter_handle,
                        backward_filters_best_algo,
                        &backward_filters_workspace_size_in_bytes));
-
-                workspace = device_global_buffer();
            }
            catch(...)
            {
@@ -989,7 +986,7 @@ namespace dlib
            // while the function is still executing on the device.  But each time we come
            // here, we make sure to grab the latest workspace buffer so that, globally, we
            // minimize the number of such buffers.
-            forward_workspace = workspace->get(forward_workspace_size_in_bytes);
+            forward_workspace = device_global_buffer(forward_workspace_size_in_bytes);

            CHECK_CUDNN(cudnnConvolutionForward(
                    context(),
@@ -1022,7 +1019,7 @@ namespace dlib
            // while the function is still executing on the device.  But each time we come
            // here, we make sure to grab the latest workspace buffer so that, globally, we
            // minimize the number of such buffers.
-            backward_data_workspace = workspace->get(backward_data_workspace_size_in_bytes);
+            backward_data_workspace = device_global_buffer(backward_data_workspace_size_in_bytes);


            CHECK_CUDNN(cudnnConvolutionBackwardData(context(),
@@ -1056,7 +1053,7 @@ namespace dlib
            // while the function is still executing on the device.  But each time we come
            // here, we make sure to grab the latest workspace buffer so that, globally, we
            // minimize the number of such buffers.
-            backward_filters_workspace = workspace->get(backward_filters_workspace_size_in_bytes);
+            backward_filters_workspace = device_global_buffer(backward_filters_workspace_size_in_bytes);

            CHECK_CUDNN(cudnnConvolutionBackwardFilter(context(),
                                                    &alpha,

--- a/dlib/cuda/cudnn_dlibapi.h
+++ b/dlib/cuda/cudnn_dlibapi.h
@@ -268,7 +268,6 @@ namespace dlib
            size_t forward_workspace_size_in_bytes;
            size_t backward_data_workspace_size_in_bytes;
            size_t backward_filters_workspace_size_in_bytes;
-            std::shared_ptr<resizable_cuda_buffer> workspace;
            cuda_data_void_ptr forward_workspace;
            cuda_data_void_ptr backward_data_workspace;
            cuda_data_void_ptr backward_filters_workspace;