Made the host to device copying code wait for any outstanding kernel executions

to finish before overwriting the device memory with updated values from the host.

Made the host to device copying code wait for any outstanding kernel executions
to finish before overwriting the device memory with updated values from the host.
8bb4a421 · Davis King · ccb148b4 · 8bb4a421 · 8bb4a421 · 8bb4a421
Commit 8bb4a421 authored Nov 29, 2015 by Davis King
Hide whitespace changes
Inline Side-by-side

Showing with 27 additions and 3 deletions

gpu_data.cpp dlib/dnn/gpu_data.cpp +12 -0

gpu_data.h dlib/dnn/gpu_data.h +11 -1

gpu_data_abstract.h dlib/dnn/gpu_data_abstract.h +4 -2

No files found.
--- a/dlib/dnn/gpu_data.cpp
+++ b/dlib/dnn/gpu_data.cpp
@@ -47,6 +47,9 @@ namespace dlib
            wait_for_transfer_to_finish();
            CHECK_CUDA(cudaMemcpy(data_host.get(), data_device.get(), data_size*sizeof(float), cudaMemcpyDeviceToHost));
            host_current = true;
+            // At this point we know our RAM block isn't in use because cudaMemcpy()
+            // implicitly syncs with the device. 
+            device_in_use = false;
            // Check for errors.  These calls to cudaGetLastError() are what help us find
            // out if our kernel launches have been failing.
            CHECK_CUDA(cudaGetLastError());
@@ -58,6 +61,13 @@ namespace dlib
    {
        if (!device_current)
        {
+            if (device_in_use)
+            {
+                // Wait for any possible CUDA kernels that might be using our memory block to
+                // complete before we overwrite the memory.
+                CHECK_CUDA(cudaStreamSynchronize(0));
+                device_in_use = false;
+            }
            CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get()));
            have_active_transfer = true;
            device_current = true;
@@ -75,6 +85,7 @@ namespace dlib
            data_size = 0;
            host_current = true;
            device_current = true;
+            device_in_use = false;
            data_host.reset();
            data_device.reset();
        }
@@ -84,6 +95,7 @@ namespace dlib
            data_size = new_size;
            host_current = true;
            device_current = true;
+            device_in_use = false;
            try
            {

--- a/dlib/dnn/gpu_data.h
+++ b/dlib/dnn/gpu_data.h
@@ -30,11 +30,16 @@ namespace dlib
                  modified the data and it hasn't been copied to the device yet then
                  host_current==true and device_current==false.
+                  Similarly, we use device_in_use==true to indicate that device() has been
+                  called and no operation to wait for all CUDA kernel completion has been
+                  executed.  So if device_in_use==true then there might be a CUDA kernel
+                  executing that is using the device memory block contained in this object.
        !*/
    public:
        gpu_data(
-        ) : data_size(0), host_current(true), device_current(true),have_active_transfer(false)
+        ) : data_size(0), host_current(true), device_current(true),have_active_transfer(false),device_in_use(false)
        {
        }
@@ -61,6 +66,7 @@ namespace dlib
                data_size = 0;
                host_current = true;
                device_current = true;
+                device_in_use = false;
                data_host.reset();
                data_device.reset();
            }
@@ -69,6 +75,7 @@ namespace dlib
                data_size = new_size;
                host_current = true;
                device_current = true;
+                device_in_use = false;
                data_host.reset(new float[new_size], std::default_delete<float[]>());
                data_device.reset();
            }
@@ -94,6 +101,7 @@ namespace dlib
            DLIB_CASSERT(false, "CUDA NOT ENABLED");
 #endif
            copy_to_device();
+            device_in_use = true;
            return data_device.get(); 
        }
@@ -104,6 +112,7 @@ namespace dlib
 #endif
            copy_to_device();
            host_current = false;
+            device_in_use = true;
            return data_device.get(); 
        }
@@ -143,6 +152,7 @@ namespace dlib
        mutable bool host_current;
        mutable bool device_current;
        mutable bool have_active_transfer;
+        mutable bool device_in_use;
        std::shared_ptr<float> data_host;
        std::shared_ptr<float> data_device;

--- a/dlib/dnn/gpu_data_abstract.h
+++ b/dlib/dnn/gpu_data_abstract.h
@@ -58,9 +58,11 @@ namespace dlib
        ); 
        /*!
            ensures
-                - This function does not block.
                - if (!device_ready()) then
-                    - Begins asynchronously copying host data to the device.
+                    - Begins asynchronously copying host data to the device once it is safe
+                      to do so.  I.e. This function will wait until any previously
+                      scheduled CUDA kernels, which are using the device() memory block,
+                      have completed before transferring the new data to the device.
                    - A call to device() that happens before the transfer completes will
                      block until the transfer is complete.  That is, it is safe to call
                      async_copy_to_device() and then immediately call device().