Commit 8bb4a421 authored by Davis King's avatar Davis King

Made the host to device copying code wait for any outstanding kernel executions

to finish before overwriting the device memory with updated values from the
host.
parent ccb148b4
...@@ -47,6 +47,9 @@ namespace dlib ...@@ -47,6 +47,9 @@ namespace dlib
wait_for_transfer_to_finish(); wait_for_transfer_to_finish();
CHECK_CUDA(cudaMemcpy(data_host.get(), data_device.get(), data_size*sizeof(float), cudaMemcpyDeviceToHost)); CHECK_CUDA(cudaMemcpy(data_host.get(), data_device.get(), data_size*sizeof(float), cudaMemcpyDeviceToHost));
host_current = true; host_current = true;
// At this point we know our RAM block isn't in use because cudaMemcpy()
// implicitly syncs with the device.
device_in_use = false;
// Check for errors. These calls to cudaGetLastError() are what help us find // Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing. // out if our kernel launches have been failing.
CHECK_CUDA(cudaGetLastError()); CHECK_CUDA(cudaGetLastError());
...@@ -58,6 +61,13 @@ namespace dlib ...@@ -58,6 +61,13 @@ namespace dlib
{ {
if (!device_current) if (!device_current)
{ {
if (device_in_use)
{
// Wait for any possible CUDA kernels that might be using our memory block to
// complete before we overwrite the memory.
CHECK_CUDA(cudaStreamSynchronize(0));
device_in_use = false;
}
CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get())); CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get()));
have_active_transfer = true; have_active_transfer = true;
device_current = true; device_current = true;
...@@ -75,6 +85,7 @@ namespace dlib ...@@ -75,6 +85,7 @@ namespace dlib
data_size = 0; data_size = 0;
host_current = true; host_current = true;
device_current = true; device_current = true;
device_in_use = false;
data_host.reset(); data_host.reset();
data_device.reset(); data_device.reset();
} }
...@@ -84,6 +95,7 @@ namespace dlib ...@@ -84,6 +95,7 @@ namespace dlib
data_size = new_size; data_size = new_size;
host_current = true; host_current = true;
device_current = true; device_current = true;
device_in_use = false;
try try
{ {
......
...@@ -30,11 +30,16 @@ namespace dlib ...@@ -30,11 +30,16 @@ namespace dlib
modified the data and it hasn't been copied to the device yet then modified the data and it hasn't been copied to the device yet then
host_current==true and device_current==false. host_current==true and device_current==false.
Similarly, we use device_in_use==true to indicate that device() has been
called and no operation to wait for all CUDA kernel completion has been
executed. So if device_in_use==true then there might be a CUDA kernel
executing that is using the device memory block contained in this object.
!*/ !*/
public: public:
gpu_data( gpu_data(
) : data_size(0), host_current(true), device_current(true),have_active_transfer(false) ) : data_size(0), host_current(true), device_current(true),have_active_transfer(false),device_in_use(false)
{ {
} }
...@@ -61,6 +66,7 @@ namespace dlib ...@@ -61,6 +66,7 @@ namespace dlib
data_size = 0; data_size = 0;
host_current = true; host_current = true;
device_current = true; device_current = true;
device_in_use = false;
data_host.reset(); data_host.reset();
data_device.reset(); data_device.reset();
} }
...@@ -69,6 +75,7 @@ namespace dlib ...@@ -69,6 +75,7 @@ namespace dlib
data_size = new_size; data_size = new_size;
host_current = true; host_current = true;
device_current = true; device_current = true;
device_in_use = false;
data_host.reset(new float[new_size], std::default_delete<float[]>()); data_host.reset(new float[new_size], std::default_delete<float[]>());
data_device.reset(); data_device.reset();
} }
...@@ -94,6 +101,7 @@ namespace dlib ...@@ -94,6 +101,7 @@ namespace dlib
DLIB_CASSERT(false, "CUDA NOT ENABLED"); DLIB_CASSERT(false, "CUDA NOT ENABLED");
#endif #endif
copy_to_device(); copy_to_device();
device_in_use = true;
return data_device.get(); return data_device.get();
} }
...@@ -104,6 +112,7 @@ namespace dlib ...@@ -104,6 +112,7 @@ namespace dlib
#endif #endif
copy_to_device(); copy_to_device();
host_current = false; host_current = false;
device_in_use = true;
return data_device.get(); return data_device.get();
} }
...@@ -143,6 +152,7 @@ namespace dlib ...@@ -143,6 +152,7 @@ namespace dlib
mutable bool host_current; mutable bool host_current;
mutable bool device_current; mutable bool device_current;
mutable bool have_active_transfer; mutable bool have_active_transfer;
mutable bool device_in_use;
std::shared_ptr<float> data_host; std::shared_ptr<float> data_host;
std::shared_ptr<float> data_device; std::shared_ptr<float> data_device;
......
...@@ -58,9 +58,11 @@ namespace dlib ...@@ -58,9 +58,11 @@ namespace dlib
); );
/*! /*!
ensures ensures
- This function does not block.
- if (!device_ready()) then - if (!device_ready()) then
- Begins asynchronously copying host data to the device. - Begins asynchronously copying host data to the device once it is safe
to do so. I.e. This function will wait until any previously
scheduled CUDA kernels, which are using the device() memory block,
have completed before transferring the new data to the device.
- A call to device() that happens before the transfer completes will - A call to device() that happens before the transfer completes will
block until the transfer is complete. That is, it is safe to call block until the transfer is complete. That is, it is safe to call
async_copy_to_device() and then immediately call device(). async_copy_to_device() and then immediately call device().
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment