Apply the #1514 fix even on non-Windows platforms, and change all remaining…

Apply the #1514 fix even on non-Windows platforms, and change all remaining cudaStreamSynchronize calls (#1596) * Apply the #1514 fix even on non-Windows platforms * Try to fix #1513 even more by circumventing the remaining cudaStreamSynchronize calls * Make the fix apply only if CUDA_VERSION == 10000 * Make the fix apply on CUDA 9.2 also * CHECK_CUDA(cudaStreamSynchronize(stream));

Apply the #1514 fix even on non-Windows platforms, and change all remaining…
Apply the #1514 fix even on non-Windows platforms, and change all remaining cudaStreamSynchronize calls (#1596) * Apply the #1514 fix even on non-Windows platforms * Try to fix #1513 even more by circumventing the remaining cudaStreamSynchronize calls * Make the fix apply only if CUDA_VERSION == 10000 * Make the fix apply on CUDA 9.2 also * CHECK_CUDA(cudaStreamSynchronize(stream));
5c623e3c · Juha Reunanen · Davis E. King · 9ca72b3e · 5c623e3c
Commit 5c623e3c authored Jan 08, 2019 by Juha Reunanen Committed by Davis E. King Jan 08, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 24 additions and 27 deletions

gpu_data.cpp dlib/cuda/gpu_data.cpp +24 -27

No files found.
--- a/dlib/cuda/gpu_data.cpp
+++ b/dlib/cuda/gpu_data.cpp
@@ -79,12 +79,33 @@ namespace dlib
    }
 // ----------------------------------------------------------------------------------------
+    void synchronize_stream(cudaStream_t stream)
+    {
+#if CUDA_VERSION >= 9020 && CUDA_VERSION <= 10000
+        // This should be pretty much the same as cudaStreamSynchronize, which for some
+        // reason makes training freeze in some cases.
+        // (see https://github.com/davisking/dlib/issues/1513)
+        while (true)
+        {
+            cudaError_t err = cudaStreamQuery(stream);
+            switch (err)
+            {
+            case cudaSuccess: return;      // now we are synchronized
+            case cudaErrorNotReady: break; // continue waiting
+            default: CHECK_CUDA(err);      // unexpected error: throw
+            }
+        }
+#else // CUDA_VERSION
+        CHECK_CUDA(cudaStreamSynchronize(stream));
+#endif // CUDA_VERSION
+    }
    void gpu_data::
    wait_for_transfer_to_finish() const
    {
        if (have_active_transfer)
        {
-            CHECK_CUDA(cudaStreamSynchronize((cudaStream_t)cuda_stream.get()));
+            synchronize_stream((cudaStream_t)cuda_stream.get());
            have_active_transfer = false;
            // Check for errors.  These calls to cudaGetLastError() are what help us find
            // out if our kernel launches have been failing.
@@ -118,25 +139,6 @@ namespace dlib
        }
    }
-#ifdef WIN32
-    // This should be pretty much the same as cudaStreamSynchronize, which for some
-    // reason makes training freeze on some Windows machines.
-    // (see https://github.com/davisking/dlib/issues/1513)
-    void synchronize_stream(cudaStream_t stream)
-    {
-        while (true)
-        {
-            cudaError_t err = cudaStreamQuery(stream);
-            switch (err)
-            {
-            case cudaSuccess: return;      // now we are synchronized
-            case cudaErrorNotReady: break; // continue waiting
-            default: CHECK_CUDA(err);      // unexpected error: throw
-            }
-        }
-    }
-#endif // WIN32
    void gpu_data::
    async_copy_to_device() const
    {
@@ -146,12 +148,7 @@ namespace dlib
            {
                // Wait for any possible CUDA kernels that might be using our memory block to
                // complete before we overwrite the memory.
-#ifdef WIN32
                synchronize_stream(0);
-#else
-                CHECK_CUDA(cudaStreamSynchronize(0));
-#endif
                device_in_use = false;
            }
            CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get()));
@@ -171,7 +168,7 @@ namespace dlib
            {
                // Wait for any possible CUDA kernels that might be using our memory block to
                // complete before we free the memory.
-                CHECK_CUDA(cudaStreamSynchronize(0));
+                synchronize_stream(0);
                device_in_use = false;
            }
            wait_for_transfer_to_finish();
@@ -188,7 +185,7 @@ namespace dlib
            {
                // Wait for any possible CUDA kernels that might be using our memory block to
                // complete before we free the memory.
-                CHECK_CUDA(cudaStreamSynchronize(0));
+                synchronize_stream(0);
                device_in_use = false;
            }
            wait_for_transfer_to_finish();