Commit 452b188d authored by Davis King's avatar Davis King

Fixed some race conditions in cuda code.

parent 76d5bba6
...@@ -113,8 +113,9 @@ namespace dlib ...@@ -113,8 +113,9 @@ namespace dlib
__global__ void _cuda_inverse_norms(float* invnorms, const float* data, size_t nr, size_t nc, const float eps) __global__ void _cuda_inverse_norms(float* invnorms, const float* data, size_t nr, size_t nc, const float eps)
{ {
// initialize invnorms before we begin. // initialize invnorms before we begin.
for (auto i : grid_stride_range(0, nr)) for (auto i : grid_stride_range_y(0, nr))
invnorms[i] = eps; for (auto j : grid_stride_range(0, 1))
invnorms[i] = eps;
__syncthreads(); __syncthreads();
for (auto i : grid_stride_range_y(0, nr)) for (auto i : grid_stride_range_y(0, nr))
...@@ -129,10 +130,9 @@ namespace dlib ...@@ -129,10 +130,9 @@ namespace dlib
} }
__syncthreads(); __syncthreads();
for (auto j : grid_stride_range(0, nr)) for (auto i : grid_stride_range_y(0, nr))
{ for (auto j : grid_stride_range(0, 1))
invnorms[j] = 1.0/std::sqrt(invnorms[j]); invnorms[i] = 1.0/std::sqrt(invnorms[i]);
}
} }
void inverse_norms ( void inverse_norms (
...@@ -142,7 +142,7 @@ namespace dlib ...@@ -142,7 +142,7 @@ namespace dlib
) )
{ {
invnorms.set_size(data.num_samples()); invnorms.set_size(data.num_samples());
dim3 blocks(10,1); dim3 blocks(1,10); // x size 1 so we don't need to worry about inter-block synchronization (since only y spans blocks)
dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel. dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
_cuda_inverse_norms<<<blocks,threads>>>(invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps); _cuda_inverse_norms<<<blocks,threads>>>(invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps);
} }
...@@ -152,8 +152,9 @@ namespace dlib ...@@ -152,8 +152,9 @@ namespace dlib
__global__ void _cuda_dot_prods(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc) __global__ void _cuda_dot_prods(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc)
{ {
// initialize out before we begin. // initialize out before we begin.
for (auto i : grid_stride_range(0, nr)) for (auto i : grid_stride_range_y(0, nr))
out[i] = 0; for (auto j : grid_stride_range(0, 1))
out[i] = 0;
__syncthreads(); __syncthreads();
for (auto i : grid_stride_range_y(0, nr)) for (auto i : grid_stride_range_y(0, nr))
...@@ -176,7 +177,7 @@ namespace dlib ...@@ -176,7 +177,7 @@ namespace dlib
) )
{ {
out.set_size(lhs.num_samples()); out.set_size(lhs.num_samples());
dim3 blocks(10,1); dim3 blocks(1,10); // x size 1 so we don't need to worry about inter-block synchronization (since only y spans blocks)
dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel. dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
_cuda_dot_prods<<<blocks,threads>>>(out.device(), lhs.device(), rhs.device(), lhs.num_samples(), lhs.size()/lhs.num_samples()); _cuda_dot_prods<<<blocks,threads>>>(out.device(), lhs.device(), rhs.device(), lhs.num_samples(), lhs.size()/lhs.num_samples());
} }
...@@ -379,8 +380,9 @@ namespace dlib ...@@ -379,8 +380,9 @@ namespace dlib
__global__ void _cuda_multiply_conv2(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks) __global__ void _cuda_multiply_conv2(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
{ {
// zero initialize d before we begin. // zero initialize d before we begin.
for (auto i : grid_stride_range(0, ks)) for (auto i : grid_stride_range_y(0, ks))
d[i] = 0; for (auto j : grid_stride_range(0, 1))
d[i] = 0;
__syncthreads(); __syncthreads();
// loop over all the image planes // loop over all the image planes
...@@ -448,7 +450,7 @@ namespace dlib ...@@ -448,7 +450,7 @@ namespace dlib
if (dest.size() == 0) if (dest.size() == 0)
return; return;
dim3 blocks(10,1); dim3 blocks(1,10); // x size 1 so we don't need to worry about inter-block synchronization (since only y spans blocks)
dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel. dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
if (add_to) if (add_to)
_cuda_multiply_conv2_add_to<<<blocks,threads>>>( _cuda_multiply_conv2_add_to<<<blocks,threads>>>(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment