Commit 546bdf51 authored by Davis King's avatar Davis King

Switched 2D kernels to use the new 2D launch_kernel(). Also added overload of

dot_prods() that can accumulate in addition to assign.
parent 19b16e1a
...@@ -142,9 +142,8 @@ namespace dlib ...@@ -142,9 +142,8 @@ namespace dlib
) )
{ {
invnorms.set_size(data.num_samples()); invnorms.set_size(data.num_samples());
dim3 blocks(1,10); // x size 1 so we don't need to worry about inter-block synchronization (since only y spans blocks) launch_kernel(_cuda_inverse_norms, max_jobs(data.size()/data.num_samples(), data.num_samples()),
dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel. invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps);
_cuda_inverse_norms<<<blocks,threads>>>(invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps);
} }
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
...@@ -170,16 +169,57 @@ namespace dlib ...@@ -170,16 +169,57 @@ namespace dlib
} }
} }
__global__ void _cuda_dot_prods_add_to(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc)
{
for (auto i : grid_stride_range_y(0, nr))
{
auto l = lhs + i*nc;
auto r = rhs + i*nc;
float temp = 0;
for (auto j : grid_stride_range(0, nc))
temp += l[j]*r[j];
// and store the sum into out[i]
warp_reduce_atomic_add(out[i], temp);
}
}
void dot_prods ( void dot_prods (
resizable_tensor& out, resizable_tensor& out,
const tensor& lhs, const tensor& lhs,
const tensor& rhs const tensor& rhs
) )
{ {
DLIB_CASSERT(have_same_dimensions(lhs,rhs));
out.set_size(lhs.num_samples()); out.set_size(lhs.num_samples());
dim3 blocks(1,10); // x size 1 so we don't need to worry about inter-block synchronization (since only y spans blocks) if (out.size() == 0)
dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel. return;
_cuda_dot_prods<<<blocks,threads>>>(out.device(), lhs.device(), rhs.device(), lhs.num_samples(), lhs.size()/lhs.num_samples());
const auto nr = lhs.num_samples();
const auto nc = lhs.size()/lhs.num_samples();
launch_kernel(_cuda_dot_prods, max_jobs(nc,nr), out.device_write_only(), lhs.device(), rhs.device(), nr, nc);
}
void dot_prods (
bool add_to,
tensor& out,
const tensor& lhs,
const tensor& rhs
)
{
DLIB_CASSERT(have_same_dimensions(lhs,rhs));
DLIB_CASSERT(out.k() == 1 && out.nr() == 1 && out.nc() == 1);
DLIB_CASSERT(out.size() == lhs.num_samples());
const auto nr = lhs.num_samples();
const auto nc = lhs.size()/lhs.num_samples();
if (add_to)
launch_kernel(_cuda_dot_prods_add_to, max_jobs(nc,nr), out.device(), lhs.device(), rhs.device(), nr, nc);
else
launch_kernel(_cuda_dot_prods, max_jobs(nc,nr), out.device_write_only(), lhs.device(), rhs.device(), nr, nc);
} }
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
...@@ -501,14 +541,15 @@ namespace dlib ...@@ -501,14 +541,15 @@ namespace dlib
if (dest.size() == 0) if (dest.size() == 0)
return; return;
dim3 blocks(1,10); // x size 1 so we don't need to worry about inter-block synchronization (since only y spans blocks)
dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel. const auto bs = src1.nr()*src1.nc();
const auto n = src1.num_samples()*src1.k();
if (add_to) if (add_to)
_cuda_multiply_conv2_add_to<<<blocks,threads>>>( launch_kernel(_cuda_multiply_conv2_add_to, max_jobs(bs,n),
dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k()); dest.device(), src1.device(), n, src2.device(), bs, src1.k());
else else
_cuda_multiply_conv2<<<blocks,threads>>>( launch_kernel(_cuda_multiply_conv2, max_jobs(bs,n),
dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k()); dest.device(), src1.device(), n, src2.device(), bs, src1.k());
} }
} }
......
...@@ -121,6 +121,13 @@ namespace dlib ...@@ -121,6 +121,13 @@ namespace dlib
const tensor& rhs const tensor& rhs
); );
void dot_prods (
bool add_to,
tensor& out,
const tensor& lhs,
const tensor& rhs
);
void scale_columns ( void scale_columns (
tensor& out, tensor& out,
const tensor& m, const tensor& m,
......
...@@ -69,6 +69,23 @@ namespace dlib { namespace tt ...@@ -69,6 +69,23 @@ namespace dlib { namespace tt
#endif #endif
} }
void dot_prods (
bool add_to,
tensor& out,
const tensor& lhs,
const tensor& rhs
)
{
#ifdef DLIB_USE_CUDA
cuda::dot_prods(add_to, out, lhs, rhs);
#else
if (add_to)
out += sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
else
out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
#endif
}
void scale_columns ( void scale_columns (
tensor& out, tensor& out,
const tensor& m, const tensor& m,
......
...@@ -50,6 +50,24 @@ namespace dlib { namespace tt ...@@ -50,6 +50,24 @@ namespace dlib { namespace tt
- #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); - #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
!*/ !*/
void dot_prods (
bool add_to,
tensor& out,
const tensor& lhs,
const tensor& rhs
);
/*!
requires
- have_same_dimensions(lhs,rhs) == true
- out.size() == lhs.num_samples()
- out.k() == out.nr() == out.nc() == 1
ensures
- if (add_to) then
- #out == mat(out) + sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
- else
- #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
!*/
void scale_columns ( void scale_columns (
tensor& out, tensor& out,
const tensor& m, const tensor& m,
......
...@@ -1256,6 +1256,24 @@ namespace ...@@ -1256,6 +1256,24 @@ namespace
out2 = scale_rows(mat(data), mat(invnorms1)); out2 = scale_rows(mat(data), mat(invnorms1));
DLIB_TEST(max(abs(mat(out1)-mat(out2))) < 1e-6); DLIB_TEST(max(abs(mat(out1)-mat(out2))) < 1e-6);
} }
{
resizable_tensor a(123,432), b(123,432);
rnd.fill_gaussian(a);
rnd.fill_gaussian(b);
resizable_tensor out;
dot_prods(out, a,b);
const matrix<float> truth = sum_cols(pointwise_multiply(mat(a), mat(b)));
DLIB_TEST(max(abs(mat(out) - truth)) < 1e-4);
out = 0;
DLIB_TEST(max(abs(mat(out) - truth)) > 1e-2);
dot_prods(false, out, a,b);
DLIB_TEST(max(abs(mat(out) - truth)) < 1e-4);
dot_prods(true, out, a,b);
DLIB_TEST(max(abs(mat(out)/2 - truth)) < 1e-4);
DLIB_TEST(max(abs(mat(out) - truth)) > 1e-2);
}
} }
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment