Commit 546bdf51 authored by Davis King's avatar Davis King

Switched 2D kernels to use the new 2D launch_kernel(). Also added overload of

dot_prods() that can accumulate in addition to assign.
parent 19b16e1a
......@@ -142,9 +142,8 @@ namespace dlib
)
{
invnorms.set_size(data.num_samples());
dim3 blocks(1,10); // x size 1 so we don't need to worry about inter-block synchronization (since only y spans blocks)
dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
_cuda_inverse_norms<<<blocks,threads>>>(invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps);
launch_kernel(_cuda_inverse_norms, max_jobs(data.size()/data.num_samples(), data.num_samples()),
invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps);
}
// ----------------------------------------------------------------------------------------
......@@ -170,16 +169,57 @@ namespace dlib
}
}
__global__ void _cuda_dot_prods_add_to(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc)
{
for (auto i : grid_stride_range_y(0, nr))
{
auto l = lhs + i*nc;
auto r = rhs + i*nc;
float temp = 0;
for (auto j : grid_stride_range(0, nc))
temp += l[j]*r[j];
// and store the sum into out[i]
warp_reduce_atomic_add(out[i], temp);
}
}
void dot_prods (
resizable_tensor& out,
const tensor& lhs,
const tensor& rhs
)
{
DLIB_CASSERT(have_same_dimensions(lhs,rhs));
out.set_size(lhs.num_samples());
dim3 blocks(1,10); // x size 1 so we don't need to worry about inter-block synchronization (since only y spans blocks)
dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
_cuda_dot_prods<<<blocks,threads>>>(out.device(), lhs.device(), rhs.device(), lhs.num_samples(), lhs.size()/lhs.num_samples());
if (out.size() == 0)
return;
const auto nr = lhs.num_samples();
const auto nc = lhs.size()/lhs.num_samples();
launch_kernel(_cuda_dot_prods, max_jobs(nc,nr), out.device_write_only(), lhs.device(), rhs.device(), nr, nc);
}
void dot_prods (
bool add_to,
tensor& out,
const tensor& lhs,
const tensor& rhs
)
{
DLIB_CASSERT(have_same_dimensions(lhs,rhs));
DLIB_CASSERT(out.k() == 1 && out.nr() == 1 && out.nc() == 1);
DLIB_CASSERT(out.size() == lhs.num_samples());
const auto nr = lhs.num_samples();
const auto nc = lhs.size()/lhs.num_samples();
if (add_to)
launch_kernel(_cuda_dot_prods_add_to, max_jobs(nc,nr), out.device(), lhs.device(), rhs.device(), nr, nc);
else
launch_kernel(_cuda_dot_prods, max_jobs(nc,nr), out.device_write_only(), lhs.device(), rhs.device(), nr, nc);
}
// ----------------------------------------------------------------------------------------
......@@ -501,14 +541,15 @@ namespace dlib
if (dest.size() == 0)
return;
dim3 blocks(1,10); // x size 1 so we don't need to worry about inter-block synchronization (since only y spans blocks)
dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
const auto bs = src1.nr()*src1.nc();
const auto n = src1.num_samples()*src1.k();
if (add_to)
_cuda_multiply_conv2_add_to<<<blocks,threads>>>(
dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k());
launch_kernel(_cuda_multiply_conv2_add_to, max_jobs(bs,n),
dest.device(), src1.device(), n, src2.device(), bs, src1.k());
else
_cuda_multiply_conv2<<<blocks,threads>>>(
dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k());
launch_kernel(_cuda_multiply_conv2, max_jobs(bs,n),
dest.device(), src1.device(), n, src2.device(), bs, src1.k());
}
}
......
......@@ -121,6 +121,13 @@ namespace dlib
const tensor& rhs
);
void dot_prods (
bool add_to,
tensor& out,
const tensor& lhs,
const tensor& rhs
);
void scale_columns (
tensor& out,
const tensor& m,
......
......@@ -69,6 +69,23 @@ namespace dlib { namespace tt
#endif
}
void dot_prods (
bool add_to,
tensor& out,
const tensor& lhs,
const tensor& rhs
)
{
#ifdef DLIB_USE_CUDA
cuda::dot_prods(add_to, out, lhs, rhs);
#else
if (add_to)
out += sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
else
out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
#endif
}
void scale_columns (
tensor& out,
const tensor& m,
......
......@@ -50,6 +50,24 @@ namespace dlib { namespace tt
- #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
!*/
void dot_prods (
bool add_to,
tensor& out,
const tensor& lhs,
const tensor& rhs
);
/*!
requires
- have_same_dimensions(lhs,rhs) == true
- out.size() == lhs.num_samples()
- out.k() == out.nr() == out.nc() == 1
ensures
- if (add_to) then
- #out == mat(out) + sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
- else
- #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
!*/
void scale_columns (
tensor& out,
const tensor& m,
......
......@@ -1256,6 +1256,24 @@ namespace
out2 = scale_rows(mat(data), mat(invnorms1));
DLIB_TEST(max(abs(mat(out1)-mat(out2))) < 1e-6);
}
{
resizable_tensor a(123,432), b(123,432);
rnd.fill_gaussian(a);
rnd.fill_gaussian(b);
resizable_tensor out;
dot_prods(out, a,b);
const matrix<float> truth = sum_cols(pointwise_multiply(mat(a), mat(b)));
DLIB_TEST(max(abs(mat(out) - truth)) < 1e-4);
out = 0;
DLIB_TEST(max(abs(mat(out) - truth)) > 1e-2);
dot_prods(false, out, a,b);
DLIB_TEST(max(abs(mat(out) - truth)) < 1e-4);
dot_prods(true, out, a,b);
DLIB_TEST(max(abs(mat(out)/2 - truth)) < 1e-4);
DLIB_TEST(max(abs(mat(out) - truth)) > 1e-2);
}
}
// ----------------------------------------------------------------------------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment