Commit 3586d409 authored by Davis King's avatar Davis King

Added tt::scale_columns(). Also optimized some other cuda kernel launches a

little.
parent 2a2956a2
......@@ -142,7 +142,9 @@ namespace dlib
)
{
invnorms.set_size(data.num_samples());
launch_kernel(_cuda_inverse_norms,max_jobs(data.size()), invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps);
dim3 blocks(10,1);
dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
_cuda_inverse_norms<<<blocks,threads>>>(invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps);
}
// ----------------------------------------------------------------------------------------
......@@ -174,7 +176,28 @@ namespace dlib
)
{
out.set_size(lhs.num_samples());
launch_kernel(_cuda_dot_prods, max_jobs(lhs.size()), out.device(), lhs.device(), rhs.device(), lhs.num_samples(), lhs.size()/lhs.num_samples());
dim3 blocks(10,1);
dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
_cuda_dot_prods<<<blocks,threads>>>(out.device(), lhs.device(), rhs.device(), lhs.num_samples(), lhs.size()/lhs.num_samples());
}
// ----------------------------------------------------------------------------------------
__global__ void _cuda_scale_columns(float* out, const float* m, const float* v, size_t nr, size_t nc)
{
for (auto j : grid_stride_range(0, nr*nc))
{
out[j] = m[j]*v[j%nc];
}
}
void scale_columns (
tensor& out,
const tensor& m,
const tensor& v
)
{
launch_kernel(_cuda_scale_columns, max_jobs(m.size()), out.device(), m.device(), v.device(), m.num_samples(), m.size()/m.num_samples());
}
// ----------------------------------------------------------------------------------------
......
......@@ -120,6 +120,12 @@ namespace dlib
const tensor& rhs
);
void scale_columns (
tensor& out,
const tensor& m,
const tensor& v
);
void scale_rows (
tensor& out,
const tensor& m,
......
......@@ -69,6 +69,27 @@ namespace dlib { namespace tt
#endif
}
void scale_columns (
tensor& out,
const tensor& m,
const tensor& v
)
{
DLIB_CASSERT(have_same_dimensions(out,m));
DLIB_CASSERT(is_vector(v));
if (m.size() == 0 && v.size() == 0)
return;
DLIB_CASSERT(m.size() != 0);
DLIB_CASSERT(m.size()/m.num_samples() == v.size());
#ifdef DLIB_USE_CUDA
cuda::scale_columns(out, m, v);
#else
DLIB_CASSERT(false, "shouldn't be called right now");
out = scale_columns(mat(m), mat(v));
#endif
}
void scale_rows (
tensor& out,
const tensor& m,
......@@ -76,6 +97,11 @@ namespace dlib { namespace tt
)
{
DLIB_CASSERT(have_same_dimensions(out,m));
DLIB_CASSERT(is_vector(v));
if (m.size() == 0 && v.size() == 0)
return;
DLIB_CASSERT(m.size() != 0);
DLIB_CASSERT(m.num_samples() == v.size());
#ifdef DLIB_USE_CUDA
cuda::scale_rows(out, m, v);
......
......@@ -48,6 +48,20 @@ namespace dlib { namespace tt
- #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs)));
!*/
void scale_columns (
tensor& out,
const tensor& m,
const tensor& v
);
/*!
requires
- have_same_dimensions(out,m) == true
- is_vector(v) == true
- v.size() == mat(m).nc()
ensures
- performs: out = scale_columns(mat(m),mat(v));
!*/
void scale_rows (
tensor& out,
const tensor& m,
......@@ -56,7 +70,7 @@ namespace dlib { namespace tt
/*!
requires
- have_same_dimensions(out,m) == true
- is_vector(mat(v)) == true
- is_vector(v) == true
- v.size() == m.num_samples()
ensures
- performs: out = scale_rows(mat(m),mat(v));
......@@ -75,7 +89,7 @@ namespace dlib { namespace tt
- have_same_dimensions(out,m1) == true
- have_same_dimensions(out,m2) == true
- have_same_dimensions(v1,v2) == true
- is_vector(mat(v1)) == true
- is_vector(v1) == true
- v1.size() == m1.num_samples()
ensures
- performs:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment