Commit 669a1e17 authored by Davis King's avatar Davis King

Added affine_transform_conv() and multiply_conv() as well as a CPU

implementation of assign_conv_bias_gradient().
parent e44b2aa2
...@@ -54,6 +54,57 @@ namespace dlib ...@@ -54,6 +54,57 @@ namespace dlib
} }
} }
void multiply_conv (
tensor& dest,
const tensor& src1,
const tensor& src2
)
{
auto d = dest.host();
auto s1 = src1.host();
auto s2 = src2.host();
if (have_same_dimensions(dest,src1))
{
DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k(),"");
for (long n = 0; n < dest.num_samples(); ++n)
{
for (long k = 0; k < dest.k(); ++k)
{
for (long r = 0; r < dest.nr(); ++r)
{
for (long c = 0; c < dest.nc(); ++c)
{
*d++ = (*s1++)*s2[k];
}
}
}
}
}
else
{
DLIB_CASSERT(have_same_dimensions(src1,src2),"");
DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k(),"");
for (long k = 0; k < src1.k(); ++k)
d[k] = 0;
for (long n = 0; n < src1.num_samples(); ++n)
{
for (long k = 0; k < src1.k(); ++k)
{
for (long r = 0; r < src1.nr(); ++r)
{
for (long c = 0; c < src1.nc(); ++c)
{
d[k] += (*s1++)*(*s2++);
}
}
}
}
}
}
void add( void add(
float beta, float beta,
tensor& dest, tensor& dest,
...@@ -196,6 +247,44 @@ namespace dlib ...@@ -196,6 +247,44 @@ namespace dlib
} }
} }
// ------------------------------------------------------------------------------------
void assign_conv_bias_gradient (
tensor& grad,
const tensor& gradient_input
)
{
DLIB_CASSERT(
grad.num_samples() == 1 &&
grad.k() >= 1 &&
grad.nr() == 1 &&
grad.nc() == 1 &&
gradient_input.k() == grad.k() &&
gradient_input.size() > 0 &&
is_same_object(grad,gradient_input) == false
,"");
auto g = grad.host();
auto gi = gradient_input.host();
for (long k = 0; k < gradient_input.k(); ++k)
g[k] = 0;
for (long n = 0; n < gradient_input.num_samples(); ++n)
{
for (long k = 0; k < gradient_input.k(); ++k)
{
for (long r = 0; r < gradient_input.nr(); ++r)
{
for (long c = 0; c < gradient_input.nc(); ++c)
{
g[k] += (*gi++);
}
}
}
}
}
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void affine_transform( void affine_transform(
...@@ -293,6 +382,41 @@ namespace dlib ...@@ -293,6 +382,41 @@ namespace dlib
} }
} }
// -----------------------------------------------------------------------------------
void affine_transform_conv(
tensor& dest,
const tensor& src,
const tensor& A,
const tensor& B
)
{
DLIB_CASSERT(have_same_dimensions(dest,src),"");
DLIB_CASSERT(have_same_dimensions(A,B),"");
DLIB_CASSERT(A.num_samples() == 1 &&
A.nr() == 1 &&
A.nc() == 1 &&
A.k() == src.k(), "");
auto d = dest.host();
auto s = src.host();
const auto a = A.host();
const auto b = B.host();
for (long n = 0; n < dest.num_samples(); ++n)
{
for (long k = 0; k < dest.k(); ++k)
{
for (long r = 0; r < dest.nr(); ++r)
{
for (long c = 0; c < dest.nc(); ++c)
{
*d++ = a[k]*(*s++) + b[k];
}
}
}
}
}
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void batch_normalize_inference ( void batch_normalize_inference (
...@@ -1238,7 +1362,6 @@ namespace dlib ...@@ -1238,7 +1362,6 @@ namespace dlib
} }
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
......
...@@ -25,6 +25,12 @@ namespace dlib ...@@ -25,6 +25,12 @@ namespace dlib
const tensor& src2 const tensor& src2
); );
void multiply_conv (
tensor& dest,
const tensor& src1,
const tensor& src2
);
void add( void add(
float beta, float beta,
tensor& dest, tensor& dest,
...@@ -43,6 +49,11 @@ namespace dlib ...@@ -43,6 +49,11 @@ namespace dlib
const tensor& src2 const tensor& src2
); );
void assign_conv_bias_gradient (
tensor& grad,
const tensor& gradient_input
);
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void affine_transform( void affine_transform(
...@@ -81,6 +92,15 @@ namespace dlib ...@@ -81,6 +92,15 @@ namespace dlib
const tensor& B const tensor& B
); );
// -----------------------------------------------------------------------------------
void affine_transform_conv(
tensor& dest,
const tensor& src,
const tensor& A,
const tensor& B
);
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void batch_normalize_inference ( void batch_normalize_inference (
......
...@@ -94,6 +94,68 @@ namespace dlib ...@@ -94,6 +94,68 @@ namespace dlib
} }
} }
// ------------------------------------------------------------------------------------
__global__ void _cuda_multiply_conv(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
{
for (auto i : grid_stride_range(0, n))
{
auto k = (i/bs)%ks;
d[i] = s1[i]*s2[k];
}
}
__global__ void _cuda_multiply_conv2(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
{
// zero initialize d before we begin.
for (auto i : grid_stride_range(0, ks))
d[i] = 0;
__syncthreads();
// loop over all the image planes
for (auto i : grid_stride_range_y(0, n))
{
// sum all the elements in the i-th image plane
float temp = 0;
for (auto j : grid_stride_range(i*bs, (i+1)*bs))
temp += s1[j]*s2[j];
auto k = i%ks;
// and store the sum into d[k]
warp_reduce_atomic_add(d[k], temp);
}
}
void multiply_conv (
tensor& dest,
const tensor& src1,
const tensor& src2
)
{
if (have_same_dimensions(dest,src1))
{
DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k(),"");
if (dest.size() == 0)
return;
launch_kernel(_cuda_multiply_conv,max_jobs(dest.size()),
dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k());
}
else
{
DLIB_CASSERT(have_same_dimensions(src1,src2),"");
DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k(),"");
if (dest.size() == 0)
return;
dim3 blocks(10,1);
dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
_cuda_multiply_conv2<<<blocks,threads>>>(
dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k());
}
}
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
__global__ void _cuda_add1(float* d, const float* s1, const float* s2, size_t n) __global__ void _cuda_add1(float* d, const float* s1, const float* s2, size_t n)
...@@ -302,6 +364,32 @@ namespace dlib ...@@ -302,6 +364,32 @@ namespace dlib
} }
} }
// -----------------------------------------------------------------------------------
__global__ void _cuda_affine_transform_conv(float* d, const float* s, size_t n, const float* A, const float* B, size_t bs, size_t ks)
{
for (auto i : grid_stride_range(0, n))
{
auto k = (i/bs)%ks;
d[i] = A[k]*s[i] + B[k];
}
}
void affine_transform_conv(
tensor& dest,
const tensor& src,
const tensor& A,
const tensor& B
)
{
DLIB_CASSERT(have_same_dimensions(dest, src),"");
DLIB_CASSERT(have_same_dimensions(A, B),"");
DLIB_CASSERT(A.num_samples() == 1 && A.nr() == 1 && A.nc() == 1 && A.k() == src.k(),"");
launch_kernel(_cuda_affine_transform_conv,max_jobs(dest.size()),
dest.device(), src.device(), src.size(), A.device(), B.device(), src.nr()*src.nc(), src.k());
}
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
__global__ void _add_bias_gradient(float* out, const float* in, size_t n, size_t total_n) __global__ void _add_bias_gradient(float* out, const float* in, size_t n, size_t total_n)
......
...@@ -30,6 +30,12 @@ namespace dlib ...@@ -30,6 +30,12 @@ namespace dlib
const tensor& src2 const tensor& src2
); );
void multiply_conv (
tensor& dest,
const tensor& src1,
const tensor& src2
);
void add ( void add (
tensor& dest, tensor& dest,
const tensor& src1, const tensor& src1,
...@@ -82,6 +88,15 @@ namespace dlib ...@@ -82,6 +88,15 @@ namespace dlib
const tensor& B const tensor& B
); );
// -----------------------------------------------------------------------------------
void affine_transform_conv(
tensor& dest,
const tensor& src,
const tensor& A,
const tensor& B
);
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void assign_bias_gradient ( void assign_bias_gradient (
......
...@@ -115,6 +115,19 @@ namespace dlib { namespace tt ...@@ -115,6 +115,19 @@ namespace dlib { namespace tt
} }
void multiply_conv (
tensor& dest,
const tensor& src1,
const tensor& src2
)
{
#ifdef DLIB_USE_CUDA
cuda::multiply_conv(dest, src1, src2);
#else
cpu::multiply_conv(dest, src1, src2);
#endif
}
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void affine_transform( void affine_transform(
...@@ -181,6 +194,22 @@ namespace dlib { namespace tt ...@@ -181,6 +194,22 @@ namespace dlib { namespace tt
#endif #endif
} }
// ----------------------------------------------------------------------------------------
void affine_transform_conv(
tensor& dest,
const tensor& src,
const tensor& A,
const tensor& B
)
{
#ifdef DLIB_USE_CUDA
cuda::affine_transform_conv(dest,src,A,B);
#else
cpu::affine_transform_conv(dest,src,A,B);
#endif
}
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void batch_normalize_inference ( void batch_normalize_inference (
...@@ -362,8 +391,7 @@ namespace dlib { namespace tt ...@@ -362,8 +391,7 @@ namespace dlib { namespace tt
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cuda::assign_conv_bias_gradient(grad,gradient_input); cuda::assign_conv_bias_gradient(grad,gradient_input);
#else #else
// TODO cpu::assign_conv_bias_gradient(grad,gradient_input);
DLIB_CASSERT(false,"");
#endif #endif
} }
......
...@@ -118,6 +118,34 @@ namespace dlib { namespace tt ...@@ -118,6 +118,34 @@ namespace dlib { namespace tt
with num_samples()==1 which is then assigned to #dest. with num_samples()==1 which is then assigned to #dest.
!*/ !*/
void multiply_conv (
tensor& dest,
const tensor& src1,
const tensor& src2
);
/*!
requires
- if (have_same_dimensions(dest, src1) == true) then
- src2.num_samples() == 1
- src2.nr() == 1
- src2.nc() == 1
- src2.k() == src1.k()
- else
- have_same_dimensions(src1, src2) == true)
- dest.num_samples() == 1
- dest.nr() == 1
- dest.nc() == 1
- dest.k() == src1.k()
ensures
- Performs #dest == src1*src2
In particular, if the elements of dest, src1, and src2 were indexed by (n,k,r,c) then
we would have:
- if (have_same_dimensions(dest,src1)) then
#dest(n,k,r,c) == src1(n,k,r,c)*src2(k)
- else
#dest(k) == sum over {n,r,c} of src1(n,k,r,c)*src2(n,k,r,c)
!*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void affine_transform( void affine_transform(
...@@ -196,6 +224,29 @@ namespace dlib { namespace tt ...@@ -196,6 +224,29 @@ namespace dlib { namespace tt
- #dest.host()[i] == A.host()[i]*src.host()[i] + B.host()[i] - #dest.host()[i] == A.host()[i]*src.host()[i] + B.host()[i]
!*/ !*/
// ----------------------------------------------------------------------------------------
void affine_transform_conv(
tensor& dest,
const tensor& src,
const tensor& A,
const tensor& B
);
/*!
requires
- have_same_dimensions(dest,src) == true
- have_same_dimensions(A, B) == true
- A.num_samples() == 1
- A.nr() == 1
- A.nc() == 1
- A.k() == src.k()
ensures
- Performs #dest == A*src + B
In particular, if the elements of dest and src were indexed by (n,k,r,c) then
we would have:
#dest(n,k,r,c) == A(k)*src(n,k,r,c) + B(k).
!*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void batch_normalize_inference ( void batch_normalize_inference (
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment