Commit 7078cfaf authored by Davis King's avatar Davis King

Added an "add_to" option to tt:copy_tensor(). There was also a bug in the

concat layer's backward() method.  It was assigning the gradient to previous
layers instead of adding the gradient, as required by the layer interface
specification.  This change also noticeably speeds up concat layers since only
one CUDA kernel launch now happens per concat operation, rather than one
kernel launch for each sample in a tensor.
parent 89c9267e
...@@ -2023,6 +2023,7 @@ namespace dlib ...@@ -2023,6 +2023,7 @@ namespace dlib
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
void copy_tensor( void copy_tensor(
bool add_to,
tensor& dest, tensor& dest,
size_t dest_k_offset, size_t dest_k_offset,
const tensor& src, const tensor& src,
...@@ -2045,7 +2046,15 @@ namespace dlib ...@@ -2045,7 +2046,15 @@ namespace dlib
for (long i = 0; i < src.num_samples(); ++i) for (long i = 0; i < src.num_samples(); ++i)
{ {
::memcpy(dest_p, src_p, block_size * sizeof(float)); if (add_to)
{
for (size_t j = 0; j < block_size; ++j)
dest_p[j] += src_p[j];
}
else
{
::memcpy(dest_p, src_p, block_size * sizeof(float));
}
dest_p += dest_sample_size; dest_p += dest_sample_size;
src_p += src_sample_size; src_p += src_sample_size;
......
...@@ -445,6 +445,7 @@ namespace dlib ...@@ -445,6 +445,7 @@ namespace dlib
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void copy_tensor( void copy_tensor(
bool add_to,
tensor& dest, tensor& dest,
size_t dest_k_offset, size_t dest_k_offset,
const tensor& src, const tensor& src,
......
...@@ -1377,12 +1377,33 @@ namespace dlib ...@@ -1377,12 +1377,33 @@ namespace dlib
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
__global__ void _cuda_copy_tensor_add_to (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size)
{
for(auto i : grid_stride_range(0, size))
{
size_t blk = i/block_size;
size_t j = i%block_size;
dest[blk*dest_stride + j] += src[blk*src_stride + j];
}
}
__global__ void _cuda_copy_tensor (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size)
{
for(auto i : grid_stride_range(0, size))
{
size_t blk = i/block_size;
size_t j = i%block_size;
dest[blk*dest_stride + j] = src[blk*src_stride + j];
}
}
void copy_tensor( void copy_tensor(
tensor& dest, bool add_to,
size_t dest_k_offset, tensor& dest,
const tensor& src, size_t dest_k_offset,
size_t src_k_offset, const tensor& src,
size_t count_k size_t src_k_offset,
size_t count_k
) )
{ {
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k()); const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
...@@ -1398,13 +1419,17 @@ namespace dlib ...@@ -1398,13 +1419,17 @@ namespace dlib
float* dest_p = dest.device() + dest_k_offset * dest.nc() * dest.nr(); float* dest_p = dest.device() + dest_k_offset * dest.nc() * dest.nr();
const float* src_p = src.device() + src_k_offset * src.nc() * src.nr();; const float* src_p = src.device() + src_k_offset * src.nc() * src.nr();;
if (add_to)
for (long i = 0; i < src.num_samples(); ++i)
{ {
CHECK_CUDA(cudaMemcpyAsync(dest_p, src_p, block_size * sizeof(float), cudaMemcpyDeviceToDevice)); launch_kernel(_cuda_copy_tensor_add_to, max_jobs(dest.size()),
dest_p, block_size*dest.num_samples(),
dest_p += dest_sample_size; src_p, dest_sample_size, src_sample_size, block_size);
src_p += src_sample_size; }
else
{
launch_kernel(_cuda_copy_tensor, max_jobs(dest.size()),
dest_p, block_size*dest.num_samples(),
src_p, dest_sample_size, src_sample_size, block_size);
} }
} }
......
...@@ -369,6 +369,7 @@ namespace dlib ...@@ -369,6 +369,7 @@ namespace dlib
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void copy_tensor( void copy_tensor(
bool add_to,
tensor& dest, tensor& dest,
size_t dest_k_offset, size_t dest_k_offset,
const tensor& src, const tensor& src,
......
...@@ -2604,7 +2604,7 @@ namespace dlib ...@@ -2604,7 +2604,7 @@ namespace dlib
static void concat(tensor& out, const SUBNET& sub, size_t k_offset) static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
{ {
auto& t = layer<TAG_TYPE>(sub).get_output(); auto& t = layer<TAG_TYPE>(sub).get_output();
tt::copy_tensor(out, k_offset, t, 0, t.k()); tt::copy_tensor(false, out, k_offset, t, 0, t.k());
k_offset += t.k(); k_offset += t.k();
concat_helper_impl<TAG_TYPES...>::concat(out, sub, k_offset); concat_helper_impl<TAG_TYPES...>::concat(out, sub, k_offset);
} }
...@@ -2612,7 +2612,7 @@ namespace dlib ...@@ -2612,7 +2612,7 @@ namespace dlib
static void split(const tensor& input, SUBNET& sub, size_t k_offset) static void split(const tensor& input, SUBNET& sub, size_t k_offset)
{ {
auto& t = layer<TAG_TYPE>(sub).get_gradient_input(); auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
tt::copy_tensor(t, 0, input, k_offset, t.k()); tt::copy_tensor(true, t, 0, input, k_offset, t.k());
k_offset += t.k(); k_offset += t.k();
concat_helper_impl<TAG_TYPES...>::split(input, sub, k_offset); concat_helper_impl<TAG_TYPES...>::split(input, sub, k_offset);
} }
...@@ -2635,13 +2635,13 @@ namespace dlib ...@@ -2635,13 +2635,13 @@ namespace dlib
static void concat(tensor& out, const SUBNET& sub, size_t k_offset) static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
{ {
auto& t = layer<TAG_TYPE>(sub).get_output(); auto& t = layer<TAG_TYPE>(sub).get_output();
tt::copy_tensor(out, k_offset, t, 0, t.k()); tt::copy_tensor(false, out, k_offset, t, 0, t.k());
} }
template<typename SUBNET> template<typename SUBNET>
static void split(const tensor& input, SUBNET& sub, size_t k_offset) static void split(const tensor& input, SUBNET& sub, size_t k_offset)
{ {
auto& t = layer<TAG_TYPE>(sub).get_gradient_input(); auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
tt::copy_tensor(t, 0, input, k_offset, t.k()); tt::copy_tensor(true, t, 0, input, k_offset, t.k());
} }
}; };
} }
......
...@@ -881,6 +881,7 @@ namespace dlib { namespace tt ...@@ -881,6 +881,7 @@ namespace dlib { namespace tt
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
void copy_tensor( void copy_tensor(
bool add_to,
tensor& dest, tensor& dest,
size_t dest_k_offset, size_t dest_k_offset,
const tensor& src, const tensor& src,
...@@ -889,9 +890,9 @@ namespace dlib { namespace tt ...@@ -889,9 +890,9 @@ namespace dlib { namespace tt
) )
{ {
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cuda::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k); cuda::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
#else #else
cpu::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k); cpu::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
#endif #endif
} }
......
...@@ -1544,6 +1544,7 @@ namespace dlib { namespace tt ...@@ -1544,6 +1544,7 @@ namespace dlib { namespace tt
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void copy_tensor( void copy_tensor(
bool add_to,
tensor& dest, tensor& dest,
size_t dest_k_offset, size_t dest_k_offset,
const tensor& src, const tensor& src,
...@@ -1560,8 +1561,12 @@ namespace dlib { namespace tt ...@@ -1560,8 +1561,12 @@ namespace dlib { namespace tt
- is_same_object(dest,src) == false - is_same_object(dest,src) == false
- The memory areas of src and dest do not overlap. - The memory areas of src and dest do not overlap.
ensures ensures
- performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k] - if (add_to) then
Copies content of each sample from src in to corresponding place of sample at dest. - performs: dest[i, k + dest_k_offset, r, c] += src[i, k + src_k_offset, r, c], where k in [0..count_k]
i.e., adds content of each sample from src in to corresponding place of sample at dest.
- else
- performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k]
i.e., copies content of each sample from src in to corresponding place of sample at dest.
!*/ !*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
......
...@@ -1794,9 +1794,9 @@ namespace ...@@ -1794,9 +1794,9 @@ namespace
rnd.fill_gaussian(src2); rnd.fill_gaussian(src2);
rnd.fill_gaussian(src3); rnd.fill_gaussian(src3);
cpu::copy_tensor(dest, 0, src1, 0, src1.k()); //full copy src1->dest cpu::copy_tensor(false, dest, 0, src1, 0, src1.k()); //full copy src1->dest
cpu::copy_tensor(dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1 cpu::copy_tensor(false, dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1
cpu::copy_tensor(dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest cpu::copy_tensor(false, dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest
for (long i = 0; i < dest.num_samples(); ++i) for (long i = 0; i < dest.num_samples(); ++i)
...@@ -1845,9 +1845,9 @@ namespace ...@@ -1845,9 +1845,9 @@ namespace
rnd.fill_gaussian(src1); rnd.fill_gaussian(src1);
rnd.fill_gaussian(src2); rnd.fill_gaussian(src2);
rnd.fill_gaussian(src3); rnd.fill_gaussian(src3);
cuda::copy_tensor(dest, 0, src1, 0, src1.k()); //full copy src1->dest cuda::copy_tensor(false, dest, 0, src1, 0, src1.k()); //full copy src1->dest
cuda::copy_tensor(dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1 cuda::copy_tensor(false, dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1
cuda::copy_tensor(dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest cuda::copy_tensor(false, dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest
for (long i = 0; i < dest.num_samples(); ++i) for (long i = 0; i < dest.num_samples(); ++i)
...@@ -1910,9 +1910,9 @@ namespace ...@@ -1910,9 +1910,9 @@ namespace
auto& b3o = layer<itag3>(net).get_output(); auto& b3o = layer<itag3>(net).get_output();
resizable_tensor dest(10, 14, 111, 222); resizable_tensor dest(10, 14, 111, 222);
copy_tensor(dest, 0, b1o, 0, b1o.k()); copy_tensor(false, dest, 0, b1o, 0, b1o.k());
copy_tensor(dest, b1o.k(), b2o, 0, b2o.k()); copy_tensor(false, dest, b1o.k(), b2o, 0, b2o.k());
copy_tensor(dest, b1o.k() + b2o.k(), b3o, 0, b3o.k()); copy_tensor(false, dest, b1o.k() + b2o.k(), b3o, 0, b3o.k());
DLIB_TEST(dest.size() == out.size()); DLIB_TEST(dest.size() == out.size());
int error = memcmp(dest.host(), out.host(), dest.size()); int error = memcmp(dest.host(), out.host(), dest.size());
...@@ -1932,9 +1932,9 @@ namespace ...@@ -1932,9 +1932,9 @@ namespace
resizable_tensor g2(10, 8, 111, 222); resizable_tensor g2(10, 8, 111, 222);
resizable_tensor g3(10, 1, 111, 222); resizable_tensor g3(10, 1, 111, 222);
copy_tensor(g1, 0, gr, 0, g1.k()); copy_tensor(false, g1, 0, gr, 0, g1.k());
copy_tensor(g2, 0, gr, g1.k(), g2.k()); copy_tensor(false, g2, 0, gr, g1.k(), g2.k());
copy_tensor(g3, 0, gr, g1.k() + g2.k(), g3.k()); copy_tensor(false, g3, 0, gr, g1.k() + g2.k(), g3.k());
DLIB_TEST(g1.size() == b1g.size()); DLIB_TEST(g1.size() == b1g.size());
error = memcmp(g1.host(), b1g.host(), b1g.size()); error = memcmp(g1.host(), b1g.host(), b1g.size());
DLIB_TEST(error == 0); DLIB_TEST(error == 0);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment