Commit 7078cfaf authored by Davis King's avatar Davis King

Added an "add_to" option to tt:copy_tensor(). There was also a bug in the

concat layer's backward() method.  It was assigning the gradient to previous
layers instead of adding the gradient, as required by the layer interface
specification.  This change also noticeably speeds up concat layers since only
one CUDA kernel launch now happens per concat operation, rather than one
kernel launch for each sample in a tensor.
parent 89c9267e
......@@ -2023,6 +2023,7 @@ namespace dlib
// ------------------------------------------------------------------------------------
void copy_tensor(
bool add_to,
tensor& dest,
size_t dest_k_offset,
const tensor& src,
......@@ -2045,7 +2046,15 @@ namespace dlib
for (long i = 0; i < src.num_samples(); ++i)
{
::memcpy(dest_p, src_p, block_size * sizeof(float));
if (add_to)
{
for (size_t j = 0; j < block_size; ++j)
dest_p[j] += src_p[j];
}
else
{
::memcpy(dest_p, src_p, block_size * sizeof(float));
}
dest_p += dest_sample_size;
src_p += src_sample_size;
......
......@@ -445,6 +445,7 @@ namespace dlib
// -----------------------------------------------------------------------------------
void copy_tensor(
bool add_to,
tensor& dest,
size_t dest_k_offset,
const tensor& src,
......
......@@ -1377,12 +1377,33 @@ namespace dlib
// ----------------------------------------------------------------------------------------
__global__ void _cuda_copy_tensor_add_to (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size)
{
for(auto i : grid_stride_range(0, size))
{
size_t blk = i/block_size;
size_t j = i%block_size;
dest[blk*dest_stride + j] += src[blk*src_stride + j];
}
}
__global__ void _cuda_copy_tensor (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size)
{
for(auto i : grid_stride_range(0, size))
{
size_t blk = i/block_size;
size_t j = i%block_size;
dest[blk*dest_stride + j] = src[blk*src_stride + j];
}
}
void copy_tensor(
tensor& dest,
size_t dest_k_offset,
const tensor& src,
size_t src_k_offset,
size_t count_k
bool add_to,
tensor& dest,
size_t dest_k_offset,
const tensor& src,
size_t src_k_offset,
size_t count_k
)
{
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
......@@ -1398,13 +1419,17 @@ namespace dlib
float* dest_p = dest.device() + dest_k_offset * dest.nc() * dest.nr();
const float* src_p = src.device() + src_k_offset * src.nc() * src.nr();;
for (long i = 0; i < src.num_samples(); ++i)
if (add_to)
{
CHECK_CUDA(cudaMemcpyAsync(dest_p, src_p, block_size * sizeof(float), cudaMemcpyDeviceToDevice));
dest_p += dest_sample_size;
src_p += src_sample_size;
launch_kernel(_cuda_copy_tensor_add_to, max_jobs(dest.size()),
dest_p, block_size*dest.num_samples(),
src_p, dest_sample_size, src_sample_size, block_size);
}
else
{
launch_kernel(_cuda_copy_tensor, max_jobs(dest.size()),
dest_p, block_size*dest.num_samples(),
src_p, dest_sample_size, src_sample_size, block_size);
}
}
......
......@@ -369,6 +369,7 @@ namespace dlib
// ----------------------------------------------------------------------------------------
void copy_tensor(
bool add_to,
tensor& dest,
size_t dest_k_offset,
const tensor& src,
......
......@@ -2604,7 +2604,7 @@ namespace dlib
static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
{
auto& t = layer<TAG_TYPE>(sub).get_output();
tt::copy_tensor(out, k_offset, t, 0, t.k());
tt::copy_tensor(false, out, k_offset, t, 0, t.k());
k_offset += t.k();
concat_helper_impl<TAG_TYPES...>::concat(out, sub, k_offset);
}
......@@ -2612,7 +2612,7 @@ namespace dlib
static void split(const tensor& input, SUBNET& sub, size_t k_offset)
{
auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
tt::copy_tensor(t, 0, input, k_offset, t.k());
tt::copy_tensor(true, t, 0, input, k_offset, t.k());
k_offset += t.k();
concat_helper_impl<TAG_TYPES...>::split(input, sub, k_offset);
}
......@@ -2635,13 +2635,13 @@ namespace dlib
static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
{
auto& t = layer<TAG_TYPE>(sub).get_output();
tt::copy_tensor(out, k_offset, t, 0, t.k());
tt::copy_tensor(false, out, k_offset, t, 0, t.k());
}
template<typename SUBNET>
static void split(const tensor& input, SUBNET& sub, size_t k_offset)
{
auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
tt::copy_tensor(t, 0, input, k_offset, t.k());
tt::copy_tensor(true, t, 0, input, k_offset, t.k());
}
};
}
......
......@@ -881,6 +881,7 @@ namespace dlib { namespace tt
// ------------------------------------------------------------------------------------
void copy_tensor(
bool add_to,
tensor& dest,
size_t dest_k_offset,
const tensor& src,
......@@ -889,9 +890,9 @@ namespace dlib { namespace tt
)
{
#ifdef DLIB_USE_CUDA
cuda::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
cuda::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
#else
cpu::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
cpu::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
#endif
}
......
......@@ -1544,6 +1544,7 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
void copy_tensor(
bool add_to,
tensor& dest,
size_t dest_k_offset,
const tensor& src,
......@@ -1560,8 +1561,12 @@ namespace dlib { namespace tt
- is_same_object(dest,src) == false
- The memory areas of src and dest do not overlap.
ensures
- performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k]
Copies content of each sample from src in to corresponding place of sample at dest.
- if (add_to) then
- performs: dest[i, k + dest_k_offset, r, c] += src[i, k + src_k_offset, r, c], where k in [0..count_k]
i.e., adds content of each sample from src in to corresponding place of sample at dest.
- else
- performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k]
i.e., copies content of each sample from src in to corresponding place of sample at dest.
!*/
// ----------------------------------------------------------------------------------------
......
......@@ -1794,9 +1794,9 @@ namespace
rnd.fill_gaussian(src2);
rnd.fill_gaussian(src3);
cpu::copy_tensor(dest, 0, src1, 0, src1.k()); //full copy src1->dest
cpu::copy_tensor(dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1
cpu::copy_tensor(dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest
cpu::copy_tensor(false, dest, 0, src1, 0, src1.k()); //full copy src1->dest
cpu::copy_tensor(false, dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1
cpu::copy_tensor(false, dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest
for (long i = 0; i < dest.num_samples(); ++i)
......@@ -1845,9 +1845,9 @@ namespace
rnd.fill_gaussian(src1);
rnd.fill_gaussian(src2);
rnd.fill_gaussian(src3);
cuda::copy_tensor(dest, 0, src1, 0, src1.k()); //full copy src1->dest
cuda::copy_tensor(dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1
cuda::copy_tensor(dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest
cuda::copy_tensor(false, dest, 0, src1, 0, src1.k()); //full copy src1->dest
cuda::copy_tensor(false, dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1
cuda::copy_tensor(false, dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest
for (long i = 0; i < dest.num_samples(); ++i)
......@@ -1910,9 +1910,9 @@ namespace
auto& b3o = layer<itag3>(net).get_output();
resizable_tensor dest(10, 14, 111, 222);
copy_tensor(dest, 0, b1o, 0, b1o.k());
copy_tensor(dest, b1o.k(), b2o, 0, b2o.k());
copy_tensor(dest, b1o.k() + b2o.k(), b3o, 0, b3o.k());
copy_tensor(false, dest, 0, b1o, 0, b1o.k());
copy_tensor(false, dest, b1o.k(), b2o, 0, b2o.k());
copy_tensor(false, dest, b1o.k() + b2o.k(), b3o, 0, b3o.k());
DLIB_TEST(dest.size() == out.size());
int error = memcmp(dest.host(), out.host(), dest.size());
......@@ -1932,9 +1932,9 @@ namespace
resizable_tensor g2(10, 8, 111, 222);
resizable_tensor g3(10, 1, 111, 222);
copy_tensor(g1, 0, gr, 0, g1.k());
copy_tensor(g2, 0, gr, g1.k(), g2.k());
copy_tensor(g3, 0, gr, g1.k() + g2.k(), g3.k());
copy_tensor(false, g1, 0, gr, 0, g1.k());
copy_tensor(false, g2, 0, gr, g1.k(), g2.k());
copy_tensor(false, g3, 0, gr, g1.k() + g2.k(), g3.k());
DLIB_TEST(g1.size() == b1g.size());
error = memcmp(g1.host(), b1g.host(), b1g.size());
DLIB_TEST(error == 0);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment