Added an "add_to" option to tt:copy_tensor(). There was also a bug in the

concat layer's backward() method. It was assigning the gradient to previous layers instead of adding the gradient, as required by the layer interface specification. This change also noticeably speeds up concat layers since only one CUDA kernel launch now happens per concat operation, rather than one kernel launch for each sample in a tensor.

Added an "add_to" option to tt:copy_tensor(). There was also a bug in the
concat layer's backward() method. It was assigning the gradient to previous layers instead of adding the gradient, as required by the layer interface specification. This change also noticeably speeds up concat layers since only one CUDA kernel launch now happens per concat operation, rather than one kernel launch for each sample in a tensor.
7078cfaf · Davis King · 89c9267e · 7078cfaf · 7078cfaf · 7078cfaf
Commit 7078cfaf authored Aug 14, 2017 by Davis King
8 changed files
--- a/dlib/dnn/cpu_dlib.cpp
+++ b/dlib/dnn/cpu_dlib.cpp
@@ -2023,6 +2023,7 @@ namespace dlib
     // ------------------------------------------------------------------------------------

        void copy_tensor(
+            bool add_to,
            tensor& dest,
            size_t dest_k_offset,
            const tensor& src,
@@ -2045,7 +2046,15 @@ namespace dlib

            for (long i = 0; i < src.num_samples(); ++i)
            {
-                ::memcpy(dest_p, src_p, block_size * sizeof(float));
+                if (add_to)
+                {
+                    for (size_t j = 0; j < block_size; ++j)
+                        dest_p[j] += src_p[j];
+                }
+                else
+                {
+                    ::memcpy(dest_p, src_p, block_size * sizeof(float));
+                }

                dest_p += dest_sample_size;
                src_p  += src_sample_size;

--- a/dlib/dnn/cpu_dlib.h
+++ b/dlib/dnn/cpu_dlib.h
@@ -445,6 +445,7 @@ namespace dlib
    // -----------------------------------------------------------------------------------

        void copy_tensor(
+            bool add_to,
            tensor& dest,
            size_t dest_k_offset,
            const tensor& src,

--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@@ -1377,12 +1377,33 @@ namespace dlib

    // ----------------------------------------------------------------------------------------

+        __global__ void _cuda_copy_tensor_add_to (float* dest, size_t size,  const float* src,  size_t dest_stride, size_t src_stride, size_t block_size)
+        {
+            for(auto i : grid_stride_range(0, size)) 
+            {
+                size_t blk = i/block_size;
+                size_t j = i%block_size;
+                dest[blk*dest_stride + j] += src[blk*src_stride + j];
+            }
+        }
+
+        __global__ void _cuda_copy_tensor (float* dest, size_t size,  const float* src,  size_t dest_stride, size_t src_stride, size_t block_size)
+        {
+            for(auto i : grid_stride_range(0, size)) 
+            {
+                size_t blk = i/block_size;
+                size_t j = i%block_size;
+                dest[blk*dest_stride + j] = src[blk*src_stride + j];
+            }
+        }
+
        void copy_tensor(
-                tensor& dest,
-                size_t dest_k_offset,
-                const tensor& src,
-                size_t src_k_offset,
-                size_t count_k
+            bool add_to,
+            tensor& dest,
+            size_t dest_k_offset,
+            const tensor& src,
+            size_t src_k_offset,
+            size_t count_k
        )
        {
            const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
@@ -1398,13 +1419,17 @@ namespace dlib
            float* dest_p = dest.device() + dest_k_offset * dest.nc() * dest.nr();
            const float* src_p = src.device() + src_k_offset * src.nc() * src.nr();;

-
-            for (long i = 0; i < src.num_samples(); ++i)
+            if (add_to)
            {
-                CHECK_CUDA(cudaMemcpyAsync(dest_p, src_p, block_size * sizeof(float), cudaMemcpyDeviceToDevice));
-
-                dest_p += dest_sample_size;
-                src_p  += src_sample_size;
+                launch_kernel(_cuda_copy_tensor_add_to, max_jobs(dest.size()), 
+                              dest_p, block_size*dest.num_samples(),
+                              src_p, dest_sample_size, src_sample_size, block_size);
+            }
+            else
+            {
+                launch_kernel(_cuda_copy_tensor, max_jobs(dest.size()), 
+                              dest_p, block_size*dest.num_samples(),
+                              src_p, dest_sample_size, src_sample_size, block_size);
            }
        }


--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@@ -369,6 +369,7 @@ namespace dlib
    // ----------------------------------------------------------------------------------------

        void copy_tensor(
+            bool add_to,
            tensor& dest,
            size_t dest_k_offset,
            const tensor& src,

--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -2604,7 +2604,7 @@ namespace dlib
            static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
            {
                auto& t = layer<TAG_TYPE>(sub).get_output();
-                tt::copy_tensor(out, k_offset, t, 0, t.k());
+                tt::copy_tensor(false, out, k_offset, t, 0, t.k());
                k_offset += t.k();
                concat_helper_impl<TAG_TYPES...>::concat(out, sub, k_offset);
            }
@@ -2612,7 +2612,7 @@ namespace dlib
            static void split(const tensor& input, SUBNET& sub, size_t k_offset)
            {
                auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
-                tt::copy_tensor(t, 0, input, k_offset, t.k());
+                tt::copy_tensor(true, t, 0, input, k_offset, t.k());
                k_offset += t.k();
                concat_helper_impl<TAG_TYPES...>::split(input, sub, k_offset);
            }
@@ -2635,13 +2635,13 @@ namespace dlib
            static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
            {
                auto& t = layer<TAG_TYPE>(sub).get_output();
-                tt::copy_tensor(out, k_offset, t, 0, t.k());
+                tt::copy_tensor(false, out, k_offset, t, 0, t.k());
            }
            template<typename SUBNET>
            static void split(const tensor& input, SUBNET& sub, size_t k_offset)
            {
                auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
-                tt::copy_tensor(t, 0, input, k_offset, t.k());
+                tt::copy_tensor(true, t, 0, input, k_offset, t.k());
            }
        };
    }

--- a/dlib/dnn/tensor_tools.cpp
+++ b/dlib/dnn/tensor_tools.cpp
@@ -881,6 +881,7 @@ namespace dlib { namespace tt
 // ------------------------------------------------------------------------------------

    void copy_tensor(
+            bool add_to,
            tensor& dest,
            size_t dest_k_offset,
            const tensor& src,
@@ -889,9 +890,9 @@ namespace dlib { namespace tt
    )
    {
 #ifdef DLIB_USE_CUDA
-        cuda::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
+        cuda::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
 #else
-        cpu::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
+        cpu::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
 #endif
    }


--- a/dlib/dnn/tensor_tools.h
+++ b/dlib/dnn/tensor_tools.h
@@ -1544,6 +1544,7 @@ namespace dlib { namespace tt
 // ----------------------------------------------------------------------------------------

    void copy_tensor(
+            bool add_to,
            tensor& dest,
            size_t dest_k_offset,
            const tensor& src,
@@ -1560,8 +1561,12 @@ namespace dlib { namespace tt
            - is_same_object(dest,src) == false
            - The memory areas of src and dest do not overlap.
        ensures
-            - performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k]
-              Copies content of each sample from src in to corresponding place of sample at dest.
+            - if (add_to) then
+                - performs: dest[i, k + dest_k_offset, r, c] += src[i, k + src_k_offset, r, c], where k in [0..count_k]
+                  i.e., adds content of each sample from src in to corresponding place of sample at dest.
+            - else
+                - performs: dest[i, k + dest_k_offset, r, c]  = src[i, k + src_k_offset, r, c], where k in [0..count_k]
+                  i.e., copies content of each sample from src in to corresponding place of sample at dest.
    !*/

 // ----------------------------------------------------------------------------------------

--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -1794,9 +1794,9 @@ namespace
        rnd.fill_gaussian(src2);
        rnd.fill_gaussian(src3);

-        cpu::copy_tensor(dest, 0, src1, 0,  src1.k()); //full copy src1->dest
-        cpu::copy_tensor(dest, src1.k(), src2, 0,  src2.k()); //full copy src2->dest with offset of src1
-        cpu::copy_tensor(dest, src1.k() + src2.k(), src3, 3,  3); //partial copy src3 into the rest place of dest
+        cpu::copy_tensor(false, dest, 0, src1, 0,  src1.k()); //full copy src1->dest
+        cpu::copy_tensor(false, dest, src1.k(), src2, 0,  src2.k()); //full copy src2->dest with offset of src1
+        cpu::copy_tensor(false, dest, src1.k() + src2.k(), src3, 3,  3); //partial copy src3 into the rest place of dest


        for (long i = 0; i < dest.num_samples(); ++i)
@@ -1845,9 +1845,9 @@ namespace
        rnd.fill_gaussian(src1);
        rnd.fill_gaussian(src2);
        rnd.fill_gaussian(src3);
-        cuda::copy_tensor(dest, 0, src1, 0,  src1.k()); //full copy src1->dest
-        cuda::copy_tensor(dest, src1.k(), src2, 0,  src2.k()); //full copy src2->dest with offset of src1
-        cuda::copy_tensor(dest, src1.k() + src2.k(), src3, 3,  3); //partial copy src3 into the rest place of dest
+        cuda::copy_tensor(false, dest, 0, src1, 0,  src1.k()); //full copy src1->dest
+        cuda::copy_tensor(false, dest, src1.k(), src2, 0,  src2.k()); //full copy src2->dest with offset of src1
+        cuda::copy_tensor(false, dest, src1.k() + src2.k(), src3, 3,  3); //partial copy src3 into the rest place of dest


        for (long i = 0; i < dest.num_samples(); ++i)
@@ -1910,9 +1910,9 @@ namespace
        auto& b3o = layer<itag3>(net).get_output();

        resizable_tensor dest(10, 14, 111, 222);
-        copy_tensor(dest, 0, b1o, 0,  b1o.k());
-        copy_tensor(dest, b1o.k(), b2o, 0,  b2o.k());
-        copy_tensor(dest, b1o.k() + b2o.k(), b3o, 0,  b3o.k());
+        copy_tensor(false, dest, 0, b1o, 0,  b1o.k());
+        copy_tensor(false, dest, b1o.k(), b2o, 0,  b2o.k());
+        copy_tensor(false, dest, b1o.k() + b2o.k(), b3o, 0,  b3o.k());

        DLIB_TEST(dest.size() == out.size());
        int error = memcmp(dest.host(), out.host(), dest.size());
@@ -1932,9 +1932,9 @@ namespace
        resizable_tensor g2(10, 8, 111, 222);
        resizable_tensor g3(10, 1, 111, 222);

-        copy_tensor(g1, 0, gr, 0,  g1.k());
-        copy_tensor(g2, 0, gr, g1.k(), g2.k());
-        copy_tensor(g3, 0, gr, g1.k() + g2.k(), g3.k());
+        copy_tensor(false, g1, 0, gr, 0,  g1.k());
+        copy_tensor(false, g2, 0, gr, g1.k(), g2.k());
+        copy_tensor(false, g3, 0, gr, g1.k() + g2.k(), g3.k());
        DLIB_TEST(g1.size() == b1g.size());
        error = memcmp(g1.host(), b1g.host(), b1g.size());
        DLIB_TEST(error == 0);