Merge pull request #114 from e-fominov/dnn_group_layer

Concat layer

Merge pull request #114 from e-fominov/dnn_group_layer
Concat layer
8c550d4c · Davis E. King · cbd37d56 · 01b3b08b · 8c550d4c · 8c550d4c
Commit 8c550d4c authored May 30, 2016 by Davis E. King
12 changed files
--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -536,7 +536,7 @@ namespace dlib
            subnet_wrapper(const subnet_wrapper&) = delete;
            subnet_wrapper& operator=(const subnet_wrapper&) = delete;
-            subnet_wrapper(T& l_) {}
+            subnet_wrapper(T& /*l_*/) {}
            // Nothing here because in this case T is one of the input layer types 
            // that doesn't have anything in it.
        };
@@ -600,7 +600,7 @@ namespace dlib
    struct is_nonloss_layer_type<add_layer<T,U>> : std::true_type {};
    template <typename LAYER_DETAILS, typename SUBNET>
-    class add_layer<LAYER_DETAILS,SUBNET, 
+    class add_layer<LAYER_DETAILS,SUBNET,
            typename std::enable_if<is_nonloss_layer_type<SUBNET>::value>::type>
    {
    public:

--- a/dlib/dnn/cpu_dlib.cpp
+++ b/dlib/dnn/cpu_dlib.cpp
@@ -1783,6 +1783,36 @@ namespace dlib
                    filters_gradient += gi*temp;
            }
        }
+    // ------------------------------------------------------------------------------------
+    void copy_tensor(
+            tensor& dest,
+            size_t dest_k_offset,
+            const tensor& src,
+            size_t src_k_offset,
+            size_t count_k
+    )
+    {
+        const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
+        const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
+        const size_t block_size = count_k * dest.nc() * dest.nr();
+        DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
+                     dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size");
+        DLIB_CASSERT(dest.k() - dest_k_offset >= count_k, "Not enough space in dest tensor");
+        DLIB_CASSERT(src.k() - src_k_offset >= count_k, "Not enough space in src tensor");
+        float* dest_p = dest.host() + dest_k_offset * dest.nc() * dest.nr();
+        const float* src_p = src.host() + src_k_offset * src.nc() * src.nr();
+        for (unsigned long i = 0; i < src.num_samples(); ++i)
+        {
+            ::memcpy(dest_p, src_p, block_size * sizeof(float));
+            dest_p += dest_sample_size;
+            src_p  += src_sample_size;
+        }
+    }
    // ------------------------------------------------------------------------------------
    // ------------------------------------------------------------------------------------

--- a/dlib/dnn/cpu_dlib.h
+++ b/dlib/dnn/cpu_dlib.h
@@ -385,6 +385,14 @@ namespace dlib
        };
    // -----------------------------------------------------------------------------------
+    void copy_tensor(
+            tensor& dest,
+            size_t dest_k_offset,
+            const tensor& src,
+            size_t src_k_offset,
+            size_t count_k
+    );
+    // -----------------------------------------------------------------------------------
    } 
 }

--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@@ -796,7 +796,38 @@ namespace dlib
                grad.device(), src.device(), gradient_input.device(), grad.size(),
                param.device(), params_grad.device());
        }
+        // ----------------------------------------------------------------------------------------
+        void copy_tensor(
+                tensor& dest,
+                size_t dest_k_offset,
+                const tensor& src,
+                size_t src_k_offset,
+                size_t count_k
+        )
+        {
+            const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
+            const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
+            const size_t block_size = count_k * dest.nc() * dest.nr();
+            DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
+                         dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size");
+            DLIB_CASSERT(dest.k() - dest_k_offset >= count_k, "Not enough space in dest tensor");
+            DLIB_CASSERT(src.k() - src_k_offset >= count_k, "Not enough space in src tensor");
+            float* dest_p = dest.device() + dest_k_offset * dest.nc() * dest.nr();
+            const float* src_p = src.device() + src_k_offset * src.nc() * src.nr();;
+            for (unsigned long i = 0; i < src.num_samples(); ++i)
+            {
+                CHECK_CUDA(cudaMemcpy(dest_p, src_p, block_size * sizeof(float), cudaMemcpyDeviceToDevice));
+                dest_p += dest_sample_size;
+                src_p  += src_sample_size;
+            }
+        }
    // ----------------------------------------------------------------------------------------
    }

--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@@ -258,6 +258,13 @@ namespace dlib
            tensor& params_grad 
        );
+        void copy_tensor(
+                tensor& dest,
+                size_t dest_k_offset,
+                const tensor& src,
+                size_t src_k_offset,
+                size_t count_k
+        );
    // ------------------------------------------------------------------------------------
    // ------------------------------------------------------------------------------------
    // ------------------------------------------------------------------------------------

--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -1836,6 +1836,199 @@ namespace dlib
    template <typename SUBNET>
    using softmax = add_layer<softmax_, SUBNET>;
+// ----------------------------------------------------------------------------------------
+    namespace impl{
+        // helper classes for layer concat processing
+        template <template<typename> class... TAG_TYPES>
+        struct concat_helper_impl {
+        };
+        template <template<typename> class TAG_TYPE>
+        struct concat_helper_impl<TAG_TYPE>{
+            constexpr static size_t tag_count() {return 1;}
+            template<typename SUBNET>
+            static void resize_out(resizable_tensor& out, const SUBNET& sub, long sum_k)
+            {
+                auto& t = layer<TAG_TYPE>(sub).get_output();
+                out.set_size(t.num_samples(), t.k() + sum_k, t.nr(), t.nc());
+            }
+            template<typename SUBNET>
+            static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
+            {
+                auto& t = layer<TAG_TYPE>(sub).get_output();
+                tt::copy_tensor(out, k_offset, t, 0, t.k());
+            }
+            template<typename SUBNET>
+            static void split(const tensor& input, SUBNET& sub, size_t k_offset)
+            {
+                auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
+                tt::copy_tensor(t, 0, input, k_offset, t.k());
+            }
+        };
+        template <template<typename> class TAG_TYPE, template<typename> class... TAG_TYPES>
+        struct concat_helper_impl<TAG_TYPE, TAG_TYPES...>{
+            constexpr static size_t tag_count() {return 1 + concat_helper_impl<TAG_TYPES...>::tag_count();}
+            template<typename SUBNET>
+            static void resize_out(resizable_tensor& out, const SUBNET& sub, long sum_k)
+            {
+                auto& t = layer<TAG_TYPE>(sub).get_output();
+                concat_helper_impl<TAG_TYPES...>::resize_out(out, sub, sum_k + t.k());
+            }
+            template<typename SUBNET>
+            static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
+            {
+                auto& t = layer<TAG_TYPE>(sub).get_output();
+                tt::copy_tensor(out, k_offset, t, 0, t.k());
+                k_offset += t.k();
+                concat_helper_impl<TAG_TYPES...>::concat(out, sub, k_offset);
+            }
+            template<typename SUBNET>
+            static void split(const tensor& input, SUBNET& sub, size_t k_offset)
+            {
+                auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
+                tt::copy_tensor(t, 0, input, k_offset, t.k());
+                k_offset += t.k();
+                concat_helper_impl<TAG_TYPES...>::split(input, sub, k_offset);
+            }
+        };
+    }
+    // concat layer
+    template<
+        template<typename> class... TAG_TYPES
+        >
+    class concat_
+    {
+    public:
+        constexpr static size_t tag_count() {return impl::concat_helper_impl<TAG_TYPES...>::tag_count();};
+        template <typename SUBNET>
+        void setup (const SUBNET&)
+        {
+            // do nothing
+        }
+        template <typename SUBNET>
+        void forward(const SUBNET& sub, resizable_tensor& output)
+        {
+            // the total depth of result is the sum of depths from all tags
+            impl::concat_helper_impl<TAG_TYPES...>::resize_out(output, sub, 0);
+            // copy output from each tag into different part result
+            impl::concat_helper_impl<TAG_TYPES...>::concat(output, sub, 0);
+        }
+        template <typename SUBNET>
+        void backward(const tensor& gradient_input, SUBNET& sub, tensor&)
+        {
+            // Gradient is splitted into parts for each tag layer
+            impl::concat_helper_impl<TAG_TYPES...>::split(gradient_input, sub, 0);
+        }
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+        friend void serialize(const concat_& item, std::ostream& out)
+        {
+            serialize("concat_", out);
+            size_t count = tag_count();
+            serialize(count, out);
+        }
+        friend void deserialize(concat_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "concat_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::concat_.");
+            size_t count_tags;
+            deserialize(count_tags, in);
+            if (count_tags != tag_count())
+                throw serialization_error("Invalid count of tags "+ std::to_string(count_tags) +", expecting " +
+                                          std::to_string(tag_count()) +
+                                                  " found while deserializing dlib::concat_.");
+        }
+        friend std::ostream& operator<<(std::ostream& out, const concat_& item)
+        {
+            out << "concat\t ("
+                << tag_count()
+                << ")";
+            return out;
+        }
+    private:
+        resizable_tensor params; // unused
+    };
+    // concat layer definitions
+    template <template<typename> class TAG1, typename SUBNET>
+    using concat1 = add_layer<concat_<TAG1>, SUBNET>;
+    template <template<typename> class TAG1,
+            template<typename> class TAG2,
+            typename SUBNET>
+    using concat2 = add_layer<concat_<TAG1, TAG2>, SUBNET>;
+    template <template<typename> class TAG1,
+            template<typename> class TAG2,
+            template<typename> class TAG3,
+            typename SUBNET>
+    using concat3 = add_layer<concat_<TAG1, TAG2, TAG3>, SUBNET>;
+    template <template<typename> class TAG1,
+            template<typename> class TAG2,
+            template<typename> class TAG3,
+            template<typename> class TAG4,
+            typename SUBNET>
+    using concat4 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4>, SUBNET>;
+    template <template<typename> class TAG1,
+            template<typename> class TAG2,
+            template<typename> class TAG3,
+            template<typename> class TAG4,
+            template<typename> class TAG5,
+            typename SUBNET>
+    using concat5 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4, TAG5>, SUBNET>;
+    // inception layer will use tags internally. If user will use tags too,
+    // some conflicts possible
+    // to exclude them, here are new tags specially for inceptions
+    template <typename SUBNET> using itag0  = add_tag_layer< 1000 + 0, SUBNET>;
+    template <typename SUBNET> using itag1  = add_tag_layer< 1000 + 1, SUBNET>;
+    template <typename SUBNET> using itag2  = add_tag_layer< 1000 + 2, SUBNET>;
+    template <typename SUBNET> using itag3  = add_tag_layer< 1000 + 3, SUBNET>;
+    template <typename SUBNET> using itag4  = add_tag_layer< 1000 + 4, SUBNET>;
+    template <typename SUBNET> using itag5  = add_tag_layer< 1000 + 5, SUBNET>;
+    // skip to inception input
+    template <typename SUBNET> using iskip  = add_skip_layer< itag0, SUBNET>;
+    // here are some templates to be used for creating inception layer groups
+    template <template<typename>class B1,
+            typename SUBNET>
+    using inception1 = concat1<itag1, itag1<B1<iskip< itag0<SUBNET>>>>>;
+    template <template<typename>class B1,
+            template<typename>class B2,
+            typename SUBNET>
+    using inception2 = concat2<itag1, itag2, itag1<B1<iskip< itag2<B2< itag0<SUBNET>>>>>>>;
+    template <template<typename>class B1,
+            template<typename>class B2,
+            template<typename>class B3,
+            typename SUBNET>
+    using inception3 = concat3<itag1, itag2, itag3, itag1<B1<iskip< itag2<B2<iskip< itag3<B3<  itag0<SUBNET>>>>>>>>>>;
+    template <template<typename>class B1,
+            template<typename>class B2,
+            template<typename>class B3,
+            template<typename>class B4,
+            typename SUBNET>
+    using inception4 = concat4<itag1, itag2, itag3, itag4,
+                itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip<  itag4<B4<  itag0<SUBNET>>>>>>>>>>>>
+            >;
+    template <template<typename>class B1,
+            template<typename>class B2,
+            template<typename>class B3,
+            template<typename>class B4,
+            template<typename>class B5,
+            typename SUBNET>
+    using inception5 = concat5<itag1, itag2, itag3, itag4, itag5,
+                itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip<  itag4<B4<iskip<  itag5<B5<  itag0<SUBNET>>>>>>>>>>>>>>>
+            >;
 // ----------------------------------------------------------------------------------------
 }

--- a/dlib/dnn/layers_abstract.h
+++ b/dlib/dnn/layers_abstract.h
@@ -1652,6 +1652,116 @@ namespace dlib
    using add_prev9_  = add_prev_<tag9>;
    using add_prev10_ = add_prev_<tag10>;
+// ----------------------------------------------------------------------------------------
+    template<
+        template<typename> class... TAG_TYPES
+        >
+    class concat_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  This layer simply concatenates the output of requiered layers
+                In particular, it copies each layer's output from TAG_TYPES into the corresponding
+                place of the result tensor, those producing combined output
+                The output of each tag layer is stored in a separate part of final output.
+                FORWARD:
+                for each (tag in TAG_TYPES)
+                    outout[i, k + tag.k(), r, c] = layer<tag>(subnet).get_output[i, k, r, c]
+                BACKWARD:
+                for each (tag in TAG_TYPES)
+                    layer<tag>(subnet).get_gradient_input[i, k, r, c] = input[i, k + tag.k(), r, c]
+                This layer can be only used with tags inside.
+                Each tagged layer should have identical num_samples, R and C size
+                The output will have K = sum(k) of tags, and the, and the output's num_samples,
+                R and C will be the same as tagged layers
+        !*/
+    public:
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        const tensor& get_layer_params() const;
+        tensor& get_layer_params();
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+    };
+    // concat layer definitions
+    template <template<typename> class TAG1, typename SUBNET>
+    using concat1 = add_layer<concat_<TAG1>, SUBNET>;
+    template <template<typename> class TAG1,
+            template<typename> class TAG2,
+            typename SUBNET>
+    using concat2 = add_layer<concat_<TAG1, TAG2>, SUBNET>;
+    template <template<typename> class TAG1,
+            template<typename> class TAG2,
+            template<typename> class TAG3,
+            typename SUBNET>
+    using concat3 = add_layer<concat_<TAG1, TAG2, TAG3>, SUBNET>;
+    template <template<typename> class TAG1,
+            template<typename> class TAG2,
+            template<typename> class TAG3,
+            template<typename> class TAG4,
+            typename SUBNET>
+    using concat4 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4>, SUBNET>;
+    template <template<typename> class TAG1,
+            template<typename> class TAG2,
+            template<typename> class TAG3,
+            template<typename> class TAG4,
+            template<typename> class TAG5,
+            typename SUBNET>
+    using concat5 = add_layer<concat_<TAG1, TAG2, TAG3, TAG4, TAG5>, SUBNET>;
+    // inception layer will use tags internally. If user will use tags too,
+    // some conflicts possible
+    // to exclude them, here are new tags specially for inceptions
+    template <typename SUBNET> using itag0  = add_tag_layer< 1000 + 0, SUBNET>;
+    template <typename SUBNET> using itag1  = add_tag_layer< 1000 + 1, SUBNET>;
+    template <typename SUBNET> using itag2  = add_tag_layer< 1000 + 2, SUBNET>;
+    template <typename SUBNET> using itag3  = add_tag_layer< 1000 + 3, SUBNET>;
+    template <typename SUBNET> using itag4  = add_tag_layer< 1000 + 4, SUBNET>;
+    template <typename SUBNET> using itag5  = add_tag_layer< 1000 + 5, SUBNET>;
+    // skip to inception input
+    template <typename SUBNET> using iskip  = add_skip_layer< itag0, SUBNET>;
+    // here are some templates to be used for creating inception layer groups
+    template <template<typename>class B1,
+            typename SUBNET>
+    using inception1 = concat1<itag1, itag1<B1<iskip< itag0<SUBNET>>>>>;
+    template <template<typename>class B1,
+            template<typename>class B2,
+            typename SUBNET>
+    using inception2 = concat2<itag1, itag2, itag1<B1<iskip< itag2<B2< itag0<SUBNET>>>>>>>;
+    template <template<typename>class B1,
+            template<typename>class B2,
+            template<typename>class B3,
+            typename SUBNET>
+    using inception3 = concat3<itag1, itag2, itag3, itag1<B1<iskip< itag2<B2<iskip< itag3<B3<  itag0<SUBNET>>>>>>>>>>;
+    template <template<typename>class B1,
+            template<typename>class B2,
+            template<typename>class B3,
+            template<typename>class B4,
+            typename SUBNET>
+    using inception4 = concat4<itag1, itag2, itag3, itag4,
+                itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip<  itag4<B4<  itag0<SUBNET>>>>>>>>>>>>
+            >;
+    template <template<typename>class B1,
+            template<typename>class B2,
+            template<typename>class B3,
+            template<typename>class B4,
+            template<typename>class B5,
+            typename SUBNET>
+    using inception5 = concat5<itag1, itag2, itag3, itag4, itag5,
+                itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip<  itag4<B4<iskip<  itag5<B5<  itag0<SUBNET>>>>>>>>>>>>>>>
+            >;
 // ----------------------------------------------------------------------------------------
 }

--- a/dlib/dnn/tensor_tools.cpp
+++ b/dlib/dnn/tensor_tools.cpp
@@ -678,6 +678,23 @@ namespace dlib { namespace tt
 #endif
    }
+// ------------------------------------------------------------------------------------
+        void copy_tensor(
+                tensor& dest,
+                size_t dest_k_offset,
+                const tensor& src,
+                size_t src_k_offset,
+                size_t count_k
+        )
+        {
+#ifdef DLIB_USE_CUDA
+            cuda::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
+#else
+            cpu::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
+#endif
+        }
 // ----------------------------------------------------------------------------------------
 }}

--- a/dlib/dnn/tensor_tools.h
+++ b/dlib/dnn/tensor_tools.h
@@ -1232,6 +1232,27 @@ namespace dlib { namespace tt
        resizable_tensor accum_buffer;
    };
+    // ----------------------------------------------------------------------------------------
+        void copy_tensor(
+                tensor& dest,
+                size_t dest_k_offset,
+                const tensor& src,
+                size_t src_k_offset,
+                size_t count_k
+        );
+        /*!
+            requires
+                - dest.nc() == src.nc()
+                - dest.nr() == src.nr()
+                - dest.num_samples() == src.num_samples()
+                - dest.k() - dest_k_offset >= count_k
+                - src.k() - src_k_offset >= count_k
+                - is_same_object(dest,src) == false
+            ensures
+                - performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k]
+                  Copies content of each sample from src in to corresponding place of sample at dst
+        !*/
 // ----------------------------------------------------------------------------------------

--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -11,8 +11,7 @@
 #include "tester.h"
+namespace
-namespace  
 {
    using namespace test;
@@ -1405,6 +1404,174 @@ namespace
        DLIB_TEST(count == pnet.num_computational_layers);
    }
+    float tensor_read_cpu(const tensor& t, long i, long k, long r, long c)
+    {
+        const float* p = t.host() + t.k() * t.nr() * t.nc() * i +
+                        t.nr() * t.nc() * k + t.nc() * r + c;
+        return *p;
+    }
+    void test_copy_tensor_cpu()
+    {
+        using namespace dlib::tt;
+        print_spinner();
+        resizable_tensor dest(10, 9, 7, 15);
+        resizable_tensor src1(10, 3, 7, 15);
+        resizable_tensor src2(10, 3, 7, 15);
+        resizable_tensor src3(10, 9, 7, 15);
+        dest = matrix_cast<float>(gaussian_randm(dest.num_samples(), dest.k() * dest.nr() * dest.nc(), 1));
+        src1 = matrix_cast<float>(gaussian_randm(src1.num_samples(), src1.k() * src1.nr() * src1.nc(), 0));
+        src2 = matrix_cast<float>(gaussian_randm(src1.num_samples(), src2.k() * src2.nr() * src2.nc(), 0));
+        src3 = matrix_cast<float>(gaussian_randm(src1.num_samples(), src3.k() * src3.nr() * src3.nc(), 0));
+        cpu::copy_tensor(dest, 0, src1, 0,  src1.k()); //full copy src1->dest
+        cpu::copy_tensor(dest, src1.k(), src2, 0,  src2.k()); //full copy src2->dest with offset of src1
+        cpu::copy_tensor(dest, src1.k() + src2.k(), src3, 3,  3); //partial copy src3 into the rest place of dest
+        for (long i = 0; i < dest.num_samples(); ++i)
+        {
+            for (long k = 0; k < dest.k(); ++k)
+            {
+                for (long r = 0; r < dest.nr(); ++r)
+                {
+                    for (long c = 0; c < dest.nc(); ++c)
+                    {
+                        float dest_value = tensor_read_cpu(dest, i, k, r, c);
+                        // first part is from src1
+                        if (k < src1.k())
+                        {
+                            float src_value = tensor_read_cpu(src1, i, k, r, c);
+                            DLIB_TEST(src_value == dest_value);
+                        }
+                        // second part is from src2
+                        else if (k < src1.k() + src2.k())
+                        {
+                            float src_value = tensor_read_cpu(src2, i, k - src1.k(), r, c);
+                            DLIB_TEST(src_value == dest_value);
+                        }
+                        // third part is from src3
+                        else
+                        {
+                            float src_value = tensor_read_cpu(src3, i, k - src1.k() - src2.k() + 3, r, c);
+                            DLIB_TEST(src_value == dest_value);
+                        }
+                    }
+                }
+            }
+        }
+    }
+#ifdef DLIB_USE_CUDA
+    void test_copy_tensor_gpu()
+    {
+        using namespace dlib::tt;
+        print_spinner();
+        resizable_tensor dest(10, 9, 7, 15);
+        resizable_tensor src1(10, 3, 7, 15);
+        resizable_tensor src2(10, 3, 7, 15);
+        resizable_tensor src3(10, 9, 7, 15);
+        dest = matrix_cast<float>(gaussian_randm(dest.num_samples(), dest.k() * dest.nr() * dest.nc(), 1));
+        src1 = matrix_cast<float>(gaussian_randm(src1.num_samples(), src1.k() * src1.nr() * src1.nc(), 0));
+        src2 = matrix_cast<float>(gaussian_randm(src1.num_samples(), src2.k() * src2.nr() * src2.nc(), 0));
+        src3 = matrix_cast<float>(gaussian_randm(src1.num_samples(), src3.k() * src3.nr() * src3.nc(), 0));
+        cuda::copy_tensor(dest, 0, src1, 0,  src1.k()); //full copy src1->dest
+        cuda::copy_tensor(dest, src1.k(), src2, 0,  src2.k()); //full copy src2->dest with offset of src1
+        cuda::copy_tensor(dest, src1.k() + src2.k(), src3, 3,  3); //partial copy src3 into the rest place of dest
+        for (long i = 0; i < dest.num_samples(); ++i)
+        {
+            for (long k = 0; k < dest.k(); ++k)
+            {
+                for (long r = 0; r < dest.nr(); ++r)
+                {
+                    for (long c = 0; c < dest.nc(); ++c)
+                    {
+                        float dest_value = tensor_read_cpu(dest, i, k, r, c);
+                        // first part is from src1
+                        if (k < src1.k())
+                        {
+                            float src_value = tensor_read_cpu(src1, i, k, r, c);
+                            DLIB_TEST(src_value == dest_value);
+                        }
+                            // second part is from src2
+                        else if (k < src1.k() + src2.k())
+                        {
+                            float src_value = tensor_read_cpu(src2, i, k - src1.k(), r, c);
+                            DLIB_TEST(src_value == dest_value);
+                        }
+                            // third part is from src3
+                        else
+                        {
+                            float src_value = tensor_read_cpu(src3, i, k - src1.k() - src2.k() + 3, r, c);
+                            DLIB_TEST(src_value == dest_value);
+                        }
+                    }
+                }
+            }
+        }
+    }
+#endif//DLIB_USE_CUDA
+    template <typename SUBNET> using concat_block1 = con<5,1,1,1,1,SUBNET>;
+    template <typename SUBNET> using concat_block2 = con<8,3,3,1,1,SUBNET>;
+    template <typename SUBNET> using concat_block3 = max_pool<3,3,1,1,SUBNET>;
+    template <typename SUBNET> using concat_incept = inception3<concat_block1,concat_block2,concat_block3,SUBNET>;
+    void test_concat()
+    {
+        using namespace dlib::tt;
+        print_spinner();
+        using net_type = concat_incept<input<matrix<float>>>;
+        resizable_tensor data(10, 1, 111, 222);
+        data = matrix_cast<float>(gaussian_randm(data.num_samples(), data.k() * data.nr() * data.nc(), 1));
+        net_type net;
+        auto& out = net.forward(data);
+        auto& b1o = layer<itag1>(net).get_output();
+        auto& b2o = layer<itag2>(net).get_output();
+        auto& b3o = layer<itag3>(net).get_output();
+        resizable_tensor dest(10, 14, 111, 222);
+        copy_tensor(dest, 0, b1o, 0,  b1o.k());
+        copy_tensor(dest, b1o.k(), b2o, 0,  b2o.k());
+        copy_tensor(dest, b1o.k() + b2o.k(), b3o, 0,  b3o.k());
+        DLIB_TEST(dest.size() == out.size());
+        int error = memcmp(dest.host(), out.host(), dest.size());
+        DLIB_TEST(error == 0);
+        resizable_tensor gr(10, 14, 111, 222);
+        gr = matrix_cast<float>(gaussian_randm(gr.num_samples(), gr.k() * gr.nr() * gr.nc(), 1));
+        resizable_tensor params;
+        net.layer_details().backward(gr, net, params);
+        auto& b1g = layer<itag1>(net).subnet().get_gradient_input();
+        auto& b2g = layer<itag2>(net).subnet().get_gradient_input();
+        auto& b3g = layer<itag3>(net).subnet().get_gradient_input();
+        resizable_tensor g1(10, 5, 111, 222);
+        resizable_tensor g2(10, 8, 111, 222);
+        resizable_tensor g3(10, 1, 111, 222);
+        copy_tensor(g1, 0, gr, 0,  g1.k());
+        copy_tensor(g2, 0, gr, g1.k(), g2.k());
+        copy_tensor(g3, 0, gr, g1.k() + g2.k(), g3.k());
+        DLIB_TEST(g1.size() == b1g.size());
+        error = memcmp(g1.host(), b1g.host(), b1g.size());
+        DLIB_TEST(error == 0);
+        DLIB_TEST(g2.size() == b2g.size());
+        error = memcmp(g2.host(), b2g.host(), b2g.size());
+        DLIB_TEST(error == 0);
+        DLIB_TEST(g3.size() == b3g.size());
+        error = memcmp(g3.host(), b3g.host(), b3g.size());
+        DLIB_TEST(error == 0);
+    }
 // ----------------------------------------------------------------------------------------
    class dnn_tester : public tester
@@ -1433,6 +1600,7 @@ namespace
            compare_bn_conv_gpu_and_cpu();
            test_add();
            compare_adam();
+            test_copy_tensor_gpu();
 #endif
            test_max_pool(1,1,2,3,0,0);
            test_max_pool(3,3,1,1,0,0);
@@ -1466,9 +1634,10 @@ namespace
            test_basic_tensor_ops();
            test_layers();
            test_visit_funcions();
+            test_copy_tensor_cpu();
+            test_concat();
        }
    } a;
 }
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -33,6 +33,7 @@ ENDMACRO()
 if (COMPILER_CAN_DO_CPP_11)
   add_example(dnn_mnist_ex)
   add_example(dnn_mnist_advanced_ex)
+   add_example(dnn_inception_ex)
 endif()
 #here we apply our macros 

--- a/examples/dnn_inception_ex.cpp
+++ b/examples/dnn_inception_ex.cpp
+// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
+/*
+    This is an example illustrating the use of the deep learning tools from the
+    dlib C++ Library.  I'm assuming you have already read the dnn_mnist_ex.cpp
+    example.  So in this example program I'm going to go over a number of more
+    advanced parts of the API, including:
+        - Using grp layer for constructing inception layer
+    Inception layer is a kind of NN architecture for running sevelar convolution types
+    on the same input area and joining all convolution results into one output.
+    For further reading refer http://www.cs.unc.edu/~wliu/papers/GoogLeNet.pdf
+*/
+#include <dlib/dnn.h>
+#include <iostream>
+#include <dlib/data_io.h>
+using namespace std;
+using namespace dlib;
+// Inception layer has some different convolutions inside
+// Here we define blocks as convolutions with different kernel size that we will use in
+// inception layer block.
+template <typename SUBNET> using block_a1 = relu<con<10,1,1,1,1,SUBNET>>;
+template <typename SUBNET> using block_a2 = relu<con<10,3,3,1,1,relu<con<16,1,1,1,1,SUBNET>>>>;
+template <typename SUBNET> using block_a3 = relu<con<10,5,5,1,1,relu<con<16,1,1,1,1,SUBNET>>>>;
+template <typename SUBNET> using block_a4 = relu<con<10,1,1,1,1,max_pool<3,3,1,1,SUBNET>>>;
+// Here is inception layer definition. It uses different blocks to process input and returns combined output
+template <typename SUBNET> using incept_a = inception4<block_a1,block_a2,block_a3,block_a4, SUBNET>;
+// Network can have inception layers of different structure.
+// Here are blocks with different convolutions
+template <typename SUBNET> using block_b1 = relu<con<4,1,1,1,1,SUBNET>>;
+template <typename SUBNET> using block_b2 = relu<con<4,3,3,1,1,SUBNET>>;
+template <typename SUBNET> using block_b3 = relu<con<4,1,1,1,1,max_pool<3,3,1,1,SUBNET>>>;
+// Here is inception layer definition. It uses different blocks to process input and returns combined output
+template <typename SUBNET> using incept_b = inception3<block_b1,block_b2,block_b3,SUBNET>;
+// and then the network type is
+using net_type = loss_multiclass_log<
+        fc<10,
+        relu<fc<32,
+        max_pool<2,2,2,2,incept_b<
+        max_pool<2,2,2,2,tag1<incept_a<
+        input<matrix<unsigned char>>
+        >>>>>>>>>;
+int main(int argc, char** argv) try
+{
+    // This example is going to run on the MNIST dataset.
+    if (argc != 2)
+    {
+        cout << "This example needs the MNIST dataset to run!" << endl;
+        cout << "You can get MNIST from http://yann.lecun.com/exdb/mnist/" << endl;
+        cout << "Download the 4 files that comprise the dataset, decompress them, and" << endl;
+        cout << "put them in a folder.  Then give that folder as input to this program." << endl;
+        return 1;
+    }
+    std::vector<matrix<unsigned char>> training_images;
+    std::vector<unsigned long>         training_labels;
+    std::vector<matrix<unsigned char>> testing_images;
+    std::vector<unsigned long>         testing_labels;
+    load_mnist_dataset(argv[1], training_images, training_labels, testing_images, testing_labels);
+    // Create network of predefined type.
+    net_type net;
+    // Now let's print the details of the pnet to the screen and inspect it.
+    cout << "The net has " << net.num_layers << " layers in it." << endl;
+    cout << net << endl;
+    // we can access inner layers with layer<> function:
+    // with tags
+    auto& in_b = layer<tag1>(net);
+    cout << "Found inception B layer: " << endl << in_b << endl;
+    // and we can access layers inside inceptions with itags
+    auto& in_b_1 = layer<itag1>(in_b);
+    cout << "Found inception B/1 layer: " << endl << in_b_1 << endl;
+    // or this is identical to
+    auto& in_b_1_a = layer<tag1,2>(net);
+    cout << "Found inception B/1 layer alternative way: " << endl << in_b_1_a << endl;
+    cout << "Traning NN..." << endl;
+    // The rest of the sample is identical to dnn_minst_ex
+    // And then train it using the MNIST data.  The code below uses mini-batch stochastic
+    // gradient descent with an initial learning rate of 0.01 to accomplish this.
+    dnn_trainer<net_type> trainer(net);
+    trainer.set_learning_rate(0.01);
+    trainer.set_min_learning_rate(0.00001);
+    trainer.set_mini_batch_size(128);
+    trainer.be_verbose();
+    // Since DNN training can take a long time, we can ask the trainer to save its state to
+    // a file named "mnist_sync" every 20 seconds.  This way, if we kill this program and
+    // start it again it will begin where it left off rather than restarting the training
+    // from scratch.  This is because, when the program restarts, this call to
+    // set_synchronization_file() will automatically reload the settings from mnist_sync if
+    // the file exists.
+    trainer.set_synchronization_file("inception_sync", std::chrono::seconds(20));
+    // Finally, this line begins training.  By default, it runs SGD with our specified
+    // learning rate until the loss stops decreasing.  Then it reduces the learning rate by
+    // a factor of 10 and continues running until the loss stops decreasing again.  It will
+    // keep doing this until the learning rate has dropped below the min learning rate
+    // defined above or the maximum number of epochs as been executed (defaulted to 10000).
+    trainer.train(training_images, training_labels);
+    // At this point our net object should have learned how to classify MNIST images.  But
+    // before we try it out let's save it to disk.  Note that, since the trainer has been
+    // running images through the network, net will have a bunch of state in it related to
+    // the last batch of images it processed (e.g. outputs from each layer).  Since we
+    // don't care about saving that kind of stuff to disk we can tell the network to forget
+    // about that kind of transient data so that our file will be smaller.  We do this by
+    // "cleaning" the network before saving it.
+    net.clean();
+    serialize("mnist_network_inception.dat") << net;
+    // Now if we later wanted to recall the network from disk we can simply say:
+    // deserialize("mnist_network.dat") >> net;
+    // Now let's run the training images through the network.  This statement runs all the
+    // images through it and asks the loss layer to convert the network's raw output into
+    // labels.  In our case, these labels are the numbers between 0 and 9.
+    std::vector<unsigned long> predicted_labels = net(training_images);
+    int num_right = 0;
+    int num_wrong = 0;
+    // And then let's see if it classified them correctly.
+    for (size_t i = 0; i < training_images.size(); ++i)
+    {
+        if (predicted_labels[i] == training_labels[i])
+            ++num_right;
+        else
+            ++num_wrong;
+    }
+    cout << "training num_right: " << num_right << endl;
+    cout << "training num_wrong: " << num_wrong << endl;
+    cout << "training accuracy:  " << num_right/(double)(num_right+num_wrong) << endl;
+    // Let's also see if the network can correctly classify the testing images.  Since
+    // MNIST is an easy dataset, we should see at least 99% accuracy.
+    predicted_labels = net(testing_images);
+    num_right = 0;
+    num_wrong = 0;
+    for (size_t i = 0; i < testing_images.size(); ++i)
+    {
+        if (predicted_labels[i] == testing_labels[i])
+            ++num_right;
+        else
+            ++num_wrong;
+    }
+    cout << "testing num_right: " << num_right << endl;
+    cout << "testing num_wrong: " << num_wrong << endl;
+    cout << "testing accuracy:  " << num_right/(double)(num_right+num_wrong) << endl;
+}
+catch(std::exception& e)
+{
+    cout << e.what() << endl;
+}