Cleaned up documentation for conv_. Also removed unnecessary tensor

reallocation and copying inside conv_'s backward pass. Doing this required adding an add_to_output boolean option to the methods of tensor_conv.

Cleaned up documentation for conv_. Also removed unnecessary tensor
reallocation and copying inside conv_'s backward pass. Doing this required adding an add_to_output boolean option to the methods of tensor_conv.
31bcddd5 · Davis King · b3d5dbd3 · 31bcddd5 · 31bcddd5 · 31bcddd5
Commit 31bcddd5 authored Jun 27, 2017 by Davis King
8 changed files
--- a/dlib/dnn/cpu_dlib.cpp
+++ b/dlib/dnn/cpu_dlib.cpp
@@ -1739,31 +1739,52 @@ namespace dlib
            }
        }

-
        void tensor_conv::operator() (
+            const bool add_to_output,
            resizable_tensor& output,
            const tensor& data,
            const tensor& filters
        )
+        {
+            DLIB_CASSERT(last_stride_y > 0 && last_stride_x > 0, "You must call setup() before calling this function.");
+            output.set_size(data.num_samples(),
+                            filters.num_samples(),
+                            1+(data.nr()+2*last_padding_y-filters.nr())/last_stride_y,
+                            1+(data.nc()+2*last_padding_x-filters.nc())/last_stride_x);
+            (*this)(add_to_output, static_cast<tensor&>(output),data,filters);
+        }
+
+        void tensor_conv::operator() (
+            const bool add_to_output,
+            tensor& output,
+            const tensor& data,
+            const tensor& filters
+        )
        {
            DLIB_CASSERT(is_same_object(output,data) == false);
            DLIB_CASSERT(is_same_object(output,filters) == false);
            DLIB_CASSERT(filters.k() == data.k());
+            DLIB_CASSERT(last_stride_y > 0 && last_stride_x > 0, "You must call setup() before calling this function.");
            DLIB_CASSERT(filters.nr() <= data.nr() + 2*last_padding_y,
                "Filter windows must be small enough to fit into the padded image.");
            DLIB_CASSERT(filters.nc() <= data.nc() + 2*last_padding_x,
                "Filter windows must be small enough to fit into the padded image.");

-            output.set_size(data.num_samples(),
-                            filters.num_samples(),
-                            1+(data.nr()+2*last_padding_y-filters.nr())/last_stride_y,
-                            1+(data.nc()+2*last_padding_x-filters.nc())/last_stride_x);
+            DLIB_CASSERT(output.num_samples() == data.num_samples());
+            DLIB_CASSERT(output.k() == filters.num_samples());
+            DLIB_CASSERT(output.nr() == 1+(data.nr()+2*last_padding_y-filters.nr())/last_stride_y);
+            DLIB_CASSERT(output.nc() == 1+(data.nc()+2*last_padding_x-filters.nc())/last_stride_x);
+

            matrix<float> temp;
            for (long n = 0; n < data.num_samples(); ++n)
            {
                img2col(temp, data, n, filters.nr(), filters.nc(), last_stride_y, last_stride_x, last_padding_y, last_padding_x);
-                output.set_sample(n, mat(filters)*trans(temp));
+
+                if (add_to_output)
+                    output.add_to_sample(n, mat(filters)*trans(temp));
+                else 
+                    output.set_sample(n, mat(filters)*trans(temp));
            }
        }

@@ -1771,12 +1792,15 @@ namespace dlib

        void tensor_conv::
        get_gradient_for_data (
+            const bool add_to_output,
            const tensor& gradient_input, 
            const tensor& filters,
            tensor& data_gradient
        )
        {
            matrix<float> temp;
+            if (!add_to_output)
+                data_gradient = 0;
            for (long n = 0; n < gradient_input.num_samples(); ++n)
            {
                auto gi = mat(gradient_input.host()+gradient_input.k()*gradient_input.nr()*gradient_input.nc()*n,
@@ -1793,6 +1817,7 @@ namespace dlib

        void tensor_conv::
        get_gradient_for_filters (
+            const bool add_to_output,
            const tensor& gradient_input, 
            const tensor& data,
            tensor& filters_gradient
@@ -1808,9 +1833,16 @@ namespace dlib

                img2col(temp, data, n, filters_gradient.nr(), filters_gradient.nc(), last_stride_y, last_stride_x, last_padding_y, last_padding_x);
                if (n == 0)
-                    filters_gradient = gi*temp;
+                {
+                    if (add_to_output)
+                        filters_gradient += gi*temp;
+                    else
+                        filters_gradient = gi*temp;
+                }
                else
+                {
                    filters_gradient += gi*temp;
+                }
            }
        }
     // ------------------------------------------------------------------------------------

--- a/dlib/dnn/cpu_dlib.h
+++ b/dlib/dnn/cpu_dlib.h
@@ -388,18 +388,28 @@ namespace dlib
            }

             void operator() (
+                const bool add_to_output,
                resizable_tensor& output,
                const tensor& data,
                const tensor& filters
            );

+             void operator() (
+                const bool add_to_output,
+                tensor& output,
+                const tensor& data,
+                const tensor& filters
+            );
+
            void get_gradient_for_data (
+                const bool add_to_output,
                const tensor& gradient_input, 
                const tensor& filters,
                tensor& data_gradient
            );

            void get_gradient_for_filters (
+                const bool add_to_output,
                const tensor& gradient_input, 
                const tensor& data,
                tensor& filters_gradient
@@ -407,10 +417,10 @@ namespace dlib

        private:

-            long last_stride_y;
-            long last_stride_x;
-            long last_padding_y;
-            long last_padding_x;
+            long last_stride_y = 0;
+            long last_stride_x = 0;
+            long last_padding_y = 0;
+            long last_padding_x = 0;
        };

    // -----------------------------------------------------------------------------------

--- a/dlib/dnn/cudnn_dlibapi.cpp
+++ b/dlib/dnn/cudnn_dlibapi.cpp
@@ -951,15 +951,29 @@ namespace dlib
        }

        void tensor_conv::operator() (
+            const bool add_to_output,
            resizable_tensor& output,
            const tensor& data,
            const tensor& filters
        )
+        {
+            DLIB_CASSERT(stride_y > 0 && stride_x > 0, "You must call setup() before calling this function");
+
+            output.set_size(out_num_samples, out_k, out_nr, out_nc);
+            (*this)(add_to_output, static_cast<tensor&>(output), data, filters);
+        }
+
+        void tensor_conv::operator() (
+            const bool add_to_output,
+            tensor& output,
+            const tensor& data,
+            const tensor& filters
+        )
        {
            DLIB_CASSERT(is_same_object(output,data) == false);
            DLIB_CASSERT(is_same_object(output,filters) == false);
            DLIB_CASSERT(filters.k() == data.k());
-            DLIB_CASSERT(stride_y > 0 && stride_x > 0);
+            DLIB_CASSERT(stride_y > 0 && stride_x > 0, "You must call setup() before calling this function");
            DLIB_CASSERT(filters.nc() <= data.nc() + 2*padding_x,
                "Filter windows must be small enough to fit into the padded image."
                << "\n\t filters.nc(): " << filters.nc() 
@@ -974,17 +988,15 @@ namespace dlib
                );


-            output.set_size(out_num_samples, out_k, out_nr, out_nc);
-
-            DLIB_ASSERT(output.num_samples() == data.num_samples(),out_num_samples << "  " << data.num_samples());
-            DLIB_ASSERT(output.k() == filters.num_samples());
-            DLIB_ASSERT(output.nr() == 1+(data.nr()+2*padding_y-filters.nr())/stride_y);
-            DLIB_ASSERT(output.nc() == 1+(data.nc()+2*padding_x-filters.nc())/stride_x);
+            DLIB_CASSERT(output.num_samples() == data.num_samples(),out_num_samples << "  " << data.num_samples());
+            DLIB_CASSERT(output.k() == filters.num_samples());
+            DLIB_CASSERT(output.nr() == 1+(data.nr()+2*padding_y-filters.nr())/stride_y);
+            DLIB_CASSERT(output.nc() == 1+(data.nc()+2*padding_x-filters.nc())/stride_x);



            const float alpha = 1;
-            const float beta = 0;
+            const float beta = add_to_output ? 1 : 0;
            CHECK_CUDNN(cudnnConvolutionForward(
                    context(),
                    &alpha,
@@ -1002,13 +1014,14 @@ namespace dlib
        }

        void tensor_conv::get_gradient_for_data (
+            const bool add_to_output,
            const tensor& gradient_input, 
            const tensor& filters,
            tensor& data_gradient
        )
        {
            const float alpha = 1;
-            const float beta = 1;
+            const float beta = add_to_output ? 1 : 0;


            CHECK_CUDNN(cudnnConvolutionBackwardData(context(),
@@ -1028,13 +1041,14 @@ namespace dlib

        void tensor_conv::
        get_gradient_for_filters (
+            const bool add_to_output,
            const tensor& gradient_input, 
            const tensor& data,
            tensor& filters_gradient
        )
        {
            const float alpha = 1;
-            const float beta = 0;
+            const float beta = add_to_output ? 1 : 0;
            CHECK_CUDNN(cudnnConvolutionBackwardFilter(context(),
                                                    &alpha,
                                                    descriptor(data),

--- a/dlib/dnn/cudnn_dlibapi.h
+++ b/dlib/dnn/cudnn_dlibapi.h
@@ -203,68 +203,32 @@ namespace dlib
            );

            void operator() (
+                const bool add_to_output,
+                tensor& output,
+                const tensor& data,
+                const tensor& filters
+            );
+
+            void operator() (
+                const bool add_to_output,
                resizable_tensor& output,
                const tensor& data,
                const tensor& filters
            );
-            /*!
-                requires
-                    - stride_y > 0
-                    - stride_x > 0
-                    - 0 <= padding_y < filters.nr()
-                    - 0 <= padding_x < filters.nc()
-                    - is_same_object(output,data) == false
-                    - is_same_object(output,filters) == false
-                ensures
-                    - convolves filters over data.  
-                    - filters contains filters.num_samples() filters. 
-                    - #output.num_samples() == data.num_samples()
-                    - #output.k() == filters.num_samples()
-                    - #output.nr() == 1+(data.nr()-filters.nr()%2)/stride_y
-                    - #output.nc() == 1+(data.nc()-filters.nc()%2)/stride_x
-            !*/

            void get_gradient_for_data (
+                const bool add_to_output,
                const tensor& gradient_input, 
                const tensor& filters,
                tensor& data_gradient
            );
-            /*!
-                requires
-                    - filters has the same dimensions as the filters object give to the 
-                      last call to operator().
-                    - data_gradient has the same dimensions as the data object give to the
-                      last call to operator().
-                    - gradient_input has the same dimensions as the output of operator().
-                    - is_same_object(data_gradient,filters) == false
-                    - is_same_object(data_gradient,gradient_input) == false
-                ensures
-                    - let OUT be the output of (*this)(OUT,data,filters).
-                    - let f(data,filters) == dot(OUT, gradient_input)
-                    - This function finds the gradient of f() with respect to data
-                      and adds this gradient to data_gradient.
-            !*/

            void get_gradient_for_filters (
+                const bool add_to_output,
                const tensor& gradient_input, 
                const tensor& data,
                tensor& filters_gradient
            );
-            /*!
-                requires
-                    - filters_gradient has the same dimensions as the filters object give
-                      to the last call to operator().
-                    - data has the same dimensions as the data object give to the last call
-                      to operator().
-                    - gradient_input has the same dimensions as the output of operator().
-                    - is_same_object(filters_gradient,data) == false
-                    - is_same_object(filters_gradient,gradient_input) == false
-                ensures
-                    - let OUT be the output of (*this)(OUT,data,filters).
-                    - let f(data,filters) == dot(OUT, gradient_input)
-                    - This function finds the gradient of f() with respect to filters 
-                      and assigns this gradient to filters_gradient.
-            !*/

           void setup(
                const tensor& data,
@@ -277,15 +241,6 @@ namespace dlib

        private:

-            /*!
-                requires
-                    - filters.k() == data.k()
-                    - stride_y > 0
-                    - stride_x > 0
-                    - 0 <= padding_y < filters.nr()
-                    - 0 <= padding_x < filters.nc()
-            !*/
-
            // These variables record the type of data given to the last call to setup().
            int stride_y;
            int stride_x;

--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -142,6 +142,7 @@ namespace dlib

            // set the initial bias values to zero
            biases(params,filters.size()) = 0;
+
        }

        template <typename SUBNET>
@@ -153,8 +154,7 @@ namespace dlib
                       _stride_x,
                       padding_y_,
                       padding_x_);
-            
-            conv(output,
+            conv(false, output,
                sub.get_output(),
                filters(params,0));

@@ -164,12 +164,12 @@ namespace dlib
        template <typename SUBNET>
        void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
        {
-            conv.get_gradient_for_data (gradient_input, filters(params,0), sub.get_gradient_input());
+            conv.get_gradient_for_data (true, gradient_input, filters(params,0), sub.get_gradient_input());
            // no point computing the parameter gradients if they won't be used.
            if (learning_rate_multiplier != 0)
            {
                auto filt = filters(params_grad,0);
-                conv.get_gradient_for_filters (gradient_input, sub.get_output(), filt);
+                conv.get_gradient_for_filters (false, gradient_input, sub.get_output(), filt);
                auto b = biases(params_grad, filters.size());
                tt::assign_conv_bias_gradient(b, gradient_input);
            }
@@ -443,26 +443,21 @@ namespace dlib
            unsigned int gnsamps = sub.get_output().num_samples();
            unsigned int gk = filt.k();
            output.set_size(gnsamps,gk,gnr,gnc);
-            output = 0;
            conv.setup(output,filt,_stride_y,_stride_x,padding_y_,padding_x_);
-            conv.get_gradient_for_data(sub.get_output(),filt,output);            
+            conv.get_gradient_for_data(false, sub.get_output(),filt,output);            
            tt::add(1,output,1,biases(params,filters.size()));
        } 

        template <typename SUBNET>
        void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
        {
-            resizable_tensor temp;
-            temp.copy_size(sub.get_gradient_input());
            auto filt = filters(params,0);           
-            conv(temp,gradient_input, filt);
-            // need to add the new gradients on top of the previous ones
-            tt::add(1,sub.get_gradient_input(),1,temp);
+            conv(true, sub.get_gradient_input(),gradient_input, filt);
            // no point computing the parameter gradients if they won't be used.
            if (learning_rate_multiplier != 0)
            {
                auto filt = filters(params_grad,0);                
-                conv.get_gradient_for_filters (sub.get_output(),gradient_input, filt);
+                conv.get_gradient_for_filters (false, sub.get_output(),gradient_input, filt);
                auto b = biases(params_grad, filters.size());
                tt::assign_conv_bias_gradient(b, gradient_input);
            }
@@ -566,7 +561,7 @@ namespace dlib
                << " padding_y='"<<item.padding_y_<<"'"
                << " padding_x='"<<item.padding_x_<<"'"
                << " learning_rate_mult='"<<item.learning_rate_multiplier<<"'"
-                << " weight_decay_46mult='"<<item.weight_decay_multiplier<<"'"
+                << " weight_decay_mult='"<<item.weight_decay_multiplier<<"'"
                << " bias_learning_rate_mult='"<<item.bias_learning_rate_multiplier<<"'"
                << " bias_weight_decay_mult='"<<item.bias_weight_decay_multiplier<<"'>\n";
            out << mat(item.params);

--- a/dlib/dnn/layers_abstract.h
+++ b/dlib/dnn/layers_abstract.h
@@ -864,17 +864,21 @@ namespace dlib

            WHAT THIS OBJECT REPRESENTS
                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
-                defined above.  In particular, it defines a transposed convolution layer 
-                that takes an input tensor (nominally representing an image) and 
-                transpose convolves (deconvolves) it with a set of filters and then outputs the results. 
-                This is basically a convolutional layer with reversed forward/backward passes
+                defined above.  In particular, it defines a transposed convolution layer
+                that takes an input tensor and transpose convolves (sometimes called
+                "deconvolution") it with a set of filters and then outputs the results. 
+
+                This is essentially a convolutional layer that allows fractional strides.
+                Therefore, you can make output tensors that are larger than the input
+                tensors using this layer type. 
+
                
                The dimensions of the tensors output by this layer are as follows (letting
                IN be the input tensor and OUT the output tensor):
                    - OUT.num_samples() == IN.num_samples()
                    - OUT.k()  == num_filters()
-                    - OUT.nr() == stride_y * (IN.nr() -1) + nr) - 2*padding_y
-                    - OUT.nc() == stride_x * (IN.nc() -1) + nc) - 2*padding_x
+                    - OUT.nr() == stride_y()*(IN.nr()-1) + nr() - 2*padding_y()
+                    - OUT.nc() == stride_x()*(IN.nc()-1) + nc() - 2*padding_x()
        !*/

    public:
@@ -923,8 +927,8 @@ namespace dlib
        /*!
            ensures
                - returns the vertical stride used when convolving the filters over an
-                  image.  That is, each filter will be moved stride_y() pixels down at a
-                  time when it moves over the image.
+                  image.  That is, each filter will be moved 1.0/stride_y() pixels down at
+                  a time when it moves over the image.
        !*/

        long stride_x(
@@ -932,8 +936,8 @@ namespace dlib
        /*!
            ensures
                - returns the horizontal stride used when convolving the filters over an
-                  image.  That is, each filter will be moved stride_x() pixels right at a
-                  time when it moves over the image.
+                  image.  That is, each filter will be moved 1.0/stride_x() pixels right at
+                  a time when it moves over the image.
        !*/

        long padding_y(

--- a/dlib/dnn/tensor_tools.h
+++ b/dlib/dnn/tensor_tools.h
@@ -877,23 +877,50 @@ namespace dlib { namespace tt
        ) { impl.clear(); }

        void operator() (
+            const bool add_to_output,
+            tensor& output,
+            const tensor& data,
+            const tensor& filters
+        ) { impl(add_to_output,output,data,filters); }
+        /*!
+            requires
+                - setup() has been called.  Specifically, setup() has been called like this:
+                    this->setup(data, filters, stride_y, stride_x, padding_y, padding_x);
+                - is_same_object(output,data) == false
+                - is_same_object(output,filters) == false
+                - filters.k() == data.k()
+                - filters.nr() <= src.nr() + 2*padding_y
+                - filters.nc() <= src.nc() + 2*padding_x
+                - #output.num_samples() == data.num_samples()
+                - #output.k() == filters.num_samples()
+                - #output.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y
+                - #output.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x
+            ensures
+                - Convolves filters over data.  If add_to_output==true then we add the
+                  results to output, otherwise we assign to output, overwriting the
+                  previous values in output.
+                - filters contains filters.num_samples() filters. 
+        !*/
+
+        void operator() (
+            const bool add_to_output,
            resizable_tensor& output,
            const tensor& data,
            const tensor& filters
-        ) { impl(output,data,filters); }
+        ) { impl(add_to_output,output,data,filters); }
        /*!
            requires
-                - stride_y > 0
-                - stride_x > 0
-                - 0 <= padding_y < filters.nr()
-                - 0 <= padding_x < filters.nc()
+                - setup() has been called.  Specifically, setup() has been called like this:
+                    this->setup(data, filters, stride_y, stride_x, padding_y, padding_x);
                - is_same_object(output,data) == false
                - is_same_object(output,filters) == false
                - filters.k() == data.k()
                - filters.nr() <= src.nr() + 2*padding_y
                - filters.nc() <= src.nc() + 2*padding_x
            ensures
-                - convolves filters over data.  
+                - Convolves filters over data.  If add_to_output==true then we add the
+                  results to output, otherwise we assign to output, overwriting the
+                  previous values in output.  
                - filters contains filters.num_samples() filters. 
                - #output.num_samples() == data.num_samples()
                - #output.k() == filters.num_samples()
@@ -902,49 +929,77 @@ namespace dlib { namespace tt
        !*/

        void get_gradient_for_data (
+            const bool add_to_output,
            const tensor& gradient_input, 
            const tensor& filters,
            tensor& data_gradient
-        ) { impl.get_gradient_for_data(gradient_input,filters,data_gradient); }
+        ) { impl.get_gradient_for_data(add_to_output,gradient_input,filters,data_gradient); }
        /*!
            requires
-                - filters has the same dimensions as the filters object given to the last
-                  call to operator().
-                - data_gradient has the same dimensions as the data object given to the last
-                  call to operator().
-                - gradient_input has the same dimensions as the last output of operator().
+                - One of the following must be true:
+                    - filters has the same dimensions as the filters object given to the
+                      last call to operator().  Also, data_gradient has the same dimensions
+                      as the data object given to the last call to operator().
+                    - setup() has been called.  Specifically, setup() has been called like this:
+                      this->setup(data_gradient, filters, stride_y, stride_x, padding_y, padding_x);
+                - gradient_input has the following dimensions:
+                    - gradient_input.num_samples() == data_gradient.num_samples()
+                    - gradient_input.k() == filters.num_samples()
+                    - gradient_input.nr() == 1+(data_gradient.nr() + 2*padding_y - filters.nr())/stride_y
+                    - gradient_input.nc() == 1+(data_gradient.nc() + 2*padding_x - filters.nc())/stride_x
+                    - NOTE, these dimensions are what you would obtain if gradient_input
+                      has the same dimensions as the last output of operator().  
                - is_same_object(data_gradient,filters) == false
                - is_same_object(data_gradient,gradient_input) == false
            ensures
                - let OUT be the output of (*this)(OUT,data,filters,sx,sy).
                - let f(data,filters) == dot(OUT, gradient_input)
-                - This function finds the gradient of f() with respect to data and adds
-                  this gradient to data_gradient.
+                - if (add_to_output) then
+                    - This function finds the gradient of f() with respect to data and adds
+                      this gradient to data_gradient.
+                - else
+                    - This function finds the gradient of f() with respect to data and
+                      assigns this gradient to data_gradient, overwriting the previous
+                      values in data_gradient.
        !*/

        void get_gradient_for_filters (
+            const bool add_to_output,
            const tensor& gradient_input, 
            const tensor& data,
            tensor& filters_gradient
-        ) { impl.get_gradient_for_filters(gradient_input,data,filters_gradient); }
+        ) { impl.get_gradient_for_filters(add_to_output,gradient_input,data,filters_gradient); }
        /*!
            requires
-                - filters_gradient has the same dimensions as the filters object given to
-                  the last call to operator().
-                - data has the same dimensions as the data object given to the last call to
-                  operator().
-                - gradient_input has the same dimensions as the last output of operator().
+                - One of the following must be true:
+                    - filters_gradient has the same dimensions as the filters object given
+                      to the last call to operator().  Also, data has the same dimensions
+                      as the data object given to the last call to operator().
+                    - setup() has been called.  Specifically, setup() has been called like this:
+                      this->setup(data, filters_gradient, stride_y, stride_x, padding_y, padding_x);
+                - gradient_input has the following dimensions:
+                    - gradient_input.num_samples() == data.num_samples()
+                    - gradient_input.k() == filters.num_samples()
+                    - gradient_input.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y
+                    - gradient_input.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x
+                    - NOTE, these dimensions are what you would obtain if gradient_input
+                      has the same dimensions as the last output of operator().  
                - is_same_object(filters_gradient,data) == false
                - is_same_object(filters_gradient,gradient_input) == false
            ensures
                - let OUT be the output of (*this)(OUT,data,filters,sx,sy).
                - let f(data,filters) == dot(OUT, gradient_input)
-                - This function finds the gradient of f() with respect to filters and assigns 
-                  this gradient to filters_gradient.
+                - if (add_to_output) then
+                    - This function finds the gradient of f() with respect to filters and
+                      adds this gradient to filters_gradient.
+                - else 
+                    - This function finds the gradient of f() with respect to filters and
+                      assigns this gradient to filters_gradient, overwriting the previous
+                      values in filters_gradient.
        !*/

 
-       void setup(
+        void setup(
            const tensor& data,
            const tensor& filters,
            int stride_y,
@@ -952,6 +1007,26 @@ namespace dlib { namespace tt
            int padding_y,
            int padding_x
        ) {impl.setup(data,filters,stride_y,stride_x,padding_y,padding_x); }
+        /*!
+            requires
+                - filters.k() == data.k()
+                - stride_y > 0
+                - stride_x > 0
+                - 0 <= padding_y < filters.nr()
+                - 0 <= padding_x < filters.nc()
+            ensures
+                - When operator() is called, the output tensor will have these dimensions:
+                    - output.nr() == 1+(data.nr() + 2*padding_y - filters.nr())/stride_y
+                    - output.nc() == 1+(data.nc() + 2*padding_x - filters.nc())/stride_x
+                    - output.num_samples() == data.num_samples()
+                    - output.k() == filters.num_samples()
+                - The point of setup() is to allow this object to gather information about
+                  all the tensor sizes and filter layouts involved in the computation.  In
+                  particular, the reason the tensors are input into setup() is just to
+                  observe their sizes.  setup() doesn't do anything with the contents of
+                  the tensors, or store any kind of references to the data or filter
+                  tensors. 
+        !*/
       
    private:
 #ifdef DLIB_USE_CUDA

--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -806,9 +806,17 @@ namespace
            if (!(filters.nc() <= data.nc() + 2*padding_x))
                padding_x = (filters.nc()-data.nc()+1)/2;
            conv1.setup(data,filters,stride_y,stride_x,padding_y,padding_x);
-            conv1(output1, data, filters);
+            conv1(false, output1, data, filters);
            conv2.setup(data,filters,stride_y,stride_x,padding_y,padding_x);
-            conv2(output2, data, filters);
+            conv2(false, output2, data, filters);
+            dlog << LINFO << "forward error: "<< max(abs(mat(output1)-mat(output2)));
+            DLIB_TEST_MSG(max(abs(mat(output1)-mat(output2))) < 1e-3, max(abs(mat(output1)-mat(output2)))
+                 <<"\n\t padding_y: "<< padding_y 
+                 <<"\n\t padding_x: "<< padding_x 
+                 );
+
+            conv1(true, output1, data, filters);
+            conv2(true, output2, data, filters);
            dlog << LINFO << "forward error: "<< max(abs(mat(output1)-mat(output2)));
            DLIB_TEST_MSG(max(abs(mat(output1)-mat(output2))) < 1e-3, max(abs(mat(output1)-mat(output2)))
                 <<"\n\t padding_y: "<< padding_y 
@@ -826,8 +834,14 @@ namespace
            data_gradient1 = 1;
            data_gradient2 = 1;

-            conv1.get_gradient_for_data(gi, filters, data_gradient1);
-            conv2.get_gradient_for_data(gi, filters, data_gradient2);
+            conv1.get_gradient_for_data(true, gi, filters, data_gradient1);
+            conv2.get_gradient_for_data(true, gi, filters, data_gradient2);
+
+            dlog << LINFO << "data gradient error: "<< max(abs(mat(data_gradient1)-mat(data_gradient2)));
+            DLIB_TEST(max(abs(mat(data_gradient1)-mat(data_gradient2))) < 1e-3);
+
+            conv1.get_gradient_for_data(false, gi, filters, data_gradient1);
+            conv2.get_gradient_for_data(false, gi, filters, data_gradient2);

            dlog << LINFO << "data gradient error: "<< max(abs(mat(data_gradient1)-mat(data_gradient2)));
            DLIB_TEST(max(abs(mat(data_gradient1)-mat(data_gradient2))) < 1e-3);
@@ -842,8 +856,15 @@ namespace
            filter_gradient1 = 1;
            filter_gradient2 = 1;

-            conv1.get_gradient_for_filters(gi, data, filter_gradient1);
-            conv2.get_gradient_for_filters(gi, data, filter_gradient2);
+            conv1.get_gradient_for_filters(false, gi, data, filter_gradient1);
+            conv2.get_gradient_for_filters(false, gi, data, filter_gradient2);
+
+            dlog << LINFO << "filter gradient error: "<< max(abs(mat(filter_gradient1)-mat(filter_gradient2)));
+            DLIB_TEST_MSG(max(abs(mat(filter_gradient1)-mat(filter_gradient2))) < 1e-3, max(abs(mat(filter_gradient1)-mat(filter_gradient2))));
+
+
+            conv1.get_gradient_for_filters(true, gi, data, filter_gradient1);
+            conv2.get_gradient_for_filters(true, gi, data, filter_gradient2);

            dlog << LINFO << "filter gradient error: "<< max(abs(mat(filter_gradient1)-mat(filter_gradient2)));
            DLIB_TEST_MSG(max(abs(mat(filter_gradient1)-mat(filter_gradient2))) < 1e-3, max(abs(mat(filter_gradient1)-mat(filter_gradient2))));
@@ -1475,6 +1496,12 @@ namespace
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
+        {
+            print_spinner();
+            cont_<3,3,3,2,2,0,0> l;
+            auto res = test_layer(l);
+            DLIB_TEST_MSG(res, res);
+        }
        {
            print_spinner();
            cont_<3,3,3,2,2> l;
@@ -1487,6 +1514,12 @@ namespace
            auto res = test_layer(l);
            DLIB_TEST_MSG(res, res);
        }
+        {
+            print_spinner();
+            cont_<3,3,3,1,1,0,0> l;
+            auto res = test_layer(l);
+            DLIB_TEST_MSG(res, res);
+        }
        {
            print_spinner();
            cont_<3,2,2,2,2> l;