Made the affine_ layer support being constructed from bn_ layers. Also added

unit tests for the routines supporting this feature.

Made the affine_ layer support being constructed from bn_ layers. Also added
unit tests for the routines supporting this feature.
93ab80c7 · Davis King · 669a1e17 · 93ab80c7 · 93ab80c7 · 93ab80c7
Commit 93ab80c7 authored Jan 23, 2016 by Davis King
Hide whitespace changes
Inline Side-by-side

Showing with 309 additions and 76 deletions

layers.h dlib/dnn/layers.h +80 -22

layers_abstract.h dlib/dnn/layers_abstract.h +120 -50

dnn.cpp dlib/test/dnn.cpp +109 -4

No files found.
--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -418,27 +418,31 @@ namespace dlib

 // ----------------------------------------------------------------------------------------

-    enum batch_normalization_mode
+    enum layer_mode
    {
-        BATCH_NORM_CONV = 0,
-        BATCH_NORM_FC = 1
+        CONV_MODE = 0,
+        FC_MODE = 1
    };

    class bn_
    {
    public:
-        bn_() : num_updates(0), running_stats_window_size(1000), mode(BATCH_NORM_FC)
+        bn_() : num_updates(0), running_stats_window_size(1000), mode(FC_MODE)
        {}

-        explicit bn_(batch_normalization_mode mode_) : num_updates(0), running_stats_window_size(1000), mode(mode_)
+        explicit bn_(layer_mode mode_) : num_updates(0), running_stats_window_size(1000), mode(mode_)
        {}

-        batch_normalization_mode get_mode() const { return mode; }
+        bn_(layer_mode mode_, unsigned long window_size) : num_updates(0), running_stats_window_size(window_size), mode(mode_)
+        {}
+
+        layer_mode get_mode() const { return mode; }
+        unsigned long get_running_stats_window_size () const { return running_stats_window_size; }

        template <typename SUBNET>
        void setup (const SUBNET& sub)
        {
-            if (mode == BATCH_NORM_FC)
+            if (mode == FC_MODE)
            {
                gamma = alias_tensor(1,
                                sub.get_output().k(),
@@ -473,14 +477,14 @@ namespace dlib
                const double decay = 1.0 - num_updates/(num_updates+1.0);
                if (num_updates <running_stats_window_size)
                    ++num_updates;
-                if (mode == BATCH_NORM_FC)
+                if (mode == FC_MODE)
                    tt::batch_normalize(output, means, invstds, decay, running_means, running_invstds, sub.get_output(), g, b);
                else 
                    tt::batch_normalize_conv(output, means, invstds, decay, running_means, running_invstds, sub.get_output(), g, b);
            }
            else // we are running in testing mode so we just linearly scale the input tensor.
            {
-                if (mode == BATCH_NORM_FC)
+                if (mode == FC_MODE)
                    tt::batch_normalize_inference(output, sub.get_output(), g, b, running_means, running_invstds);
                else
                    tt::batch_normalize_conv_inference(output, sub.get_output(), g, b, running_means, running_invstds);
@@ -493,7 +497,7 @@ namespace dlib
            auto g = gamma(params,0);
            auto g_grad = gamma(params_grad, 0);
            auto b_grad = beta(params_grad, gamma.size());
-            if (mode == BATCH_NORM_FC)
+            if (mode == FC_MODE)
                tt::batch_normalize_gradient(gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
            else
                tt::batch_normalize_conv_gradient(gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
@@ -534,18 +538,20 @@ namespace dlib
            deserialize(item.running_stats_window_size, in);
            int mode;
            deserialize(mode, in);
-            item.mode = (batch_normalization_mode)mode;
+            item.mode = (layer_mode)mode;
        }

    private:

+        friend class affine_;
+
        resizable_tensor params;
        alias_tensor gamma, beta;
        resizable_tensor means, running_means;
        resizable_tensor invstds, running_invstds;
        unsigned long num_updates;
        unsigned long running_stats_window_size;
-        batch_normalization_mode mode;
+        layer_mode mode;
    };

    template <typename SUBNET>
@@ -770,17 +776,53 @@ namespace dlib
    {
    public:
        affine_(
-        ) 
+        ) : mode(FC_MODE)
+        {
+        }
+
+        explicit affine_(
+            layer_mode mode_
+        ) : mode(mode_)
        {
        }

+        affine_(
+            const bn_& item
+        )
+        {
+            gamma = item.gamma;
+            beta = item.beta;
+            mode = item.mode;
+
+            params.copy_size(item.params);
+
+            auto g = gamma(params,0);
+            auto b = beta(params,gamma.size());
+            
+            resizable_tensor temp(item.params);
+            auto sg = gamma(temp,0);
+            auto sb = beta(temp,gamma.size());
+
+            g = pointwise_multiply(mat(sg), mat(item.running_invstds));
+            b = mat(sb) - pointwise_multiply(mat(g), mat(item.running_means));
+        }
+
+        layer_mode get_mode() const { return mode; }
+
        template <typename SUBNET>
        void setup (const SUBNET& sub)
        {
-            gamma = alias_tensor(1,
-                            sub.get_output().k(),
-                            sub.get_output().nr(),
-                            sub.get_output().nc());
+            if (mode == FC_MODE)
+            {
+                gamma = alias_tensor(1,
+                                sub.get_output().k(),
+                                sub.get_output().nr(),
+                                sub.get_output().nc());
+            }
+            else
+            {
+                gamma = alias_tensor(1, sub.get_output().k());
+            }
            beta = gamma;

            params.set_size(gamma.size()+beta.size());
@@ -793,7 +835,10 @@ namespace dlib
        {
            auto g = gamma(params,0);
            auto b = beta(params,gamma.size());
-            tt::affine_transform(output, input, g, b);
+            if (mode == FC_MODE)
+                tt::affine_transform(output, input, g, b);
+            else
+                tt::affine_transform_conv(output, input, g, b);
        } 

        void backward_inplace(
@@ -809,10 +854,18 @@ namespace dlib
            auto b_grad = beta(params_grad,gamma.size());

            // We are computing the gradient of dot(gradient_input, computed_output*g + b)
-            tt::multiply(data_grad, gradient_input, g);
-
-            tt::multiply(g_grad, gradient_input, computed_output);
-            tt::assign_bias_gradient(b_grad, gradient_input);
+            if (mode == FC_MODE)
+            {
+                tt::multiply(data_grad, gradient_input, g);
+                tt::multiply(g_grad, gradient_input, computed_output);
+                tt::assign_bias_gradient(b_grad, gradient_input);
+            }
+            else
+            {
+                tt::multiply_conv(data_grad, gradient_input, g);
+                tt::multiply_conv(g_grad, gradient_input, computed_output);
+                tt::assign_conv_bias_gradient(b_grad, gradient_input);
+            }
        }

        const tensor& get_layer_params() const { return params; }
@@ -824,6 +877,7 @@ namespace dlib
            serialize(item.params, out);
            serialize(item.gamma, out);
            serialize(item.beta, out);
+            serialize((int)item.mode, out);
        }

        friend void deserialize(affine_& item, std::istream& in)
@@ -835,11 +889,15 @@ namespace dlib
            deserialize(item.params, in);
            deserialize(item.gamma, in);
            deserialize(item.beta, in);
+            int mode;
+            deserialize(mode, in);
+            item.mode = (layer_mode)mode;
        }

    private:
        resizable_tensor params; 
        alias_tensor gamma, beta;
+        layer_mode mode;
    };

    template <typename SUBNET>

--- a/dlib/dnn/layers_abstract.h
+++ b/dlib/dnn/layers_abstract.h
@@ -552,29 +552,89 @@ namespace dlib

 // ----------------------------------------------------------------------------------------

-    class affine_
+    enum layer_mode
+    {
+        CONV_MODE = 0, // convolutional mode
+        FC_MODE = 1    // fully connected mode
+    };
+
+    class bn_
    {
        /*!
            WHAT THIS OBJECT REPRESENTS
                This is an implementation of the EXAMPLE_LAYER_ interface defined above.
-                In particular, it applies a simple pointwise linear transformation to an
-                input tensor.  You can think of it as having two parameter tensors, A and
-                B, that each have the same dimensionality as the input tensor (except their
-                num_samples() dimensions are 1).  If the input tensor is called INPUT
-                then the output of this layer is simply:
-                    A*INPUT+B
-                where all operations are performed element wise and each sample in the
-                INPUT tensor is processed separately.
+                In particular, it defines a batch normalization layer that implements the
+                method described in the paper: 
+                    Batch Normalization: Accelerating Deep Network Training by Reducing
+                    Internal Covariate Shift by Sergey Ioffe and Christian Szegedy
+                
+                In particular, this layer produces output tensors with the same
+                dimensionality as the input tensors, except that the mean and variances of
+                the elements have been standardized to 0 and 1 respectively. 
+
+                It should also be noted that when tensors with a num_samples() dimension of
+                1 are passed to this layer it doesn't perform batch normalization.
+                Instead, it runs in "inference mode" where the learned linear normalizing
+                transformation is used to transform the tensor. 
+
+                Finally, after you finish training a batch normalized network, it is a good
+                idea to replace each bn_ layer with an affine_ layer because the affine_
+                layer is faster and will never surprise you by performing batch
+                normalization on tensors that have a num_samples() dimension > 1.  This allows
+                you to run large mini-batches of samples through your final network without
+                batch normalization executing at all. 
        !*/

    public:
+        bn_(
+        );
+        /*!
+            ensures
+                - #get_mode() == FC_MODE
+                - get_running_stats_window_size() == 1000
+        !*/

-        affine_(
+        explicit bn_(
+            layer_mode mode
        );
+        /*!
+            ensures
+                - #get_mode() == mode 
+                - get_running_stats_window_size() == 1000
+        !*/
+
+        layer_mode get_mode(
+        ) const; 
+        /*!
+            ensures
+                - returns the mode of this layer, either CONV_MODE or FC_MODE.
+                  If the mode is FC_MODE then the normalization is applied across the
+                  samples in a tensor (i.e. k()*nr()*nc() different things will be
+                  normalized).  Otherwise, normalization is applied across everything
+                  except for the k() dimension, resulting in there being only k()
+                  normalization equations that are applied spatially over the tensor.
+
+                  Therefore, if you are putting batch normalization after a fully connected
+                  layer you should use FC_MODE.  Otherwise, if you are putting batch
+                  normalization after a convolutional layer you should use CONV_MODE.
+        !*/
+
+        unsigned long get_running_stats_window_size (
+        ) const; 
+        /*!
+            ensures
+                - Just as recommended in the batch normalization paper, this object keeps a
+                  running average of the mean and standard deviations of the features.
+                  These averages are used during "inference mode" so you can run a single
+                  object through a batch normalized network.  They are also what is used to
+                  initialize an affine_ layer that is constructed from a bn_ layer.  This
+                  function returns the effective number of recent samples used to compute
+                  the running average.
+        !*/

        template <typename SUBNET> void setup (const SUBNET& sub);
-        void forward_inplace(const tensor& input, tensor& output);
-        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
        const tensor& get_layer_params() const; 
        tensor& get_layer_params(); 
        /*!
@@ -582,73 +642,83 @@ namespace dlib
        !*/
    };

-    void serialize(const affine_& item, std::ostream& out);
-    void deserialize(affine_& item, std::istream& in);
+    void serialize(const bn_& item, std::ostream& out);
+    void deserialize(bn_& item, std::istream& in);
    /*!
        provides serialization support  
    !*/

    template <typename SUBNET>
-    using affine = add_layer<affine_, SUBNET>;
+    using bn = add_layer<bn_, SUBNET>;

 // ----------------------------------------------------------------------------------------

-    enum batch_normalization_mode
-    {
-        BATCH_NORM_CONV = 0,
-        BATCH_NORM_FC = 1
-    };
-
-    class bn_
+    class affine_
    {
        /*!
            WHAT THIS OBJECT REPRESENTS
                This is an implementation of the EXAMPLE_LAYER_ interface defined above.
-                In particular, it defines a batch normalization layer that implements the
-                method described in the paper: 
-                    Batch Normalization: Accelerating Deep Network Training by Reducing
-                    Internal Covariate Shift by Sergey Ioffe and Christian Szegedy
-                
-                In particular, this layer produces output tensors with the same
-                dimensionality as the input tensors, except that the mean and variances of
-                the elements have been standardized. 
+                In particular, it applies a simple pointwise linear transformation to an
+                input tensor.  You can think of it as having two parameter tensors, A and
+                B.  If the input tensor is called INPUT then the output of this layer is:
+                    A*INPUT+B
+                where all operations are performed element wise and each sample in the
+                INPUT tensor is processed separately.
+
+                Moreover, this object has two modes that effect the dimensionalities of A
+                and B and how they are applied to compute A*INPUT+B.  If
+                get_mode()==FC_MODE then A and B each have the same dimensionality as the
+                input tensor, except their num_samples() dimensions are 1.  If
+                get_mode()==CONV_MODE then A and B have all their dimensions set to 1
+                except for k(), which is equal to INPUT.k().
+
+                In either case, the computation of A*INPUT+B is performed pointwise over all
+                the elements of INPUT using either:
+                    OUTPUT(n,k,r,c) == A(1,k,r,c)*INPUT(n,k,r,c)+B(1,k,r,c)
+                or
+                    OUTPUT(n,k,r,c) == A(1,k,1,1)*INPUT(n,k,r,c)+B(1,k,1,1)
+                as appropriate.
        !*/

    public:
-        bn_(
+
+        affine_(
        );
        /*!
            ensures
-                - #get_mode() == BATCH_NORM_FC
+                - #get_mode() == FC_MODE
        !*/

-        explicit bn_(
-            batch_normalization_mode mode
+        affine_(
+            const bn_& layer
+        );
+        /*!
+            ensures
+                - Constructs affine_ so that it performs the same transformation as the
+                  supplied batch normalization layer.  You would want to do this after you
+                  finish training a network with bn_ layers because the affine_ layer will
+                  execute faster.  
+                - #get_mode() == layer.get_mode()
+        !*/
+
+        explicit affine_(
+            layer_mode mode
        );
        /*!
            ensures
                - #get_mode() == mode 
        !*/

-        batch_normalization_mode get_mode(
+        layer_mode get_mode(
        ) const; 
        /*!
            ensures
-                - returns the mode of this layer, either BATCH_NORM_CONV or BATCH_NORM_FC.
-                  If the mode is BATCH_NORM_FC then the normalization is applied across the
-                  samples in a tensor (i.e. k()*nr()*nc() different things will be
-                  normalized).  Otherwise, normalization is applied across everything
-                  except for the k() dimension, resulting in there being only k()
-                  normalization equations that are applied spatially over the tensor.
-
-                  Therefore, if you are putting batch normalization after a fully connected
-                  layer you should use BATCH_NORM_FC.  Otherwise, if you are putting batch
-                  normalization after a convolutional layer you should use BATCH_NORM_CONV.
+                - returns the mode of this layer, either CONV_MODE or FC_MODE.  
        !*/

        template <typename SUBNET> void setup (const SUBNET& sub);
-        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
-        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        void forward_inplace(const tensor& input, tensor& output);
+        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
        const tensor& get_layer_params() const; 
        tensor& get_layer_params(); 
        /*!
@@ -656,14 +726,14 @@ namespace dlib
        !*/
    };

-    void serialize(const bn_& item, std::ostream& out);
-    void deserialize(bn_& item, std::istream& in);
+    void serialize(const affine_& item, std::ostream& out);
+    void deserialize(affine_& item, std::istream& in);
    /*!
        provides serialization support  
    !*/

    template <typename SUBNET>
-    using bn = add_layer<bn_, SUBNET>;
+    using affine = add_layer<affine_, SUBNET>;

 // ----------------------------------------------------------------------------------------


--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -763,7 +763,101 @@ namespace
        DLIB_TEST(max(abs(mat(gamma_grad)-mat(gamma_grad2))) < 1e-4);
        DLIB_TEST(max(abs(mat(beta_grad)-mat(beta_grad2))) < 1e-4);
    }
-#endif
+
+
+    void test_more_ops2()
+    {
+        dlib::rand rnd;
+        tt::tensor_rand trand;
+
+        for (int iter = 0; iter < 100; ++iter)
+        {
+            print_spinner();
+            resizable_tensor dest1, dest2, src1, src2;
+            src1.set_size(rnd.get_random_32bit_number()%30+1,
+                rnd.get_random_32bit_number()%30+1,
+                rnd.get_random_32bit_number()%30+1,
+                rnd.get_random_32bit_number()%30+1);
+            dest1.copy_size(src1);
+            dest2.copy_size(src1);
+            src2.set_size(1,src1.k(),1,1);
+
+            trand.fill_uniform(dest1);
+            trand.fill_uniform(dest2);
+            trand.fill_uniform(src1);
+            trand.fill_uniform(src2);
+
+            cpu::multiply_conv(dest1, src1, src2);
+            cuda::multiply_conv(dest2, src1, src2);
+            DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);
+
+
+            // now try it using the other mode of multiply_conv
+            src2.copy_size(src1);
+            dest1.set_size(1,src1.k(),1,1);
+            dest2.set_size(1,src1.k(),1,1);
+            trand.fill_uniform(dest1);
+            trand.fill_uniform(dest2);
+            trand.fill_uniform(src1);
+            trand.fill_uniform(src2);
+            cpu::multiply_conv(dest1, src1, src2);
+            cuda::multiply_conv(dest2, src1, src2);
+            const float scale = max(abs(mat(dest1)));
+            const float scalem = mean(abs(mat(dest1)));
+            DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2)))/scale < 1e-4 , max(abs(mat(dest1)-mat(dest2)))/scale);
+            DLIB_TEST_MSG(mean(abs(mat(dest1)-mat(dest2)))/scalem < 1e-5 , mean(abs(mat(dest1)-mat(dest2)))/scalem);
+        }
+
+        for (int iter = 0; iter < 100; ++iter)
+        {
+            print_spinner();
+            resizable_tensor dest1, dest2, src, A, B;
+            src.set_size(rnd.get_random_32bit_number()%30+1,
+                rnd.get_random_32bit_number()%30+1,
+                rnd.get_random_32bit_number()%30+1,
+                rnd.get_random_32bit_number()%30+1);
+            dest1.copy_size(src);
+            dest2.copy_size(src);
+            A.set_size(1,src.k(),1,1);
+            B.set_size(1,src.k(),1,1);
+
+            trand.fill_uniform(dest1);
+            trand.fill_uniform(dest2);
+            trand.fill_uniform(src);
+            trand.fill_uniform(A);
+            trand.fill_uniform(B);
+
+            cpu::affine_transform_conv(dest1, src, A, B);
+            cuda::affine_transform_conv(dest2, src, A, B);
+            DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);
+        }
+
+        for (int iter = 0; iter < 100; ++iter)
+        {
+            print_spinner();
+            resizable_tensor dest1, dest2, g;
+            g.set_size(rnd.get_random_32bit_number()%30+1,
+                rnd.get_random_32bit_number()%30+1,
+                rnd.get_random_32bit_number()%30+1,
+                rnd.get_random_32bit_number()%30+1);
+            dest1.set_size(1,g.k(),1,1);
+            dest2.set_size(1,g.k(),1,1);
+
+            trand.fill_uniform(dest1);
+            trand.fill_uniform(dest2);
+            trand.fill_uniform(g);
+
+            cpu::assign_conv_bias_gradient(dest1, g);
+            cuda::assign_conv_bias_gradient(dest2, g);
+            const float scale = max(abs(mat(dest1)));
+            const float scalem = mean(abs(mat(dest1)));
+            DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2)))/scale < 1e-4 , max(abs(mat(dest1)-mat(dest2)))/scale);
+            DLIB_TEST_MSG(mean(abs(mat(dest1)-mat(dest2)))/scalem < 1e-5 , mean(abs(mat(dest1)-mat(dest2)))/scalem);
+        }
+
+    }
+
+#endif // DLIB_USE_CUDA

 // ----------------------------------------------------------------------------------------

@@ -883,12 +977,22 @@ namespace
        }
        {
            print_spinner();
-            affine_ l;
+            affine_ l(CONV_MODE);
+            DLIB_TEST_MSG(test_layer(l), test_layer(l));
+        }
+        {
+            print_spinner();
+            affine_ l(FC_MODE);
+            DLIB_TEST_MSG(test_layer(l), test_layer(l));
+        }
+        {
+            print_spinner();
+            bn_ l(CONV_MODE);
            DLIB_TEST_MSG(test_layer(l), test_layer(l));
        }
        {
            print_spinner();
-            bn_ l;
+            bn_ l(FC_MODE);
            DLIB_TEST_MSG(test_layer(l), test_layer(l));
        }
        {
@@ -953,7 +1057,7 @@ namespace
    template <typename T> using rcon = max_pool<relu<bn<con<T>>>>;
    std::tuple<max_pool_,relu_,bn_,con_> rcon_ (unsigned long n) 
    {
-        return std::make_tuple(max_pool_(2,2,2,2),relu_(),bn_(BATCH_NORM_CONV),con_(n,5,5));
+        return std::make_tuple(max_pool_(2,2,2,2),relu_(),bn_(CONV_MODE),con_(n,5,5));
    }

    template <typename T> using rfc = relu<bn<fc<T>>>;
@@ -996,6 +1100,7 @@ namespace
        {
            test_tagging();
 #ifdef DLIB_USE_CUDA
+            test_more_ops2();
            test_more_ops(1,1);
            test_more_ops(3,4);
            test_more_ops(4,3);