Added l2normalize_ layer.

b2faad0d · Davis King · a390f109 · b2faad0d · b2faad0d · b2faad0d
Commit b2faad0d authored Oct 23, 2016 by Davis King
7 changed files
--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@@ -108,6 +108,135 @@ namespace dlib
    // -----------------------------------------------------------------------------------
    // -----------------------------------------------------------------------------------
+    // -----------------------------------------------------------------------------------
+        __global__ void _cuda_inverse_norms(float* invnorms, const float* data, size_t nr, size_t nc, const float eps)
+        {
+            // initialize invnorms before we begin.
+            for (auto i : grid_stride_range(0, nr))
+                invnorms[i] = eps;
+            __syncthreads();
+            for (auto i : grid_stride_range_y(0, nr))
+            {
+                auto p = data + i*nc;
+                float temp = 0;
+                for (auto j : grid_stride_range(0, nc))
+                    temp += p[j]*p[j];
+                // and store the sum into invnorms[i]
+                warp_reduce_atomic_add(invnorms[i], temp);
+            }
+            __syncthreads();
+            for (auto j : grid_stride_range(0, nr))
+            {
+                invnorms[j] = 1.0/std::sqrt(invnorms[j]);
+            }
+        }
+        void inverse_norms (
+            resizable_tensor& invnorms,
+            const tensor& data,
+            const double eps
+        )
+        {
+            invnorms.set_size(data.num_samples());
+            launch_kernel(_cuda_inverse_norms,max_jobs(data.size()), invnorms.device(), data.device(), data.num_samples(), data.size()/data.num_samples(), eps);
+        }
+    // ----------------------------------------------------------------------------------------
+        __global__ void _cuda_dot_prods(float* out, const float* lhs, const float* rhs, size_t nr, size_t nc)
+        {
+            // initialize out before we begin.
+            for (auto i : grid_stride_range(0, nr))
+                out[i] = 0;
+            __syncthreads();
+            for (auto i : grid_stride_range_y(0, nr))
+            {
+                auto l = lhs + i*nc;
+                auto r = rhs + i*nc;
+                float temp = 0;
+                for (auto j : grid_stride_range(0, nc))
+                    temp += l[j]*r[j];
+                // and store the sum into out[i]
+                warp_reduce_atomic_add(out[i], temp);
+            }
+        }
+        void dot_prods (
+            resizable_tensor& out,
+            const tensor& lhs,
+            const tensor& rhs
+        )
+        {
+            out.set_size(lhs.num_samples());
+            launch_kernel(_cuda_dot_prods, max_jobs(lhs.size()), out.device(), lhs.device(), rhs.device(), lhs.num_samples(), lhs.size()/lhs.num_samples());
+        }
+    // ----------------------------------------------------------------------------------------
+        __global__ void _cuda_scale_rows(float* out, const float* m, const float* v, size_t nr, size_t nc)
+        {
+            for (auto j : grid_stride_range(0, nr*nc))
+            {
+                out[j] = m[j]*v[j/nc];
+            }
+        }
+        void scale_rows (
+            tensor& out,
+            const tensor& m,
+            const tensor& v
+        )
+        {
+            launch_kernel(_cuda_scale_rows, max_jobs(m.size()), out.device(), m.device(), v.device(), m.num_samples(), m.size()/m.num_samples());
+        }
+    // ----------------------------------------------------------------------------------------
+        __global__ void _cuda_scale_rows2(float* out, const float* m1, const float* m2, const float* v1, const float* v2, size_t nr, size_t nc)
+        {
+            for (auto j : grid_stride_range(0, nr*nc))
+            {
+                out[j] = (m1[j] - m2[j]*v1[j/nc]) * v2[j/nc];
+            }
+        }
+        __global__ void _cuda_scale_rows2_beta(const float beta, float* out, const float* m1, const float* m2, const float* v1, const float* v2, size_t nr, size_t nc)
+        {
+            for (auto j : grid_stride_range(0, nr*nc))
+            {
+                out[j] = beta*out[j] + (m1[j] - m2[j]*v1[j/nc]) * v2[j/nc];
+            }
+        }
+        void scale_rows2 (
+            float beta, 
+            tensor& out,
+            const tensor& m1,
+            const tensor& m2,
+            const tensor& v1,
+            const tensor& v2
+        )
+        {
+            if (beta == 0)
+            {
+                launch_kernel(_cuda_scale_rows2, max_jobs(m1.size()), out.device(),
+                    m1.device(), m2.device(), v1.device(), v2.device(), m1.num_samples(),
+                    m1.size()/m1.num_samples());
+            }
+            else
+            {
+                launch_kernel(_cuda_scale_rows2_beta, max_jobs(m1.size()), beta,
+                    out.device(), m1.device(), m2.device(), v1.device(), v2.device(),
+                    m1.num_samples(), m1.size()/m1.num_samples());
+            }
+        }
    // -----------------------------------------------------------------------------------
        __global__ void _cuda_multiply1(float* d, const float* s1, const float* s2, size_t n)

--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@@ -108,6 +108,35 @@ namespace dlib
    // -----------------------------------------------------------------------------------
+        void inverse_norms (
+            resizable_tensor& invnorms,
+            const tensor& data,
+            const double eps
+        );
+        void dot_prods (
+            resizable_tensor& out,
+            const tensor& lhs,
+            const tensor& rhs
+        );
+        void scale_rows (
+            tensor& out,
+            const tensor& m,
+            const tensor& v
+        );
+        void scale_rows2 (
+            float beta, 
+            tensor& out,
+            const tensor& m1,
+            const tensor& m2,
+            const tensor& v1,
+            const tensor& v2
+        );
+    // ------------------------------------------------------------------------------------
        void multiply (
            bool add_to,
            tensor& dest,

--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -2277,6 +2277,98 @@ namespace dlib
            typename SUBNET>
    using inception5 = concat5<itag1, itag2, itag3, itag4, itag5,
                itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip<  itag4<B4<iskip<  itag5<B5<  itag0<SUBNET>>>>>>>>>>>>>>>>;
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+    const double DEFAULT_L2_NORM_EPS = 1e-5;
+    class l2normalize_
+    {
+    public:
+        explicit l2normalize_(
+            double eps_ = DEFAULT_L2_NORM_EPS
+        ) : 
+            eps(eps_)
+        {
+        }
+        double get_eps() const { return eps; }
+        template <typename SUBNET>
+        void setup (const SUBNET& /*sub*/)
+        {
+        }
+        void forward_inplace(const tensor& input, tensor& output)
+        {
+            tt::inverse_norms(norm, input, eps);
+            tt::scale_rows(output, input, norm);
+        } 
+        void backward_inplace(
+            const tensor& computed_output, 
+            const tensor& gradient_input, 
+            tensor& data_grad, 
+            tensor& /*params_grad*/
+        )
+        {
+            if (is_same_object(gradient_input, data_grad))
+            {
+                tt::dot_prods(temp, gradient_input, computed_output);
+                tt::scale_rows2(0, data_grad, gradient_input, computed_output, temp, norm);
+            }
+            else
+            {
+                tt::dot_prods(temp, gradient_input, computed_output);
+                tt::scale_rows2(1, data_grad, gradient_input, computed_output, temp, norm);
+            }
+        }
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+        friend void serialize(const l2normalize_& item, std::ostream& out)
+        {
+            serialize("l2normalize_", out);
+            serialize(item.eps, out);
+        }
+        friend void deserialize(l2normalize_& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "l2normalize_")
+                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::l2normalize_.");
+            deserialize(item.eps, in);
+        }
+        friend std::ostream& operator<<(std::ostream& out, const l2normalize_& item)
+        {
+            out << "l2normalize";
+            out << " eps="<<item.eps;
+            return out;
+        }
+        friend void to_xml(const l2normalize_& item, std::ostream& out)
+        {
+            out << "<l2normalize";
+            out << " eps='"<<item.eps<<"'";
+            out << "/>\n";
+        }
+    private:
+        double eps;
+        resizable_tensor params; // unused
+        // Here only to avoid reallocation and as a cache between forward/backward
+        // functions.  
+        resizable_tensor norm;
+        resizable_tensor temp;
+    };
+    template <typename SUBNET>
+    using l2normalize = add_layer<l2normalize_, SUBNET>;
 // ----------------------------------------------------------------------------------------
 }

--- a/dlib/dnn/layers_abstract.h
+++ b/dlib/dnn/layers_abstract.h
@@ -1930,6 +1930,56 @@ namespace dlib
    using inception5 = concat5<itag1, itag2, itag3, itag4, itag5,
                itag1<B1<iskip< itag2<B2<iskip< itag3<B3<iskip<  itag4<B4<iskip<  itag5<B5<  itag0<SUBNET>>>>>>>>>>>>>>>>;
+// ----------------------------------------------------------------------------------------
+    const double DEFAULT_L2_NORM_EPS = 1e-5;
+    class l2normalize_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above.  It takes tensors as input and L2 normalizes them.  In particular,
+                it has the following properties:
+                    - The output tensors from this layer have the same dimenstions as the
+                      input tensors.
+                    - If you think of each input tensor as a set of tensor::num_samples()
+                      vectors, then the output tensor contains the same vectors except they
+                      have been length normlized so that their L2 norms are all 1.  I.e. 
+                      for each vector v we will have ||v||==1.
+        !*/
+    public:
+        explicit l2normalize_(
+            double eps = tt::DEFAULT_L2_NORM_EPS
+        );
+        /*!
+            requires
+                - eps > 0
+            ensures
+                - #get_eps() == eps
+        !*/
+        double get_eps(
+        ) const; 
+        /*!
+            ensures
+                - When we normalize a vector we divide it by its L2 norm.  However, the
+                  get_eps() value is added to the squared norm prior to division to avoid
+                  ever dividing by zero. 
+        !*/
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        void forward_inplace(const tensor& input, tensor& output);
+        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+    };
 // ----------------------------------------------------------------------------------------
 }

--- a/dlib/dnn/tensor_tools.cpp
+++ b/dlib/dnn/tensor_tools.cpp
@@ -41,6 +41,74 @@ namespace dlib
 namespace dlib { namespace tt
 {
+// ----------------------------------------------------------------------------------------
+    void inverse_norms (
+        resizable_tensor& invnorms,
+        const tensor& data,
+        const double eps
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::inverse_norms(invnorms, data, eps);
+#else
+        invnorms = reciprocal(sqrt(sum_cols(squared(mat(data))) + eps));
+#endif
+    }
+    void dot_prods (
+        resizable_tensor& out,
+        const tensor& lhs,
+        const tensor& rhs
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::dot_prods(out, lhs, rhs);
+#else
+        out = sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
+#endif
+    }
+    void scale_rows (
+        tensor& out,
+        const tensor& m,
+        const tensor& v
+    )
+    {
+        DLIB_CASSERT(have_same_dimensions(out,m));
+#ifdef DLIB_USE_CUDA
+        cuda::scale_rows(out, m, v);
+#else
+        out = scale_rows(mat(m), mat(v));
+#endif
+    }
+    void scale_rows2 (
+        float beta, 
+        tensor& out,
+        const tensor& m1,
+        const tensor& m2,
+        const tensor& v1,
+        const tensor& v2
+    )
+    {
+        DLIB_CASSERT(have_same_dimensions(out,m1));
+        DLIB_CASSERT(have_same_dimensions(out,m2));
+        DLIB_CASSERT(have_same_dimensions(v1,v2));
+        DLIB_CASSERT(is_vector(mat(v1))); 
+        DLIB_CASSERT(v1.size() == m1.num_samples());
+#ifdef DLIB_USE_CUDA
+        cuda::scale_rows2(beta, out, m1, m2, v1, v2);
+#else
+        if (beta == 0)
+            out = scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2));
+        else
+            out = beta*mat(out) + scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2));
+#endif
+    }
 // ----------------------------------------------------------------------------------------
    void gemm (

--- a/dlib/dnn/tensor_tools.h
+++ b/dlib/dnn/tensor_tools.h
@@ -22,6 +22,66 @@ namespace dlib
 namespace dlib { namespace tt
 {
+// ----------------------------------------------------------------------------------------
+    void inverse_norms (
+        resizable_tensor& invnorms,
+        const tensor& data,
+        const double eps
+    );
+    /*!
+        ensures
+            - #invnorms == reciprocal(sqrt(sum_cols(squared(mat(data))) + eps))
+    !*/
+    void dot_prods (
+        resizable_tensor& out,
+        const tensor& lhs,
+        const tensor& rhs
+    );
+    /*!
+        requires
+            - have_same_dimensions(lhs,rhs) == true
+        ensures
+            - #out.num_samples() == lhs.num_samples()
+            - #out.k() == #out.nr() == #out.nc() == 1
+            - #out == sum_cols(pointwise_multiply(mat(lhs), mat(rhs))); 
+    !*/
+    void scale_rows (
+        tensor& out,
+        const tensor& m,
+        const tensor& v
+    );
+    /*!
+        requires
+            - have_same_dimensions(out,m) == true
+            - is_vector(mat(v)) == true
+            - v.size() == m.num_samples()
+        ensures
+            - performs: out = scale_rows(mat(m),mat(v));
+    !*/
+    void scale_rows2 (
+        float beta, 
+        tensor& out,
+        const tensor& m1,
+        const tensor& m2,
+        const tensor& v1,
+        const tensor& v2
+    );
+    /*!
+        requires
+            - have_same_dimensions(out,m1) == true
+            - have_same_dimensions(out,m2) == true
+            - have_same_dimensions(v1,v2) == true
+            - is_vector(mat(v1)) == true
+            - v1.size() == m1.num_samples()
+        ensures
+            - performs: 
+                out = beta*out + scale_rows(mat(m1) - scale_rows(mat(m2),mat(v1)), mat(v2));
+    !*/
 // ----------------------------------------------------------------------------------------
    void gemm (

--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -927,7 +927,6 @@ namespace
            A = dest;
            B = dest;
-            tensor_rand rnd;
            rnd.fill_uniform(dest);
            rnd.fill_uniform(A);
            rnd.fill_uniform(B);
@@ -960,6 +959,23 @@ namespace
            cuda::multiply(false, dest, A, B);
            DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6); 
        }
+        {
+            resizable_tensor invnorms1, invnorms2;
+            resizable_tensor data(4,5), out1, out2;
+            rnd.fill_uniform(data);
+            const double eps = 0.1;
+            invnorms2 = reciprocal(sqrt(sum_cols(squared(mat(data))) + eps));
+            tt::inverse_norms(invnorms1, data, eps);
+            DLIB_TEST(max(abs(mat(invnorms1)-mat(invnorms2))) < 1e-6);
+            out1.copy_size(data);
+            tt::scale_rows(out1, data, invnorms1);
+            out2 = scale_rows(mat(data), mat(invnorms1));
+            DLIB_TEST(max(abs(mat(out1)-mat(out2))) < 1e-6);
+        }
    }
 // ----------------------------------------------------------------------------------------
@@ -1302,6 +1318,12 @@ namespace
    void test_layers()
    {
+        {
+            print_spinner();
+            l2normalize_ l;
+            auto res = test_layer(l);
+            DLIB_TEST_MSG(res, res);
+        }
        {
            print_spinner();
            multiply_ l;