Added visit_layer_parameter_gradients() and also fixed a silly synchronization

error in the multi-gpu training code.

Added visit_layer_parameter_gradients() and also fixed a silly synchronization
error in the multi-gpu training code.
168574bd · Davis King · d31723ff · 168574bd · 168574bd · 168574bd
Commit 168574bd authored Apr 19, 2016 by Davis King
Show whitespace changes
Inline Side-by-side

Showing with 135 additions and 2 deletions

core.h dlib/dnn/core.h +67 -0

core_abstract.h dlib/dnn/core_abstract.h +34 -0

trainer.h dlib/dnn/trainer.h +34 -2

No files found.
--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -3049,6 +3049,73 @@ namespace dlib
        impl::vlp_loop<0, net_type::num_layers>::visit(comp_i, net, v);
    }
+// ----------------------------------------------------------------------------------------
+    namespace impl
+    {
+        template <size_t i, size_t num>
+        struct vlpg_loop
+        {
+            template <typename T, typename U>
+            static typename std::enable_if<!is_add_layer<U>::value>::type invoke_functor(T&& , size_t& , U&& )
+            {
+                // intentionally left empty
+            }
+            template <typename T, typename U>
+            static typename std::enable_if<is_add_layer<U>::value>::type invoke_functor(T&& v , size_t& comp_i, U&& l )
+            {
+                v(comp_i, l.get_parameter_gradient());
+                ++comp_i;
+            }
+            template <
+                typename net_type,
+                typename visitor
+                >
+            static void visit(
+                size_t comp_i,
+                net_type& net,
+                visitor&& v
+            )
+            {
+                invoke_functor(v, comp_i, layer<i>(net));
+                vlpg_loop<i+1, num>::visit(comp_i, net,v);
+            }
+        };
+        template <size_t num>
+        struct vlpg_loop<num,num>
+        {
+            template <
+                typename net_type,
+                typename visitor
+                >
+            static void visit(
+                size_t,
+                net_type&,
+                visitor&& 
+            )
+            {
+                // Base case of recursion.  Don't do anything.
+            }
+        };
+    }
+    template <
+        typename net_type,
+        typename visitor
+        >
+    void visit_layer_parameter_gradients(
+        net_type& net,
+        visitor v
+    )
+    {
+        size_t comp_i = 0;
+        impl::vlpg_loop<0, net_type::num_layers>::visit(comp_i, net, v);
+    }
 // ----------------------------------------------------------------------------------------
 }

--- a/dlib/dnn/core_abstract.h
+++ b/dlib/dnn/core_abstract.h
@@ -1348,6 +1348,40 @@ namespace dlib
            - When v() is called, the first argument is always < net_type::num_computational_layers.
    !*/
+// ----------------------------------------------------------------------------------------
+    template <
+        typename net_type,
+        typename visitor
+        >
+    void visit_layer_parameter_gradients(
+        net_type& net,
+        visitor v
+    );
+    /*!
+        requires
+            - net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
+              add_tag_layer.
+            - v is a function object with a signature equivalent to: 
+                v(size_t idx, tensor& t)
+        ensures
+            - Loops over all the computational layers (i.e. layers with parameters, as
+              opposed to loss, tag, or input layers) in net and passes their parameter
+              gradients to v().  To be specific, this function essentially performs the
+              following:
+                size_t computational_layer_idx = 0;
+                for (size_t i = 0; i < net_type::num_layers; ++i)
+                {
+                    if (layer<i>(net) is a computational layer)
+                    {
+                        v(computational_layer_idx, layer<i>(net).get_parameter_gradient());
+                        ++computational_layer_idx;
+                    }
+                }
+            - When v() is called, the first argument is always < net_type::num_computational_layers.
+    !*/
 // ----------------------------------------------------------------------------------------
    struct layer_test_results

--- a/dlib/dnn/trainer.h
+++ b/dlib/dnn/trainer.h
@@ -501,9 +501,12 @@ namespace dlib
            std::vector<std::future<double>> losses(devices.size());
            std::vector<std::future<void>> update_futs(devices.size());
            std::vector<matrix<float>> param_buffer(net_type::num_computational_layers);
+            std::vector<matrix<float>> param_grad_buffer(net_type::num_computational_layers);
+            size_t iteration = 0;
            while(job_pipe.dequeue(next_job))
            {
+                ++iteration;
                // Call compute_parameter_gradients() and update_parameters() but pick the
                // right version for unsupervised or supervised training based on the type
                // of label_type.
@@ -517,17 +520,45 @@ namespace dlib
                // gradient updates between devices.  So we do that now.
                if (devices.size() > 1)
                {
-                    for (auto&& p : param_buffer)
+                    for (auto&& p : param_grad_buffer)
                        p = 0;
                    // now average all the parameter gradients
                    for (size_t i = 0; i < devices.size(); ++i)
+                    {
+                        visit_layer_parameter_gradients(devices[i]->net, [&param_grad_buffer](size_t j, tensor& t) { 
+                            if (t.size() != 0)
+                            param_grad_buffer[j] += mat(t);
+                        });
+                    }
+                    // and then assign the parameter gradients back to all the networks
+                    const float scale = 1.0f/devices.size();
+                    for (size_t i = 0; i < devices.size(); ++i)
+                    {
+                        visit_layer_parameter_gradients(devices[i]->net, [scale,&param_grad_buffer](size_t j, tensor& t) { 
+                            if (t.size() != 0)
+                            {
+                                t = param_grad_buffer[j]*scale;
+                                t.async_copy_to_device();
+                            }
+                        });
+                    }
+                    // Evey now and then force all the parameters to be the same just to
+                    // make sure they aren't drifting apart due to any non-deterministic
+                    // behavior on the GPU.
+                    if (iteration%5000 == 1)
+                    {
+                        for (auto&& p : param_buffer)
+                            p = 0;
+                        // now average all the parameters
+                        for (size_t i = 0; i < devices.size(); ++i)
                        {
                            visit_layer_parameters(devices[i]->net, [&param_buffer](size_t j, tensor& t) { 
                                if (t.size() != 0)
                                param_buffer[j] += mat(t);
                            });
                        }
-                    // and then assign the parameter gradients back to all the networks
+                        // and then assign the parameters back to all the networks.
                        const float scale = 1.0f/devices.size();
                        for (size_t i = 0; i < devices.size(); ++i)
                        {
@@ -540,6 +571,7 @@ namespace dlib
                            });
                        }
                    }
+                }
                // Now apply all the updates to each device.