Cleaned up trainer API and made the verbose output include information about

how much the current step size has converged.

Cleaned up trainer API and made the verbose output include information about
how much the current step size has converged.
95cb5697 · Davis King · c5f83cbe · 95cb5697 · 95cb5697
Commit 95cb5697 authored Mar 25, 2016 by Davis King
Show whitespace changes
Inline Side-by-side

Showing with 56 additions and 40 deletions

trainer.h dlib/dnn/trainer.h +40 -27

trainer_abstract.h dlib/dnn/trainer_abstract.h +16 -13

No files found.
--- a/dlib/dnn/trainer.h
+++ b/dlib/dnn/trainer.h
@@ -142,7 +142,8 @@ namespace dlib
                    last_time = now_time;
                    std::cout << "step#: " << rpad(cast_to_string(train_one_step_calls),epoch_string_pad) << "  " 
                        << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << "  "
-                        << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) 
+                        << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad)  << "  "
+                        << "steps without apparent progress: " << steps_without_progress 
                        << std::endl;
                    clear_average_loss();
                }
@@ -167,7 +168,8 @@ namespace dlib
                    last_time = now_time;
                    std::cout << "step#: " << rpad(cast_to_string(train_one_step_calls),epoch_string_pad) << "  " 
                        << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << "  "
-                        << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) 
+                        << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  "
+                        << "steps without apparent progress: " << steps_without_progress 
                        << std::endl;
                    clear_average_loss();
                }
@@ -207,7 +209,8 @@ namespace dlib
                            auto iter = epoch_iteration + epoch_pos/(double)data.size();
                            std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << "  " 
                                << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << "  "
-                                << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) 
+                                << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  "
+                                << "steps without apparent progress: " << steps_without_progress 
                                << std::endl;
                        }
                    }
@@ -229,7 +232,8 @@ namespace dlib
                    // are for full epoch status statements.
                    std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << "  " 
                              << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << "  "
-                              << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) 
+                              << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  "
+                              << "steps without apparent progress: " << steps_without_progress 
                              << std::endl;
                }
            }
@@ -270,7 +274,8 @@ namespace dlib
                            auto iter = epoch_iteration + epoch_pos/(double)data.size();
                            std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << "  " 
                                << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << "  "
-                                << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) 
+                                << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  "
+                                << "steps without apparent progress: " << steps_without_progress 
                                << std::endl;
                        }
                    }
@@ -290,7 +295,8 @@ namespace dlib
                    // are for full epoch status statements.
                    std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << "  " 
                              << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << "  "
-                              << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) 
+                              << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  "
+                              << "steps without apparent progress: " << steps_without_progress 
                              << std::endl;
                }
            }
@@ -359,17 +365,17 @@ namespace dlib
            return min_step_size;
        }

-        void set_iterations_between_step_size_adjust (
-            unsigned long min_iter
+        void set_iterations_without_progress_threshold (
+            unsigned long thresh 
        )
        {
-            iter_between_step_size_adjust = min_iter;
+            iter_without_progress_thresh = thresh;
        }

-        unsigned long get_iterations_between_step_size_adjust (
+        unsigned long get_iterations_without_progress_threshold (
        ) const
        {
-            return iter_between_step_size_adjust;
+            return iter_without_progress_thresh;
        }

        void set_step_size_shrink_amount (
@@ -396,15 +402,16 @@ namespace dlib
        void record_loss(double loss)
        {
            // Say that we will check if the gradient is bad 200 times during each
-            // iter_between_step_size_adjust interval of network updates.   This kind of
+            // iter_without_progress_thresh interval of network updates.   This kind of
            // budgeting causes our gradient checking to use a fixed amount of
            // computational resources, regardless of the size of
-            // iter_between_step_size_adjust.
+            // iter_without_progress_thresh.
            gradient_check_budget += 200;

            rs.add(loss);
            previous_loss_values.push_back(loss);
-            if (previous_loss_values.size() > iter_between_step_size_adjust)
+            // discard really old loss values.
+            while (previous_loss_values.size() > iter_without_progress_thresh)
                previous_loss_values.pop_front();
        }

@@ -417,7 +424,7 @@ namespace dlib

        void run_update(job_t& next_job, const no_label_type&)
        {
-            no_label_type pick_wich_run_update;
+            no_label_type pick_which_run_update;
            double loss = net.update(next_job.t, make_sstack(solvers), step_size);
            record_loss(loss);
        }
@@ -427,26 +434,28 @@ namespace dlib
            // Make sure this thread uses the same cuda device as the thread that created
            // the dnn_trainer object.
            dlib::cuda::set_device(cuda_device_id);
-            label_type pick_wich_run_update;
+            label_type pick_which_run_update;
            job_t next_job;
            while(job_pipe.dequeue(next_job))
            {
                // call net.update() but pick the right version for unsupervised or
                // supervised training based on the type of label_type.
-                run_update(next_job, pick_wich_run_update);
+                run_update(next_job, pick_which_run_update);

                // If we have been running for a while then check if the loss is still
                // dropping.  If it isn't then we will reduce the step size.  Note that we
                // have a "budget" that prevents us from calling
-                // probability_gradient_greater_than() every iteration.  We do this because
+                // count_steps_without_decrease() every iteration.  We do this because
                // it can be expensive to compute when previous_loss_values is large.
-                if (previous_loss_values.size() >= iter_between_step_size_adjust && 
-                    gradient_check_budget > previous_loss_values.size())
+                if (gradient_check_budget > iter_without_progress_thresh)
                {
                    gradient_check_budget = 0;
-                    if (probability_gradient_greater_than(previous_loss_values, 0) > 0.49)
+                    steps_without_progress = count_steps_without_decrease(previous_loss_values);
+                    if (steps_without_progress >= iter_without_progress_thresh)
                    {
+                        // optimization has flattened out, so drop the learning rate. 
                        step_size = step_size_shrink*step_size;
+                        steps_without_progress = 0;
                        previous_loss_values.clear();
                    }
                }
@@ -475,7 +484,8 @@ namespace dlib
            cuda_device_id = dlib::cuda::get_device();
            step_size = 1;
            min_step_size = 1e-3;
-            iter_between_step_size_adjust = 2000;
+            iter_without_progress_thresh = 2000;
+            steps_without_progress = 0;
            step_size_shrink = 0.1;
            epoch_iteration = 0;
            epoch_pos = 0;
@@ -491,7 +501,7 @@ namespace dlib
        friend void serialize(const dnn_trainer& item, std::ostream& out)
        {
            item.wait_for_thread_to_pause();
-            int version = 4;
+            int version = 5;
            serialize(version, out);

            size_t nl = dnn_trainer::num_layers;
@@ -505,7 +515,8 @@ namespace dlib
            serialize(item.solvers, out);
            serialize(item.step_size.load(), out);
            serialize(item.min_step_size, out);
-            serialize(item.iter_between_step_size_adjust.load(), out);
+            serialize(item.iter_without_progress_thresh.load(), out);
+            serialize(item.steps_without_progress.load(), out);
            serialize(item.step_size_shrink.load(), out);
            serialize(item.epoch_iteration, out);
            serialize(item.epoch_pos, out);
@@ -516,7 +527,7 @@ namespace dlib
            item.wait_for_thread_to_pause();
            int version = 0;
            deserialize(version, in);
-            if (version != 4)
+            if (version != 5)
                throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer.");

            size_t num_layers = 0;
@@ -540,7 +551,8 @@ namespace dlib
            deserialize(item.solvers, in);
            deserialize(dtemp, in); item.step_size = dtemp;
            deserialize(item.min_step_size, in);
-            deserialize(ltemp, in); item.iter_between_step_size_adjust = ltemp;
+            deserialize(ltemp, in); item.iter_without_progress_thresh = ltemp;
+            deserialize(ltemp, in); item.steps_without_progress = ltemp;
            deserialize(dtemp, in); item.step_size_shrink = dtemp;
            deserialize(item.epoch_iteration, in);
            deserialize(item.epoch_pos, in);
@@ -592,7 +604,8 @@ namespace dlib
        std::vector<solver_type> solvers;
        std::atomic<double> step_size;
        double min_step_size;
-        std::atomic<unsigned long> iter_between_step_size_adjust;
+        std::atomic<unsigned long> iter_without_progress_thresh;
+        std::atomic<unsigned long> steps_without_progress;
        std::atomic<double> step_size_shrink;
        std::chrono::time_point<std::chrono::system_clock> last_sync_time;
        std::string sync_filename;

--- a/dlib/dnn/trainer_abstract.h
+++ b/dlib/dnn/trainer_abstract.h
@@ -61,7 +61,7 @@ namespace dlib
                - #get_mini_batch_size() == 128
                - #get_step_size() == 1
                - #get_min_step_size() == 1e-3
-                - #get_iterations_between_step_size_adjust() == 2000
+                - #get_iterations_without_progress_threshold() == 2000
                - #get_step_size_shrink() == 0.1
        !*/

@@ -193,27 +193,30 @@ namespace dlib
                  training will terminate.
        !*/

-        void set_iterations_between_step_size_adjust (
-            unsigned long min_iter
+        void set_iterations_without_progress_threshold (
+            unsigned long thresh 
        );
        /*!
            ensures
-                - #get_iterations_between_step_size_adjust() == min_iter
+                - #get_iterations_without_progress_threshold() == thresh
        !*/

-        unsigned long get_iterations_between_step_size_adjust (
+        unsigned long get_iterations_without_progress_threshold (
        ) const;
        /*!
            ensures
                - This object monitors the progress of training and estimates if the
-                  training error is being reduced.  It does this by looking at
-                  get_iterations_between_step_size_adjust() mini-batch results and applying
-                  the statistical test defined by the running_gradient object to see if the
-                  training error is getting smaller.  
-
-                  Therefore, get_iterations_between_step_size_adjust() should always be set
-                  to something sensibly large so that this test can be done with reasonably
-                  high confidence.
+                  training error is being reduced.  It does this by looking at the previous
+                  get_iterations_without_progress_threshold() mini-batch results and
+                  applying the statistical test defined by the running_gradient object to
+                  see if the training error is getting smaller.  If it isn't being reduced
+                  then get_step_size() is made smaller by a factor of get_step_size_shrink().
+
+                  Therefore, get_iterations_without_progress_threshold() should always be
+                  set to something sensibly large so that this test can be done with
+                  reasonably high confidence.  Think of this test as saying "if the loss
+                  hasn't been reduced for the previous get_iterations_without_progress_threshold() 
+                  then shrink the step size".
        !*/

        void set_step_size_shrink_amount (