Improved the dnn_trainer. In particular, it no longer makes a copy of the

network (which would needlessly double VRAM usage). I also added a set_synchronization_file() method so you can tell it to automatically synchronize itself to disk every so often during training. This makes resuming an interrupted training session trivially easy.

Improved the dnn_trainer. In particular, it no longer makes a copy of the
network (which would needlessly double VRAM usage). I also added a set_synchronization_file() method so you can tell it to automatically synchronize itself to disk every so often during training. This makes resuming an interrupted training session trivially easy.
68412221 · Davis King · 4189386d · 68412221 · 68412221
Commit 68412221 authored Jan 09, 2016 by Davis King
Show whitespace changes
Inline Side-by-side

Showing with 210 additions and 142 deletions

trainer.h dlib/dnn/trainer.h +166 -70

trainer_abstract.h dlib/dnn/trainer_abstract.h +44 -72

No files found.
--- a/dlib/dnn/trainer.h
+++ b/dlib/dnn/trainer.h
@@ -8,6 +8,8 @@
 #include "solvers.h"
 #include "../statistics.h"
 #include <chrono>
+#include <fstream>
+#include <sstream>
 #include "../serialize.h"

 #include "../pipe.h"
@@ -15,6 +17,7 @@
 #include "cuda_dlib.h"
 #include "../statistics/running_gradient.h"
 #include <atomic>
+#include <cstdio>

 namespace dlib
 {
@@ -34,22 +37,20 @@ namespace dlib

        typedef typename net_type::label_type label_type;
        typedef typename net_type::input_type input_type;
+        const static size_t num_layers = net_type::num_layers;

-        dnn_trainer(
-        ) : job_pipe(0), solvers(net_type::num_layers)
-        {
-            init();
-        }
+        dnn_trainer() = delete;
+        dnn_trainer(const dnn_trainer&) = delete;

-        explicit dnn_trainer(const net_type& net_) : job_pipe(0), net(net_), solvers(net_type::num_layers)
+        explicit dnn_trainer(net_type& net_) : job_pipe(0), net(net_), solvers(num_layers)
        {
            init();
        }

        dnn_trainer(
-            const net_type& net_, 
+            net_type& net_, 
            const solver_type& solver_
-        ) : job_pipe(0), net(net_), solvers(net_type::num_layers, solver_) 
+        ) : job_pipe(0), net(net_), solvers(num_layers, solver_) 
        {
            init();
        }
@@ -62,27 +63,19 @@ namespace dlib
            wait();
        }

-        const net_type& get_net (
+        net_type& get_net (
        ) const 
        { 
            wait_for_thread_to_pause();
            return net; 
        }

-        void set_net (
-            const net_type& net_
-        ) 
-        { 
-            wait_for_thread_to_pause();
-            return net = net_; 
-        }
-
        void set_solver (
            const solver_type& solver_
        ) 
        { 
            wait_for_thread_to_pause();
-            solvers = std::vector<solver_type>(net_type::num_layers, solver_); 
+            solvers = std::vector<solver_type>(num_layers, solver_); 
        }

        unsigned long get_mini_batch_size (
@@ -140,6 +133,7 @@ namespace dlib
            const std::vector<label_type>& labels 
        )
        {
+            sync_to_disk();
            job.labels = labels;
            net.to_tensor(data.begin(), data.end(), job.t);
            job_pipe.enqueue(job);
@@ -149,32 +143,39 @@ namespace dlib
            const std::vector<input_type>& data
        )
        {
+            sync_to_disk();
            net.to_tensor(data.begin(), data.end(), job.t);
            job_pipe.enqueue(job);
        }

-        const net_type& train (
+        void train (
            const std::vector<input_type>& data,
            const std::vector<label_type>& labels 
        ) 
        {
            DLIB_CASSERT(data.size() == labels.size() && data.size() > 0, "");

-            for (unsigned long epoch_iteration = 0; 
+            bool updated_the_network = false;
+            // The reason these two loops don't initialize their counter variables but
+            // instead use class members is so we can include the state of the loops in the
+            // stuff written by sync_to_disk()
+            for (; 
                epoch_iteration < max_num_epochs && step_size >= min_step_size; 
                ++epoch_iteration)
            {
                using namespace std::chrono;
                auto last_time = system_clock::now();
                clear_average_loss();
-                for (size_t i = 0; i < data.size() && step_size >= min_step_size; i += mini_batch_size)
+                for (; epoch_pos < data.size() && step_size >= min_step_size; epoch_pos += mini_batch_size)
                {
-                    net.to_tensor(data.begin()+i, 
-                                  data.begin()+std::min(i+mini_batch_size,data.size()), 
+                    sync_to_disk();
+                    net.to_tensor(data.begin()+epoch_pos, 
+                                  data.begin()+std::min(epoch_pos+mini_batch_size,data.size()), 
                                  job.t);
-                    job.labels.assign(labels.begin()+i,
-                                      labels.begin()+std::min(i+mini_batch_size,data.size()));
+                    job.labels.assign(labels.begin()+epoch_pos,
+                                      labels.begin()+std::min(epoch_pos+mini_batch_size,data.size()));
                    job_pipe.enqueue(job);
+                    updated_the_network = true;


                    if (verbose)
@@ -183,14 +184,16 @@ namespace dlib
                        if (now_time-last_time > seconds(20))
                        {
                            last_time = now_time;
-                            auto iter = epoch_iteration + i/(double)data.size();
+                            auto iter = epoch_iteration + epoch_pos/(double)data.size();
                            std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << "  " 
                                << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << "  "
                                << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) 
                                << std::endl;
                        }
                    }
+
                }
+                epoch_pos = 0;

                if (verbose)
                {
@@ -202,10 +205,12 @@ namespace dlib
                              << std::endl;
                }
            }
-            return get_net();
+            wait_for_thread_to_pause();
+            // if we modified the network at all then be sure to sync the final result.
+            sync_to_disk(updated_the_network);
        }

-        const net_type& train (
+        void train (
            const std::vector<input_type>& data
        ) 
        {
@@ -215,19 +220,25 @@ namespace dlib
            static_assert(has_unsupervised_loss, 
                "You can only call this version of train() when using an unsupervised loss.");

-            for (unsigned long epoch_iteration = 0; 
+            bool updated_the_network = false;
+            // The reason these two loops don't initialize their counter variables but
+            // instead use class members is so we can include the state of the loops in the
+            // stuff written by sync_to_disk()
+            for (; 
                epoch_iteration < max_num_epochs && step_size >= min_step_size; 
                ++epoch_iteration)
            {
                using namespace std::chrono;
                auto last_time = system_clock::now();
                clear_average_loss();
-                for (size_t i = 0; i < data.size() && step_size >= min_step_size; i += mini_batch_size)
+                for (; epoch_pos < data.size() && step_size >= min_step_size; epoch_pos += mini_batch_size)
                {
-                    net.to_tensor(data.begin()+i, 
-                                  data.begin()+std::min(i+mini_batch_size,data.size()), 
+                    sync_to_disk();
+                    net.to_tensor(data.begin()+epoch_pos, 
+                                  data.begin()+std::min(epoch_pos+mini_batch_size,data.size()), 
                                  job.t);
                    job_pipe.enqueue(job);
+                    updated_the_network = true;


                    if (verbose)
@@ -236,7 +247,7 @@ namespace dlib
                        if (now_time-last_time > seconds(20))
                        {
                            last_time = now_time;
-                            auto iter = epoch_iteration + i/(double)data.size();
+                            auto iter = epoch_iteration + epoch_pos/(double)data.size();
                            std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << "  " 
                                << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << "  "
                                << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) 
@@ -244,6 +255,7 @@ namespace dlib
                        }
                    }
                }
+                epoch_pos = 0;

                if (verbose)
                {
@@ -255,48 +267,34 @@ namespace dlib
                              << std::endl;
                }
            }
-            return get_net();
+            wait_for_thread_to_pause();
+            // if we modified the network at all then be sure to sync the final result.
+            sync_to_disk(updated_the_network);
        }

-        friend void serialize(const dnn_trainer& item, std::ostream& out)
+        void set_synchronization_file (
+            const std::string& filename,
+            std::chrono::seconds time_between_syncs_ = std::chrono::minutes(15)
+        )
        {
-            item.wait_for_thread_to_pause();
-            int version = 3;
-            serialize(version, out);
+            last_sync_time = std::chrono::system_clock::now();
+            sync_filename = filename;
+            time_between_syncs = time_between_syncs_;

-            serialize(item.rs, out);
-            serialize(item.rg, out);
-            serialize(item.max_num_epochs, out);
-            serialize(item.mini_batch_size, out);
-            serialize(item.verbose, out);
-            serialize(item.net, out);
-            serialize(item.solvers, out);
-            serialize(item.step_size.load(), out);
-            serialize(item.min_step_size, out);
-            serialize(item.iter_between_step_size_adjust.load(), out);
-            serialize(item.step_size_shrink.load(), out);
+            // check if the sync file already exists, if it does we should load it.  We
+            // first check for a .tmp version since that would be the newest if it existed.
+            // If it doesn't exist we check the canonical sync file.
+            std::ifstream fin(filename+".tmp", std::ios::binary);
+            if (fin)
+            {
+                deserialize(*this, fin);
            }
-
-        friend void deserialize(dnn_trainer& item, std::istream& in)
+            else
            {
-            item.wait_for_thread_to_pause();
-            int version = 0;
-            deserialize(version, in);
-            if (version != 3)
-                throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer.");
-
-            double temp;
-            deserialize(item.rs, in);
-            deserialize(item.rg, in);
-            deserialize(item.max_num_epochs, in);
-            deserialize(item.mini_batch_size, in);
-            deserialize(item.verbose, in);
-            deserialize(item.net, in);
-            deserialize(item.solvers, in);
-            deserialize(temp, in); item.step_size = temp;
-            deserialize(item.min_step_size, in);
-            deserialize(temp, in); item.iter_between_step_size_adjust = temp;
-            deserialize(temp, in); item.step_size_shrink = temp;
+                std::ifstream fin(filename, std::ios::binary);
+                if (fin)
+                    deserialize(*this, fin);
+            }
        }

        double get_average_loss (
@@ -442,9 +440,102 @@ namespace dlib
            min_step_size = 1e-4;
            iter_between_step_size_adjust = 2000;
            step_size_shrink = 0.1;
+            epoch_iteration = 0;
+            epoch_pos = 0;
            start();
        }

+        // serialize and deserialize are private because we hold net by reference so
+        // allowing someone to serialize this training object is weird and will likely
+        // result in user errors.  However, we use these functions as part of the automatic
+        // sync code in this object.
+        friend void serialize(const dnn_trainer& item, std::ostream& out)
+        {
+            item.wait_for_thread_to_pause();
+            int version = 3;
+            serialize(version, out);
+
+            size_t nl = dnn_trainer::num_layers;
+            serialize(nl, out);
+            serialize(item.rs, out);
+            serialize(item.rg, out);
+            serialize(item.max_num_epochs, out);
+            serialize(item.mini_batch_size, out);
+            serialize(item.verbose, out);
+            serialize(item.net, out);
+            serialize(item.solvers, out);
+            serialize(item.step_size.load(), out);
+            serialize(item.min_step_size, out);
+            serialize(item.iter_between_step_size_adjust.load(), out);
+            serialize(item.step_size_shrink.load(), out);
+            serialize(item.epoch_iteration, out);
+            serialize(item.epoch_pos, out);
+        }
+        friend void deserialize(dnn_trainer& item, std::istream& in)
+        {
+            item.wait_for_thread_to_pause();
+            int version = 0;
+            deserialize(version, in);
+            if (version != 3)
+                throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer.");
+
+            size_t num_layers = 0;
+            deserialize(num_layers, in);
+            if (num_layers != dnn_trainer::num_layers)
+            {
+                std::ostringstream sout;
+                sout << "Error deserializing dlib::dnn_trainer.  The saved sync file is for a network with " << std::endl;
+                sout << "a different number of layers.  We expected the number of layers to be " << dnn_trainer::num_layers << " but" << std::endl;
+                sout << "instead the file contains " << num_layers << " layers." << std::endl;
+                throw serialization_error(sout.str());
+            }
+
+            double dtemp; long ltemp;
+            deserialize(item.rs, in);
+            deserialize(item.rg, in);
+            deserialize(item.max_num_epochs, in);
+            deserialize(item.mini_batch_size, in);
+            deserialize(item.verbose, in);
+            deserialize(item.net, in);
+            deserialize(item.solvers, in);
+            deserialize(dtemp, in); item.step_size = dtemp;
+            deserialize(item.min_step_size, in);
+            deserialize(ltemp, in); item.iter_between_step_size_adjust = ltemp;
+            deserialize(dtemp, in); item.step_size_shrink = dtemp;
+            deserialize(item.epoch_iteration, in);
+            deserialize(item.epoch_pos, in);
+        }
+        void sync_to_disk (
+            bool do_it_now = false
+        )
+        {
+            // If the sync file isn't set then don't do anything.
+            if (sync_filename.size() == 0)
+                return;
+
+            // Only sync if it has been long enough since the last sync or we are being
+            // explicitly forced to do it.
+            if (std::chrono::system_clock::now() - last_sync_time > time_between_syncs ||
+                do_it_now)
+            {
+                // save our state to a temp file
+                std::string tempfile = sync_filename + ".tmp";
+                std::ofstream fout(tempfile, std::ios::binary);
+                serialize(*this, fout);
+                fout.close();
+
+                // Now that we know the state is safely saved to disk, delete the old sync
+                // file and move the .tmp file to it.
+                std::remove(sync_filename.c_str());
+                std::rename(tempfile.c_str(), sync_filename.c_str());
+
+                last_sync_time = std::chrono::system_clock::now();
+                if (verbose)
+                    std::cout << "Saved state to " << sync_filename << std::endl;
+            }
+        }
+
+

        dlib::pipe<job_t> job_pipe;
        running_stats<double> rs;
@@ -453,12 +544,17 @@ namespace dlib
        size_t mini_batch_size;
        bool verbose;
        int cuda_device_id;
-        net_type net;
+        net_type& net;
        std::vector<solver_type> solvers;
        std::atomic<double> step_size;
        double min_step_size;
        std::atomic<long> iter_between_step_size_adjust;
        std::atomic<double> step_size_shrink;
+        std::chrono::time_point<std::chrono::system_clock> last_sync_time;
+        std::string sync_filename;
+        std::chrono::seconds time_between_syncs;
+        unsigned long epoch_iteration;
+        unsigned long epoch_pos;

        // The job object is not logically part of the state of this object. It is here
        // only to avoid reallocating it over and over.

--- a/dlib/dnn/trainer_abstract.h
+++ b/dlib/dnn/trainer_abstract.h
@@ -6,6 +6,7 @@
 #include "core_abstract.h"
 #include "solvers_abstract.h"
 #include <vector>
+#include <chrono>


 namespace dlib
@@ -39,43 +40,21 @@ namespace dlib

        typedef typename net_type::label_type label_type;
        typedef typename net_type::input_type input_type;
+        const static size_t num_layers = net_type::num_layers;

-        dnn_trainer(
-        );
-        /*!
-            ensures
-                - #get_net() == a default initialized net_type object.
-                - #get_solvers() == a set of default initialized solvers.
-                - #get_max_num_epochs() == 10000
-                - #get_mini_batch_size() == 128
-                - #get_step_size() == 1
-                - #get_min_step_size() == 1e-4
-                - #get_iterations_between_step_size_adjust() == 2000
-                - #get_step_size_shrink() == 0.1
-        !*/
-
-        explicit dnn_trainer(
-            const net_type& net
-        );
-        /*!
-            ensures
-                - #get_net() == net 
-                - #get_solvers() == a set of default initialized solvers.
-                - #get_max_num_epochs() == 10000
-                - #get_mini_batch_size() == 128
-                - #get_step_size() == 1
-                - #get_min_step_size() == 1e-4
-                - #get_iterations_between_step_size_adjust() == 2000
-                - #get_step_size_shrink() == 0.1
-        !*/
+        dnn_trainer() = delete;
+        dnn_trainer(const dnn_trainer&) = delete;

        dnn_trainer(
-            const net_type& net, 
-            const solver_type& solver
+            net_type& net, 
+            const solver_type& solver = solver_type()
        ); 
        /*!
            ensures
-                - #get_net() == net 
+                - &#get_net() == &net 
+                  (i.e. The dnn_trainer holds a reference to net, it does not copy it.
+                  Therefore, you must ensure net has a lifetime at least as long as the
+                  dnn_trainer).
                - #get_solvers() == a set of solvers that are all initialized with the
                  provided solver instance.
                - #get_max_num_epochs() == 10000
@@ -86,20 +65,15 @@ namespace dlib
                - #get_step_size_shrink() == 0.1
        !*/

-        const net_type& get_net (
+       net_type& get_net (
        ) const; 
        /*!
            ensures
-                - returns the neural network object in this trainer.  This is the network
-                  that is optimized when you call train().
-        !*/
-
-        void set_net (
-            const net_type& net
-        ); 
-        /*!
-            ensures
-                - #get_net() == net
+                - returns the neural network object used by this trainer.  This is the
+                  network that is optimized when you call train() or train_one_step().
+                  Recall that the dnn_trainer doesn't contain the net_type object but
+                  simply holds a reference to an external network which was provided to the
+                  dnn_trainer's constructor.
        !*/

        void set_solver (
@@ -275,7 +249,23 @@ namespace dlib
                - This object will not print anything to standard out
        !*/

-        const net_type& train (
+        void set_synchronization_file (
+            const std::string& filename,
+            std::chrono::seconds time_between_syncs = std::chrono::minutes(15)
+        );
+        /*!
+            ensures
+                - While training is running, either via train() or repeated calls to
+                  train_one_step(), this object will save its entire state, including the
+                  state of get_net(), to disk in the file named filename every
+                  time_between_syncs seconds.
+                - if the filename file already exists then the state of this trainer will
+                  be loaded from that file by this call to set_synchronization_file().
+                  This allows you to resume a training session which was previously
+                  interrupted.
+        !*/
+
+        void train (
            const std::vector<input_type>& data,
            const std::vector<label_type>& labels 
        ); 
@@ -292,22 +282,17 @@ namespace dlib
                  get_max_num_epochs() training epochs have been executes. 
                - Each layer in the network will be optimized by its corresponding solver
                  in get_solvers().  
-                - returns #get_net()
-                  (i.e. the trained network can also be accessed by calling get_net() after
-                  train() finishes executing)
                - Each call to train DOES NOT reinitialize the state of get_net() or
-                  get_solvers().  That is, the state of the solvers and network contained
-                  inside this trainer is the starting point for the optimization each time
-                  train() is called.  For example, calling train() 1 time and having it
-                  execute 100 epochs of training is equivalent to calling train() 10 times
-                  and having it execute 10 epochs of training during each call.  This also
-                  means you can serialize a trainer to disk and then, at a later date,
-                  deserialize it and resume training your network where you left off.
+                  get_solvers().  That is, the existing state of the solvers and network is
+                  the starting point for the optimization each time train() is called.  In
+                  particular, if you use the set_synchronization_file() method you can
+                  resume an interrupted train() call by simply calling train() again and it
+                  will pick up from the last synchronization point.  
                - You can obtain the average loss value during the final training epoch by
                  calling get_average_loss().
        !*/

-        const net_type& train (
+        void train (
            const std::vector<input_type>& data
        );
        /*!
@@ -322,17 +307,12 @@ namespace dlib
                  get_max_num_epochs() training epochs have been executes. 
                - Each layer in the network will be optimized by its corresponding solver
                  in get_solvers().  
-                - returns #get_net()
-                  (i.e. the trained network can also be accessed by calling get_net() after
-                  train() finishes executing)
                - Each call to train DOES NOT reinitialize the state of get_net() or
-                  get_solvers().  That is, the state of the solvers and network contained
-                  inside this trainer is the starting point for the optimization each time
-                  train() is called.  For example, calling train() 1 time and having it
-                  execute 100 epochs of training is equivalent to calling train() 10 times
-                  and having it execute 10 epochs of training during each call.  This also
-                  means you can serialize a trainer to disk and then, at a later date,
-                  deserialize it and resume training your network where you left off.
+                  get_solvers().  That is, the existing state of the solvers and network is
+                  the starting point for the optimization each time train() is called.  In
+                  particular, if you use the set_synchronization_file() method you can
+                  resume an interrupted train() call by simply calling train() again and it
+                  will pick up from the last synchronization point.  
                - You can obtain the average loss value during the final training epoch by
                  calling get_average_loss().
        !*/
@@ -398,14 +378,6 @@ namespace dlib

    };

-    template <typename T, typename U>
-    void serialize(const dnn_trainer<T,U>& item, std::ostream& out);
-    template <typename T, typename U>
-    void deserialize(dnn_trainer<T,U>& item, std::istream& in);
-    /*!
-        provides serialization support  
-    !*/
-
 // ----------------------------------------------------------------------------------------

 }