Gave dnn_trainer the ability to train on out of core data by adding the

train_one_step() member function. Also improved how the host to device transfers are overlapped with kernel computation.

Gave dnn_trainer the ability to train on out of core data by adding the
train_one_step() member function. Also improved how the host to device transfers are overlapped with kernel computation.
3b75b335 · Davis King · adec3eef · 3b75b335 · 3b75b335 · 3b75b335
Commit 3b75b335 authored Nov 29, 2015 by Davis King
Showing with 243 additions and 100 deletions

core.h dlib/dnn/core.h +2 -1

cuda_dlib.cu dlib/dnn/cuda_dlib.cu +17 -0

cuda_dlib.h dlib/dnn/cuda_dlib.h +25 -2

trainer.h dlib/dnn/trainer.h +136 -97

trainer_abstract.h dlib/dnn/trainer_abstract.h +63 -0

No files found.
--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -1284,8 +1284,9 @@ namespace dlib
        // "no label".  So here we make the constructor private with the exception that
        // add_loss_layer objects can make it (again, just to simplify add_loss_layer's
        // implementation).
-        no_label_type()=default;
+        no_label_type(){};
        template <typename LOSS_DETAILS, typename SUBNET> friend class add_loss_layer;
+        template < typename net_type, typename solver_type > friend class dnn_trainer; 
    };

 // ----------------------------------------------------------------------------------------

--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@@ -10,6 +10,23 @@ namespace dlib
    namespace cuda 
    {

+    // -----------------------------------------------------------------------------------
+
+        void set_device (
+            int dev
+        )
+        {
+            CHECK_CUDA(cudaSetDevice(dev));
+        }
+
+        int get_device (
+        )
+        {
+            int dev = 0;
+            CHECK_CUDA(cudaGetDevice(&dev));
+            return dev;
+        }
+
    // -----------------------------------------------------------------------------------

        __global__ void _cuda_multiply(float* d, const float* s, size_t n)

--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@@ -3,7 +3,6 @@
 #ifndef DLIB_DNN_CuDA_H_
 #define DLIB_DNN_CuDA_H_

-#ifdef DLIB_USE_CUDA

 #include "tensor.h"

@@ -12,6 +11,17 @@ namespace dlib
    namespace cuda 
    {

+#ifdef DLIB_USE_CUDA
+
+    // ----------------------------------------------------------------------------------------
+
+        void set_device (
+            int dev
+        );
+
+        int get_device (
+        );
+
    // -----------------------------------------------------------------------------------

        void multiply (
@@ -120,11 +130,24 @@ namespace dlib
        );

    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+#else // if DLIB_USE_CUDA NOT DEFINED
+
+        inline void set_device (
+            int dev
+        ){}
+
+        inline int get_device (
+        ){}
+
+#endif // DLIB_USE_CUDA

    } 
 }

-#endif // DLIB_USE_CUDA

 #endif // DLIB_DNN_CuDA_H_

--- a/dlib/dnn/trainer.h
+++ b/dlib/dnn/trainer.h
@@ -11,6 +11,10 @@
 #include <chrono>
 #include "../serialize.h"

+#include "../pipe.h"
+#include "../threads.h"
+#include "cuda_dlib.h"
+
 namespace dlib
 {

@@ -20,7 +24,7 @@ namespace dlib
        typename net_type, 
        typename solver_type = sgd
        >
-    class dnn_trainer
+    class dnn_trainer : private threaded_object
    {
    public:

@@ -31,12 +35,12 @@ namespace dlib
        typedef typename net_type::input_type input_type;

        dnn_trainer(
-        ) 
+        ) : job_pipe(0)
        {
            init();
        }

-        explicit dnn_trainer(const net_type& net_) :  net(net_) 
+        explicit dnn_trainer(const net_type& net_) : job_pipe(0), net(net_)
        {
            init();
        }
@@ -44,18 +48,31 @@ namespace dlib
        dnn_trainer(
            const net_type& net_, 
            const solver_type& solver_
-        ) : net(net_), solvers(solver_) 
+        ) : job_pipe(0), net(net_), solvers(solver_) 
        {
            init();
        }

+        ~dnn_trainer(
+        )
+        {
+            job_pipe.disable();
+            stop();
+            wait();
+        }
+
        const net_type& get_net (
-        ) const { return net; }
+        ) const 
+        { 
+            wait_for_thread_to_pause();
+            return net; 
+        }

        void set_net (
            const net_type& net_
        ) 
        { 
+            wait_for_thread_to_pause();
            return net = net_; 
        }

@@ -63,6 +80,7 @@ namespace dlib
            const solver_type& solver_
        ) 
        { 
+            wait_for_thread_to_pause();
            solvers = solver_; 
        }

@@ -102,10 +120,37 @@ namespace dlib


        const sstack<solver_type,net_type::num_layers>& get_solvers (
-        ) const { return solvers; }
+        ) const 
+        { 
+            wait_for_thread_to_pause();
+            return solvers; 
+        }

        sstack<solver_type,net_type::num_layers>& get_solvers (
-        ) { return solvers; }
+        ) 
+        { 
+            wait_for_thread_to_pause();
+            return solvers; 
+        }
+
+
+        void train_one_step (
+            const std::vector<input_type>& data,
+            const std::vector<label_type>& labels 
+        )
+        {
+            job.labels = labels;
+            net.to_tensor(data.begin(), data.end(), job.t);
+            job_pipe.enqueue(job);
+        }
+
+        void train_one_step (
+            const std::vector<input_type>& data
+        )
+        {
+            net.to_tensor(data.begin(), data.end(), job.t);
+            job_pipe.enqueue(job);
+        }

        const net_type& train (
            const std::vector<input_type>& data,
@@ -114,57 +159,22 @@ namespace dlib
        {
            DLIB_CASSERT(data.size() == labels.size() && data.size() > 0, "");

-            resizable_tensor t1, t2;
-
-
            console_progress_indicator pbar(num_epochs);
            pbar.print_status(0);
            for (unsigned long epoch_iteration = 0; epoch_iteration < num_epochs; ++epoch_iteration)
            {
-                running_stats<double> rs;
-
-                size_t j = 0;
-
-                // Load two tensors worth of data at once so we can overlap the computation
-                // and data transfer between the host and the device.
-                if (j < data.size())
-                {
-                    net.to_tensor(data.begin()+j, 
-                                  data.begin()+std::min(j+mini_batch_size,data.size()), t1);
-                    j += mini_batch_size;
-                }
-                if (j < data.size())
-                {
-                    net.to_tensor(data.begin()+j, 
-                                  data.begin()+std::min(j+mini_batch_size,data.size()), t2);
-                    j += mini_batch_size;
-                }
-
-                size_t i = 0;
                using namespace std::chrono;
                auto last_time = system_clock::now();
-                while (i < data.size())
+                clear_average_loss();
+                for (size_t i = 0; i < data.size(); i += mini_batch_size)
                {
-                    rs.add(net.update(t1, labels.begin()+i, solvers));
-                    i += mini_batch_size;
-                    if (j < data.size())
-                    {
-                        net.to_tensor(data.begin()+j, 
-                                      data.begin()+std::min(j+mini_batch_size,data.size()), t1);
-                        j += mini_batch_size;
-                    }
+                    net.to_tensor(data.begin()+i, 
+                                  data.begin()+std::min(i+mini_batch_size,data.size()), 
+                                  job.t);
+                    job.labels.assign(labels.begin()+i,
+                                      labels.begin()+std::min(i+mini_batch_size,data.size()));
+                    job_pipe.enqueue(job);

-                    if (i < data.size())
-                    {
-                        rs.add(net.update(t2, labels.begin()+i, solvers));
-                        i += mini_batch_size;
-                        if (j < data.size())
-                        {
-                            net.to_tensor(data.begin()+j, 
-                                          data.begin()+std::min(j+mini_batch_size,data.size()), t2);
-                            j += mini_batch_size;
-                        }
-                    }

                    if (verbose)
                    {
@@ -174,7 +184,7 @@ namespace dlib
                            last_time = now_time;
                            auto iter = epoch_iteration + i/(double)data.size();
                            std::cout << "epoch: " << rpad(cast_to_string(iter),string_pad) << " " 
-                                << "average loss: " << rpad(cast_to_string(rs.mean()),string_pad) << "  ";
+                                << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  ";
                            pbar.print_status(iter, true);
                            std::cout << std::endl;
                        }
@@ -186,12 +196,12 @@ namespace dlib
                    // Capitalize the E in Epoch so it's easy to grep out the lines that
                    // are for full epoch status statements.
                    std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),string_pad) << " " 
-                              << "average loss: " << rpad(cast_to_string(rs.mean()),string_pad) << "  ";
+                              << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  ";
                    pbar.print_status(epoch_iteration+1, true);
                    std::cout << std::endl;
                }
            }
-            return net;
+            return get_net();
        }

        const net_type& train (
@@ -204,55 +214,20 @@ namespace dlib
            static_assert(has_unsupervised_loss, 
                "You can only call this version of train() when using an unsupervised loss.");

-            resizable_tensor t1, t2;
-
            console_progress_indicator pbar(num_epochs);
            pbar.print_status(0);
            for (unsigned long epoch_iteration = 0; epoch_iteration < num_epochs; ++epoch_iteration)
            {
-                running_stats<double> rs;
-                size_t j = 0;
-
-                // Load two tensors worth of data at once so we can overlap the computation
-                // and data transfer between the host and the device.
-                if (j < data.size())
-                {
-                    net.to_tensor(data.begin()+j, 
-                                  data.begin()+std::min(j+mini_batch_size,data.size()), t1);
-                    j += mini_batch_size;
-                }
-                if (j < data.size())
-                {
-                    net.to_tensor(data.begin()+j, 
-                                  data.begin()+std::min(j+mini_batch_size,data.size()), t2);
-                    j += mini_batch_size;
-                }
-
-                size_t i = 0;
                using namespace std::chrono;
                auto last_time = system_clock::now();
-                while (i < data.size())
+                clear_average_loss();
+                for (size_t i = 0; i < data.size(); i += mini_batch_size)
                {
-                    rs.add(net.update(t1, solvers));
-                    i += mini_batch_size;
-                    if (j < data.size())
-                    {
-                        net.to_tensor(data.begin()+j, 
-                                      data.begin()+std::min(j+mini_batch_size,data.size()), t1);
-                        j += mini_batch_size;
-                    }
+                    net.to_tensor(data.begin()+i, 
+                                  data.begin()+std::min(i+mini_batch_size,data.size()), 
+                                  job.t);
+                    job_pipe.enqueue(job);

-                    if (i < data.size())
-                    {
-                        rs.add(net.update(t2, solvers));
-                        i += mini_batch_size;
-                        if (j < data.size())
-                        {
-                            net.to_tensor(data.begin()+j, 
-                                          data.begin()+std::min(j+mini_batch_size,data.size()), t2);
-                            j += mini_batch_size;
-                        }
-                    }

                    if (verbose)
                    {
@@ -262,7 +237,7 @@ namespace dlib
                            last_time = now_time;
                            auto iter = epoch_iteration + i/(double)data.size();
                            std::cout << "epoch: " << rpad(cast_to_string(iter),string_pad) << " " 
-                                << "average loss: " << rpad(cast_to_string(rs.mean()),string_pad) << "  ";
+                                << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  ";
                            pbar.print_status(iter, true);
                            std::cout << std::endl;
                        }
@@ -274,18 +249,20 @@ namespace dlib
                    // Capitalize the E in Epoch so it's easy to grep out the lines that
                    // are for full epoch status statements.
                    std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),string_pad) << " " 
-                              << "average loss: " << rpad(cast_to_string(rs.mean()),string_pad) << "  ";
+                              << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  ";
                    pbar.print_status(epoch_iteration+1, true);
                    std::cout << std::endl;
                }
            }
-            return net;
+            return get_net();
        }

        friend void serialize(const dnn_trainer& item, std::ostream& out)
        {
+            item.wait_for_thread_to_pause();
            int version = 1;
            serialize(version, out);
+            serialize(item.rs, out);
            serialize(item.num_epochs, out);
            serialize(item.mini_batch_size, out);
            serialize(item.verbose, out);
@@ -295,10 +272,12 @@ namespace dlib

        friend void deserialize(dnn_trainer& item, std::istream& in)
        {
+            item.wait_for_thread_to_pause();
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer.");
+            deserialize(item.rs, in);
            deserialize(item.num_epochs, in);
            deserialize(item.mini_batch_size, in);
            deserialize(item.verbose, in);
@@ -306,7 +285,58 @@ namespace dlib
            deserialize(item.solvers, in);
        }

+        double get_average_loss (
+        ) const 
+        { 
+            wait_for_thread_to_pause();
+            return rs.mean();
+        }
+
+        void clear_average_loss (
+        )
+        {
+            wait_for_thread_to_pause();
+            rs.clear();
+        }
+
    private:
+        struct job_t
+        {
+            std::vector<label_type> labels;
+            resizable_tensor t;
+        };
+
+        template <typename T>
+        void run_update(job_t& next_job, const T&)
+        {
+            rs.add(net.update(next_job.t, next_job.labels.begin(), solvers));
+        }
+
+        void run_update(job_t& next_job, const no_label_type&)
+        {
+            no_label_type pick_wich_run_update;
+            rs.add(net.update(next_job.t, solvers));
+        }
+
+        void thread()
+        {
+            // Make sure this thread uses the same cuda device as the thread that created
+            // the dnn_trainer object.
+            dlib::cuda::set_device(cuda_device_id);
+            label_type pick_wich_run_update;
+            job_t next_job;
+            while(job_pipe.dequeue(next_job))
+            {
+                // call net.update() but pick the right version for unsupervised or
+                // supervised training based on the type of label_type.
+                run_update(next_job, pick_wich_run_update);
+            }
+        }
+
+        void wait_for_thread_to_pause() const
+        {
+            job_pipe.wait_for_num_blocked_dequeues(1);
+        }

        const static long string_pad = 10;

@@ -315,11 +345,20 @@ namespace dlib
            num_epochs = 300;
            mini_batch_size = 11;
            verbose = false;
+            cuda_device_id = dlib::cuda::get_device();
+            start();
        }

+        // The job object is not logically part of the state of this object. It is here
+        // only to avoid reallocating it over and over.
+        job_t job;
+
+        dlib::pipe<job_t> job_pipe;
+        running_stats<double> rs;
        unsigned long num_epochs;
        size_t mini_batch_size;
        bool verbose;
+        int cuda_device_id;

        net_type net;
        sstack<solver_type,net_type::num_layers> solvers;

--- a/dlib/dnn/trainer_abstract.h
+++ b/dlib/dnn/trainer_abstract.h
@@ -192,6 +192,8 @@ namespace dlib
                  and having it execute 10 epochs of training during each call.  This also
                  means you can serialize a trainer to disk and then, at a later date,
                  deserialize it and resume training your network where you left off.
+                - You can obtain the average loss value during the final training epoch by
+                  calling get_average_loss().
        !*/

        const net_type& train (
@@ -218,6 +220,67 @@ namespace dlib
                  and having it execute 10 epochs of training during each call.  This also
                  means you can serialize a trainer to disk and then, at a later date,
                  deserialize it and resume training your network where you left off.
+                - You can obtain the average loss value during the final training epoch by
+                  calling get_average_loss().
+        !*/
+
+        void train_one_step (
+            const std::vector<input_type>& data,
+            const std::vector<label_type>& labels 
+        );
+        /*!
+            requires
+                - data.size() == labels.size()
+                - net_type uses a supervised loss.  
+                  i.e. net_type::label_type != no_label_type.
+            ensures
+                - Performs one stochastic gradient update step based on the mini-batch of
+                  data and labels supplied to this function.  In particular, calling
+                  train_one_step() in a loop is equivalent to calling the train() method
+                  defined above.  However, train_one_step() allows you to stream data from
+                  disk into the training process while train() requires you to first load
+                  all the training data into RAM.  Otherwise, these training methods are
+                  equivalent.
+                - You can observe the current average loss value by calling get_average_loss().
+        !*/
+
+        void train_one_step (
+            const std::vector<input_type>& data
+        );
+        /*!
+            requires
+                - net_type uses an unsupervised loss.  
+                  i.e. net_type::label_type == no_label_type.
+            ensures
+                - Performs one stochastic gradient update step based on the mini-batch of
+                  data supplied to this function.  In particular, calling train_one_step()
+                  in a loop is equivalent to calling the train() method defined above.
+                  However, train_one_step() allows you to stream data from disk into the
+                  training process while train() requires you to first load all the
+                  training data into RAM.  Otherwise, these training methods are
+                  equivalent.
+                - You can observe the current average loss value by calling get_average_loss().
+        !*/
+
+        double get_average_loss (
+        ) const;
+        /*!
+            ensures
+                - returns the average loss value observed during previous calls to
+                  train_one_step() or train().  That is, the average output of
+                  net_type::update() during the previous mini-batch updates.
+        !*/
+
+        void clear_average_loss (
+        );
+        /*!
+            ensures
+                - #get_average_loss() == 0
+                - get_average_loss() uses a dlib::running_stats object to keep a running
+                  average of the loss values seen during the previous mini-batch updates
+                  applied during training.  Calling clear_average_loss() resets the
+                  running_stats object so it forgets about all previous loss values
+                  observed.
        !*/

    };