Commit 66166c67 authored by Davis King's avatar Davis King

Changed the solver interface to take the learning rate and the layer details

object as an input.  This allows the solvers to exhibit a more complex behavior
that depends on the specific layer.  It also removes the learning rate from the
solver's parameter set and pushes it entirely into the core training code.
This also removes the need for the separate "step size" which previously was
multiplied with the output of the solvers.

Most of the code is still the same, and in the core and trainer the step_size
variables have just been renamed to learning_rate.  The dnn_trainer's relevant
member functions have also been renamed.

The examples have been updated to reflect these API changes.  I also cleaned up
the resnet definition and added better downsampling.
parent 8421f213
...@@ -825,16 +825,16 @@ namespace dlib ...@@ -825,16 +825,16 @@ namespace dlib
} }
template <typename solver_type> template <typename solver_type>
void update_parameters(sstack<solver_type> solvers, double step_size) void update_parameters(sstack<solver_type> solvers, double learning_rate)
{ {
DLIB_CASSERT(solvers.size()>=num_computational_layers,""); DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
// Don't try to adjust the parameters if this layer doesn't have any. // Don't try to adjust the parameters if this layer doesn't have any.
if (params_grad.size() != 0) if (params_grad.size() != 0)
{ {
const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad)); const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
tt::add(1,details.get_layer_params(), step_size, step); tt::add(details.get_layer_params(), details.get_layer_params(), step);
} }
subnetwork->update_parameters(solvers.pop(), step_size); subnetwork->update_parameters(solvers.pop(), learning_rate);
} }
const tensor& get_parameter_gradient( const tensor& get_parameter_gradient(
...@@ -1175,13 +1175,14 @@ namespace dlib ...@@ -1175,13 +1175,14 @@ namespace dlib
} }
template <typename solver_type> template <typename solver_type>
void update_parameters(sstack<solver_type> solvers, double step_size) void update_parameters(sstack<solver_type> solvers, double learning_rate)
{ {
DLIB_CASSERT(solvers.size()>=num_computational_layers,""); DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
// Don't try to adjust the parameters if this layer doesn't have any. // Don't try to adjust the parameters if this layer doesn't have any.
if (params_grad.size() != 0) { if (params_grad.size() != 0)
const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad)); {
tt::add(1,details.get_layer_params(), step_size, step); const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
tt::add(details.get_layer_params(), details.get_layer_params(), step);
} }
} }
...@@ -1401,9 +1402,9 @@ namespace dlib ...@@ -1401,9 +1402,9 @@ namespace dlib
} }
template <typename solver_type> template <typename solver_type>
void update_parameters(sstack<solver_type> solvers, double step_size) void update_parameters(sstack<solver_type> solvers, double learning_rate)
{ {
subnetwork.update_parameters(solvers, step_size); subnetwork.update_parameters(solvers, learning_rate);
} }
const tensor& get_parameter_gradient( const tensor& get_parameter_gradient(
...@@ -1687,11 +1688,11 @@ namespace dlib ...@@ -1687,11 +1688,11 @@ namespace dlib
} }
template <typename solver_type> template <typename solver_type>
void update_parameters(sstack<solver_type> solvers, double step_size) void update_parameters(sstack<solver_type> solvers, double learning_rate)
{ {
for (size_t i = 0; i < details.size(); ++i) for (size_t i = 0; i < details.size(); ++i)
details[i].update_parameters(solvers.pop(comp_layers_in_each_group*i),step_size); details[i].update_parameters(solvers.pop(comp_layers_in_each_group*i),learning_rate);
subnetwork.update_parameters(solvers.pop(comp_layers_in_each_group*details.size()),step_size); subnetwork.update_parameters(solvers.pop(comp_layers_in_each_group*details.size()),learning_rate);
} }
const subnet_type& subnet() const { return subnetwork; } const subnet_type& subnet() const { return subnetwork; }
...@@ -1905,7 +1906,7 @@ namespace dlib ...@@ -1905,7 +1906,7 @@ namespace dlib
} }
template <typename solver_type> template <typename solver_type>
void update_parameters(sstack<solver_type> /*solvers*/, double /*step_size*/) void update_parameters(sstack<solver_type> /*solvers*/, double /*learning_rate*/)
{ {
// nothing to do // nothing to do
} }
...@@ -2248,10 +2249,10 @@ namespace dlib ...@@ -2248,10 +2249,10 @@ namespace dlib
template <typename solver_type> template <typename solver_type>
void update_parameters ( void update_parameters (
sstack<solver_type> solvers, sstack<solver_type> solvers,
double step_size double learning_rate
) )
{ {
subnetwork.update_parameters(solvers, step_size); subnetwork.update_parameters(solvers, learning_rate);
} }
const subnet_type& subnet() const { return subnetwork; } const subnet_type& subnet() const { return subnetwork; }
...@@ -2542,9 +2543,9 @@ namespace dlib ...@@ -2542,9 +2543,9 @@ namespace dlib
} }
template <typename solver_type> template <typename solver_type>
void update_parameters(sstack<solver_type> solvers, double step_size) void update_parameters(sstack<solver_type> solvers, double learning_rate)
{ {
subnetwork.update_parameters(solvers, step_size); subnetwork.update_parameters(solvers, learning_rate);
} }
const tensor& get_parameter_gradient( const tensor& get_parameter_gradient(
......
...@@ -506,7 +506,7 @@ namespace dlib ...@@ -506,7 +506,7 @@ namespace dlib
template <typename solver_type> template <typename solver_type>
void update_parameters( void update_parameters(
sstack<solver_type> solvers, sstack<solver_type> solvers,
double step_size double learning_rate
); );
/*! /*!
requires requires
...@@ -517,13 +517,14 @@ namespace dlib ...@@ -517,13 +517,14 @@ namespace dlib
if you want to call update_parameters() on some other neural network if you want to call update_parameters() on some other neural network
object then you must NOT reuse the same solvers object. object then you must NOT reuse the same solvers object.
- solvers.size() >= num_computational_layers - solvers.size() >= num_computational_layers
- 0 < step_size <= 1 - 0 < learning_rate <= 1
ensures ensures
- Updates all the parameters in the network. In particular, we pass each - Updates all the parameters in the network. In particular, we pass each
layer's parameter gradient (i.e. the tensor returned by the layer's layer's parameter gradient (i.e. the tensor returned by the layer's
get_parameter_gradient() member) through that layer's corresponding get_parameter_gradient() member) through that layer's corresponding
solver object. This produces a parameter delta vector and we add solver object. This produces a parameter delta vector which we add to
step_size times that vector to the layer's parameters. the layer's parameters.
- The solvers use the given learning rate.
!*/ !*/
void clean( void clean(
...@@ -944,7 +945,7 @@ namespace dlib ...@@ -944,7 +945,7 @@ namespace dlib
template <typename solver_type> template <typename solver_type>
void update_parameters ( void update_parameters (
sstack<solver_type> solvers, sstack<solver_type> solvers,
double step_size double learning_rate
); );
/*! /*!
requires requires
...@@ -955,13 +956,14 @@ namespace dlib ...@@ -955,13 +956,14 @@ namespace dlib
is, if you want to call update_parameters() on some other neural network is, if you want to call update_parameters() on some other neural network
object then you must NOT reuse the same solvers object. object then you must NOT reuse the same solvers object.
- solvers.size() >= num_computational_layers - solvers.size() >= num_computational_layers
- 0 < step_size <= 1 - 0 < learning_rate <= 1
ensures ensures
- Updates all the parameters in the network. In particular, we pass each - Updates all the parameters in the network. In particular, we pass each
layer's parameter gradient (i.e. the tensor returned by the layer's layer's parameter gradient (i.e. the tensor returned by the layer's
get_parameter_gradient() member) through that layer's corresponding get_parameter_gradient() member) through that layer's corresponding
solver object. This produces a parameter delta vector and we add solver object. This produces a parameter delta vector which we add to
step_size times that vector to the layer's parameters. the layer's parameters.
- The solvers use the given learning rate.
!*/ !*/
// ------------- // -------------
......
...@@ -14,30 +14,34 @@ namespace dlib ...@@ -14,30 +14,34 @@ namespace dlib
public: public:
sgd( sgd(
float learning_rate_ = 0.01, float weight_decay_,
float weight_decay_ = 0.0005, float momentum_
float momentum_ = 0.9
) )
{ {
weight_decay = weight_decay_; weight_decay = weight_decay_;
learning_rate = learning_rate_;
momentum = momentum_; momentum = momentum_;
} }
sgd(
) : sgd(0.0005, 0.9)
{
}
float get_momentum ( float get_momentum (
) const { return momentum; } ) const { return momentum; }
float get_weight_decay ( float get_weight_decay (
) const { return weight_decay; } ) const { return weight_decay; }
float get_learning_rate ( template <typename layer_type>
) const { return learning_rate; }
const tensor& operator() ( const tensor& operator() (
const tensor& params, const float learning_rate,
const layer_type& l,
const tensor& params_grad const tensor& params_grad
) )
{ {
const tensor& params = l.get_layer_params();
DLIB_CASSERT(params.size() != 0,""); DLIB_CASSERT(params.size() != 0,"");
if (v.size() == 0) if (v.size() == 0)
{ {
...@@ -54,10 +58,9 @@ namespace dlib ...@@ -54,10 +58,9 @@ namespace dlib
friend void serialize(const sgd& item, std::ostream& out) friend void serialize(const sgd& item, std::ostream& out)
{ {
serialize("sgd", out); serialize("sgd2", out);
serialize(item.v, out); serialize(item.v, out);
serialize(item.weight_decay, out); serialize(item.weight_decay, out);
serialize(item.learning_rate, out);
serialize(item.momentum, out); serialize(item.momentum, out);
} }
...@@ -65,18 +68,16 @@ namespace dlib ...@@ -65,18 +68,16 @@ namespace dlib
{ {
std::string version; std::string version;
deserialize(version, in); deserialize(version, in);
if (version != "sgd") if (version != "sgd2")
throw serialization_error("Unexpected version found while deserializing dlib::sgd."); throw serialization_error("Unexpected version found while deserializing dlib::sgd.");
deserialize(item.v, in); deserialize(item.v, in);
deserialize(item.weight_decay, in); deserialize(item.weight_decay, in);
deserialize(item.learning_rate, in);
deserialize(item.momentum, in); deserialize(item.momentum, in);
} }
private: private:
resizable_tensor v; resizable_tensor v;
float weight_decay; float weight_decay;
float learning_rate;
float momentum; float momentum;
}; };
...@@ -87,19 +88,21 @@ namespace dlib ...@@ -87,19 +88,21 @@ namespace dlib
public: public:
adam( adam(
float learning_rate_ = 0.001, float weight_decay_,
float weight_decay_ = 0.0005, float momentum1_,
float momentum1_ = 0.9, float momentum2_
float momentum2_ = 0.999
) )
{ {
weight_decay = weight_decay_; weight_decay = weight_decay_;
learning_rate = learning_rate_;
momentum1 = momentum1_; momentum1 = momentum1_;
momentum2 = momentum2_; momentum2 = momentum2_;
t = 0; t = 0;
} }
adam(
) : adam(0.0005, 0.9, 0.999)
{}
float get_momentum1 ( float get_momentum1 (
) const { return momentum1; } ) const { return momentum1; }
...@@ -109,14 +112,14 @@ namespace dlib ...@@ -109,14 +112,14 @@ namespace dlib
float get_weight_decay ( float get_weight_decay (
) const { return weight_decay; } ) const { return weight_decay; }
float get_learning_rate ( template <typename layer_type>
) const { return learning_rate; }
const tensor& operator() ( const tensor& operator() (
const tensor& params, const float learning_rate,
const layer_type& l,
const tensor& params_grad const tensor& params_grad
) )
{ {
const tensor& params = l.get_layer_params();
DLIB_CASSERT(params.size() != 0,""); DLIB_CASSERT(params.size() != 0,"");
if (v.size() == 0) if (v.size() == 0)
{ {
...@@ -136,12 +139,11 @@ namespace dlib ...@@ -136,12 +139,11 @@ namespace dlib
friend void serialize(const adam& item, std::ostream& out) friend void serialize(const adam& item, std::ostream& out)
{ {
serialize("adam", out); serialize("adam2", out);
serialize(item.m, out); serialize(item.m, out);
serialize(item.v, out); serialize(item.v, out);
serialize(item.s, out); serialize(item.s, out);
serialize(item.weight_decay, out); serialize(item.weight_decay, out);
serialize(item.learning_rate, out);
serialize(item.momentum1, out); serialize(item.momentum1, out);
serialize(item.momentum2, out); serialize(item.momentum2, out);
serialize(item.t, out); serialize(item.t, out);
...@@ -151,13 +153,12 @@ namespace dlib ...@@ -151,13 +153,12 @@ namespace dlib
{ {
std::string version; std::string version;
deserialize(version, in); deserialize(version, in);
if (version != "adam") if (version != "adam2")
throw serialization_error("Unexpected version found while deserializing dlib::adam."); throw serialization_error("Unexpected version found while deserializing dlib::adam.");
deserialize(item.m, in); deserialize(item.m, in);
deserialize(item.v, in); deserialize(item.v, in);
deserialize(item.s, in); deserialize(item.s, in);
deserialize(item.weight_decay, in); deserialize(item.weight_decay, in);
deserialize(item.learning_rate, in);
deserialize(item.momentum1, in); deserialize(item.momentum1, in);
deserialize(item.momentum2, in); deserialize(item.momentum2, in);
deserialize(item.t, in); deserialize(item.t, in);
...@@ -168,7 +169,6 @@ namespace dlib ...@@ -168,7 +169,6 @@ namespace dlib
resizable_tensor v; resizable_tensor v;
resizable_tensor s; resizable_tensor s;
float weight_decay; float weight_decay;
float learning_rate;
float momentum1; float momentum1;
float momentum2; float momentum2;
float t; float t;
......
...@@ -33,22 +33,28 @@ namespace dlib ...@@ -33,22 +33,28 @@ namespace dlib
EXAMPLE_SOLVER( EXAMPLE_SOLVER(
); );
template <typename layer_type>
const tensor& operator() ( const tensor& operator() (
const tensor& params, const float learning_rate,
const layer_type& l,
const tensor& params_grad const tensor& params_grad
) )
/*! /*!
requires requires
- params.size() != 0 - l.get_layer_params().size() != 0
- have_same_dimensions(params, params_grad) == true. - have_same_dimensions(l.get_layer_params(), params_grad) == true.
- When this function is invoked on a particular solver instance, it is - When this function is invoked on a particular solver instance, it is
always supplied with parameters from the same layer instance. That is, always supplied with the same layer instance, l. That is, the solver is
the solver is allowed to remember things from one invocation to another allowed to remember things from one invocation to another and to assume
and to assume that it is being serially applied to optimize the same that it is being serially applied to optimize the same layer's
parameters. parameters.
ensures ensures
- Returns a step vector V that is intended to be used to update the - Returns a step vector V that is intended to be used to update the
parameters by adding V to params. parameters by adding V to l.get_layer_params().
- This function will use the given "learning rate" to compute V. How the
learning rate is used is solver dependent. But in general the learning
rate should be used to select the step size, i.e. to somehow determine
the magnitude of V.
!*/ !*/
}; };
...@@ -68,32 +74,34 @@ namespace dlib ...@@ -68,32 +74,34 @@ namespace dlib
WHAT THIS OBJECT REPRESENTS WHAT THIS OBJECT REPRESENTS
This object implements the EXAMPLE_SOLVER interface defined above. It is a This object implements the EXAMPLE_SOLVER interface defined above. It is a
basic stochastic gradient descent solver which uses momentum and weight basic stochastic gradient descent solver which uses momentum and weight
decay. In particular, it performs the following update each time the decay. In particular, it computes the update vector V according to:
solver is invoked: V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
v = momentum*v - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad; Here V is a momentum term that is remembered by the solver from one
l.get_layer_params() += v; invocation of operator() to the next.
Here v is a momentum term that is remembered by the solver from one
invocation of operator() to the next.
!*/ !*/
public: public:
sgd( sgd(
float learning_rate = 0.01, );
float weight_decay = 0.0005, /*!
float momentum = 0.9 ensures
- #get_weight_decay() == 0.0005
- #get_momentum() == 0.9
!*/
sgd(
float weight_decay,
float momentum
); );
/*! /*!
requires requires
- learning_rate > 0
- weight_decay >= 0 - weight_decay >= 0
- momentum >= 0 - momentum >= 0
ensures ensures
- #get_learning_rate() == learning_rate
- #get_weight_decay() == weight_decay - #get_weight_decay() == weight_decay
- #get_momentum() == momentum - #get_momentum() == momentum
!*/ !*/
float get_learning_rate () const;
float get_weight_decay () const; float get_weight_decay () const;
float get_momentum () const; float get_momentum () const;
}; };
...@@ -120,25 +128,30 @@ namespace dlib ...@@ -120,25 +128,30 @@ namespace dlib
public: public:
adam( adam(
float learning_rate = 0.001, );
float weight_decay = 0.0005, /*!
float momentum1 = 0.9, ensures
float momentum2 = 0.999 - #get_weight_decay() == 0.0005
- #get_momentum1() == 0.9
- #get_momentum2() == 0.999
!*/
adam(
float weight_decay,
float momentum1,
float momentum2
); );
/*! /*!
requires requires
- learning_rate > 0
- weight_decay >= 0 - weight_decay >= 0
- 0 <= momentum1 < 1 - 0 <= momentum1 < 1
- 0 <= momentum2 < 1 - 0 <= momentum2 < 1
ensures ensures
- #get_learning_rate() == learning_rate
- #get_weight_decay() == weight_decay - #get_weight_decay() == weight_decay
- #get_momentum1() == momentum1 - #get_momentum1() == momentum1
- #get_momentum2() == momentum2 - #get_momentum2() == momentum2
!*/ !*/
float get_learning_rate () const;
float get_weight_decay () const; float get_weight_decay () const;
float get_momentum1 () const; float get_momentum1 () const;
float get_momentum2 () const; float get_momentum2 () const;
......
This diff is collapsed.
...@@ -68,10 +68,10 @@ namespace dlib ...@@ -68,10 +68,10 @@ namespace dlib
provided solver instance. provided solver instance.
- #get_max_num_epochs() == 10000 - #get_max_num_epochs() == 10000
- #get_mini_batch_size() == 128 - #get_mini_batch_size() == 128
- #get_step_size() == 1 - #get_learning_rate() == 1e-2
- #get_min_step_size() == 1e-3 - #get_min_learning_rate() == 1e-5
- #get_iterations_without_progress_threshold() == 2000 - #get_iterations_without_progress_threshold() == 2000
- #get_step_size_shrink() == 0.1 - #get_learning_rate_shrink() == 0.1
- if (cuda_extra_devices.size() > 0) then - if (cuda_extra_devices.size() > 0) then
- This object will use multiple graphics cards to run the learning - This object will use multiple graphics cards to run the learning
algorithms. In particular, it will always use whatever device is algorithms. In particular, it will always use whatever device is
...@@ -102,6 +102,8 @@ namespace dlib ...@@ -102,6 +102,8 @@ namespace dlib
get_net(). In particular, the first layer's solver is get_net(). In particular, the first layer's solver is
get_solvers()[0], the second layer's solver is get_solvers()[0], the second layer's solver is
get_solvers()[1], and so on. get_solvers()[1], and so on.
- This function blocks until all threads inside the dnn_trainer have
stopped touching the net.
!*/ !*/
unsigned long get_mini_batch_size ( unsigned long get_mini_batch_size (
...@@ -142,54 +144,51 @@ namespace dlib ...@@ -142,54 +144,51 @@ namespace dlib
- #get_max_num_epochs() == num - #get_max_num_epochs() == num
!*/ !*/
void set_step_size ( void set_learning_rate (
double ss double lr
); );
/*! /*!
requires requires
- ss > 0 - lr > 0
ensures ensures
- #get_step_size() == ss - #get_learning_rate() == lr
- This function blocks until all threads inside the dnn_trainer have
stopped touching the net.
!*/ !*/
double get_step_size( double get_learning_rate(
) const; ) const;
/*! /*!
ensures ensures
- During each training step, a solver tells us how to modify the parameters - During each training step, a solver tells us how to modify the parameters
of each layer in the network. It does this by outputting a step vector, of each layer in the network. It does this by outputting a step vector
that when added to the parameters, will hopefully result in improved that, when added to the parameters, will hopefully result in improved
network performance. In our case, at each step, we multiply the step network performance. The learning rate is one of the inputs to the
vector from the solver by get_step_size() before adding it to the solver and influences the size of this step vector.
parameters. Therefore, get_step_size() controls the "learning rate" used
during training.
It should be emphasized that this learning rate applied by dnn_trainer is
independent from any learning rate scheduling a solver might itself apply
to the step vector it outputs. That is, the dnn_trainer doesn't know
what the solver is doing. It just takes the output from a solver and
multiplies it by get_step_size() before applying the step vector.
!*/ !*/
void set_min_step_size ( void set_min_learning_rate (
double ss double lr
); );
/*! /*!
requires requires
- ss > 0 - lr > 0
ensures ensures
- #get_min_step_size() == ss - #get_min_learning_rate() == lr
!*/ !*/
double get_min_step_size ( double get_min_learning_rate (
) const; ) const;
/*! /*!
ensures ensures
- During training, this object will test if progress is still being made - During training via this->train(), this object will test if progress is
and if it isn't then it will reduce get_step_size() by setting it to still being made and if it isn't then it will reduce get_learning_rate()
get_step_size()*get_step_size_shrink(). However, it will not reduce it by setting it to get_learning_rate()*get_learning_rate_shrink().
below get_min_step_size(). Once this minimum step size is crossed the However, it will not reduce it below get_min_learning_rate(). Once this
training will terminate. minimum learning rate is crossed the training will terminate.
- get_min_learning_rate() doesn't apply if you are using train_one_step().
You can keep calling train_one_step() as many times as you want and the
learning rate will drop infinitely close to 0 if you run long enough.
!*/ !*/
void set_iterations_without_progress_threshold ( void set_iterations_without_progress_threshold (
...@@ -209,33 +208,33 @@ namespace dlib ...@@ -209,33 +208,33 @@ namespace dlib
get_iterations_without_progress_threshold() mini-batch results and get_iterations_without_progress_threshold() mini-batch results and
applying the statistical test defined by the running_gradient object to applying the statistical test defined by the running_gradient object to
see if the training error is getting smaller. If it isn't being reduced see if the training error is getting smaller. If it isn't being reduced
then get_step_size() is made smaller by a factor of get_step_size_shrink(). then get_learning_rate() is made smaller by a factor of get_learning_rate_shrink().
Therefore, get_iterations_without_progress_threshold() should always be Therefore, get_iterations_without_progress_threshold() should always be
set to something sensibly large so that this test can be done with set to something sensibly large so that this test can be done with
reasonably high confidence. Think of this test as saying "if the loss reasonably high confidence. Think of this test as saying "if the loss
hasn't decreased for the previous get_iterations_without_progress_threshold() hasn't decreased for the previous get_iterations_without_progress_threshold()
then shrink the step size". then shrink the learning rate".
!*/ !*/
void set_step_size_shrink_amount ( void set_learning_rate_shrink_amount (
double shrink double shrink
); );
/*! /*!
requires requires
- 0 < shrink && shrink <= 1 - 0 < shrink && shrink <= 1
ensures ensures
- #get_step_size_shrink() == shrink - #get_learning_rate_shrink() == shrink
!*/ !*/
double get_step_size_shrink ( double get_learning_rate_shrink (
) const; ) const;
/*! /*!
ensures ensures
- Whenever the training routine thinks it isn't making progress anymore it - Whenever the training routine thinks it isn't making progress anymore it
will reduce get_step_size() by multiplying it by get_step_size_shrink(). will reduce get_learning_rate() by multiplying it by get_learning_rate_shrink().
- You can disable the automatic step size reduction by setting - You can disable the automatic learning rate reduction by setting
get_step_size_shrink() to 1. get_learning_rate_shrink() to 1.
!*/ !*/
void be_verbose ( void be_verbose (
...@@ -283,8 +282,8 @@ namespace dlib ...@@ -283,8 +282,8 @@ namespace dlib
- Trains a supervised neural network based on the given training data. - Trains a supervised neural network based on the given training data.
The goal of training is to find the network parameters that minimize The goal of training is to find the network parameters that minimize
get_net().compute_loss(data.begin(), data.end(), labels.begin()). get_net().compute_loss(data.begin(), data.end(), labels.begin()).
- The optimizer will run until get_step_size() < get_min_step_size() or - The optimizer will run until get_learning_rate() < get_min_learning_rate()
get_max_num_epochs() training epochs have been executed. or get_max_num_epochs() training epochs have been executed.
- Each layer in the network will be optimized by its corresponding solver - Each layer in the network will be optimized by its corresponding solver
in get_solvers(). in get_solvers().
- Each call to train DOES NOT reinitialize the state of get_net() or - Each call to train DOES NOT reinitialize the state of get_net() or
...@@ -309,8 +308,8 @@ namespace dlib ...@@ -309,8 +308,8 @@ namespace dlib
- Trains an unsupervised neural network based on the given training data. - Trains an unsupervised neural network based on the given training data.
The goal of training is to find the network parameters that minimize The goal of training is to find the network parameters that minimize
get_net().compute_loss(data.begin(), data.end()). get_net().compute_loss(data.begin(), data.end()).
- The optimizer will run until get_step_size() < get_min_step_size() or - The optimizer will run until get_learning_rate() < get_min_learning_rate()
get_max_num_epochs() training epochs have been executed. or get_max_num_epochs() training epochs have been executed.
- Each layer in the network will be optimized by its corresponding solver - Each layer in the network will be optimized by its corresponding solver
in get_solvers(). in get_solvers().
- Each call to train DOES NOT reinitialize the state of get_net() or - Each call to train DOES NOT reinitialize the state of get_net() or
...@@ -381,6 +380,8 @@ namespace dlib ...@@ -381,6 +380,8 @@ namespace dlib
- Note that, if be_verbose() has been called, then this object will - Note that, if be_verbose() has been called, then this object will
automatically call clear_average_loss() periodically when it logs the automatically call clear_average_loss() periodically when it logs the
loss to the console. loss to the console.
- This function blocks until all threads inside the dnn_trainer have
stopped touching the net.
!*/ !*/
void clear_average_loss ( void clear_average_loss (
...@@ -393,6 +394,8 @@ namespace dlib ...@@ -393,6 +394,8 @@ namespace dlib
applied during training. Calling clear_average_loss() resets the applied during training. Calling clear_average_loss() resets the
running_stats object so it forgets about all previous loss values running_stats object so it forgets about all previous loss values
observed. observed.
- This function blocks until all threads inside the dnn_trainer have
stopped touching the net.
!*/ !*/
}; };
......
...@@ -33,23 +33,27 @@ using namespace dlib; ...@@ -33,23 +33,27 @@ using namespace dlib;
// It exists solely so other layers can refer to it. In this case, the // It exists solely so other layers can refer to it. In this case, the
// add_prev1 layer looks for the tag1 layer and will take the tag1 output and // add_prev1 layer looks for the tag1 layer and will take the tag1 output and
// add it to the input of the add_prev1 layer. This combination allows us to // add it to the input of the add_prev1 layer. This combination allows us to
// implement skip and residual style networks. // implement skip and residual style networks. We have also made base_res
template <int stride, typename SUBNET> // parameterized by BN, which will let us insert different batch normalization
using base_res = relu<add_prev1<bn_con<con<8,3,3,1,1,relu<bn_con<con<8,3,3,stride,stride,tag1<SUBNET>>>>>>>>; // layers.
template <template <typename> class BN, typename SUBNET>
// Let's also define the same block but with all the batch normalization layers using base_res = relu<add_prev1<BN<con<8,3,3,1,1,relu<BN<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>;
// replaced with affine transform layers. We will use this type of construction
// when testing our networks. // We also want a residual block that begins by doing downsampling. We can
template <int stride, typename SUBNET> // reuse base_res to define it like this:
using base_ares = relu<add_prev1<affine<con<8,3,3,1,1,relu<affine<con<8,3,3,stride,stride,tag1<SUBNET>>>>>>>>; template <template <typename> class BN, typename SUBNET>
using base_res_down = base_res<BN,avg_pool<1,1,2,2,SUBNET>>;
// And of course we can define more alias templates based on previously defined
// alias templates. The _down versions downsample the inputs by a factor of 2 // Now we can define 4 different residual blocks we will use in this example.
// while the res and ares layer types don't. // The first two are non-downsampling residual blocks while the last two
template <typename SUBNET> using res = base_res<1,SUBNET>; // downsample. Also, res and res_down use batch normalization while ares and
template <typename SUBNET> using res_down = base_res<2,SUBNET>; // ares_down have had the batch normalization replaced with simple affine
template <typename SUBNET> using ares = base_ares<1,SUBNET>; // layers. We will use the affine version of the layers when testing our
template <typename SUBNET> using ares_down = base_ares<2,SUBNET>; // networks.
template <typename SUBNET> using res = base_res<bn_con,SUBNET>;
template <typename SUBNET> using ares = base_res<affine,SUBNET>;
template <typename SUBNET> using res_down = base_res_down<bn_con,SUBNET>;
template <typename SUBNET> using ares_down = base_res_down<affine,SUBNET>;
...@@ -141,37 +145,39 @@ int main(int argc, char** argv) try ...@@ -141,37 +145,39 @@ int main(int argc, char** argv) try
// These print statements will output this (I've truncated it since it's // These print statements will output this (I've truncated it since it's
// long, but you get the idea): // long, but you get the idea):
/* /*
The pnet has 125 layers in it. The pnet has 127 layers in it.
layer<0> loss_multiclass_log layer<0> loss_multiclass_log
layer<1> fc (num_outputs=10) layer<1> fc (num_outputs=10)
layer<2> avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0) layer<2> avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0)
layer<3> prelu (initial_param_value=0.2) layer<3> prelu (initial_param_value=0.2)
layer<4> add_prev layer<4> add_prev
layer<5> bn_con layer<5> bn_con
layer<6> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1) layer<6> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<7> prelu (initial_param_value=0.25) layer<7> prelu (initial_param_value=0.25)
layer<8> bn_con layer<8> bn_con
layer<9> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1) layer<9> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<10> tag1 layer<10> tag1
... ...
layer<33> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2) layer<33> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<34> tag1 layer<34> tag1
layer<35> tag4 layer<35> avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
layer<36> prelu (initial_param_value=0.3) layer<36> tag4
layer<37> add_prev layer<37> prelu (initial_param_value=0.3)
layer<38> bn_con layer<38> add_prev
layer<39> bn_con
... ...
layer<114> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2) layer<115> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<115> tag1 layer<116> tag1
layer<116> relu layer<117> avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
layer<117> add_prev layer<118> relu
layer<118> bn_con layer<119> add_prev
layer<119> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1) layer<120> bn_con
layer<120> relu layer<121> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<121> bn_con layer<122> relu
layer<122> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1) layer<123> bn_con
layer<123> tag1 layer<124> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<124> input<matrix> layer<125> tag1
layer<126> input<matrix>
*/ */
// Now that we know the index numbers for each layer, we can access them // Now that we know the index numbers for each layer, we can access them
...@@ -189,7 +195,7 @@ int main(int argc, char** argv) try ...@@ -189,7 +195,7 @@ int main(int argc, char** argv) try
// parts of your network and access them by layer<tag>(). You can also // parts of your network and access them by layer<tag>(). You can also
// index relative to a tag. So for example, to access the layer immediately // index relative to a tag. So for example, to access the layer immediately
// after tag4 you can say: // after tag4 you can say:
layer<tag4,1>(pnet); // Equivalent to layer<35+1>(pnet). layer<tag4,1>(pnet); // Equivalent to layer<36+1>(pnet).
// Or to access the layer 2 layers after tag4: // Or to access the layer 2 layers after tag4:
layer<tag4,2>(pnet); layer<tag4,2>(pnet);
...@@ -203,23 +209,26 @@ int main(int argc, char** argv) try ...@@ -203,23 +209,26 @@ int main(int argc, char** argv) try
// talk about training networks! // talk about training networks!
// The dnn_trainer will use SGD by default, but you can tell it to use // The dnn_trainer will use SGD by default, but you can tell it to use
// different solvers like adam. // different solvers like adam with a weight decay of 0.0005 and the given
dnn_trainer<net_type,adam> trainer(net,adam(0.001)); // momentum parameters.
dnn_trainer<net_type,adam> trainer(net,adam(0.0005, 0.9, 0.999));
// Also, if you have multiple graphics cards you can tell the trainer to use // Also, if you have multiple graphics cards you can tell the trainer to use
// them together to make the training faster. For example, replacing the // them together to make the training faster. For example, replacing the
// above constructor call with this one would cause it to use GPU cards 0 // above constructor call with this one would cause it to use GPU cards 0
// and 1. // and 1.
//dnn_trainer<net_type,adam> trainer(net,adam(0.001), {0,1}); //dnn_trainer<net_type,adam> trainer(net,adam(0.0005, 0.9, 0.999), {0,1});
trainer.be_verbose(); trainer.be_verbose();
trainer.set_synchronization_file("mnist_resnet_sync", std::chrono::seconds(100)); trainer.set_synchronization_file("mnist_resnet_sync", std::chrono::seconds(100));
// While the trainer is running it keeps an eye on the training error. If // While the trainer is running it keeps an eye on the training error. If
// it looks like the error hasn't decreased for the last 2000 iterations it // it looks like the error hasn't decreased for the last 2000 iterations it
// will automatically reduce the step size by 0.1. You can change these // will automatically reduce the learning rate by 0.1. You can change these
// default parameters to some other values by calling these functions. Or // default parameters to some other values by calling these functions. Or
// disable them entirely by setting the shrink amount to 1. // disable the automatic shrinking entirely by setting the shrink amount to 1.
trainer.set_iterations_without_progress_threshold(2000); trainer.set_iterations_without_progress_threshold(2000);
trainer.set_step_size_shrink_amount(0.1); trainer.set_learning_rate_shrink_amount(0.1);
// The learning rate will start at 1e-3.
trainer.set_learning_rate(1e-3);
// Now, what if your training dataset is so big it doesn't fit in RAM? You // Now, what if your training dataset is so big it doesn't fit in RAM? You
...@@ -230,10 +239,10 @@ int main(int argc, char** argv) try ...@@ -230,10 +239,10 @@ int main(int argc, char** argv) try
std::vector<matrix<unsigned char>> mini_batch_samples; std::vector<matrix<unsigned char>> mini_batch_samples;
std::vector<unsigned long> mini_batch_labels; std::vector<unsigned long> mini_batch_labels;
dlib::rand rnd(time(0)); dlib::rand rnd(time(0));
// Loop until the trainer's automatic shrinking has shrunk the step size by // Loop until the trainer's automatic shrinking has shrunk the learning rate to 1e-6.
// 1e-3. For the default shrinks amount of 0.1 this means stop after it // Given our settings, this means it will stop training after it has shrunk the
// shrinks it 3 times. // learning rate 3 times.
while(trainer.get_step_size() >= 1e-3) while(trainer.get_learning_rate() >= 1e-6)
{ {
mini_batch_samples.clear(); mini_batch_samples.clear();
mini_batch_labels.clear(); mini_batch_labels.clear();
......
...@@ -89,7 +89,9 @@ int main(int argc, char** argv) try ...@@ -89,7 +89,9 @@ int main(int argc, char** argv) try
net_type net; net_type net;
// And then train it using the MNIST data. The code below uses mini-batch stochastic // And then train it using the MNIST data. The code below uses mini-batch stochastic
// gradient descent with an initial learning rate of 0.01 to accomplish this. // gradient descent with an initial learning rate of 0.01 to accomplish this.
dnn_trainer<net_type> trainer(net,sgd(0.01)); dnn_trainer<net_type> trainer(net);
trainer.set_learning_rate(0.01);
trainer.set_min_learning_rate(0.00001);
trainer.set_mini_batch_size(128); trainer.set_mini_batch_size(128);
trainer.be_verbose(); trainer.be_verbose();
// Since DNN training can take a long time, we can ask the trainer to save its state to // Since DNN training can take a long time, we can ask the trainer to save its state to
...@@ -97,11 +99,11 @@ int main(int argc, char** argv) try ...@@ -97,11 +99,11 @@ int main(int argc, char** argv) try
// start it again it will begin where it left off rather than restarting the training // start it again it will begin where it left off rather than restarting the training
// from scratch. // from scratch.
trainer.set_synchronization_file("mnist_sync", std::chrono::seconds(20)); trainer.set_synchronization_file("mnist_sync", std::chrono::seconds(20));
// Finally, this line begins training. By default, it runs SGD with our specified step // Finally, this line begins training. By default, it runs SGD with our specified
// size until the loss stops decreasing. Then it reduces the step size by a factor of // learning rate until the loss stops decreasing. Then it reduces the learning rate by
// 10 and continues running until the loss stops decreasing again. It will reduce the // a factor of 10 and continues running until the loss stops decreasing again. It will
// step size 3 times and then terminate. For a longer discussion, see the documentation // keep doing this until the learning rate has dropped below the min learning rate
// of the dnn_trainer object. // defined above or the maximum number of epochs as been executed (defaulted to 10000).
trainer.train(training_images, training_labels); trainer.train(training_images, training_labels);
// At this point our net object should have learned how to classify MNIST images. But // At this point our net object should have learned how to classify MNIST images. But
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment