Commit 9f92b082 authored by Davis King's avatar Davis King

Now training will automatically reduce the learning rate when it is clear that

the loss isn't being reduced.  Also, there is a stopping condition now based on
how large the current learning rate is.  That is, training stops when the learning
rate gets small enough and it is clear that no progress is being made.
parent 6f63bc62
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
#include <utility> #include <utility>
#include <tuple> #include <tuple>
#include <cmath> #include <cmath>
#include "tensor_tools.h"
namespace dlib namespace dlib
...@@ -719,23 +721,27 @@ namespace dlib ...@@ -719,23 +721,27 @@ namespace dlib
) const { return subnetwork->get_final_data_gradient(); } ) const { return subnetwork->get_final_data_gradient(); }
template <typename solver_type> template <typename solver_type>
void update(const tensor& x, sstack<solver_type> solvers) void update(const tensor& x, sstack<solver_type> solvers, double step_size)
{ {
update(x,private_get_gradient_input(),solvers); update(x,private_get_gradient_input(),solvers,step_size);
} }
template <typename solver_type> template <typename solver_type>
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers) void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
{ {
DLIB_CASSERT(solvers.size()>=num_layers,""); DLIB_CASSERT(solvers.size()>=num_layers,"");
dimpl::subnet_wrapper<subnet_type> wsub(*subnetwork); dimpl::subnet_wrapper<subnet_type> wsub(*subnetwork);
params_grad.copy_size(details.get_layer_params()); params_grad.copy_size(details.get_layer_params());
impl::call_layer_backward(details, private_get_output(), impl::call_layer_backward(details, private_get_output(),
gradient_input, wsub, static_cast<tensor&>(params_grad)); gradient_input, wsub, static_cast<tensor&>(params_grad));
// Don't try to adjust the parameters if this layer doesn't have any. // Don't try to adjust the parameters if this layer doesn't have any.
if (params_grad.size() != 0) if (params_grad.size() != 0)
solvers.top()(details, static_cast<const tensor&>(params_grad)); {
subnetwork->update(x, solvers.pop()); const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
tt::add(1,details.get_layer_params(), step_size, step);
}
subnetwork->update(x, solvers.pop(), step_size);
gradient_input_is_stale = true; gradient_input_is_stale = true;
} }
...@@ -1016,13 +1022,13 @@ namespace dlib ...@@ -1016,13 +1022,13 @@ namespace dlib
) const { return grad_final; } ) const { return grad_final; }
template <typename solver_type> template <typename solver_type>
void update(const tensor& x, sstack<solver_type> solvers) void update(const tensor& x, sstack<solver_type> solvers, double step_size)
{ {
update(x,private_get_gradient_input(),solvers); return update(x,private_get_gradient_input(),solvers, step_size);
} }
template <typename solver_type> template <typename solver_type>
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers) void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
{ {
DLIB_CASSERT(solvers.size()>=num_layers,""); DLIB_CASSERT(solvers.size()>=num_layers,"");
// make sure grad_final is initialized to 0 // make sure grad_final is initialized to 0
...@@ -1034,9 +1040,13 @@ namespace dlib ...@@ -1034,9 +1040,13 @@ namespace dlib
params_grad.copy_size(details.get_layer_params()); params_grad.copy_size(details.get_layer_params());
impl::call_layer_backward(details, private_get_output(), impl::call_layer_backward(details, private_get_output(),
gradient_input, wsub, static_cast<tensor&>(params_grad)); gradient_input, wsub, static_cast<tensor&>(params_grad));
// Don't try to adjust the parameters if this layer doesn't have any. // Don't try to adjust the parameters if this layer doesn't have any.
if (params_grad.size() != 0) if (params_grad.size() != 0)
solvers.top()(details, static_cast<const tensor&>(params_grad)); {
const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
tt::add(1,details.get_layer_params(), step_size, step);
}
gradient_input_is_stale = true; gradient_input_is_stale = true;
} }
...@@ -1225,15 +1235,15 @@ namespace dlib ...@@ -1225,15 +1235,15 @@ namespace dlib
) const { return subnetwork.get_final_data_gradient(); } ) const { return subnetwork.get_final_data_gradient(); }
template <typename solver_type> template <typename solver_type>
void update(const tensor& x, sstack<solver_type> solvers) void update(const tensor& x, sstack<solver_type> solvers, double step_size)
{ {
subnetwork.update(x,solvers); subnetwork.update(x,solvers, step_size);
} }
template <typename solver_type> template <typename solver_type>
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers) void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
{ {
subnetwork.update(x,gradient_input,solvers); subnetwork.update(x,gradient_input,solvers, step_size);
} }
const subnet_type& subnet() const { return subnetwork; } const subnet_type& subnet() const { return subnetwork; }
...@@ -1462,31 +1472,31 @@ namespace dlib ...@@ -1462,31 +1472,31 @@ namespace dlib
} }
template <typename solver_type> template <typename solver_type>
void update(const tensor& x, sstack<solver_type> solvers) void update(const tensor& x, sstack<solver_type> solvers, double step_size)
{ {
update(x,private_get_gradient_input(),solvers); update(x,private_get_gradient_input(),solvers,step_size);
} }
template <typename solver_type> template <typename solver_type>
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers) void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
{ {
const auto cnt = (LAYER<SUBNET>::num_layers-SUBNET::num_layers); const auto cnt = (LAYER<SUBNET>::num_layers-SUBNET::num_layers);
if (details.size() > 1) if (details.size() > 1)
{ {
details[0].update(details[1].get_output(), gradient_input, solvers); details[0].update(details[1].get_output(), gradient_input, solvers,step_size);
for (size_t i = 1; i < details.size(); ++i) for (size_t i = 1; i < details.size(); ++i)
{ {
if (i+1 < details.size()) if (i+1 < details.size())
details[i].update(details[i+1].get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i)); details[i].update(details[i+1].get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i),step_size);
else else
details[i].update(subnetwork.get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i)); details[i].update(subnetwork.get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i),step_size);
} }
} }
else else
{ {
details[0].update(subnetwork.get_output(), gradient_input, solvers); details[0].update(subnetwork.get_output(), gradient_input, solvers,step_size);
} }
subnetwork.update(x, details.back().get_final_data_gradient(), solvers.pop(cnt*details.size())); subnetwork.update(x, details.back().get_final_data_gradient(), solvers.pop(cnt*details.size()),step_size);
} }
const subnet_type& subnet() const { return subnetwork; } const subnet_type& subnet() const { return subnetwork; }
...@@ -1672,13 +1682,22 @@ namespace dlib ...@@ -1672,13 +1682,22 @@ namespace dlib
} }
template <typename solver_type> template <typename solver_type>
void update(const tensor& /*x*/, sstack<solver_type> /*solvers*/) void update(
const tensor& /*x*/,
sstack<solver_type> /*solvers*/,
double /*step_size*/
)
{ {
// nothing to update // nothing to update
} }
template <typename solver_type> template <typename solver_type>
void update(const tensor& /*x*/, const tensor& gradient_input, sstack<solver_type> /*solvers*/) void update(
const tensor& /*x*/,
const tensor& /*gradient_input*/,
sstack<solver_type> /*solvers*/,
double /*step_size*/
)
{ {
// nothing to update // nothing to update
} }
...@@ -1948,13 +1967,14 @@ namespace dlib ...@@ -1948,13 +1967,14 @@ namespace dlib
double update ( double update (
const tensor& x, const tensor& x,
label_iterator lbegin, label_iterator lbegin,
sstack<solver_type> solvers sstack<solver_type> solvers,
double step_size
) )
{ {
subnetwork.forward(x); subnetwork.forward(x);
dimpl::subnet_wrapper<subnet_type> wsub(subnetwork); dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
double l = loss.compute_loss(x, lbegin, wsub); double l = loss.compute_loss(x, lbegin, wsub);
subnetwork.update(x, solvers); subnetwork.update(x, solvers, step_size);
return l; return l;
} }
...@@ -1963,23 +1983,25 @@ namespace dlib ...@@ -1963,23 +1983,25 @@ namespace dlib
input_iterator ibegin, input_iterator ibegin,
input_iterator iend, input_iterator iend,
label_iterator lbegin, label_iterator lbegin,
sstack<solver_type> solvers sstack<solver_type> solvers,
double step_size
) )
{ {
to_tensor(ibegin,iend,temp_tensor); to_tensor(ibegin,iend,temp_tensor);
return update(temp_tensor, lbegin, solvers); return update(temp_tensor, lbegin, solvers, step_size);
} }
template <typename solver_type> template <typename solver_type>
double update ( double update (
const tensor& x, const tensor& x,
sstack<solver_type> solvers sstack<solver_type> solvers,
double step_size
) )
{ {
subnetwork.forward(x); subnetwork.forward(x);
dimpl::subnet_wrapper<subnet_type> wsub(subnetwork); dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
double l = loss.compute_loss(x, wsub); double l = loss.compute_loss(x, wsub);
subnetwork.update(x, solvers); subnetwork.update(x, solvers, step_size);
return l; return l;
} }
...@@ -1987,11 +2009,12 @@ namespace dlib ...@@ -1987,11 +2009,12 @@ namespace dlib
double update ( double update (
input_iterator ibegin, input_iterator ibegin,
input_iterator iend, input_iterator iend,
std::vector<solver_type>& solvers sstack<solver_type> solvers,
double step_size
) )
{ {
to_tensor(ibegin,iend,temp_tensor); to_tensor(ibegin,iend,temp_tensor);
return update(temp_tensor, solvers); return update(temp_tensor, solvers, step_size);
} }
const subnet_type& subnet() const { return subnetwork; } const subnet_type& subnet() const { return subnetwork; }
......
...@@ -392,7 +392,8 @@ namespace dlib ...@@ -392,7 +392,8 @@ namespace dlib
template <typename solver_type> template <typename solver_type>
void update( void update(
const tensor& x, const tensor& x,
sstack<solver_type> solvers sstack<solver_type> solvers,
double step_size
); );
/*! /*!
requires requires
...@@ -405,9 +406,12 @@ namespace dlib ...@@ -405,9 +406,12 @@ namespace dlib
is, if you want to call update() on some other neural network object then is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object. you must NOT reuse the same solvers object.
- solvers.size() >= num_layers - solvers.size() >= num_layers
- 0 < step_size <= 1
ensures ensures
- Back propagates the error gradient, get_gradient_input(), through this - Back propagates the error gradient, get_gradient_input(), through this
network and uses the provided solvers to update the network parameters. network and uses the provided solvers to update the network parameters.
- The parameter delta vector output by the solvers is multiplied by
step_size before being added to the parameters.
- All elements of #get_gradient_input() are set to 0. - All elements of #get_gradient_input() are set to 0.
- have_same_dimensions(#get_final_data_gradient(), x) == true - have_same_dimensions(#get_final_data_gradient(), x) == true
- #get_final_data_gradient() contains the gradient of the network with - #get_final_data_gradient() contains the gradient of the network with
...@@ -418,7 +422,8 @@ namespace dlib ...@@ -418,7 +422,8 @@ namespace dlib
void update( void update(
const tensor& x, const tensor& x,
const tensor& gradient_input, const tensor& gradient_input,
sstack<solver_type> solvers sstack<solver_type> solvers,
double step_size
); );
/*! /*!
requires requires
...@@ -430,6 +435,7 @@ namespace dlib ...@@ -430,6 +435,7 @@ namespace dlib
is, if you want to call update() on some other neural network object then is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object. you must NOT reuse the same solvers object.
- solvers.size() >= num_layers - solvers.size() >= num_layers
- 0 < step_size <= 1
ensures ensures
- This function is identical to the version of update() defined immediately - This function is identical to the version of update() defined immediately
above except that it back-propagates gradient_input through the network above except that it back-propagates gradient_input through the network
...@@ -439,6 +445,8 @@ namespace dlib ...@@ -439,6 +445,8 @@ namespace dlib
update(x,solvers); update(x,solvers);
Except that calling update(x,gradient_input,solvers) avoids the copy Except that calling update(x,gradient_input,solvers) avoids the copy
and is therefore slightly more efficient. and is therefore slightly more efficient.
- The parameter delta vector output by the solvers is multiplied by
step_size before being added to the parameters.
- All elements of #get_gradient_input() are set to 0. - All elements of #get_gradient_input() are set to 0.
- #get_final_data_gradient() contains the gradient of the network with - #get_final_data_gradient() contains the gradient of the network with
respect to x. respect to x.
...@@ -755,7 +763,8 @@ namespace dlib ...@@ -755,7 +763,8 @@ namespace dlib
double update ( double update (
const tensor& x, const tensor& x,
label_iterator lbegin, label_iterator lbegin,
sstack<solver_type> solvers sstack<solver_type> solvers,
double step_size
); );
/*! /*!
requires requires
...@@ -767,6 +776,7 @@ namespace dlib ...@@ -767,6 +776,7 @@ namespace dlib
is, if you want to call update() on some other neural network object then is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object. you must NOT reuse the same solvers object.
- solvers.size() >= num_layers - solvers.size() >= num_layers
- 0 < step_size <= 1
ensures ensures
- runs x through the network, compares the output to the expected output - runs x through the network, compares the output to the expected output
pointed to by lbegin, and updates the network parameters via pointed to by lbegin, and updates the network parameters via
...@@ -775,6 +785,8 @@ namespace dlib ...@@ -775,6 +785,8 @@ namespace dlib
- the expected label of the kth sample in x is *(lbegin+k/sample_expansion_factor). - the expected label of the kth sample in x is *(lbegin+k/sample_expansion_factor).
- The provided solvers are used to update the parameters in each layer of - The provided solvers are used to update the parameters in each layer of
the network. the network.
- The parameter delta vector output by the solvers is multiplied by
step_size before being added to the parameters.
- returns compute_loss(x,lbegin) - returns compute_loss(x,lbegin)
!*/ !*/
...@@ -783,7 +795,8 @@ namespace dlib ...@@ -783,7 +795,8 @@ namespace dlib
input_iterator ibegin, input_iterator ibegin,
input_iterator iend, input_iterator iend,
label_iterator lbegin, label_iterator lbegin,
sstack<solver_type> solvers sstack<solver_type> solvers,
double step_size
); );
/*! /*!
requires requires
...@@ -795,6 +808,7 @@ namespace dlib ...@@ -795,6 +808,7 @@ namespace dlib
is, if you want to call update() on some other neural network object then is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object. you must NOT reuse the same solvers object.
- solvers.size() >= num_layers - solvers.size() >= num_layers
- 0 < step_size <= 1
ensures ensures
- runs [ibegin,iend) through the network, compares the output to the - runs [ibegin,iend) through the network, compares the output to the
expected output pointed to by lbegin, and updates the network parameters expected output pointed to by lbegin, and updates the network parameters
...@@ -803,6 +817,8 @@ namespace dlib ...@@ -803,6 +817,8 @@ namespace dlib
- the expected label of *(ibegin+k) is *(lbegin+k). - the expected label of *(ibegin+k) is *(lbegin+k).
- The provided solvers are used to update the parameters in each layer of - The provided solvers are used to update the parameters in each layer of
the network. the network.
- The parameter delta vector output by the solvers is multiplied by
step_size before being added to the parameters.
- returns compute_loss(ibegin,iend,lbegin) - returns compute_loss(ibegin,iend,lbegin)
!*/ !*/
...@@ -811,7 +827,8 @@ namespace dlib ...@@ -811,7 +827,8 @@ namespace dlib
template <typename solver_type> template <typename solver_type>
double update ( double update (
const tensor& x, const tensor& x,
sstack<solver_type> solvers sstack<solver_type> solvers,
double step_size
); );
/*! /*!
requires requires
...@@ -822,11 +839,14 @@ namespace dlib ...@@ -822,11 +839,14 @@ namespace dlib
is, if you want to call update() on some other neural network object then is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object. you must NOT reuse the same solvers object.
- solvers.size() >= num_layers - solvers.size() >= num_layers
- 0 < step_size <= 1
ensures ensures
- runs x through the network and updates the network parameters by - runs x through the network and updates the network parameters by
back-propagating the loss gradient through the network. back-propagating the loss gradient through the network.
- The provided solvers are used to update the parameters in each layer of - The provided solvers are used to update the parameters in each layer of
the network. the network.
- The parameter delta vector output by the solvers is multiplied by
step_size before being added to the parameters.
- returns compute_loss(x) - returns compute_loss(x)
!*/ !*/
...@@ -834,7 +854,8 @@ namespace dlib ...@@ -834,7 +854,8 @@ namespace dlib
double update ( double update (
input_iterator ibegin, input_iterator ibegin,
input_iterator iend, input_iterator iend,
sstack<solver_type> solvers sstack<solver_type> solvers,
double step_size
); );
/*! /*!
requires requires
...@@ -845,11 +866,14 @@ namespace dlib ...@@ -845,11 +866,14 @@ namespace dlib
is, if you want to call update() on some other neural network object then is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object. you must NOT reuse the same solvers object.
- solvers.size() >= num_layers - solvers.size() >= num_layers
- 0 < step_size <= 1
ensures ensures
- runs [ibegin,iend) through the network and updates the network parameters - runs [ibegin,iend) through the network and updates the network parameters
by back-propagating the loss gradient through the network. by back-propagating the loss gradient through the network.
- The provided solvers are used to update the parameters in each layer of - The provided solvers are used to update the parameters in each layer of
the network. the network.
- The parameter delta vector output by the solvers is multiplied by
step_size before being added to the parameters.
- returns compute_loss(ibegin,iend) - returns compute_loss(ibegin,iend)
!*/ !*/
......
...@@ -33,24 +33,23 @@ namespace dlib ...@@ -33,24 +33,23 @@ namespace dlib
float get_learning_rate ( float get_learning_rate (
) const { return learning_rate; } ) const { return learning_rate; }
template <typename LAYER_DETAILS> const tensor& operator() (
void operator() ( const tensor& params,
LAYER_DETAILS& l,
const tensor& params_grad const tensor& params_grad
) )
{ {
DLIB_CASSERT(l.get_layer_params().size() != 0,""); DLIB_CASSERT(params.size() != 0,"");
if (v.size() == 0) if (v.size() == 0)
{ {
v.copy_size(params_grad); v.copy_size(params_grad);
v = 0; v = 0;
} }
tt::affine_transform(v, v, l.get_layer_params(), params_grad,
//perform: v = momentum*mat(v) - weight_decay*learning_rate*mat(params) - learning_rate*mat(params_grad);
tt::affine_transform(v, v, params, params_grad,
momentum, -weight_decay*learning_rate, -learning_rate, 0); momentum, -weight_decay*learning_rate, -learning_rate, 0);
// perform l.get_layer_params() += v; return v;
tt::affine_transform(l.get_layer_params(), l.get_layer_params(), v, 1, 1, 0);
} }
friend void serialize(const sgd& item, std::ostream& out) friend void serialize(const sgd& item, std::ostream& out)
......
...@@ -18,11 +18,11 @@ namespace dlib ...@@ -18,11 +18,11 @@ namespace dlib
/*! /*!
WHAT THIS OBJECT REPRESENTS WHAT THIS OBJECT REPRESENTS
A solver defines the parameter update rule for a single layer in a deep A solver defines the parameter update rule for a single layer in a deep
neural network. It takes a parameter gradient vector and a layer and neural network. It takes a parameter gradient vector and the layer's
updates the layer's parameters. Importantly, each solver instance is used parameters and tells you how the parameters should be updated.
with only one layer in a network. This allows us to define solvers that Importantly, each solver instance is used with only one layer in a network.
have per layer state, for example, a solver may keep a momentum term and This allows us to define solvers that have per layer state, for example, a
apply it to its update rule. solver may keep a momentum term and apply it to its update rule.
Note that there is no dlib::EXAMPLE_SOLVER type. It is shown here purely Note that there is no dlib::EXAMPLE_SOLVER type. It is shown here purely
to document the interface a solver object must implement. to document the interface a solver object must implement.
...@@ -33,22 +33,22 @@ namespace dlib ...@@ -33,22 +33,22 @@ namespace dlib
EXAMPLE_SOLVER( EXAMPLE_SOLVER(
); );
template <typename LAYER_DETAILS> const tensor& operator() (
void operator() ( const tensor& params,
LAYER_DETAILS& l,
const tensor& params_grad const tensor& params_grad
); )
/*! /*!
requires requires
- LAYER_DETAILS implements the EXAMPLE_LAYER_ interface defined in - params.size() != 0
layers_abstract.h. - have_same_dimensions(params, params_grad) == true.
- l.get_layer_params().size() != 0
- have_same_dimensions(l.get_layer_params(), params_grad) == true.
- When this function is invoked on a particular solver instance, it is - When this function is invoked on a particular solver instance, it is
always supplied with the same LAYER_DETAILS object. always supplied with parameters from the same layer instance. That is,
the solver is allowed to remember things from one invocation to another
and to assume that it is being serially applied to optimize the same
parameters.
ensures ensures
- Updates the parameters in l. That is, l.get_layer_params() is modified - Returns a step vector V that is intended to be used to update the
based on the parameter gradient vector stored in params_grad. parameters by adding V to params.
!*/ !*/
}; };
......
...@@ -7,13 +7,14 @@ ...@@ -7,13 +7,14 @@
#include "core.h" #include "core.h"
#include "solvers.h" #include "solvers.h"
#include "../statistics.h" #include "../statistics.h"
#include "../console_progress_indicator.h"
#include <chrono> #include <chrono>
#include "../serialize.h" #include "../serialize.h"
#include "../pipe.h" #include "../pipe.h"
#include "../threads.h" #include "../threads.h"
#include "cuda_dlib.h" #include "cuda_dlib.h"
#include "../statistics/running_gradient.h"
#include <atomic>
namespace dlib namespace dlib
{ {
...@@ -95,15 +96,15 @@ namespace dlib ...@@ -95,15 +96,15 @@ namespace dlib
mini_batch_size = batch_size; mini_batch_size = batch_size;
} }
unsigned long get_num_epochs ( unsigned long get_max_num_epochs (
) const { return num_epochs; } ) const { return max_num_epochs; }
void set_num_epochs ( void set_max_num_epochs (
unsigned long num unsigned long num
) )
{ {
DLIB_CASSERT(num > 0,""); DLIB_CASSERT(num > 0,"");
num_epochs = num; max_num_epochs = num;
} }
void be_verbose ( void be_verbose (
...@@ -159,14 +160,14 @@ namespace dlib ...@@ -159,14 +160,14 @@ namespace dlib
{ {
DLIB_CASSERT(data.size() == labels.size() && data.size() > 0, ""); DLIB_CASSERT(data.size() == labels.size() && data.size() > 0, "");
console_progress_indicator pbar(num_epochs); for (unsigned long epoch_iteration = 0;
pbar.print_status(0); epoch_iteration < max_num_epochs && step_size >= min_step_size;
for (unsigned long epoch_iteration = 0; epoch_iteration < num_epochs; ++epoch_iteration) ++epoch_iteration)
{ {
using namespace std::chrono; using namespace std::chrono;
auto last_time = system_clock::now(); auto last_time = system_clock::now();
clear_average_loss(); clear_average_loss();
for (size_t i = 0; i < data.size(); i += mini_batch_size) for (size_t i = 0; i < data.size() && step_size >= min_step_size; i += mini_batch_size)
{ {
net.to_tensor(data.begin()+i, net.to_tensor(data.begin()+i,
data.begin()+std::min(i+mini_batch_size,data.size()), data.begin()+std::min(i+mini_batch_size,data.size()),
...@@ -183,10 +184,10 @@ namespace dlib ...@@ -183,10 +184,10 @@ namespace dlib
{ {
last_time = now_time; last_time = now_time;
auto iter = epoch_iteration + i/(double)data.size(); auto iter = epoch_iteration + i/(double)data.size();
std::cout << "epoch: " << rpad(cast_to_string(iter),string_pad) << " " std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "; << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
pbar.print_status(iter, true); << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad)
std::cout << std::endl; << std::endl;
} }
} }
} }
...@@ -195,10 +196,10 @@ namespace dlib ...@@ -195,10 +196,10 @@ namespace dlib
{ {
// Capitalize the E in Epoch so it's easy to grep out the lines that // Capitalize the E in Epoch so it's easy to grep out the lines that
// are for full epoch status statements. // are for full epoch status statements.
std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),string_pad) << " " std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "; << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
pbar.print_status(epoch_iteration+1, true); << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad)
std::cout << std::endl; << std::endl;
} }
} }
return get_net(); return get_net();
...@@ -214,14 +215,14 @@ namespace dlib ...@@ -214,14 +215,14 @@ namespace dlib
static_assert(has_unsupervised_loss, static_assert(has_unsupervised_loss,
"You can only call this version of train() when using an unsupervised loss."); "You can only call this version of train() when using an unsupervised loss.");
console_progress_indicator pbar(num_epochs); for (unsigned long epoch_iteration = 0;
pbar.print_status(0); epoch_iteration < max_num_epochs && step_size >= min_step_size;
for (unsigned long epoch_iteration = 0; epoch_iteration < num_epochs; ++epoch_iteration) ++epoch_iteration)
{ {
using namespace std::chrono; using namespace std::chrono;
auto last_time = system_clock::now(); auto last_time = system_clock::now();
clear_average_loss(); clear_average_loss();
for (size_t i = 0; i < data.size(); i += mini_batch_size) for (size_t i = 0; i < data.size() && step_size >= min_step_size; i += mini_batch_size)
{ {
net.to_tensor(data.begin()+i, net.to_tensor(data.begin()+i,
data.begin()+std::min(i+mini_batch_size,data.size()), data.begin()+std::min(i+mini_batch_size,data.size()),
...@@ -236,10 +237,10 @@ namespace dlib ...@@ -236,10 +237,10 @@ namespace dlib
{ {
last_time = now_time; last_time = now_time;
auto iter = epoch_iteration + i/(double)data.size(); auto iter = epoch_iteration + i/(double)data.size();
std::cout << "epoch: " << rpad(cast_to_string(iter),string_pad) << " " std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "; << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
pbar.print_status(iter, true); << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad)
std::cout << std::endl; << std::endl;
} }
} }
} }
...@@ -248,10 +249,10 @@ namespace dlib ...@@ -248,10 +249,10 @@ namespace dlib
{ {
// Capitalize the E in Epoch so it's easy to grep out the lines that // Capitalize the E in Epoch so it's easy to grep out the lines that
// are for full epoch status statements. // are for full epoch status statements.
std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),string_pad) << " " std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "; << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
pbar.print_status(epoch_iteration+1, true); << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad)
std::cout << std::endl; << std::endl;
} }
} }
return get_net(); return get_net();
...@@ -260,14 +261,20 @@ namespace dlib ...@@ -260,14 +261,20 @@ namespace dlib
friend void serialize(const dnn_trainer& item, std::ostream& out) friend void serialize(const dnn_trainer& item, std::ostream& out)
{ {
item.wait_for_thread_to_pause(); item.wait_for_thread_to_pause();
int version = 2; int version = 3;
serialize(version, out); serialize(version, out);
serialize(item.rs, out); serialize(item.rs, out);
serialize(item.num_epochs, out); serialize(item.rg, out);
serialize(item.max_num_epochs, out);
serialize(item.mini_batch_size, out); serialize(item.mini_batch_size, out);
serialize(item.verbose, out); serialize(item.verbose, out);
serialize(item.net, out); serialize(item.net, out);
serialize(item.solvers, out); serialize(item.solvers, out);
serialize(item.step_size.load(), out);
serialize(item.min_step_size, out);
serialize(item.iter_between_step_size_adjust.load(), out);
serialize(item.step_size_shrink.load(), out);
} }
friend void deserialize(dnn_trainer& item, std::istream& in) friend void deserialize(dnn_trainer& item, std::istream& in)
...@@ -275,14 +282,21 @@ namespace dlib ...@@ -275,14 +282,21 @@ namespace dlib
item.wait_for_thread_to_pause(); item.wait_for_thread_to_pause();
int version = 0; int version = 0;
deserialize(version, in); deserialize(version, in);
if (version != 2) if (version != 3)
throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer."); throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer.");
double temp;
deserialize(item.rs, in); deserialize(item.rs, in);
deserialize(item.num_epochs, in); deserialize(item.rg, in);
deserialize(item.max_num_epochs, in);
deserialize(item.mini_batch_size, in); deserialize(item.mini_batch_size, in);
deserialize(item.verbose, in); deserialize(item.verbose, in);
deserialize(item.net, in); deserialize(item.net, in);
deserialize(item.solvers, in); deserialize(item.solvers, in);
deserialize(temp, in); item.step_size = temp;
deserialize(item.min_step_size, in);
deserialize(temp, in); item.iter_between_step_size_adjust = temp;
deserialize(temp, in); item.step_size_shrink = temp;
} }
double get_average_loss ( double get_average_loss (
...@@ -299,6 +313,62 @@ namespace dlib ...@@ -299,6 +313,62 @@ namespace dlib
rs.clear(); rs.clear();
} }
void set_setep_size (
double ss
)
{
DLIB_CASSERT(ss > 0,"");
wait_for_thread_to_pause();
step_size = ss;
}
double get_step_size(
) const
{
return step_size;
}
void set_min_step_size (
double ss
)
{
DLIB_CASSERT(ss > 0,"");
min_step_size = ss;
}
double get_min_step_size (
) const
{
return min_step_size;
}
void set_iterations_between_step_size_adjust (
unsigned long min_iter
)
{
iter_between_step_size_adjust = min_iter;
}
unsigned long get_iterations_between_step_size_adjust (
) const
{
return iter_between_step_size_adjust;
}
void set_step_size_shrink_amount (
double shrink
)
{
DLIB_CASSERT(0 < shrink && shrink <= 1,"");
step_size_shrink = shrink;
}
double get_step_size_shrink (
) const
{
return step_size_shrink;
}
private: private:
struct job_t struct job_t
{ {
...@@ -309,16 +379,20 @@ namespace dlib ...@@ -309,16 +379,20 @@ namespace dlib
template <typename T> template <typename T>
void run_update(job_t& next_job, const T&) void run_update(job_t& next_job, const T&)
{ {
rs.add(net.update(next_job.t, next_job.labels.begin(), make_sstack(solvers))); double loss = net.update(next_job.t, next_job.labels.begin(), make_sstack(solvers),step_size);
rs.add(loss);
rg.add(loss);
} }
void run_update(job_t& next_job, const no_label_type&) void run_update(job_t& next_job, const no_label_type&)
{ {
no_label_type pick_wich_run_update; no_label_type pick_wich_run_update;
rs.add(net.update(next_job.t, make_sstack(solvers))); double loss = net.update(next_job.t, make_sstack(solvers), step_size);
rs.add(loss);
rg.add(loss);
} }
void thread() void thread() try
{ {
// Make sure this thread uses the same cuda device as the thread that created // Make sure this thread uses the same cuda device as the thread that created
// the dnn_trainer object. // the dnn_trainer object.
...@@ -330,8 +404,24 @@ namespace dlib ...@@ -330,8 +404,24 @@ namespace dlib
// call net.update() but pick the right version for unsupervised or // call net.update() but pick the right version for unsupervised or
// supervised training based on the type of label_type. // supervised training based on the type of label_type.
run_update(next_job, pick_wich_run_update); run_update(next_job, pick_wich_run_update);
// If we have been running for a while then check if the loss is still
// dropping. If it isn't then we will reduce the step size.
if (rg.current_n() > iter_between_step_size_adjust)
{
if (rg.probability_gradient_greater_than(0) > 0.45)
{
step_size = step_size_shrink*step_size;
}
rg.clear();
}
} }
} }
catch(std::exception& e)
{
std::cerr << e.what() << std::endl;
throw;
}
void wait_for_thread_to_pause() const void wait_for_thread_to_pause() const
{ {
...@@ -339,29 +429,40 @@ namespace dlib ...@@ -339,29 +429,40 @@ namespace dlib
} }
const static long string_pad = 10; const static long string_pad = 10;
const static long epoch_string_pad = 4;
const static long ss_string_pad = 4;
void init() void init()
{ {
num_epochs = 300; max_num_epochs = 10000;
mini_batch_size = 32; mini_batch_size = 128;
verbose = false; verbose = false;
cuda_device_id = dlib::cuda::get_device(); cuda_device_id = dlib::cuda::get_device();
step_size = 1;
min_step_size = 1e-4;
iter_between_step_size_adjust = 2000;
step_size_shrink = 0.1;
start(); start();
} }
// The job object is not logically part of the state of this object. It is here
// only to avoid reallocating it over and over.
job_t job;
dlib::pipe<job_t> job_pipe; dlib::pipe<job_t> job_pipe;
running_stats<double> rs; running_stats<double> rs;
unsigned long num_epochs; running_gradient rg;
unsigned long max_num_epochs;
size_t mini_batch_size; size_t mini_batch_size;
bool verbose; bool verbose;
int cuda_device_id; int cuda_device_id;
net_type net; net_type net;
std::vector<solver_type> solvers; std::vector<solver_type> solvers;
std::atomic<double> step_size;
double min_step_size;
std::atomic<long> iter_between_step_size_adjust;
std::atomic<double> step_size_shrink;
// The job object is not logically part of the state of this object. It is here
// only to avoid reallocating it over and over.
job_t job;
}; };
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
......
...@@ -46,6 +46,12 @@ namespace dlib ...@@ -46,6 +46,12 @@ namespace dlib
ensures ensures
- #get_net() == a default initialized net_type object. - #get_net() == a default initialized net_type object.
- #get_solvers() == a set of default initialized solvers. - #get_solvers() == a set of default initialized solvers.
- #get_max_num_epochs() == 10000
- #get_mini_batch_size() == 128
- #get_step_size() == 1
- #get_min_step_size() == 1e-4
- #get_iterations_between_step_size_adjust() == 2000
- #get_step_size_shrink() == 0.1
!*/ !*/
explicit dnn_trainer( explicit dnn_trainer(
...@@ -55,6 +61,12 @@ namespace dlib ...@@ -55,6 +61,12 @@ namespace dlib
ensures ensures
- #get_net() == net - #get_net() == net
- #get_solvers() == a set of default initialized solvers. - #get_solvers() == a set of default initialized solvers.
- #get_max_num_epochs() == 10000
- #get_mini_batch_size() == 128
- #get_step_size() == 1
- #get_min_step_size() == 1e-4
- #get_iterations_between_step_size_adjust() == 2000
- #get_step_size_shrink() == 0.1
!*/ !*/
dnn_trainer( dnn_trainer(
...@@ -66,6 +78,12 @@ namespace dlib ...@@ -66,6 +78,12 @@ namespace dlib
- #get_net() == net - #get_net() == net
- #get_solvers() == a set of solvers that are all initialized with the - #get_solvers() == a set of solvers that are all initialized with the
provided solver instance. provided solver instance.
- #get_max_num_epochs() == 10000
- #get_mini_batch_size() == 128
- #get_step_size() == 1
- #get_min_step_size() == 1e-4
- #get_iterations_between_step_size_adjust() == 2000
- #get_step_size_shrink() == 0.1
!*/ !*/
const net_type& get_net ( const net_type& get_net (
...@@ -139,22 +157,107 @@ namespace dlib ...@@ -139,22 +157,107 @@ namespace dlib
- #get_mini_batch_size() == batch_size - #get_mini_batch_size() == batch_size
!*/ !*/
unsigned long get_num_epochs ( unsigned long get_max_num_epochs (
) const; ) const;
/*! /*!
ensures ensures
- Returns the number of passes over the training data we will execute when - train() will execute at most get_max_num_epochs() iterations over the
train() is called. training data before returning.
!*/ !*/
void set_num_epochs ( void set_max_num_epochs (
unsigned long num unsigned long num
); );
/*! /*!
requires requires
- num > 0 - num > 0
ensures ensures
- @get_num_epochs() == num - #get_max_num_epochs() == num
!*/
void set_setep_size (
double ss
);
/*!
requires
- ss > 0
ensures
- #get_step_size() == ss
!*/
double get_step_size(
) const;
/*!
ensures
- During each training step, a solver tells us how to modify the parameters
of each layer in the network. It does this by outputting a step vector,
that when added to the parameters, will hopefully result in improved
network performance. In our case, at during each step, we multiply the
step vector from the solver by get_step_size() before adding it to the
parameters. Therefore, get_step_size() controls the "learning rate" used
during training.
!*/
void set_min_step_size (
double ss
);
/*!
requires
- ss > 0
ensures
- #get_min_step_size() == ss
!*/
double get_min_step_size (
) const;
/*!
ensures
- During training, this object will test if progress is still being made
and if it isn't then it will reduce get_step_size() by setting it to
get_step_size()*get_step_size_shrink(). However, it will not reduce it
below get_min_step_size(). Once this minimum step size is crossed the
training will terminate.
!*/
void set_iterations_between_step_size_adjust (
unsigned long min_iter
);
/*!
ensures
- #get_iterations_between_step_size_adjust() == min_iter
!*/
unsigned long get_iterations_between_step_size_adjust (
) const;
/*!
ensures
- This object monitors the progress of training and estimates if the
training error is being reduced. It does this by looking at
get_iterations_between_step_size_adjust() mini-batch results and applying
the statistical test defined by the running_gradient object to see if the
training error is getting smaller.
Therefore, get_iterations_between_step_size_adjust() should always be set
to something sensibly large so that this test can be done with reasonably
high confidence.
!*/
void set_step_size_shrink_amount (
double shrink
);
/*!
requires
- 0 < shrink && shrink <= 1
ensures
- #get_step_size_shrink() == shrink
!*/
double get_step_size_shrink (
) const;
/*!
ensures
- Whenever the training routine thinks it isn't making progress anymore it
will reduce get_step_size() by multiplying it by get_step_size_shrink().
!*/ !*/
void be_verbose ( void be_verbose (
...@@ -185,8 +288,10 @@ namespace dlib ...@@ -185,8 +288,10 @@ namespace dlib
- Trains a supervised neural network based on the given training data. - Trains a supervised neural network based on the given training data.
The goal of training is to find the network parameters that minimize The goal of training is to find the network parameters that minimize
get_net().compute_loss(data.begin(), data.end(), labels.begin()). get_net().compute_loss(data.begin(), data.end(), labels.begin()).
- The optimizer will run for get_num_epochs() epochs and each layer in the - The optimizer will run until get_step_size() < get_min_step_size() or
network will be optimized by its corresponding solver in get_solvers(). get_max_num_epochs() training epochs have been executes.
- Each layer in the network will be optimized by its corresponding solver
in get_solvers().
- returns #get_net() - returns #get_net()
(i.e. the trained network can also be accessed by calling get_net() after (i.e. the trained network can also be accessed by calling get_net() after
train() finishes executing) train() finishes executing)
...@@ -213,8 +318,10 @@ namespace dlib ...@@ -213,8 +318,10 @@ namespace dlib
- Trains an unsupervised neural network based on the given training data. - Trains an unsupervised neural network based on the given training data.
The goal of training is to find the network parameters that minimize The goal of training is to find the network parameters that minimize
get_net().compute_loss(data.begin(), data.end()). get_net().compute_loss(data.begin(), data.end()).
- The optimizer will run for get_num_epochs() epochs and each layer in the - The optimizer will run until get_step_size() < get_min_step_size() or
network will be optimized by its corresponding solver in get_solvers(). get_max_num_epochs() training epochs have been executes.
- Each layer in the network will be optimized by its corresponding solver
in get_solvers().
- returns #get_net() - returns #get_net()
(i.e. the trained network can also be accessed by calling get_net() after (i.e. the trained network can also be accessed by calling get_net() after
train() finishes executing) train() finishes executing)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment