Commit 9f92b082 authored by Davis King's avatar Davis King

Now training will automatically reduce the learning rate when it is clear that

the loss isn't being reduced.  Also, there is a stopping condition now based on
how large the current learning rate is.  That is, training stops when the learning
rate gets small enough and it is clear that no progress is being made.
parent 6f63bc62
......@@ -15,6 +15,8 @@
#include <utility>
#include <tuple>
#include <cmath>
#include "tensor_tools.h"
namespace dlib
......@@ -719,23 +721,27 @@ namespace dlib
) const { return subnetwork->get_final_data_gradient(); }
template <typename solver_type>
void update(const tensor& x, sstack<solver_type> solvers)
void update(const tensor& x, sstack<solver_type> solvers, double step_size)
{
update(x,private_get_gradient_input(),solvers);
update(x,private_get_gradient_input(),solvers,step_size);
}
template <typename solver_type>
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers)
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
{
DLIB_CASSERT(solvers.size()>=num_layers,"");
dimpl::subnet_wrapper<subnet_type> wsub(*subnetwork);
params_grad.copy_size(details.get_layer_params());
impl::call_layer_backward(details, private_get_output(),
gradient_input, wsub, static_cast<tensor&>(params_grad));
// Don't try to adjust the parameters if this layer doesn't have any.
if (params_grad.size() != 0)
solvers.top()(details, static_cast<const tensor&>(params_grad));
subnetwork->update(x, solvers.pop());
{
const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
tt::add(1,details.get_layer_params(), step_size, step);
}
subnetwork->update(x, solvers.pop(), step_size);
gradient_input_is_stale = true;
}
......@@ -1016,13 +1022,13 @@ namespace dlib
) const { return grad_final; }
template <typename solver_type>
void update(const tensor& x, sstack<solver_type> solvers)
void update(const tensor& x, sstack<solver_type> solvers, double step_size)
{
update(x,private_get_gradient_input(),solvers);
return update(x,private_get_gradient_input(),solvers, step_size);
}
template <typename solver_type>
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers)
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
{
DLIB_CASSERT(solvers.size()>=num_layers,"");
// make sure grad_final is initialized to 0
......@@ -1034,9 +1040,13 @@ namespace dlib
params_grad.copy_size(details.get_layer_params());
impl::call_layer_backward(details, private_get_output(),
gradient_input, wsub, static_cast<tensor&>(params_grad));
// Don't try to adjust the parameters if this layer doesn't have any.
if (params_grad.size() != 0)
solvers.top()(details, static_cast<const tensor&>(params_grad));
{
const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
tt::add(1,details.get_layer_params(), step_size, step);
}
gradient_input_is_stale = true;
}
......@@ -1225,15 +1235,15 @@ namespace dlib
) const { return subnetwork.get_final_data_gradient(); }
template <typename solver_type>
void update(const tensor& x, sstack<solver_type> solvers)
void update(const tensor& x, sstack<solver_type> solvers, double step_size)
{
subnetwork.update(x,solvers);
subnetwork.update(x,solvers, step_size);
}
template <typename solver_type>
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers)
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
{
subnetwork.update(x,gradient_input,solvers);
subnetwork.update(x,gradient_input,solvers, step_size);
}
const subnet_type& subnet() const { return subnetwork; }
......@@ -1462,31 +1472,31 @@ namespace dlib
}
template <typename solver_type>
void update(const tensor& x, sstack<solver_type> solvers)
void update(const tensor& x, sstack<solver_type> solvers, double step_size)
{
update(x,private_get_gradient_input(),solvers);
update(x,private_get_gradient_input(),solvers,step_size);
}
template <typename solver_type>
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers)
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
{
const auto cnt = (LAYER<SUBNET>::num_layers-SUBNET::num_layers);
if (details.size() > 1)
{
details[0].update(details[1].get_output(), gradient_input, solvers);
details[0].update(details[1].get_output(), gradient_input, solvers,step_size);
for (size_t i = 1; i < details.size(); ++i)
{
if (i+1 < details.size())
details[i].update(details[i+1].get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i));
details[i].update(details[i+1].get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i),step_size);
else
details[i].update(subnetwork.get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i));
details[i].update(subnetwork.get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i),step_size);
}
}
else
{
details[0].update(subnetwork.get_output(), gradient_input, solvers);
details[0].update(subnetwork.get_output(), gradient_input, solvers,step_size);
}
subnetwork.update(x, details.back().get_final_data_gradient(), solvers.pop(cnt*details.size()));
subnetwork.update(x, details.back().get_final_data_gradient(), solvers.pop(cnt*details.size()),step_size);
}
const subnet_type& subnet() const { return subnetwork; }
......@@ -1672,13 +1682,22 @@ namespace dlib
}
template <typename solver_type>
void update(const tensor& /*x*/, sstack<solver_type> /*solvers*/)
void update(
const tensor& /*x*/,
sstack<solver_type> /*solvers*/,
double /*step_size*/
)
{
// nothing to update
}
template <typename solver_type>
void update(const tensor& /*x*/, const tensor& gradient_input, sstack<solver_type> /*solvers*/)
void update(
const tensor& /*x*/,
const tensor& /*gradient_input*/,
sstack<solver_type> /*solvers*/,
double /*step_size*/
)
{
// nothing to update
}
......@@ -1948,13 +1967,14 @@ namespace dlib
double update (
const tensor& x,
label_iterator lbegin,
sstack<solver_type> solvers
sstack<solver_type> solvers,
double step_size
)
{
subnetwork.forward(x);
dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
double l = loss.compute_loss(x, lbegin, wsub);
subnetwork.update(x, solvers);
subnetwork.update(x, solvers, step_size);
return l;
}
......@@ -1963,23 +1983,25 @@ namespace dlib
input_iterator ibegin,
input_iterator iend,
label_iterator lbegin,
sstack<solver_type> solvers
sstack<solver_type> solvers,
double step_size
)
{
to_tensor(ibegin,iend,temp_tensor);
return update(temp_tensor, lbegin, solvers);
return update(temp_tensor, lbegin, solvers, step_size);
}
template <typename solver_type>
double update (
const tensor& x,
sstack<solver_type> solvers
sstack<solver_type> solvers,
double step_size
)
{
subnetwork.forward(x);
dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
double l = loss.compute_loss(x, wsub);
subnetwork.update(x, solvers);
subnetwork.update(x, solvers, step_size);
return l;
}
......@@ -1987,11 +2009,12 @@ namespace dlib
double update (
input_iterator ibegin,
input_iterator iend,
std::vector<solver_type>& solvers
sstack<solver_type> solvers,
double step_size
)
{
to_tensor(ibegin,iend,temp_tensor);
return update(temp_tensor, solvers);
return update(temp_tensor, solvers, step_size);
}
const subnet_type& subnet() const { return subnetwork; }
......
......@@ -392,7 +392,8 @@ namespace dlib
template <typename solver_type>
void update(
const tensor& x,
sstack<solver_type> solvers
sstack<solver_type> solvers,
double step_size
);
/*!
requires
......@@ -405,9 +406,12 @@ namespace dlib
is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object.
- solvers.size() >= num_layers
- 0 < step_size <= 1
ensures
- Back propagates the error gradient, get_gradient_input(), through this
network and uses the provided solvers to update the network parameters.
- The parameter delta vector output by the solvers is multiplied by
step_size before being added to the parameters.
- All elements of #get_gradient_input() are set to 0.
- have_same_dimensions(#get_final_data_gradient(), x) == true
- #get_final_data_gradient() contains the gradient of the network with
......@@ -418,7 +422,8 @@ namespace dlib
void update(
const tensor& x,
const tensor& gradient_input,
sstack<solver_type> solvers
sstack<solver_type> solvers,
double step_size
);
/*!
requires
......@@ -430,6 +435,7 @@ namespace dlib
is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object.
- solvers.size() >= num_layers
- 0 < step_size <= 1
ensures
- This function is identical to the version of update() defined immediately
above except that it back-propagates gradient_input through the network
......@@ -439,6 +445,8 @@ namespace dlib
update(x,solvers);
Except that calling update(x,gradient_input,solvers) avoids the copy
and is therefore slightly more efficient.
- The parameter delta vector output by the solvers is multiplied by
step_size before being added to the parameters.
- All elements of #get_gradient_input() are set to 0.
- #get_final_data_gradient() contains the gradient of the network with
respect to x.
......@@ -755,7 +763,8 @@ namespace dlib
double update (
const tensor& x,
label_iterator lbegin,
sstack<solver_type> solvers
sstack<solver_type> solvers,
double step_size
);
/*!
requires
......@@ -767,6 +776,7 @@ namespace dlib
is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object.
- solvers.size() >= num_layers
- 0 < step_size <= 1
ensures
- runs x through the network, compares the output to the expected output
pointed to by lbegin, and updates the network parameters via
......@@ -775,6 +785,8 @@ namespace dlib
- the expected label of the kth sample in x is *(lbegin+k/sample_expansion_factor).
- The provided solvers are used to update the parameters in each layer of
the network.
- The parameter delta vector output by the solvers is multiplied by
step_size before being added to the parameters.
- returns compute_loss(x,lbegin)
!*/
......@@ -783,7 +795,8 @@ namespace dlib
input_iterator ibegin,
input_iterator iend,
label_iterator lbegin,
sstack<solver_type> solvers
sstack<solver_type> solvers,
double step_size
);
/*!
requires
......@@ -795,6 +808,7 @@ namespace dlib
is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object.
- solvers.size() >= num_layers
- 0 < step_size <= 1
ensures
- runs [ibegin,iend) through the network, compares the output to the
expected output pointed to by lbegin, and updates the network parameters
......@@ -803,6 +817,8 @@ namespace dlib
- the expected label of *(ibegin+k) is *(lbegin+k).
- The provided solvers are used to update the parameters in each layer of
the network.
- The parameter delta vector output by the solvers is multiplied by
step_size before being added to the parameters.
- returns compute_loss(ibegin,iend,lbegin)
!*/
......@@ -811,7 +827,8 @@ namespace dlib
template <typename solver_type>
double update (
const tensor& x,
sstack<solver_type> solvers
sstack<solver_type> solvers,
double step_size
);
/*!
requires
......@@ -822,11 +839,14 @@ namespace dlib
is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object.
- solvers.size() >= num_layers
- 0 < step_size <= 1
ensures
- runs x through the network and updates the network parameters by
back-propagating the loss gradient through the network.
- The provided solvers are used to update the parameters in each layer of
the network.
- The parameter delta vector output by the solvers is multiplied by
step_size before being added to the parameters.
- returns compute_loss(x)
!*/
......@@ -834,7 +854,8 @@ namespace dlib
double update (
input_iterator ibegin,
input_iterator iend,
sstack<solver_type> solvers
sstack<solver_type> solvers,
double step_size
);
/*!
requires
......@@ -845,11 +866,14 @@ namespace dlib
is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object.
- solvers.size() >= num_layers
- 0 < step_size <= 1
ensures
- runs [ibegin,iend) through the network and updates the network parameters
by back-propagating the loss gradient through the network.
- The provided solvers are used to update the parameters in each layer of
the network.
- The parameter delta vector output by the solvers is multiplied by
step_size before being added to the parameters.
- returns compute_loss(ibegin,iend)
!*/
......
......@@ -33,24 +33,23 @@ namespace dlib
float get_learning_rate (
) const { return learning_rate; }
template <typename LAYER_DETAILS>
void operator() (
LAYER_DETAILS& l,
const tensor& operator() (
const tensor& params,
const tensor& params_grad
)
{
DLIB_CASSERT(l.get_layer_params().size() != 0,"");
DLIB_CASSERT(params.size() != 0,"");
if (v.size() == 0)
{
v.copy_size(params_grad);
v = 0;
}
tt::affine_transform(v, v, l.get_layer_params(), params_grad,
momentum, -weight_decay*learning_rate, -learning_rate, 0);
// perform l.get_layer_params() += v;
tt::affine_transform(l.get_layer_params(), l.get_layer_params(), v, 1, 1, 0);
//perform: v = momentum*mat(v) - weight_decay*learning_rate*mat(params) - learning_rate*mat(params_grad);
tt::affine_transform(v, v, params, params_grad,
momentum, -weight_decay*learning_rate, -learning_rate, 0);
return v;
}
friend void serialize(const sgd& item, std::ostream& out)
......
......@@ -18,11 +18,11 @@ namespace dlib
/*!
WHAT THIS OBJECT REPRESENTS
A solver defines the parameter update rule for a single layer in a deep
neural network. It takes a parameter gradient vector and a layer and
updates the layer's parameters. Importantly, each solver instance is used
with only one layer in a network. This allows us to define solvers that
have per layer state, for example, a solver may keep a momentum term and
apply it to its update rule.
neural network. It takes a parameter gradient vector and the layer's
parameters and tells you how the parameters should be updated.
Importantly, each solver instance is used with only one layer in a network.
This allows us to define solvers that have per layer state, for example, a
solver may keep a momentum term and apply it to its update rule.
Note that there is no dlib::EXAMPLE_SOLVER type. It is shown here purely
to document the interface a solver object must implement.
......@@ -33,22 +33,22 @@ namespace dlib
EXAMPLE_SOLVER(
);
template <typename LAYER_DETAILS>
void operator() (
LAYER_DETAILS& l,
const tensor& operator() (
const tensor& params,
const tensor& params_grad
);
)
/*!
requires
- LAYER_DETAILS implements the EXAMPLE_LAYER_ interface defined in
layers_abstract.h.
- l.get_layer_params().size() != 0
- have_same_dimensions(l.get_layer_params(), params_grad) == true.
- params.size() != 0
- have_same_dimensions(params, params_grad) == true.
- When this function is invoked on a particular solver instance, it is
always supplied with the same LAYER_DETAILS object.
always supplied with parameters from the same layer instance. That is,
the solver is allowed to remember things from one invocation to another
and to assume that it is being serially applied to optimize the same
parameters.
ensures
- Updates the parameters in l. That is, l.get_layer_params() is modified
based on the parameter gradient vector stored in params_grad.
- Returns a step vector V that is intended to be used to update the
parameters by adding V to params.
!*/
};
......
......@@ -7,13 +7,14 @@
#include "core.h"
#include "solvers.h"
#include "../statistics.h"
#include "../console_progress_indicator.h"
#include <chrono>
#include "../serialize.h"
#include "../pipe.h"
#include "../threads.h"
#include "cuda_dlib.h"
#include "../statistics/running_gradient.h"
#include <atomic>
namespace dlib
{
......@@ -95,15 +96,15 @@ namespace dlib
mini_batch_size = batch_size;
}
unsigned long get_num_epochs (
) const { return num_epochs; }
unsigned long get_max_num_epochs (
) const { return max_num_epochs; }
void set_num_epochs (
void set_max_num_epochs (
unsigned long num
)
{
DLIB_CASSERT(num > 0,"");
num_epochs = num;
max_num_epochs = num;
}
void be_verbose (
......@@ -159,14 +160,14 @@ namespace dlib
{
DLIB_CASSERT(data.size() == labels.size() && data.size() > 0, "");
console_progress_indicator pbar(num_epochs);
pbar.print_status(0);
for (unsigned long epoch_iteration = 0; epoch_iteration < num_epochs; ++epoch_iteration)
for (unsigned long epoch_iteration = 0;
epoch_iteration < max_num_epochs && step_size >= min_step_size;
++epoch_iteration)
{
using namespace std::chrono;
auto last_time = system_clock::now();
clear_average_loss();
for (size_t i = 0; i < data.size(); i += mini_batch_size)
for (size_t i = 0; i < data.size() && step_size >= min_step_size; i += mini_batch_size)
{
net.to_tensor(data.begin()+i,
data.begin()+std::min(i+mini_batch_size,data.size()),
......@@ -183,10 +184,10 @@ namespace dlib
{
last_time = now_time;
auto iter = epoch_iteration + i/(double)data.size();
std::cout << "epoch: " << rpad(cast_to_string(iter),string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
pbar.print_status(iter, true);
std::cout << std::endl;
std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " "
<< "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad)
<< std::endl;
}
}
}
......@@ -195,10 +196,10 @@ namespace dlib
{
// Capitalize the E in Epoch so it's easy to grep out the lines that
// are for full epoch status statements.
std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
pbar.print_status(epoch_iteration+1, true);
std::cout << std::endl;
std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " "
<< "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad)
<< std::endl;
}
}
return get_net();
......@@ -214,14 +215,14 @@ namespace dlib
static_assert(has_unsupervised_loss,
"You can only call this version of train() when using an unsupervised loss.");
console_progress_indicator pbar(num_epochs);
pbar.print_status(0);
for (unsigned long epoch_iteration = 0; epoch_iteration < num_epochs; ++epoch_iteration)
for (unsigned long epoch_iteration = 0;
epoch_iteration < max_num_epochs && step_size >= min_step_size;
++epoch_iteration)
{
using namespace std::chrono;
auto last_time = system_clock::now();
clear_average_loss();
for (size_t i = 0; i < data.size(); i += mini_batch_size)
for (size_t i = 0; i < data.size() && step_size >= min_step_size; i += mini_batch_size)
{
net.to_tensor(data.begin()+i,
data.begin()+std::min(i+mini_batch_size,data.size()),
......@@ -236,10 +237,10 @@ namespace dlib
{
last_time = now_time;
auto iter = epoch_iteration + i/(double)data.size();
std::cout << "epoch: " << rpad(cast_to_string(iter),string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
pbar.print_status(iter, true);
std::cout << std::endl;
std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " "
<< "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad)
<< std::endl;
}
}
}
......@@ -248,10 +249,10 @@ namespace dlib
{
// Capitalize the E in Epoch so it's easy to grep out the lines that
// are for full epoch status statements.
std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " ";
pbar.print_status(epoch_iteration+1, true);
std::cout << std::endl;
std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " "
<< "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad)
<< std::endl;
}
}
return get_net();
......@@ -260,14 +261,20 @@ namespace dlib
friend void serialize(const dnn_trainer& item, std::ostream& out)
{
item.wait_for_thread_to_pause();
int version = 2;
int version = 3;
serialize(version, out);
serialize(item.rs, out);
serialize(item.num_epochs, out);
serialize(item.rg, out);
serialize(item.max_num_epochs, out);
serialize(item.mini_batch_size, out);
serialize(item.verbose, out);
serialize(item.net, out);
serialize(item.solvers, out);
serialize(item.step_size.load(), out);
serialize(item.min_step_size, out);
serialize(item.iter_between_step_size_adjust.load(), out);
serialize(item.step_size_shrink.load(), out);
}
friend void deserialize(dnn_trainer& item, std::istream& in)
......@@ -275,14 +282,21 @@ namespace dlib
item.wait_for_thread_to_pause();
int version = 0;
deserialize(version, in);
if (version != 2)
if (version != 3)
throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer.");
double temp;
deserialize(item.rs, in);
deserialize(item.num_epochs, in);
deserialize(item.rg, in);
deserialize(item.max_num_epochs, in);
deserialize(item.mini_batch_size, in);
deserialize(item.verbose, in);
deserialize(item.net, in);
deserialize(item.solvers, in);
deserialize(temp, in); item.step_size = temp;
deserialize(item.min_step_size, in);
deserialize(temp, in); item.iter_between_step_size_adjust = temp;
deserialize(temp, in); item.step_size_shrink = temp;
}
double get_average_loss (
......@@ -299,6 +313,62 @@ namespace dlib
rs.clear();
}
void set_setep_size (
double ss
)
{
DLIB_CASSERT(ss > 0,"");
wait_for_thread_to_pause();
step_size = ss;
}
double get_step_size(
) const
{
return step_size;
}
void set_min_step_size (
double ss
)
{
DLIB_CASSERT(ss > 0,"");
min_step_size = ss;
}
double get_min_step_size (
) const
{
return min_step_size;
}
void set_iterations_between_step_size_adjust (
unsigned long min_iter
)
{
iter_between_step_size_adjust = min_iter;
}
unsigned long get_iterations_between_step_size_adjust (
) const
{
return iter_between_step_size_adjust;
}
void set_step_size_shrink_amount (
double shrink
)
{
DLIB_CASSERT(0 < shrink && shrink <= 1,"");
step_size_shrink = shrink;
}
double get_step_size_shrink (
) const
{
return step_size_shrink;
}
private:
struct job_t
{
......@@ -309,16 +379,20 @@ namespace dlib
template <typename T>
void run_update(job_t& next_job, const T&)
{
rs.add(net.update(next_job.t, next_job.labels.begin(), make_sstack(solvers)));
double loss = net.update(next_job.t, next_job.labels.begin(), make_sstack(solvers),step_size);
rs.add(loss);
rg.add(loss);
}
void run_update(job_t& next_job, const no_label_type&)
{
no_label_type pick_wich_run_update;
rs.add(net.update(next_job.t, make_sstack(solvers)));
double loss = net.update(next_job.t, make_sstack(solvers), step_size);
rs.add(loss);
rg.add(loss);
}
void thread()
void thread() try
{
// Make sure this thread uses the same cuda device as the thread that created
// the dnn_trainer object.
......@@ -330,7 +404,23 @@ namespace dlib
// call net.update() but pick the right version for unsupervised or
// supervised training based on the type of label_type.
run_update(next_job, pick_wich_run_update);
// If we have been running for a while then check if the loss is still
// dropping. If it isn't then we will reduce the step size.
if (rg.current_n() > iter_between_step_size_adjust)
{
if (rg.probability_gradient_greater_than(0) > 0.45)
{
step_size = step_size_shrink*step_size;
}
rg.clear();
}
}
}
catch(std::exception& e)
{
std::cerr << e.what() << std::endl;
throw;
}
void wait_for_thread_to_pause() const
......@@ -339,29 +429,40 @@ namespace dlib
}
const static long string_pad = 10;
const static long epoch_string_pad = 4;
const static long ss_string_pad = 4;
void init()
{
num_epochs = 300;
mini_batch_size = 32;
max_num_epochs = 10000;
mini_batch_size = 128;
verbose = false;
cuda_device_id = dlib::cuda::get_device();
step_size = 1;
min_step_size = 1e-4;
iter_between_step_size_adjust = 2000;
step_size_shrink = 0.1;
start();
}
// The job object is not logically part of the state of this object. It is here
// only to avoid reallocating it over and over.
job_t job;
dlib::pipe<job_t> job_pipe;
running_stats<double> rs;
unsigned long num_epochs;
running_gradient rg;
unsigned long max_num_epochs;
size_t mini_batch_size;
bool verbose;
int cuda_device_id;
net_type net;
std::vector<solver_type> solvers;
std::atomic<double> step_size;
double min_step_size;
std::atomic<long> iter_between_step_size_adjust;
std::atomic<double> step_size_shrink;
// The job object is not logically part of the state of this object. It is here
// only to avoid reallocating it over and over.
job_t job;
};
// ----------------------------------------------------------------------------------------
......
......@@ -46,6 +46,12 @@ namespace dlib
ensures
- #get_net() == a default initialized net_type object.
- #get_solvers() == a set of default initialized solvers.
- #get_max_num_epochs() == 10000
- #get_mini_batch_size() == 128
- #get_step_size() == 1
- #get_min_step_size() == 1e-4
- #get_iterations_between_step_size_adjust() == 2000
- #get_step_size_shrink() == 0.1
!*/
explicit dnn_trainer(
......@@ -55,6 +61,12 @@ namespace dlib
ensures
- #get_net() == net
- #get_solvers() == a set of default initialized solvers.
- #get_max_num_epochs() == 10000
- #get_mini_batch_size() == 128
- #get_step_size() == 1
- #get_min_step_size() == 1e-4
- #get_iterations_between_step_size_adjust() == 2000
- #get_step_size_shrink() == 0.1
!*/
dnn_trainer(
......@@ -66,6 +78,12 @@ namespace dlib
- #get_net() == net
- #get_solvers() == a set of solvers that are all initialized with the
provided solver instance.
- #get_max_num_epochs() == 10000
- #get_mini_batch_size() == 128
- #get_step_size() == 1
- #get_min_step_size() == 1e-4
- #get_iterations_between_step_size_adjust() == 2000
- #get_step_size_shrink() == 0.1
!*/
const net_type& get_net (
......@@ -139,22 +157,107 @@ namespace dlib
- #get_mini_batch_size() == batch_size
!*/
unsigned long get_num_epochs (
unsigned long get_max_num_epochs (
) const;
/*!
ensures
- Returns the number of passes over the training data we will execute when
train() is called.
- train() will execute at most get_max_num_epochs() iterations over the
training data before returning.
!*/
void set_num_epochs (
void set_max_num_epochs (
unsigned long num
);
/*!
requires
- num > 0
ensures
- @get_num_epochs() == num
- #get_max_num_epochs() == num
!*/
void set_setep_size (
double ss
);
/*!
requires
- ss > 0
ensures
- #get_step_size() == ss
!*/
double get_step_size(
) const;
/*!
ensures
- During each training step, a solver tells us how to modify the parameters
of each layer in the network. It does this by outputting a step vector,
that when added to the parameters, will hopefully result in improved
network performance. In our case, at during each step, we multiply the
step vector from the solver by get_step_size() before adding it to the
parameters. Therefore, get_step_size() controls the "learning rate" used
during training.
!*/
void set_min_step_size (
double ss
);
/*!
requires
- ss > 0
ensures
- #get_min_step_size() == ss
!*/
double get_min_step_size (
) const;
/*!
ensures
- During training, this object will test if progress is still being made
and if it isn't then it will reduce get_step_size() by setting it to
get_step_size()*get_step_size_shrink(). However, it will not reduce it
below get_min_step_size(). Once this minimum step size is crossed the
training will terminate.
!*/
void set_iterations_between_step_size_adjust (
unsigned long min_iter
);
/*!
ensures
- #get_iterations_between_step_size_adjust() == min_iter
!*/
unsigned long get_iterations_between_step_size_adjust (
) const;
/*!
ensures
- This object monitors the progress of training and estimates if the
training error is being reduced. It does this by looking at
get_iterations_between_step_size_adjust() mini-batch results and applying
the statistical test defined by the running_gradient object to see if the
training error is getting smaller.
Therefore, get_iterations_between_step_size_adjust() should always be set
to something sensibly large so that this test can be done with reasonably
high confidence.
!*/
void set_step_size_shrink_amount (
double shrink
);
/*!
requires
- 0 < shrink && shrink <= 1
ensures
- #get_step_size_shrink() == shrink
!*/
double get_step_size_shrink (
) const;
/*!
ensures
- Whenever the training routine thinks it isn't making progress anymore it
will reduce get_step_size() by multiplying it by get_step_size_shrink().
!*/
void be_verbose (
......@@ -185,8 +288,10 @@ namespace dlib
- Trains a supervised neural network based on the given training data.
The goal of training is to find the network parameters that minimize
get_net().compute_loss(data.begin(), data.end(), labels.begin()).
- The optimizer will run for get_num_epochs() epochs and each layer in the
network will be optimized by its corresponding solver in get_solvers().
- The optimizer will run until get_step_size() < get_min_step_size() or
get_max_num_epochs() training epochs have been executes.
- Each layer in the network will be optimized by its corresponding solver
in get_solvers().
- returns #get_net()
(i.e. the trained network can also be accessed by calling get_net() after
train() finishes executing)
......@@ -213,8 +318,10 @@ namespace dlib
- Trains an unsupervised neural network based on the given training data.
The goal of training is to find the network parameters that minimize
get_net().compute_loss(data.begin(), data.end()).
- The optimizer will run for get_num_epochs() epochs and each layer in the
network will be optimized by its corresponding solver in get_solvers().
- The optimizer will run until get_step_size() < get_min_step_size() or
get_max_num_epochs() training epochs have been executes.
- Each layer in the network will be optimized by its corresponding solver
in get_solvers().
- returns #get_net()
(i.e. the trained network can also be accessed by calling get_net() after
train() finishes executing)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment