Commit d85de930 authored by Davis King's avatar Davis King

Split the update() methods into two parts. One that computes gradients

with respect to parameters and one that updates the parameters with those
gradients.
parent 8c64a656
...@@ -804,31 +804,42 @@ namespace dlib ...@@ -804,31 +804,42 @@ namespace dlib
const tensor& get_final_data_gradient( const tensor& get_final_data_gradient(
) const { return subnetwork->get_final_data_gradient(); } ) const { return subnetwork->get_final_data_gradient(); }
template <typename solver_type> void back_propagate_error(const tensor& x)
void update(const tensor& x, sstack<solver_type> solvers, double step_size)
{ {
update(x,private_get_gradient_input(),solvers,step_size); back_propagate_error(x, private_get_gradient_input());
} }
void back_propagate_error(const tensor& x, const tensor& gradient_input)
template <typename solver_type>
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
{ {
DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
dimpl::subnet_wrapper<subnet_type> wsub(*subnetwork); dimpl::subnet_wrapper<subnet_type> wsub(*subnetwork);
params_grad.copy_size(details.get_layer_params()); params_grad.copy_size(details.get_layer_params());
impl::call_layer_backward(details, private_get_output(), impl::call_layer_backward(details, private_get_output(),
gradient_input, wsub, static_cast<tensor&>(params_grad)); gradient_input, wsub, static_cast<tensor&>(params_grad));
subnetwork->back_propagate_error(x);
// zero out get_gradient_input()
gradient_input_is_stale = true;
}
template <typename solver_type>
void update_parameters(sstack<solver_type> solvers, double step_size)
{
DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
// Don't try to adjust the parameters if this layer doesn't have any. // Don't try to adjust the parameters if this layer doesn't have any.
if (params_grad.size() != 0) if (params_grad.size() != 0)
{ {
const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad)); const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
tt::add(1,details.get_layer_params(), step_size, step); tt::add(1,details.get_layer_params(), step_size, step);
} }
subnetwork->update(x, solvers.pop(), step_size); subnetwork->update_parameters(solvers.pop(), step_size);
gradient_input_is_stale = true;
} }
const tensor& get_parameter_gradient(
) const { return params_grad; }
tensor& get_parameter_gradient (
) { return params_grad; }
const subnet_type& subnet() const { return *subnetwork; } const subnet_type& subnet() const { return *subnetwork; }
subnet_type& subnet() { return *subnetwork; } subnet_type& subnet() { return *subnetwork; }
...@@ -847,7 +858,7 @@ namespace dlib ...@@ -847,7 +858,7 @@ namespace dlib
friend void serialize(const add_layer& item, std::ostream& out) friend void serialize(const add_layer& item, std::ostream& out)
{ {
int version = 1; int version = 2;
serialize(version, out); serialize(version, out);
serialize(*item.subnetwork, out); serialize(*item.subnetwork, out);
serialize(item.details, out); serialize(item.details, out);
...@@ -856,13 +867,14 @@ namespace dlib ...@@ -856,13 +867,14 @@ namespace dlib
serialize(item.get_output_and_gradient_input_disabled, out); serialize(item.get_output_and_gradient_input_disabled, out);
serialize(item.x_grad, out); serialize(item.x_grad, out);
serialize(item.cached_output, out); serialize(item.cached_output, out);
serialize(item.params_grad, out);
} }
friend void deserialize(add_layer& item, std::istream& in) friend void deserialize(add_layer& item, std::istream& in)
{ {
int version = 0; int version = 0;
deserialize(version, in); deserialize(version, in);
if (version != 1) if (!(1 <= version && version <= 2))
throw serialization_error("Unexpected version found while deserializing dlib::add_layer."); throw serialization_error("Unexpected version found while deserializing dlib::add_layer.");
deserialize(*item.subnetwork, in); deserialize(*item.subnetwork, in);
deserialize(item.details, in); deserialize(item.details, in);
...@@ -871,6 +883,8 @@ namespace dlib ...@@ -871,6 +883,8 @@ namespace dlib
deserialize(item.get_output_and_gradient_input_disabled, in); deserialize(item.get_output_and_gradient_input_disabled, in);
deserialize(item.x_grad, in); deserialize(item.x_grad, in);
deserialize(item.cached_output, in); deserialize(item.cached_output, in);
if (version == 2)
deserialize(item.params_grad, in);
} }
friend std::ostream& operator<< (std::ostream& out, const add_layer& item) friend std::ostream& operator<< (std::ostream& out, const add_layer& item)
...@@ -910,6 +924,7 @@ namespace dlib ...@@ -910,6 +924,7 @@ namespace dlib
std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled); std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
std::swap(x_grad, item.x_grad); std::swap(x_grad, item.x_grad);
std::swap(cached_output, item.cached_output); std::swap(cached_output, item.cached_output);
std::swap(params_grad, item.params_grad);
} }
...@@ -924,10 +939,10 @@ namespace dlib ...@@ -924,10 +939,10 @@ namespace dlib
resizable_tensor x_grad; resizable_tensor x_grad;
resizable_tensor cached_output; resizable_tensor cached_output;
// The following 2 objects don't logically contribute to the state of this class.
// They are only here to prevent them from being reallocated over and over in
// member functions.
resizable_tensor params_grad; resizable_tensor params_grad;
// temp_tensor doesn't logically contribute to the state of this object.
// It is here only to prevent it from being reallocated over and over.
resizable_tensor temp_tensor; resizable_tensor temp_tensor;
}; };
...@@ -1118,16 +1133,12 @@ namespace dlib ...@@ -1118,16 +1133,12 @@ namespace dlib
const tensor& get_final_data_gradient( const tensor& get_final_data_gradient(
) const { return grad_final; } ) const { return grad_final; }
template <typename solver_type> void back_propagate_error(const tensor& x)
void update(const tensor& x, sstack<solver_type> solvers, double step_size)
{ {
return update(x,private_get_gradient_input(),solvers, step_size); back_propagate_error(x, private_get_gradient_input());
} }
void back_propagate_error(const tensor& x, const tensor& gradient_input)
template <typename solver_type>
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
{ {
DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
// make sure grad_final is initialized to 0 // make sure grad_final is initialized to 0
if (!have_same_dimensions(x, grad_final)) if (!have_same_dimensions(x, grad_final))
grad_final.copy_size(x); grad_final.copy_size(x);
...@@ -1138,15 +1149,27 @@ namespace dlib ...@@ -1138,15 +1149,27 @@ namespace dlib
impl::call_layer_backward(details, private_get_output(), impl::call_layer_backward(details, private_get_output(),
gradient_input, wsub, static_cast<tensor&>(params_grad)); gradient_input, wsub, static_cast<tensor&>(params_grad));
// Don't try to adjust the parameters if this layer doesn't have any. // zero out get_gradient_input()
if (params_grad.size() != 0) gradient_input_is_stale = true;
}
template <typename solver_type>
void update_parameters(sstack<solver_type> solvers, double step_size)
{ {
DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
// Don't try to adjust the parameters if this layer doesn't have any.
if (params_grad.size() != 0) {
const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad)); const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
tt::add(1,details.get_layer_params(), step_size, step); tt::add(1,details.get_layer_params(), step_size, step);
} }
gradient_input_is_stale = true;
} }
const tensor& get_parameter_gradient(
) const { return params_grad; }
tensor& get_parameter_gradient (
) { return params_grad; }
const subnet_type& subnet() const { return input_layer; } const subnet_type& subnet() const { return input_layer; }
subnet_type& subnet() { return input_layer; } subnet_type& subnet() { return input_layer; }
...@@ -1347,18 +1370,27 @@ namespace dlib ...@@ -1347,18 +1370,27 @@ namespace dlib
const tensor& get_final_data_gradient( const tensor& get_final_data_gradient(
) const { return subnetwork.get_final_data_gradient(); } ) const { return subnetwork.get_final_data_gradient(); }
template <typename solver_type> void back_propagate_error(const tensor& x)
void update(const tensor& x, sstack<solver_type> solvers, double step_size)
{ {
subnetwork.update(x,solvers, step_size); subnetwork.back_propagate_error(x);
}
void back_propagate_error(const tensor& x, const tensor& gradient_input)
{
subnetwork.back_propagate_error(x,gradient_input);
} }
template <typename solver_type> template <typename solver_type>
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size) void update_parameters(sstack<solver_type> solvers, double step_size)
{ {
subnetwork.update(x,gradient_input,solvers, step_size); subnetwork.update_parameters(solvers, step_size);
} }
const tensor& get_parameter_gradient(
) const { return params_grad; }
tensor& get_parameter_gradient (
) { return params_grad; }
const subnet_type& subnet() const { return subnetwork; } const subnet_type& subnet() const { return subnetwork; }
subnet_type& subnet() { return subnetwork; } subnet_type& subnet() { return subnetwork; }
...@@ -1430,6 +1462,11 @@ namespace dlib ...@@ -1430,6 +1462,11 @@ namespace dlib
{ return subnetwork.private_get_gradient_input(); } { return subnetwork.private_get_gradient_input(); }
subnet_type subnetwork; subnet_type subnetwork;
// This member doesn't logically contribute to the state of the object since it is
// always empty. It's just here so we can have the get_parameter_gradient() methods
// which have to return something. So they return this empty tensor.
resizable_tensor params_grad;
}; };
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
...@@ -1598,32 +1635,42 @@ namespace dlib ...@@ -1598,32 +1635,42 @@ namespace dlib
return details[0].get_gradient_input(); return details[0].get_gradient_input();
} }
template <typename solver_type> const tensor& get_parameter_gradient(
void update(const tensor& x, sstack<solver_type> solvers, double step_size) ) const { return details[0].get_parameter_gradient(); }
tensor& get_parameter_gradient (
) { return details[0].get_parameter_gradient(); }
void back_propagate_error(const tensor& x)
{ {
update(x,private_get_gradient_input(),solvers,step_size); back_propagate_error(x, private_get_gradient_input());
} }
void back_propagate_error(const tensor& x, const tensor& gradient_input)
template <typename solver_type>
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
{ {
const auto cnt = (REPEATED_LAYER<SUBNET>::num_computational_layers-SUBNET::num_computational_layers);
if (details.size() > 1) if (details.size() > 1)
{ {
details[0].update(details[1].get_output(), gradient_input, solvers,step_size); details[0].back_propagate_error(details[1].get_output(), gradient_input);
for (size_t i = 1; i < details.size(); ++i) for (size_t i = 1; i < details.size(); ++i)
{ {
if (i+1 < details.size()) if (i+1 < details.size())
details[i].update(details[i+1].get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i),step_size); details[i].back_propagate_error(details[i+1].get_output(), details[i-1].get_final_data_gradient());
else else
details[i].update(subnetwork.get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i),step_size); details[i].back_propagate_error(subnetwork.get_output(), details[i-1].get_final_data_gradient());
} }
} }
else else
{ {
details[0].update(subnetwork.get_output(), gradient_input, solvers,step_size); details[0].back_propagate_error(subnetwork.get_output(), gradient_input);
} }
subnetwork.update(x, details.back().get_final_data_gradient(), solvers.pop(cnt*details.size()),step_size); subnetwork.back_propagate_error(x, details.back().get_final_data_gradient());
}
template <typename solver_type>
void update_parameters(sstack<solver_type> solvers, double step_size)
{
for (size_t i = 0; i < details.size(); ++i)
details[i].update_parameters(solvers.pop(comp_layers_in_each_group*i),step_size);
subnetwork.update_parameters(solvers.pop(comp_layers_in_each_group*details.size()),step_size);
} }
const subnet_type& subnet() const { return subnetwork; } const subnet_type& subnet() const { return subnetwork; }
...@@ -1827,25 +1874,19 @@ namespace dlib ...@@ -1827,25 +1874,19 @@ namespace dlib
return grad_final; return grad_final;
} }
template <typename solver_type> void back_propagate_error(const tensor& /*x*/)
void update( {
const tensor& /*x*/, // nothing to do
sstack<solver_type> /*solvers*/, }
double /*step_size*/ void back_propagate_error(const tensor& /*x*/, const tensor& /*gradient_input*/)
)
{ {
// nothing to update // nothing to do
} }
template <typename solver_type> template <typename solver_type>
void update( void update_parameters(sstack<solver_type> /*solvers*/, double /*step_size*/)
const tensor& /*x*/,
const tensor& /*gradient_input*/,
sstack<solver_type> /*solvers*/,
double /*step_size*/
)
{ {
// nothing to update // nothing to do
} }
const subnet_type& subnet() const { return input_layer; } const subnet_type& subnet() const { return input_layer; }
...@@ -2141,58 +2182,55 @@ namespace dlib ...@@ -2141,58 +2182,55 @@ namespace dlib
return compute_loss(temp_tensor); return compute_loss(temp_tensor);
} }
template <typename label_iterator, typename solver_type> template <typename label_iterator>
double update ( double compute_parameter_gradients (
const tensor& x, const tensor& x,
label_iterator lbegin, label_iterator lbegin
sstack<solver_type> solvers,
double step_size
) )
{ {
subnetwork.forward(x); subnetwork.forward(x);
dimpl::subnet_wrapper<subnet_type> wsub(subnetwork); dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
double l = loss.compute_loss(x, lbegin, wsub); double l = loss.compute_loss(x, lbegin, wsub);
subnetwork.update(x, solvers, step_size); subnetwork.back_propagate_error(x);
return l; return l;
} }
template <typename input_iterator, typename label_iterator>
template <typename input_iterator, typename label_iterator, typename solver_type> double compute_parameter_gradients (
double update (
input_iterator ibegin, input_iterator ibegin,
input_iterator iend, input_iterator iend,
label_iterator lbegin, label_iterator lbegin
sstack<solver_type> solvers,
double step_size
) )
{ {
to_tensor(ibegin,iend,temp_tensor); to_tensor(ibegin,iend,temp_tensor);
return update(temp_tensor, lbegin, solvers, step_size); return compute_parameter_gradients(temp_tensor, lbegin);
} }
double compute_parameter_gradients (
template <typename solver_type> const tensor& x
double update (
const tensor& x,
sstack<solver_type> solvers,
double step_size
) )
{ {
subnetwork.forward(x); subnetwork.forward(x);
dimpl::subnet_wrapper<subnet_type> wsub(subnetwork); dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
double l = loss.compute_loss(x, wsub); double l = loss.compute_loss(x, wsub);
subnetwork.update(x, solvers, step_size); subnetwork.back_propagate_error(x);
return l; return l;
} }
template <typename input_iterator>
template <typename input_iterator, typename solver_type> double compute_parameter_gradients (
double update (
input_iterator ibegin, input_iterator ibegin,
input_iterator iend, input_iterator iend
)
{
to_tensor(ibegin,iend,temp_tensor);
return compute_parameter_gradients(temp_tensor);
}
template <typename solver_type>
void update_parameters (
sstack<solver_type> solvers, sstack<solver_type> solvers,
double step_size double step_size
) )
{ {
to_tensor(ibegin,iend,temp_tensor); subnetwork.update_parameters(solvers, step_size);
return update(temp_tensor, solvers, step_size);
} }
const subnet_type& subnet() const { return subnetwork; } const subnet_type& subnet() const { return subnetwork; }
...@@ -2477,18 +2515,24 @@ namespace dlib ...@@ -2477,18 +2515,24 @@ namespace dlib
return subnetwork.get_final_data_gradient(); return subnetwork.get_final_data_gradient();
} }
template <typename solver_type> void back_propagate_error(const tensor& x)
void update(const tensor& x, sstack<solver_type> solvers)
{ {
subnetwork.update(x,solvers); subnetwork.back_propagate_error(x);
} }
template <typename solver_type> template <typename solver_type>
void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers) void update_parameters(sstack<solver_type> solvers, double step_size)
{ {
subnetwork.update(x,gradient_input,solvers); subnetwork.update_parameters(solvers, step_size);
} }
const tensor& get_parameter_gradient(
) const { return params_grad; }
tensor& get_parameter_gradient (
) { return params_grad; }
const subnet_type& subnet() const const subnet_type& subnet() const
{ {
return subnetwork; return subnetwork;
...@@ -2558,6 +2602,11 @@ namespace dlib ...@@ -2558,6 +2602,11 @@ namespace dlib
{ return layer<TAG_TYPE>(subnetwork).private_get_gradient_input(); } { return layer<TAG_TYPE>(subnetwork).private_get_gradient_input(); }
subnet_type subnetwork; subnet_type subnetwork;
// This member doesn't logically contribute to the state of the object since it is
// always empty. It's just here so we can have the get_parameter_gradient() methods
// which have to return something. So they return this empty tensor.
resizable_tensor params_grad;
}; };
template <template<typename> class T, typename U> template <template<typename> class T, typename U>
struct is_nonloss_layer_type<add_skip_layer<T,U>> : std::true_type {}; struct is_nonloss_layer_type<add_skip_layer<T,U>> : std::true_type {};
......
...@@ -410,33 +410,53 @@ namespace dlib ...@@ -410,33 +410,53 @@ namespace dlib
/*! /*!
ensures ensures
- returns the error gradient for this network. That is, this is the error - returns the error gradient for this network. That is, this is the error
gradient that this network will use to update itself when update() is gradient that this network will use to compute parameter gradients when
called. Therefore, when performing back propagation, layers that sit on back_propagate_error() is called. Therefore, when performing back
top of this network layer write their back propagated error gradients propagation, layers that sit on top of this network layer write their
into get_gradient_input(). Or to put it another way, during back back-propagated error gradients into get_gradient_input(). Or to put it
propagation, layers take the contents of their get_gradient_input() and another way, during back-propagation, layers take the contents of their
back propagate it through themselves and store the results into their get_gradient_input() and back-propagate it through themselves and store
subnetwork's get_gradient_input(). the result into their subnetwork's get_gradient_input().
This means you should consider get_gradient_input() as an input to the This means you should consider get_gradient_input() as an input to the
update() method. back_propagate_error() method.
!*/ !*/
const tensor& get_final_data_gradient( const tensor& get_final_data_gradient(
) const; ) const;
/*! /*!
ensures ensures
- if update() has been called to back-propagate a gradient through this - if back_propagate_error() has been called to back-propagate a gradient
network then you can call get_final_data_gradient() to obtain the last through this network then you can call get_final_data_gradient() to
gradient computed. That is, this function returns the gradient of the obtain the last data gradient computed. That is, this function returns
network with respect to its inputs. the gradient of the network with respect to its inputs.
- Note that there is only one "final data gradient" for an entire network,
not one per layer, since there is only one input to the entire network.
!*/ !*/
template <typename solver_type> const tensor& get_parameter_gradient(
void update( ) const;
const tensor& x, /*!
sstack<solver_type> solvers, ensures
double step_size - if back_propagate_error() has been called then you can call
get_parameter_gradient() to find the gradient of this layer's parameters.
When we update the parameters by calling update_parameters(), it will use
the gradient in get_parameter_gradient() to perform the update.
Therefore, you should consider get_parameter_gradient() as an input to
update_parameters().
!*/
tensor& get_parameter_gradient (
);
/*!
ensures
- returns a non-const reference to the tensor returned by the above
get_parameter_gradient() method. You could use this method to modify the
parameter gradient in some way before invoking update_parameters().
!*/
void back_propagate_error(
const tensor& x
); );
/*! /*!
requires requires
...@@ -445,28 +465,21 @@ namespace dlib ...@@ -445,28 +465,21 @@ namespace dlib
subsequently modified in any way. subsequently modified in any way.
- get_gradient_input() has been set equal to the gradient of this network's - get_gradient_input() has been set equal to the gradient of this network's
output with respect to some loss function. output with respect to some loss function.
- The given solvers have only ever been used with this network. That
is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object.
- solvers.size() >= num_computational_layers
- 0 < step_size <= 1
ensures ensures
- Back propagates the error gradient, get_gradient_input(), through this - Back propagates the error gradient, get_gradient_input(), through this
network and uses the provided solvers to update the network parameters. network and computes parameter and data gradients, via backpropagation.
- The parameter delta vector output by the solvers is multiplied by Specifically, this function populates get_final_data_gradient() and also,
step_size before being added to the parameters. for each layer, the tensor returned by get_parameter_gradient().
- All elements of #get_gradient_input() are set to 0. - All elements of #get_gradient_input() are set to 0.
- have_same_dimensions(#get_final_data_gradient(), x) == true - have_same_dimensions(#get_final_data_gradient(), x) == true.
- have_same_dimensions(#get_parameter_gradient(), layer_details().get_layer_params()) == true.
- #get_final_data_gradient() contains the gradient of the network with - #get_final_data_gradient() contains the gradient of the network with
respect to x. respect to x.
!*/ !*/
template <typename solver_type> void back_propagate_error(
void update(
const tensor& x, const tensor& x,
const tensor& gradient_input, const tensor& gradient_input
sstack<solver_type> solvers,
double step_size
); );
/*! /*!
requires requires
...@@ -474,27 +487,45 @@ namespace dlib ...@@ -474,27 +487,45 @@ namespace dlib
Moreover, this was the most recent call to forward() and x has not been Moreover, this was the most recent call to forward() and x has not been
subsequently modified in any way. subsequently modified in any way.
- have_same_dimensions(gradient_input, get_output()) == true - have_same_dimensions(gradient_input, get_output()) == true
- The given solvers have only ever been used with this network. That
is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object.
- solvers.size() >= num_computational_layers
- 0 < step_size <= 1
ensures ensures
- This function is identical to the version of update() defined immediately - This function is identical to the version of back_propagate_error()
above except that it back-propagates gradient_input through the network defined immediately above except that it back-propagates gradient_input
instead of get_gradient_input(). Therefore, this version of update is through the network instead of get_gradient_input(). Therefore, this
equivalent to performing: version of back_propagate_error() is equivalent to performing:
get_gradient_input() = gradient_input; get_gradient_input() = gradient_input;
update(x,solvers); back_propagate_error(x);
Except that calling update(x,gradient_input,solvers) avoids the copy Except that calling back_propagate_error(x,gradient_input) avoids the
and is therefore slightly more efficient. copy and is therefore slightly more efficient.
- The parameter delta vector output by the solvers is multiplied by
step_size before being added to the parameters.
- All elements of #get_gradient_input() are set to 0. - All elements of #get_gradient_input() are set to 0.
- have_same_dimensions(#get_final_data_gradient(), x) == true.
- have_same_dimensions(#get_parameter_gradient(), layer_details().get_layer_params()) == true.
- #get_final_data_gradient() contains the gradient of the network with - #get_final_data_gradient() contains the gradient of the network with
respect to x. respect to x.
!*/ !*/
template <typename solver_type>
void update_parameters(
sstack<solver_type> solvers,
double step_size
);
/*!
requires
- solver_type is an implementation of the EXAMPLE_SOLVER interface defined
in solvers_abstract.h
- back_propagate_error() has been called.
- The given solvers have only ever been used with this network. That is,
if you want to call update_parameters() on some other neural network
object then you must NOT reuse the same solvers object.
- solvers.size() >= num_computational_layers
- 0 < step_size <= 1
ensures
- Updates all the parameters in the network. In particular, we pass each
layer's parameter gradient (i.e. the tensor returned by the layer's
get_parameter_gradient() member) through that layer's corresponding
solver object. This produces a parameter delta vector and we add
step_size times that vector to the layer's parameters.
!*/
void clean( void clean(
); );
/*! /*!
...@@ -831,12 +862,10 @@ namespace dlib ...@@ -831,12 +862,10 @@ namespace dlib
// ------------- // -------------
template <typename label_iterator, typename solver_type> template <typename label_iterator>
double update ( double compute_parameter_gradients (
const tensor& x, const tensor& x,
label_iterator lbegin, label_iterator lbegin
sstack<solver_type> solvers,
double step_size
); );
/*! /*!
requires requires
...@@ -844,31 +873,22 @@ namespace dlib ...@@ -844,31 +873,22 @@ namespace dlib
- x.num_samples() > 0 - x.num_samples() > 0
- lbegin == iterator pointing to the start of a range of - lbegin == iterator pointing to the start of a range of
x.num_samples()/sample_expansion_factor label_type elements. x.num_samples()/sample_expansion_factor label_type elements.
- The given solvers have only ever been used with this network. That
is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object.
- solvers.size() >= num_computational_layers
- 0 < step_size <= 1
ensures ensures
- runs x through the network, compares the output to the expected output - runs x through the network, compares the output to the expected output
pointed to by lbegin, and updates the network parameters via pointed to by lbegin, and computes parameter and data gradients with
backpropagation. respect to the loss, via backpropagation. Specifically, this function
updates get_final_data_gradient() and also, for each layer, the tensor
returned by get_parameter_gradient().
- for all valid k: - for all valid k:
- the expected label of the kth sample in x is *(lbegin+k/sample_expansion_factor). - the expected label of the kth sample in x is *(lbegin+k/sample_expansion_factor).
- The provided solvers are used to update the parameters in each layer of
the network.
- The parameter delta vector output by the solvers is multiplied by
step_size before being added to the parameters.
- returns compute_loss(x,lbegin) - returns compute_loss(x,lbegin)
!*/ !*/
template <typename input_iterator, typename label_iterator, typename solver_type> template <typename input_iterator, typename label_iterator>
double update ( double compute_parameter_gradients (
input_iterator ibegin, input_iterator ibegin,
input_iterator iend, input_iterator iend,
label_iterator lbegin, label_iterator lbegin
sstack<solver_type> solvers,
double step_size
); );
/*! /*!
requires requires
...@@ -876,77 +896,72 @@ namespace dlib ...@@ -876,77 +896,72 @@ namespace dlib
- std::distance(ibegin,iend) > 0 - std::distance(ibegin,iend) > 0
- lbegin == iterator pointing to the start of a range of - lbegin == iterator pointing to the start of a range of
std::distance(ibegin,iend) label_type elements. std::distance(ibegin,iend) label_type elements.
- The given solvers have only ever been used with this network. That
is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object.
- solvers.size() >= num_computational_layers
- 0 < step_size <= 1
ensures ensures
- runs [ibegin,iend) through the network, compares the output to the - runs [ibegin,iend) through the network, compares the output to the
expected output pointed to by lbegin, and updates the network parameters expected output pointed to by lbegin, and computes parameter and data
via backpropagation. gradients with respect to the loss, via backpropagation. Specifically,
this function updates get_final_data_gradient() and also, for each layer,
the tensor returned by get_parameter_gradient().
- for all valid k: - for all valid k:
- the expected label of *(ibegin+k) is *(lbegin+k). - the expected label of *(ibegin+k) is *(lbegin+k).
- The provided solvers are used to update the parameters in each layer of
the network.
- The parameter delta vector output by the solvers is multiplied by
step_size before being added to the parameters.
- returns compute_loss(ibegin,iend,lbegin) - returns compute_loss(ibegin,iend,lbegin)
!*/ !*/
// ------------- double compute_parameter_gradients (
const tensor& x
template <typename solver_type>
double update (
const tensor& x,
sstack<solver_type> solvers,
double step_size
); );
/*! /*!
requires requires
- LOSS_DETAILS is an unsupervised loss. i.e. label_type==no_label_type. - LOSS_DETAILS is an unsupervised loss. i.e. label_type==no_label_type.
- x.num_samples()%sample_expansion_factor == 0 - x.num_samples()%sample_expansion_factor == 0
- x.num_samples() > 0 - x.num_samples() > 0
- The given solvers have only ever been used with this network. That
is, if you want to call update() on some other neural network object then
you must NOT reuse the same solvers object.
- solvers.size() >= num_computational_layers
- 0 < step_size <= 1
ensures ensures
- runs x through the network and updates the network parameters by - runs x through the network and computes parameter and data gradients with
back-propagating the loss gradient through the network. respect to the loss, via backpropagation. Specifically, this function
- The provided solvers are used to update the parameters in each layer of updates get_final_data_gradient() and also, for each layer, the tensor
the network. returned by get_parameter_gradient().
- The parameter delta vector output by the solvers is multiplied by
step_size before being added to the parameters.
- returns compute_loss(x) - returns compute_loss(x)
!*/ !*/
template <typename input_iterator, typename solver_type> template <typename input_iterator>
double update ( double compute_parameter_gradients (
input_iterator ibegin, input_iterator ibegin,
input_iterator iend, input_iterator iend
sstack<solver_type> solvers,
double step_size
); );
/*! /*!
requires requires
- LOSS_DETAILS is an unsupervised loss. i.e. label_type==no_label_type. - LOSS_DETAILS is an unsupervised loss. i.e. label_type==no_label_type.
- [ibegin, iend) is an iterator range over input_type objects. - [ibegin, iend) is an iterator range over input_type objects.
- std::distance(ibegin,iend) > 0 - std::distance(ibegin,iend) > 0
ensures
- runs [ibegin,iend) through the network and computes parameter and data
gradients with respect to the loss, via backpropagation. Specifically,
this function updates get_final_data_gradient() and also, for each layer,
the tensor returned by get_parameter_gradient().
- returns compute_loss(ibegin,iend)
!*/
template <typename solver_type>
void update_parameters (
sstack<solver_type> solvers,
double step_size
);
/*!
requires
- solver_type is an implementation of the EXAMPLE_SOLVER interface defined
in solvers_abstract.h
- compute_parameter_gradients() has been called.
- The given solvers have only ever been used with this network. That - The given solvers have only ever been used with this network. That
is, if you want to call update() on some other neural network object then is, if you want to call update_parameters() on some other neural network
you must NOT reuse the same solvers object. object then you must NOT reuse the same solvers object.
- solvers.size() >= num_computational_layers - solvers.size() >= num_computational_layers
- 0 < step_size <= 1 - 0 < step_size <= 1
ensures ensures
- runs [ibegin,iend) through the network and updates the network parameters - Updates all the parameters in the network. In particular, we pass each
by back-propagating the loss gradient through the network. layer's parameter gradient (i.e. the tensor returned by the layer's
- The provided solvers are used to update the parameters in each layer of get_parameter_gradient() member) through that layer's corresponding
the network. solver object. This produces a parameter delta vector and we add
- The parameter delta vector output by the solvers is multiplied by step_size times that vector to the layer's parameters.
step_size before being added to the parameters.
- returns compute_loss(ibegin,iend)
!*/ !*/
// ------------- // -------------
......
...@@ -418,14 +418,16 @@ namespace dlib ...@@ -418,14 +418,16 @@ namespace dlib
template <typename T> template <typename T>
void run_update(job_t& next_job, const T&) void run_update(job_t& next_job, const T&)
{ {
double loss = net.update(next_job.t, next_job.labels.begin(), make_sstack(solvers),step_size); double loss = net.compute_parameter_gradients(next_job.t, next_job.labels.begin());
net.update_parameters(make_sstack(solvers),step_size);
record_loss(loss); record_loss(loss);
} }
void run_update(job_t& next_job, const no_label_type&) void run_update(job_t& next_job, const no_label_type&)
{ {
no_label_type pick_which_run_update; no_label_type pick_which_run_update;
double loss = net.update(next_job.t, make_sstack(solvers), step_size); double loss = net.compute_parameter_gradients(next_job.t);
net.update_parameters(make_sstack(solvers), step_size);
record_loss(loss); record_loss(loss);
} }
...@@ -438,8 +440,9 @@ namespace dlib ...@@ -438,8 +440,9 @@ namespace dlib
job_t next_job; job_t next_job;
while(job_pipe.dequeue(next_job)) while(job_pipe.dequeue(next_job))
{ {
// call net.update() but pick the right version for unsupervised or // call net.compute_parameter_gradients() and net.update_parameters() but
// supervised training based on the type of label_type. // pick the right version for unsupervised or supervised training based on
// the type of label_type.
run_update(next_job, pick_which_run_update); run_update(next_job, pick_which_run_update);
// If we have been running for a while then check if the loss is still // If we have been running for a while then check if the loss is still
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment