Commit c1433b3d authored by Davis King's avatar Davis King

Upgrade the layer interface so that you can implement layers that operate

in-place.
parent 69490292
......@@ -61,7 +61,171 @@ namespace dlib
return std::make_tuple(std::get<indices>(item)...);
}
}
template <typename T> struct alwaysbool { typedef bool type; };
resizable_tensor& rt();
// The significance of a layer's backward method requiring forward's outputs is
// that such as layer can't have an in-place layer stacked on top of it because
// in-place layers overwrite the output of the layer they sit on top of.
template <typename layer_type, typename SUBNET>
constexpr auto backward_requires_forward_output(
layer_type& layer,
SUBNET& sub
) -> typename alwaysbool<decltype(layer.backward(rt(),rt(),sub,rt()))>::type
{
return true;
}
template <typename layer_type, typename SUBNET>
constexpr auto backward_requires_forward_output(
layer_type& layer,
SUBNET& sub
) -> typename alwaysbool<decltype(layer.backward(rt(),sub,rt()))>::type
{
return false;
}
template <typename layer_type, typename SUBNET>
constexpr auto backward_requires_forward_output(
layer_type& layer,
SUBNET& sub
) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),rt(),sub.get_gradient_input(),rt()))>::type
{
return true;
}
template <typename layer_type, typename SUBNET>
constexpr auto has_inplace_backward(
layer_type& layer,
SUBNET& sub
) -> typename alwaysbool<decltype(layer.backward(rt(),rt(),sub,rt()))>::type
{
return false;
}
template <typename layer_type, typename SUBNET>
constexpr auto has_inplace_backward(
layer_type& layer,
SUBNET& sub
) -> typename alwaysbool<decltype(layer.backward(rt(),sub,rt()))>::type
{
return false;
}
template <typename layer_type, typename SUBNET>
constexpr auto has_inplace_backward(
layer_type& layer,
SUBNET& sub
) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),rt(),sub.get_gradient_input(),rt()))>::type
{
return true;
}
template <typename layer_type, typename SUBNET>
constexpr auto is_inplace_layer(
layer_type& layer,
const SUBNET& sub
) -> typename alwaysbool<decltype(layer.forward(sub,rt()))>::type
{
return false;
}
template <typename layer_type, typename SUBNET>
constexpr auto is_inplace_layer(
layer_type& layer,
const SUBNET& sub
) -> typename alwaysbool<decltype(layer.forward_inplace(sub.get_output(),rt()))>::type
{
return true;
}
template <typename layer_type, typename SUBNET>
auto call_layer_backward(
layer_type& layer,
const tensor& computed_output,
const tensor& gradient_input,
SUBNET& sub,
tensor& params_grad
) -> decltype(layer.backward(computed_output,gradient_input,sub,params_grad))
{
layer.backward(computed_output,gradient_input,sub,params_grad);
}
template <typename layer_type, typename SUBNET>
auto call_layer_backward(
layer_type& layer,
const tensor& ,
const tensor& gradient_input,
SUBNET& sub,
tensor& params_grad
) -> decltype(layer.backward(gradient_input,sub,params_grad))
{
layer.backward(gradient_input,sub,params_grad);
}
template <typename layer_type, typename SUBNET>
auto call_layer_backward(
layer_type& layer,
const tensor& computed_output,
const tensor& gradient_input,
SUBNET& sub,
tensor& params_grad
) -> decltype(layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad))
{
layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad);
}
template <typename layer_type, typename SUBNET>
auto call_layer_forward(
layer_type& layer,
const SUBNET& sub,
tensor& data_output
) -> decltype(layer.forward(sub,rt()))
{
// This overload of call_layer_forward() is here because this template
// naturally gets instantiated but only on code paths that never get executed.
// So rather than writing a bunch of hard to read template magic around call
// sites we just have this overload that doesn't do anything (and an assert to
// make sure that's the case).
DLIB_CASSERT(false, "This should never happen");
}
template <typename layer_type, typename SUBNET>
auto call_layer_forward(
layer_type& layer,
const SUBNET& sub,
resizable_tensor& data_output
) -> decltype(layer.forward(sub,data_output))
{
layer.forward(sub,data_output);
}
template <typename layer_type, typename SUBNET>
auto call_layer_forward(
layer_type& layer,
const SUBNET& sub,
tensor& data_output
) -> decltype(layer.forward_inplace(sub.get_output(),data_output))
{
layer.forward_inplace(sub.get_output(),data_output);
}
template <typename layer_type, typename SUBNET>
auto call_layer_forward(
layer_type& layer,
const SUBNET& sub,
resizable_tensor& data_output
) -> decltype(layer.forward_inplace(sub.get_output(),data_output))
{
if (!have_same_dimensions(data_output, sub.get_output()))
data_output.copy_size(sub.get_output());
layer.forward_inplace(sub.get_output(),data_output);
}
} // end namespace impl
template <typename Head, typename... Tail>
std::tuple<Tail...> tuple_tail(
......@@ -162,7 +326,7 @@ namespace dlib
namespace dimpl
{
template <typename T, typename enabled=void>
template <typename T, bool is_first = true, typename enabled=void>
class subnet_wrapper
{
/*!
......@@ -173,6 +337,13 @@ namespace dlib
objects to the layer callbacks those callbacks won't be able to
interact with the subnetworks in a way other than specified
by the SUBNET interface spec.
We also allow the top layer of a subnet_wrapper stack to call the
private_get_output() and private_get_gradient_input() functions. This
way, layers that have had their output/gradient overwritten by in-place
layers can only be accessed from the in-place layers that sit directly
on top of them since those in-place layers are the only layers that
know how to interact with them properly.
!*/
public:
......@@ -185,7 +356,31 @@ namespace dlib
};
template <typename T>
class subnet_wrapper<T,typename std::enable_if<is_nonloss_layer_type<T>::value>::type>
class subnet_wrapper<T,true, typename std::enable_if<is_nonloss_layer_type<T>::value>::type>
{
public:
subnet_wrapper(const subnet_wrapper&) = delete;
subnet_wrapper& operator=(const subnet_wrapper&) = delete;
typedef T wrapped_type;
const static size_t num_layers = T::num_layers;
subnet_wrapper(T& l_) : l(l_),subnetwork(l.subnet()) {}
const tensor& get_output() const { return l.private_get_output(); }
tensor& get_gradient_input() { return l.private_get_gradient_input(); }
const subnet_wrapper<typename T::subnet_type>& subnet() const { subnetwork; }
subnet_wrapper<typename T::subnet_type>& subnet() { subnetwork; }
private:
T& l;
subnet_wrapper<typename T::subnet_type,false> subnetwork;
};
template <typename T>
class subnet_wrapper<T,false, typename std::enable_if<is_nonloss_layer_type<T>::value>::type>
{
public:
......@@ -231,8 +426,11 @@ namespace dlib
add_layer(
):
this_layer_setup_called(false),
gradient_input_is_stale(true)
gradient_input_is_stale(true),
get_output_and_gradient_input_disabled(false)
{
if (this_layer_operates_inplace())
subnetwork.disable_output_and_gradient_getters();
}
add_layer(const add_layer&) = default;
......@@ -242,6 +440,8 @@ namespace dlib
template <typename T, typename U, typename E>
friend class add_layer;
template <typename T, bool is_first, typename E>
friend class dimpl::subnet_wrapper;
// Allow copying networks from one to another as long as their corresponding
// layers can be constructed from each other.
......@@ -253,9 +453,12 @@ namespace dlib
details(item.layer_details()),
this_layer_setup_called(item.this_layer_setup_called),
gradient_input_is_stale(item.gradient_input_is_stale),
get_output_and_gradient_input_disabled(item.get_output_and_gradient_input_disabled),
x_grad(item.x_grad),
cached_output(item.cached_output)
{
if (this_layer_operates_inplace())
subnetwork.disable_output_and_gradient_getters();
}
template <typename ...T>
......@@ -266,8 +469,11 @@ namespace dlib
details(layer_det),
subnetwork(std::forward<T>(args)...),
this_layer_setup_called(false),
gradient_input_is_stale(true)
gradient_input_is_stale(true),
get_output_and_gradient_input_disabled(false)
{
if (this_layer_operates_inplace())
subnetwork.disable_output_and_gradient_getters();
}
template <typename ...T>
......@@ -278,8 +484,11 @@ namespace dlib
details(std::move(layer_det)),
subnetwork(std::forward<T>(args)...),
this_layer_setup_called(false),
gradient_input_is_stale(true)
gradient_input_is_stale(true),
get_output_and_gradient_input_disabled(false)
{
if (this_layer_operates_inplace())
subnetwork.disable_output_and_gradient_getters();
}
template <typename ...T, typename ...U>
......@@ -290,8 +499,11 @@ namespace dlib
details(std::get<0>(layer_det)),
subnetwork(tuple_tail(layer_det),std::forward<T>(args)...),
this_layer_setup_called(false),
gradient_input_is_stale(true)
gradient_input_is_stale(true),
get_output_and_gradient_input_disabled(false)
{
if (this_layer_operates_inplace())
subnetwork.disable_output_and_gradient_getters();
}
template <typename ...T, typename ...U>
......@@ -343,21 +555,54 @@ namespace dlib
details.setup(wsub);
this_layer_setup_called = true;
}
details.forward(wsub, cached_output);
if (this_layer_operates_inplace())
impl::call_layer_forward(details, wsub, private_get_output());
else
impl::call_layer_forward(details, wsub, cached_output);
gradient_input_is_stale = true;
return get_output();
return private_get_output();
}
const tensor& get_output() const { return cached_output; }
tensor& get_gradient_input()
private:
tensor& private_get_output() const
{
if (gradient_input_is_stale)
if (const_cast<add_layer&>(*this).this_layer_operates_inplace())
return subnetwork.private_get_output();
else
return const_cast<resizable_tensor&>(cached_output);
}
tensor& private_get_gradient_input()
{
if (this_layer_operates_inplace())
{
gradient_input_is_stale = false;
x_grad.copy_size(get_output());
x_grad = 0;
return subnetwork.private_get_gradient_input();
}
return x_grad;
else
{
if (gradient_input_is_stale)
{
gradient_input_is_stale = false;
x_grad.copy_size(private_get_output());
x_grad = 0;
}
return x_grad;
}
}
void disable_output_and_gradient_getters (
) { get_output_and_gradient_input_disabled = true; }
public:
const tensor& get_output() const
{
if (get_output_and_gradient_input_disabled)
throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it.");
return private_get_output();
}
tensor& get_gradient_input()
{
if (get_output_and_gradient_input_disabled)
throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it.");
return private_get_gradient_input();
}
template <typename solver_type>
......@@ -365,11 +610,13 @@ namespace dlib
{
dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
params_grad.copy_size(details.get_layer_params());
details.backward(get_output(), get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
impl::call_layer_backward(details, private_get_output(),
private_get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
// Don't try to adjust the parameters if this layer doesn't have any.
if (params_grad.size() != 0)
solvers.top()(details, static_cast<const tensor&>(params_grad));
subnetwork.update(x, solvers.pop());
gradient_input_is_stale = true;
}
const subnet_type& subnet() const { return subnetwork; }
......@@ -396,6 +643,7 @@ namespace dlib
serialize(item.details, out);
serialize(item.this_layer_setup_called, out);
serialize(item.gradient_input_is_stale, out);
serialize(item.get_output_and_gradient_input_disabled, out);
serialize(item.x_grad, out);
serialize(item.cached_output, out);
}
......@@ -410,18 +658,34 @@ namespace dlib
deserialize(item.details, in);
deserialize(item.this_layer_setup_called, in);
deserialize(item.gradient_input_is_stale, in);
deserialize(item.get_output_and_gradient_input_disabled, in);
deserialize(item.x_grad, in);
deserialize(item.cached_output, in);
}
private:
bool this_layer_operates_inplace(
)
{
// This layer can run in-place if it's an in-place capable layer and also if
// the layer it's on top of doesn't need it's own output tensor (since in-place
// layers overwrite that tensor)
return impl::is_inplace_layer(details, subnetwork) && !subnetwork.this_layer_requires_forward_output();
}
bool this_layer_requires_forward_output(
)
{
return impl::backward_requires_forward_output(details, subnetwork);
}
void swap(add_layer& item)
{
std::swap(subnetwork,item.subnetwork);
std::swap(details, item.details);
std::swap(this_layer_setup_called, item.this_layer_setup_called);
std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
std::swap(x_grad, item.x_grad);
std::swap(cached_output, item.cached_output);
}
......@@ -431,6 +695,10 @@ namespace dlib
LAYER_DETAILS details;
bool this_layer_setup_called;
bool gradient_input_is_stale;
bool get_output_and_gradient_input_disabled;
// Note that if this_layer_operates_inplace()==true then x_grad and cached_output
// are not used at all. Instead, this layer uses these variables from the lower
// layer.
resizable_tensor x_grad;
resizable_tensor cached_output;
......@@ -461,7 +729,8 @@ namespace dlib
add_layer(
):
this_layer_setup_called(false),
gradient_input_is_stale(true)
gradient_input_is_stale(true),
get_output_and_gradient_input_disabled(false)
{}
add_layer(const add_layer&) = default;
......@@ -471,6 +740,8 @@ namespace dlib
template <typename T, typename U, typename E>
friend class add_layer;
template <typename T, bool is_first, typename E>
friend class dimpl::subnet_wrapper;
// Allow copying networks from one to another as long as their corresponding
// layers can be constructed from each other.
......@@ -482,6 +753,7 @@ namespace dlib
details(item.layer_details()),
this_layer_setup_called(item.this_layer_setup_called),
gradient_input_is_stale(item.gradient_input_is_stale),
get_output_and_gradient_input_disabled(false),
x_grad(item.x_grad),
cached_output(item.cached_output)
{
......@@ -492,7 +764,8 @@ namespace dlib
) :
details(layer_det),
this_layer_setup_called(false),
gradient_input_is_stale(true)
gradient_input_is_stale(true),
get_output_and_gradient_input_disabled(false)
{}
add_layer(
......@@ -500,7 +773,8 @@ namespace dlib
) :
details(std::move(layer_det)),
this_layer_setup_called(false),
gradient_input_is_stale(true)
gradient_input_is_stale(true),
get_output_and_gradient_input_disabled(false)
{}
add_layer(
......@@ -510,7 +784,8 @@ namespace dlib
details(std::move(layer_det)),
input_layer(std::move(il)),
this_layer_setup_called(false),
gradient_input_is_stale(true)
gradient_input_is_stale(true),
get_output_and_gradient_input_disabled(false)
{}
add_layer(
......@@ -577,33 +852,50 @@ namespace dlib
details.setup(wsub);
this_layer_setup_called = true;
}
details.forward(wsub, cached_output);
impl::call_layer_forward(details, wsub, cached_output);
gradient_input_is_stale = true;
return get_output();
return private_get_output();
}
const tensor& get_output() const { return cached_output; }
tensor& get_gradient_input()
private:
tensor& private_get_output() const { return const_cast<resizable_tensor&>(cached_output); }
tensor& private_get_gradient_input()
{
if (gradient_input_is_stale)
{
gradient_input_is_stale = false;
x_grad.copy_size(get_output());
x_grad.copy_size(private_get_output());
x_grad = 0;
}
return x_grad;
}
void disable_output_and_gradient_getters (
) { get_output_and_gradient_input_disabled = true; }
public:
const tensor& get_output() const
{
if (get_output_and_gradient_input_disabled)
throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it.");
return private_get_output();
}
tensor& get_gradient_input()
{
if (get_output_and_gradient_input_disabled)
throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it.");
return private_get_gradient_input();
}
template <typename solver_type>
void update(const tensor& x, sstack<solver_type,num_layers>& solvers)
{
subnet_wrapper wsub(x, grad_final_ignored);
params_grad.copy_size(details.get_layer_params());
details.backward(get_output(), get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
impl::call_layer_backward(details, private_get_output(),
private_get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
// Don't try to adjust the parameters if this layer doesn't have any.
if (params_grad.size() != 0)
solvers.top()(details, static_cast<const tensor&>(params_grad));
gradient_input_is_stale = true;
}
const subnet_type& subnet() const { return input_layer; }
......@@ -630,6 +922,7 @@ namespace dlib
serialize(item.details, out);
serialize(item.this_layer_setup_called, out);
serialize(item.gradient_input_is_stale, out);
serialize(item.get_output_and_gradient_input_disabled, out);
serialize(item.x_grad, out);
serialize(item.cached_output, out);
}
......@@ -644,12 +937,20 @@ namespace dlib
deserialize(item.details, in);
deserialize(item.this_layer_setup_called, in);
deserialize(item.gradient_input_is_stale, in);
deserialize(item.get_output_and_gradient_input_disabled, in);
deserialize(item.x_grad, in);
deserialize(item.cached_output, in);
}
private:
bool this_layer_requires_forward_output(
)
{
subnet_wrapper wsub(grad_final_ignored, grad_final_ignored);
return impl::backward_requires_forward_output(details, wsub);
}
class subnet_wrapper
{
public:
......@@ -685,6 +986,7 @@ namespace dlib
std::swap(details, item.details);
std::swap(this_layer_setup_called, item.this_layer_setup_called);
std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
std::swap(x_grad, item.x_grad);
std::swap(cached_output, item.cached_output);
}
......@@ -693,6 +995,7 @@ namespace dlib
LAYER_DETAILS details;
bool this_layer_setup_called;
bool gradient_input_is_stale;
bool get_output_and_gradient_input_disabled;
resizable_tensor x_grad;
resizable_tensor cached_output;
......@@ -1493,10 +1796,13 @@ namespace dlib
}
tensor& get_mutable_output() { return output; }
const tensor& get_output() const { return output; }
const tensor& private_get_output() const { return get_output(); }
const test_layer_subnet& subnet() const { init_sub(); return *subnetwork; }
tensor& get_gradient_input() { return gradient_input; }
tensor& private_get_gradient_input() { return get_gradient_input(); }
test_layer_subnet& subnet() { init_sub(); return *subnetwork; }
......@@ -1578,7 +1884,7 @@ namespace dlib
// (since we do a lazy layer creation thing based on calls to subnet() inside
// test_layer_subnet).
l.setup(subnetwork);
l.forward(subnetwork, output);
impl::call_layer_forward(l, subnetwork, output);
resizable_tensor input_grad;
input_grad.copy_size(output);
......@@ -1605,11 +1911,71 @@ namespace dlib
// comparing them to a central differences approximation.
resizable_tensor params_grad;
params_grad.copy_size(l.get_layer_params());
// Set the params grad to something crazy so that it's very obvious if it doesn't
// get fully assigned.
// But first, set the params grad to something crazy so that it's very obvious if
// it doesn't get fully assigned.
params_grad = std::numeric_limits<float>::infinity();
l.backward(output, input_grad, subnetwork, params_grad);
impl::call_layer_backward(l, output, input_grad, subnetwork, params_grad);
static_assert(impl::is_inplace_layer(l, subnetwork) == impl::has_inplace_backward(l, subnetwork),
"Layer not defined correctly. forward and backward methods must either both be in-place or both out-of-place. ");
// Make sure the outputs of forward() and backward() are the same when they are run
// in in-place mode.
if (impl::is_inplace_layer(l, subnetwork))
{
test_layer_subnet subnetwork2(rnd);
layer_details_type ll(l);
ll.setup(subnetwork2);
resizable_tensor ip_out;
impl::call_layer_forward(ll, subnetwork2, ip_out);
impl::call_layer_forward(ll, subnetwork2, subnetwork2.get_mutable_output());
const auto forward_error = max(abs(mat(ip_out) - mat(subnetwork2.get_output())));
if (forward_error > 0.00001)
{
using namespace std;
sout << "This layer is supposed to support in-place computations but the output of forward_inplace()\n";
sout << "changes when invoked in-place vs. out-of-place. The error was: " << forward_error << endl;
return layer_test_results(sout.str());
}
resizable_tensor params_grad;
params_grad.copy_size(ll.get_layer_params());
params_grad = std::numeric_limits<float>::infinity();
resizable_tensor input_grad;
input_grad.copy_size(ip_out);
fill_with_gassuan_random_numbers(input_grad, rnd);
resizable_tensor params_grad1, params_grad2, data_grad1, data_grad2;
params_grad1 = params_grad;
params_grad2 = params_grad;
// Now call backward() and make sure it works as well.
subnetwork2.get_gradient_input() = 9999;
impl::call_layer_backward(ll, ip_out, input_grad, subnetwork2, params_grad1);
data_grad1 = subnetwork2.get_gradient_input();
subnetwork2.get_gradient_input() = mat(input_grad);
impl::call_layer_backward(ll, ip_out, subnetwork2.get_gradient_input(), subnetwork2, params_grad2);
data_grad2 = subnetwork2.get_gradient_input();
if (params_grad.size() != 0)
{
const auto backward_param_error = max(abs(mat(params_grad1) - mat(params_grad2)));
if (backward_param_error > 0.00001)
{
using namespace std;
sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n";
sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_param_error << endl;
return layer_test_results(sout.str());
}
}
const auto backward_data_error = max(abs(mat(data_grad1) - mat(data_grad2)));
if (backward_data_error > 0.00001)
{
using namespace std;
sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n";
sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_data_error << endl;
return layer_test_results(sout.str());
}
}
// ==================================================================
// first validate the way the parameter gradients are computed
......@@ -1622,9 +1988,10 @@ namespace dlib
eps = base_eps;
const float oldval = l1.get_layer_params().host()[i];
l1.get_layer_params().host()[i] = oldval+eps;
l1.forward(subnetwork, out2);
impl::call_layer_forward(l1, subnetwork, out2);
l1.get_layer_params().host()[i] = oldval-eps;
l1.forward(subnetwork, out3);
impl::call_layer_forward(l1, subnetwork, out3);
l1.get_layer_params().host()[i] = oldval;
// Compute a reference derivative via a central differences approximation and
// compare it to the one output by the layer and make sure they match.
......@@ -1635,8 +2002,8 @@ namespace dlib
{
using namespace std;
sout << "Gradient error in parameter #" << i <<". Relative error: "<< relative_error << endl;
sout << "expected derivative: " << reference_derivative << endl;
sout << "output derivative: " << output_derivative << endl;
sout << "expected derivative: " << reference_derivative << endl;
sout << "output derivative: " << output_derivative << endl;
return layer_test_results(sout.str());
}
......@@ -1651,21 +2018,24 @@ namespace dlib
if (eps == 0)
eps = base_eps;
subnetwork.get_output_element(i) = oldval+eps;
l.forward(subnetwork, out2);
impl::call_layer_forward(l, subnetwork, out2);
subnetwork.get_output_element(i) = oldval-eps;
l.forward(subnetwork, out3);
impl::call_layer_forward(l, subnetwork, out3);
subnetwork.get_output_element(i) = oldval;
// Compute a reference derivative via a central differences approximation and
// compare it to the one output by the layer and make sure they match.
double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
double output_derivative = subnetwork.get_gradient_input_element(i)-initial_gradient_input[i];
double output_derivative = subnetwork.get_gradient_input_element(i);
if (!impl::is_inplace_layer(l,subnetwork))
output_derivative -= initial_gradient_input[i];
double relative_error = (reference_derivative - output_derivative)/(reference_derivative + 1e-100);
if (std::abs(relative_error) > 0.01)
{
using namespace std;
sout << "Gradient error in data variable #" << i <<". Relative error: "<< relative_error << endl;
sout << "expected derivative: " << reference_derivative << endl;
sout << "output derivative: " << output_derivative << endl;
sout << "expected derivative: " << reference_derivative << endl;
sout << "output derivative: " << output_derivative << endl;
return layer_test_results(sout.str());
}
}
......
......@@ -389,6 +389,7 @@ namespace dlib
ensures
- Back propagates the error gradient, get_gradient_input(), through this
network and uses the provided solvers to update the network parameters.
- All elements of #get_gradient_input() are set to 0.
!*/
void clean(
......
......@@ -36,7 +36,7 @@ namespace dlib
}
template <typename SUBNET>
void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
{
// TODO
}
......@@ -89,7 +89,7 @@ namespace dlib
}
template <typename SUBNET>
void backward(const tensor& , const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
{
// compute the gradient of the parameters.
params_grad = trans(mat(sub.get_output()))*mat(gradient_input);
......@@ -145,20 +145,22 @@ namespace dlib
{
}
template <typename SUBNET>
void forward(const SUBNET& sub, resizable_tensor& output)
void forward_inplace(const tensor& input, tensor& output)
{
output.copy_size(sub.get_output());
output = lowerbound(mat(sub.get_output()), 0);
output = lowerbound(mat(input), 0);
}
template <typename SUBNET>
void backward(const tensor&, const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
void backward_inplace(
const tensor& computed_output,
const tensor& gradient_input,
tensor& data_grad,
tensor& params_grad
)
{
const float* grad = gradient_input.host();
const float* in = sub.get_output().host();
float* out = sub.get_gradient_input().host();
for (unsigned long i = 0; i < sub.get_output().size(); ++i)
const float* in = computed_output.host();
float* out = data_grad.host();
for (unsigned long i = 0; i < computed_output.size(); ++i)
{
if (in[i] > 0)
out[i] = grad[i];
......
......@@ -91,12 +91,28 @@ namespace dlib
produces an output tensor. You create an entire deep network by composing
these functions. Importantly, you are able to use a wide range of
different functions to accommodate the task you are trying to accomplish.
Dlib includes a number of common layer types but if you want to define your
own then you simply implement a class with the same interface as
EXAMPLE_LAYER_.
Therefore, dlib includes a number of common layer types but if you want to
define your own then you simply implement a class with the same interface
as EXAMPLE_LAYER_.
Note that there is no dlib::EXAMPLE_LAYER_ type. It is shown here purely
to document the interface that a layer object must implement.
The central work of defining a layer is implementing the forward and backward
methods. When you do this you have three options:
- Implement the forward() and backward() methods according to the
specification shown below. Do not implement forward_inplace() and
backward_inplace().
- Implement the forward() and backward() methods according to the
specification shown below, except exclude the computed_output
parameter from backward(). Doing this will allow dlib to make some
layers execute in-place and therefore run a little faster and use
less memory. Do not implement forward_inplace() and
backward_inplace().
- Implement the forward_inplace() and backward_inplace() methods
according to the specification shown below. Do not implement
forward() and backward(). These in-place methods allow some types of
layers to be implemented more efficiently.
!*/
public:
......@@ -152,7 +168,7 @@ namespace dlib
template <typename SUBNET>
void forward(
const SUBNET& sub,
resizable_tensor& output
resizable_tensor& data_output
);
/*!
requires
......@@ -160,14 +176,14 @@ namespace dlib
- setup() has been called.
ensures
- Runs the output of the subnetwork through this layer and stores the
output into #output. In particular, forward() can use any of the outputs
in sub (e.g. sub.get_output(), sub.subnet().get_output(), etc.) to
compute whatever it wants.
results into #data_output. In particular, forward() can use any of the
outputs in sub (e.g. sub.get_output(), sub.subnet().get_output(), etc.)
to compute whatever it wants.
!*/
template <typename SUBNET>
void backward(
const tensor& computed_output,
const tensor& computed_output, // this parameter is optional
const tensor& gradient_input,
SUBNET& sub,
tensor& params_grad
......@@ -189,7 +205,7 @@ namespace dlib
These gradients are stored into #sub and #params_grad, respectively. To be
precise, the gradients are taken of a function f(sub,get_layer_params())
which is defined thusly:
- Recalling that computed_output is a function of sub and get_layer_params()
- Recalling that computed_output is a function of both sub and get_layer_params(),
since it is the result of calling forward(sub,computed_output):
let f(sub,get_layer_params()) == dot(computed_output, gradient_input)
Then we define the following gradient vectors:
......@@ -207,6 +223,59 @@ namespace dlib
- layer<I>(sub).get_gradient_input() += DATA_GRADIENT_I
!*/
void forward_inplace(
const tensor& data_input,
tensor& data_output
);
/*!
requires
- have_same_dimensions(data_input,data_output) == true
- setup() has been called.
ensures
- Runs the data_input tensor though this layer and stores the output into
#data_output.
- This function supports in-place operation, i.e. having
is_same_object(data_input, data_output)==true
!*/
void backward_inplace(
const tensor& computed_output,
const tensor& gradient_input,
tensor& data_grad,
tensor& params_grad
);
/*!
requires
- setup() has been called.
- computed_output is the tensor resulting from the most recent call to
forward_inplace(). This means that backward_inplace() is allowed to
cache intermediate results computed during forward_inplace() and use them
for the backward computation.
- have_same_dimensions(gradient_input, data_grad) == true
- have_same_dimensions(gradient_input, computed_output) == true
- have_same_dimensions(params_grad, get_layer_params()) == true
ensures
- This function supports in-place operation, i.e. having
is_same_object(gradient_input, data_grad)==true
- This function outputs the gradients of this layer with respect to the
input data from a sublayer and also with respect to this layer's parameters.
These gradients are stored into #data_grad and #params_grad, respectively. To be
precise, the gradients are taken of a function f(data_input,get_layer_params())
which is defined thusly:
- Recalling that computed_output is a function of both the input to
forward_inplace() and get_layer_params(), since it is the result of
calling forward_inplace(data_input,computed_output):
let f(data_input,get_layer_params()) == dot(computed_output, gradient_input)
Then we define the following gradient vectors:
- PARAMETER_GRADIENT == gradient of f(data_input,get_layer_params()) with
respect to get_layer_params().
- DATA_GRADIENT == gradient of f(data_input,get_layer_params()) with respect
to data_input.
Finally, backward_inplace() outputs these gradients by performing:
- params_grad = PARAMETER_GRADIENT
- data_grad = DATA_GRADIENT
!*/
const tensor& get_layer_params(
) const;
/*!
......@@ -277,7 +346,7 @@ namespace dlib
template <typename SUBNET> void setup (const SUBNET& sub);
template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
template <typename SUBNET> void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
const tensor& get_layer_params() const;
tensor& get_layer_params();
/*!
......@@ -313,8 +382,8 @@ namespace dlib
);
template <typename SUBNET> void setup (const SUBNET& sub);
template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
template <typename SUBNET> void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
void forward_inplace(const tensor& input, tensor& output);
void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
const tensor& get_layer_params() const;
tensor& get_layer_params();
/*!
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment