Commit 93e786db authored by Fm's avatar Fm

Merge branch 'master' of https://github.com/davisking/dlib into dnn_group_layer

parents 59892409 91163863
...@@ -488,6 +488,13 @@ namespace dlib ...@@ -488,6 +488,13 @@ namespace dlib
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
struct general_ {};
struct special_ : general_ {};
template<typename> struct int_ { typedef int type; };
// ----------------------------------------------------------------------------------------
/*!A is_same_object /*!A is_same_object
This is a templated function which checks if both of its arguments are actually This is a templated function which checks if both of its arguments are actually
......
...@@ -24,6 +24,38 @@ ...@@ -24,6 +24,38 @@
namespace dlib namespace dlib
{ {
// ----------------------------------------------------------------------------------------
namespace impl
{
template <typename T, typename int_<decltype(&T::get_learning_rate_multiplier)>::type = 0>
double get_learning_rate_multiplier (
const T& obj,
special_
) { return obj.get_learning_rate_multiplier(); }
template <typename T>
double get_learning_rate_multiplier ( const T& obj, general_) { return 1; }
}
template <typename T>
double get_learning_rate_multiplier(const T& obj) { return impl::get_learning_rate_multiplier(obj, special_()); }
// ----------------------------------------------------------------------------------------
namespace impl
{
template <typename T, typename int_<decltype(&T::get_weight_decay_multiplier)>::type = 0>
double get_weight_decay_multiplier (
const T& obj,
special_
) { return obj.get_weight_decay_multiplier(); }
template <typename T>
double get_weight_decay_multiplier ( const T& obj, general_) { return 1; }
}
template <typename T>
double get_weight_decay_multiplier(const T& obj) { return impl::get_weight_decay_multiplier(obj, special_()); }
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
namespace impl namespace impl
...@@ -458,7 +490,7 @@ namespace dlib ...@@ -458,7 +490,7 @@ namespace dlib
sstack pop(size_t num=1) sstack pop(size_t num=1)
{ {
DLIB_CASSERT(num < size(), "You can't pop more things from the stack than it has in it."); DLIB_CASSERT(num <= size(), "You can't pop more things from the stack than it has in it.");
return sstack(data+num, mysize-num); return sstack(data+num, mysize-num);
} }
...@@ -849,8 +881,9 @@ namespace dlib ...@@ -849,8 +881,9 @@ namespace dlib
void update_parameters(sstack<solver_type> solvers, double learning_rate) void update_parameters(sstack<solver_type> solvers, double learning_rate)
{ {
DLIB_CASSERT(solvers.size()>=num_computational_layers,""); DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
// Don't try to adjust the parameters if this layer doesn't have any. // Don't try to adjust the parameters if this layer doesn't have any or the
if (params_grad.size() != 0) // learning rate is disabled for this layer.
if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0)
{ {
const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad)); const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
tt::add(details.get_layer_params(), details.get_layer_params(), step); tt::add(details.get_layer_params(), details.get_layer_params(), step);
...@@ -1200,8 +1233,9 @@ namespace dlib ...@@ -1200,8 +1233,9 @@ namespace dlib
void update_parameters(sstack<solver_type> solvers, double learning_rate) void update_parameters(sstack<solver_type> solvers, double learning_rate)
{ {
DLIB_CASSERT(solvers.size()>=num_computational_layers,""); DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
// Don't try to adjust the parameters if this layer doesn't have any. // Don't try to adjust the parameters if this layer doesn't have any or the
if (params_grad.size() != 0) // learning rate is disabled for this layer.
if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0)
{ {
const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad)); const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
tt::add(details.get_layer_params(), details.get_layer_params(), step); tt::add(details.get_layer_params(), details.get_layer_params(), step);
...@@ -1817,9 +1851,7 @@ namespace dlib ...@@ -1817,9 +1851,7 @@ namespace dlib
public: public:
typedef INPUT_LAYER subnet_type; typedef INPUT_LAYER subnet_type;
typedef typename subnet_type::input_type input_type; typedef typename subnet_type::input_type input_type;
// This layer counts as a computational layer because it copies and stores the const static size_t num_computational_layers = 0;
// inputs.
const static size_t num_computational_layers = 1;
const static size_t num_layers = 2; const static size_t num_layers = 2;
const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor; const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
static_assert(sample_expansion_factor >= 1, static_assert(sample_expansion_factor >= 1,
......
...@@ -67,6 +67,32 @@ namespace dlib ...@@ -67,6 +67,32 @@ namespace dlib
(except computes it using a numerically accurate method) (except computes it using a numerically accurate method)
!*/ !*/
// ----------------------------------------------------------------------------------------
template <typename T>
double get_learning_rate_multiplier(
const T& obj
);
/*!
ensures
- if (obj has a get_learning_rate_multiplier() member function) then
- returns obj.get_learning_rate_multiplier()
- else
- returns 1
!*/
template <typename T>
double get_weight_decay_multiplier(
const T& obj
);
/*!
ensures
- if (obj has a get_weight_decay_multiplier() member function) then
- returns obj.get_weight_decay_multiplier()
- else
- returns 1
!*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
bool dnn_prefer_fastest_algorithms( bool dnn_prefer_fastest_algorithms(
...@@ -152,7 +178,7 @@ namespace dlib ...@@ -152,7 +178,7 @@ namespace dlib
); );
/*! /*!
requires requires
- num < size() - num <= size()
ensures ensures
- returns a reference to the sub-stack S such that: - returns a reference to the sub-stack S such that:
- S.size() == size()-num. - S.size() == size()-num.
......
...@@ -385,6 +385,30 @@ namespace dlib ...@@ -385,6 +385,30 @@ namespace dlib
d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D; d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D;
} }
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
)
{
DLIB_CASSERT(dest.size()==src1.size(),"");
DLIB_CASSERT(dest.size()==src2.size(),"");
DLIB_CASSERT(dest.size()==src3.size(),"");
DLIB_CASSERT(begin <= end && end <= dest.size(),"");
const auto d = dest.host();
const auto s1 = src1.host();
const auto s2 = src2.host();
const auto s3 = src3.host();
for (size_t i = begin; i < end; ++i)
d[i] = A*s1[i] + B*s2[i] + C*s3[i];
}
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void affine_transform( void affine_transform(
...@@ -464,6 +488,8 @@ namespace dlib ...@@ -464,6 +488,8 @@ namespace dlib
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void compute_adam_update ( void compute_adam_update (
size_t begin,
size_t end,
tensor& s, tensor& s,
tensor& m, tensor& m,
tensor& v, tensor& v,
...@@ -480,6 +506,7 @@ namespace dlib ...@@ -480,6 +506,7 @@ namespace dlib
s.size() == v.size() && s.size() == v.size() &&
s.size() == params.size() && s.size() == params.size() &&
s.size() == params_grad.size(),""); s.size() == params_grad.size(),"");
DLIB_CASSERT(begin <= end && end <= params.size(),"");
const float eps = 1e-8; const float eps = 1e-8;
const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t)); const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));
...@@ -492,7 +519,7 @@ namespace dlib ...@@ -492,7 +519,7 @@ namespace dlib
auto ps = s.host_write_only(); auto ps = s.host_write_only();
auto pparams = params.host(); auto pparams = params.host();
auto ppgrad = params_grad.host(); auto ppgrad = params_grad.host();
for (size_t i = 0; i < params.size(); ++i) for (size_t i = begin; i < end; ++i)
{ {
float g = weight_decay*pparams[i] + ppgrad[i]; float g = weight_decay*pparams[i] + ppgrad[i];
pm[i] = momentum1*pm[i] + (1-momentum1)*g; pm[i] = momentum1*pm[i] + (1-momentum1)*g;
...@@ -504,6 +531,7 @@ namespace dlib ...@@ -504,6 +531,7 @@ namespace dlib
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void batch_normalize_inference ( void batch_normalize_inference (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
const tensor& src, const tensor& src,
const tensor& gamma, const tensor& gamma,
...@@ -519,7 +547,8 @@ namespace dlib ...@@ -519,7 +547,8 @@ namespace dlib
gamma.k() == src.k() && gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) && have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) && have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_variances), have_same_dimensions(gamma, running_variances) &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() << "\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() << "\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() << "\ngamma.nr(): " << gamma.nr() <<
...@@ -538,7 +567,8 @@ namespace dlib ...@@ -538,7 +567,8 @@ namespace dlib
"\nrunning_variances.nc(): " << running_variances.nc() << "\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() << "\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() << "\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc() "\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
); );
dest.copy_size(src); dest.copy_size(src);
...@@ -554,7 +584,7 @@ namespace dlib ...@@ -554,7 +584,7 @@ namespace dlib
{ {
for (long k = 0; k < num; ++k) for (long k = 0; k < num; ++k)
{ {
*d = g[k]*(*s - m[k])/std::sqrt(v[k]+dlib::tt::BATCH_NORM_EPS) + b[k]; *d = g[k]*(*s - m[k])/std::sqrt(v[k]+eps) + b[k];
++d; ++d;
++s; ++s;
} }
...@@ -562,6 +592,7 @@ namespace dlib ...@@ -562,6 +592,7 @@ namespace dlib
} }
void batch_normalize ( void batch_normalize (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
resizable_tensor& means, resizable_tensor& means,
resizable_tensor& invstds, resizable_tensor& invstds,
...@@ -582,7 +613,8 @@ namespace dlib ...@@ -582,7 +613,8 @@ namespace dlib
beta.num_samples() == 1 && beta.num_samples() == 1 &&
gamma.nr() == beta.nr() && beta.nr() == src.nr() && gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
gamma.nc() == beta.nc() && beta.nc() == src.nc() && gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
gamma.k() == beta.k() && beta.k() == src.k(), gamma.k() == beta.k() && beta.k() == src.k() &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() << "\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() << "\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() << "\ngamma.nr(): " << gamma.nr() <<
...@@ -593,7 +625,8 @@ namespace dlib ...@@ -593,7 +625,8 @@ namespace dlib
"\nbeta.nc(): " << beta.nc() << "\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() << "\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() << "\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc() "\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
); );
dest.copy_size(src); dest.copy_size(src);
...@@ -635,7 +668,7 @@ namespace dlib ...@@ -635,7 +668,7 @@ namespace dlib
else else
rvar[i] = (1-averaging_factor)*rvar[i] + scale*averaging_factor*actual_var; rvar[i] = (1-averaging_factor)*rvar[i] + scale*averaging_factor*actual_var;
p_invstds[i] = 1.0f/std::sqrt(actual_var + dlib::tt::BATCH_NORM_EPS); p_invstds[i] = 1.0f/std::sqrt(actual_var + eps);
} }
p_src = src.host(); p_src = src.host();
...@@ -662,6 +695,7 @@ namespace dlib ...@@ -662,6 +695,7 @@ namespace dlib
} }
void batch_normalize_gradient ( void batch_normalize_gradient (
const double eps,
const tensor& gradient_input, const tensor& gradient_input,
const tensor& means, const tensor& means,
const tensor& invstds, const tensor& invstds,
...@@ -682,6 +716,7 @@ namespace dlib ...@@ -682,6 +716,7 @@ namespace dlib
DLIB_CASSERT(num == beta_grad.size(),""); DLIB_CASSERT(num == beta_grad.size(),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src),""); DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),""); DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
DLIB_CASSERT(eps > 0,"");
beta_grad = 0; beta_grad = 0;
gamma_grad = 0; gamma_grad = 0;
...@@ -757,6 +792,7 @@ namespace dlib ...@@ -757,6 +792,7 @@ namespace dlib
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void batch_normalize_conv_inference ( void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
const tensor& src, const tensor& src,
const tensor& gamma, const tensor& gamma,
...@@ -772,7 +808,8 @@ namespace dlib ...@@ -772,7 +808,8 @@ namespace dlib
gamma.k() == src.k() && gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) && have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) && have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_variances), have_same_dimensions(gamma, running_variances) &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() << "\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() << "\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() << "\ngamma.nr(): " << gamma.nr() <<
...@@ -791,7 +828,8 @@ namespace dlib ...@@ -791,7 +828,8 @@ namespace dlib
"\nrunning_variances.nc(): " << running_variances.nc() << "\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() << "\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() << "\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc() "\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
); );
dest.copy_size(src); dest.copy_size(src);
...@@ -807,7 +845,7 @@ namespace dlib ...@@ -807,7 +845,7 @@ namespace dlib
{ {
for (long k = 0; k < src.k(); ++k) for (long k = 0; k < src.k(); ++k)
{ {
const float invstd = 1.0f/std::sqrt(v[k] + dlib::tt::BATCH_NORM_EPS); const float invstd = 1.0f/std::sqrt(v[k] + eps);
for (long j = 0; j < num; ++j) for (long j = 0; j < num; ++j)
{ {
*d = g[k]*(*s - m[k])*invstd + b[k]; *d = g[k]*(*s - m[k])*invstd + b[k];
...@@ -819,6 +857,7 @@ namespace dlib ...@@ -819,6 +857,7 @@ namespace dlib
} }
void batch_normalize_conv ( void batch_normalize_conv (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
resizable_tensor& means, resizable_tensor& means,
resizable_tensor& invstds, resizable_tensor& invstds,
...@@ -841,7 +880,8 @@ namespace dlib ...@@ -841,7 +880,8 @@ namespace dlib
beta.nr() == 1 && beta.nr() == 1 &&
gamma.nc() == 1 && gamma.nc() == 1 &&
beta.nc() == 1 && beta.nc() == 1 &&
gamma.k() == beta.k() && beta.k() == src.k(), gamma.k() == beta.k() && beta.k() == src.k() &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() << "\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() << "\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() << "\ngamma.nr(): " << gamma.nr() <<
...@@ -852,7 +892,8 @@ namespace dlib ...@@ -852,7 +892,8 @@ namespace dlib
"\nbeta.nc(): " << beta.nc() << "\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() << "\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() << "\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc() "\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
); );
dest.copy_size(src); dest.copy_size(src);
...@@ -900,7 +941,7 @@ namespace dlib ...@@ -900,7 +941,7 @@ namespace dlib
else else
rvar[k] = (1-averaging_factor)*rvar[k] + scale*averaging_factor*actual_var; rvar[k] = (1-averaging_factor)*rvar[k] + scale*averaging_factor*actual_var;
p_invstds[k] = 1.0f/std::sqrt(actual_var + dlib::tt::BATCH_NORM_EPS); p_invstds[k] = 1.0f/std::sqrt(actual_var + eps);
} }
p_src = src.host(); p_src = src.host();
...@@ -928,6 +969,7 @@ namespace dlib ...@@ -928,6 +969,7 @@ namespace dlib
} }
void batch_normalize_conv_gradient( void batch_normalize_conv_gradient(
const double eps,
const tensor& gradient_input, const tensor& gradient_input,
const tensor& means, const tensor& means,
const tensor& invstds, const tensor& invstds,
...@@ -948,6 +990,7 @@ namespace dlib ...@@ -948,6 +990,7 @@ namespace dlib
DLIB_CASSERT(src.k() == beta_grad.size(),""); DLIB_CASSERT(src.k() == beta_grad.size(),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src),""); DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),""); DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
DLIB_CASSERT(eps > 0,"");
beta_grad = 0; beta_grad = 0;
gamma_grad = 0; gamma_grad = 0;
......
...@@ -81,6 +81,18 @@ namespace dlib ...@@ -81,6 +81,18 @@ namespace dlib
const float D const float D
); );
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
);
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void affine_transform( void affine_transform(
...@@ -102,6 +114,8 @@ namespace dlib ...@@ -102,6 +114,8 @@ namespace dlib
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void compute_adam_update ( void compute_adam_update (
size_t begin,
size_t end,
tensor& s, tensor& s,
tensor& m, tensor& m,
tensor& v, tensor& v,
...@@ -117,6 +131,7 @@ namespace dlib ...@@ -117,6 +131,7 @@ namespace dlib
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void batch_normalize_inference ( void batch_normalize_inference (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
const tensor& src, const tensor& src,
const tensor& gamma, const tensor& gamma,
...@@ -126,6 +141,7 @@ namespace dlib ...@@ -126,6 +141,7 @@ namespace dlib
); );
void batch_normalize ( void batch_normalize (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
resizable_tensor& means, resizable_tensor& means,
resizable_tensor& invstds, resizable_tensor& invstds,
...@@ -138,6 +154,7 @@ namespace dlib ...@@ -138,6 +154,7 @@ namespace dlib
); );
void batch_normalize_gradient ( void batch_normalize_gradient (
const double eps,
const tensor& gradient_input, const tensor& gradient_input,
const tensor& means, const tensor& means,
const tensor& invstds, const tensor& invstds,
...@@ -149,6 +166,7 @@ namespace dlib ...@@ -149,6 +166,7 @@ namespace dlib
); );
void batch_normalize_conv_inference ( void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
const tensor& src, const tensor& src,
const tensor& gamma, const tensor& gamma,
...@@ -158,6 +176,7 @@ namespace dlib ...@@ -158,6 +176,7 @@ namespace dlib
); );
void batch_normalize_conv ( void batch_normalize_conv (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
resizable_tensor& means, resizable_tensor& means,
resizable_tensor& invstds, resizable_tensor& invstds,
...@@ -170,6 +189,7 @@ namespace dlib ...@@ -170,6 +189,7 @@ namespace dlib
); );
void batch_normalize_conv_gradient ( void batch_normalize_conv_gradient (
const double eps,
const tensor& gradient_input, const tensor& gradient_input,
const tensor& means, const tensor& means,
const tensor& invstds, const tensor& invstds,
......
...@@ -504,6 +504,40 @@ namespace dlib ...@@ -504,6 +504,40 @@ namespace dlib
src2.device(), src3.device(), dest.size(), A, B, C, D); src2.device(), src3.device(), dest.size(), A, B, C, D);
} }
// ----------------------------------------------------------------------------------------
__global__ void _cuda_affine_transform_range(
float* d, const float* s1, const float* s2, const float* s3, size_t begin, size_t end, float A, float B, float C
)
{
for (auto i : grid_stride_range(begin, end))
{
d[i] = A*s1[i] + B*s2[i] + C*s3[i];
}
}
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
)
{
DLIB_CASSERT(dest.size()==src1.size(),"");
DLIB_CASSERT(dest.size()==src2.size(),"");
DLIB_CASSERT(dest.size()==src3.size(),"");
DLIB_CASSERT(begin <= end && end <= dest.size(),"");
launch_kernel(_cuda_affine_transform_range,max_jobs(end-begin),
dest.device(), src1.device(),
src2.device(), src3.device(), begin, end, A, B, C);
}
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
__global__ void _cuda_affine_transform2(float* d, const float* s, size_t n, const float* A, const float* B) __global__ void _cuda_affine_transform2(float* d, const float* s, size_t n, const float* A, const float* B)
...@@ -549,7 +583,8 @@ namespace dlib ...@@ -549,7 +583,8 @@ namespace dlib
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
__global__ void _cuda_compute_adam_update( __global__ void _cuda_compute_adam_update(
size_t n, size_t begin,
size_t end,
float* s, float* s,
float* m, float* m,
float* v, float* v,
...@@ -566,7 +601,7 @@ namespace dlib ...@@ -566,7 +601,7 @@ namespace dlib
// m = momentum1*m + (1-momentum1) * (weight_decay*params + params_grad); // m = momentum1*m + (1-momentum1) * (weight_decay*params + params_grad);
// v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad); // v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad);
// s = -alpha*m/(sqrt(v) + eps); // s = -alpha*m/(sqrt(v) + eps);
for (auto i : grid_stride_range(0, n)) for (auto i : grid_stride_range(begin, end))
{ {
float g = (weight_decay*params[i] + params_grad[i]); float g = (weight_decay*params[i] + params_grad[i]);
m[i] = momentum1*m[i] + (1-momentum1)*g; m[i] = momentum1*m[i] + (1-momentum1)*g;
...@@ -576,6 +611,8 @@ namespace dlib ...@@ -576,6 +611,8 @@ namespace dlib
} }
void compute_adam_update ( void compute_adam_update (
size_t begin,
size_t end,
tensor& s, tensor& s,
tensor& m, tensor& m,
tensor& v, tensor& v,
...@@ -592,10 +629,11 @@ namespace dlib ...@@ -592,10 +629,11 @@ namespace dlib
s.size() == v.size() && s.size() == v.size() &&
s.size() == params.size() && s.size() == params.size() &&
s.size() == params_grad.size(),""); s.size() == params_grad.size(),"");
DLIB_CASSERT(begin <= end && end <= params.size(),"");
const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t)); const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));
launch_kernel(_cuda_compute_adam_update,max_jobs(s.size()), launch_kernel(_cuda_compute_adam_update,max_jobs(end-begin),
s.size(), s.device(), m.device(), v.device(), alpha, weight_decay, begin, end, s.device(), m.device(), v.device(), alpha, weight_decay,
momentum1, momentum2, params.device(), params_grad.device()); momentum1, momentum2, params.device(), params_grad.device());
} }
......
...@@ -164,6 +164,18 @@ namespace dlib ...@@ -164,6 +164,18 @@ namespace dlib
const float D const float D
); );
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
);
// Note that this function isn't in the tt:: namespace because add_scaled() is // Note that this function isn't in the tt:: namespace because add_scaled() is
// called by cuda::add() so we don't need a tt:: version of add_scaled(). // called by cuda::add() so we don't need a tt:: version of add_scaled().
void add_scaled( void add_scaled(
...@@ -193,6 +205,8 @@ namespace dlib ...@@ -193,6 +205,8 @@ namespace dlib
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void compute_adam_update ( void compute_adam_update (
size_t begin,
size_t end,
tensor& s, tensor& s,
tensor& m, tensor& m,
tensor& v, tensor& v,
......
...@@ -338,6 +338,7 @@ namespace dlib ...@@ -338,6 +338,7 @@ namespace dlib
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
void batch_normalize_inference ( void batch_normalize_inference (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
const tensor& src, const tensor& src,
const tensor& gamma, const tensor& gamma,
...@@ -353,7 +354,8 @@ namespace dlib ...@@ -353,7 +354,8 @@ namespace dlib
gamma.k() == src.k() && gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) && have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) && have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_variances), have_same_dimensions(gamma, running_variances) &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() << "\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() << "\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() << "\ngamma.nr(): " << gamma.nr() <<
...@@ -372,7 +374,8 @@ namespace dlib ...@@ -372,7 +374,8 @@ namespace dlib
"\nrunning_variances.nc(): " << running_variances.nc() << "\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() << "\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() << "\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc() "\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
); );
const float in_scale = 1; const float in_scale = 1;
const float out_scale = 0; const float out_scale = 0;
...@@ -393,10 +396,11 @@ namespace dlib ...@@ -393,10 +396,11 @@ namespace dlib
beta.device(), beta.device(),
running_means.device(), running_means.device(),
running_variances.device(), running_variances.device(),
dlib::tt::BATCH_NORM_EPS)); eps));
} }
void batch_normalize ( void batch_normalize (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
resizable_tensor& means, resizable_tensor& means,
resizable_tensor& invstds, resizable_tensor& invstds,
...@@ -417,7 +421,8 @@ namespace dlib ...@@ -417,7 +421,8 @@ namespace dlib
beta.num_samples() == 1 && beta.num_samples() == 1 &&
gamma.nr() == beta.nr() && beta.nr() == src.nr() && gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
gamma.nc() == beta.nc() && beta.nc() == src.nc() && gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
gamma.k() == beta.k() && beta.k() == src.k(), gamma.k() == beta.k() && beta.k() == src.k() &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() << "\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() << "\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() << "\ngamma.nr(): " << gamma.nr() <<
...@@ -428,7 +433,8 @@ namespace dlib ...@@ -428,7 +433,8 @@ namespace dlib
"\nbeta.nc(): " << beta.nc() << "\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() << "\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() << "\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc() "\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
); );
const float in_scale = 1; const float in_scale = 1;
...@@ -455,12 +461,13 @@ namespace dlib ...@@ -455,12 +461,13 @@ namespace dlib
averaging_factor, averaging_factor,
running_means.device(), running_means.device(),
running_variances.device(), running_variances.device(),
dlib::tt::BATCH_NORM_EPS, eps,
means.device(), means.device(),
invstds.device())); invstds.device()));
} }
void batch_normalize_gradient( void batch_normalize_gradient(
const double eps,
const tensor& gradient_input, const tensor& gradient_input,
const tensor& means, const tensor& means,
const tensor& invstds, const tensor& invstds,
...@@ -480,6 +487,7 @@ namespace dlib ...@@ -480,6 +487,7 @@ namespace dlib
DLIB_CASSERT(num == beta_grad.size(),""); DLIB_CASSERT(num == beta_grad.size(),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src),""); DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),""); DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
DLIB_CASSERT(eps > 0,"");
const float in_scale = 1; const float in_scale = 1;
const float out_scale = 1; const float out_scale = 1;
...@@ -503,7 +511,7 @@ namespace dlib ...@@ -503,7 +511,7 @@ namespace dlib
gamma.device(), gamma.device(),
gamma_grad.device(), gamma_grad.device(),
beta_grad.device(), beta_grad.device(),
dlib::tt::BATCH_NORM_EPS, eps,
means.device(), means.device(),
invstds.device())); invstds.device()));
} }
...@@ -511,6 +519,7 @@ namespace dlib ...@@ -511,6 +519,7 @@ namespace dlib
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
void batch_normalize_conv_inference ( void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
const tensor& src, const tensor& src,
const tensor& gamma, const tensor& gamma,
...@@ -526,7 +535,8 @@ namespace dlib ...@@ -526,7 +535,8 @@ namespace dlib
gamma.k() == src.k() && gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) && have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) && have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_variances), have_same_dimensions(gamma, running_variances) &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() << "\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() << "\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() << "\ngamma.nr(): " << gamma.nr() <<
...@@ -545,7 +555,8 @@ namespace dlib ...@@ -545,7 +555,8 @@ namespace dlib
"\nrunning_variances.nc(): " << running_variances.nc() << "\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() << "\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() << "\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc() "\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
); );
const float in_scale = 1; const float in_scale = 1;
const float out_scale = 0; const float out_scale = 0;
...@@ -566,10 +577,11 @@ namespace dlib ...@@ -566,10 +577,11 @@ namespace dlib
beta.device(), beta.device(),
running_means.device(), running_means.device(),
running_variances.device(), running_variances.device(),
dlib::tt::BATCH_NORM_EPS)); eps));
} }
void batch_normalize_conv ( void batch_normalize_conv (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
resizable_tensor& means, resizable_tensor& means,
resizable_tensor& invstds, resizable_tensor& invstds,
...@@ -592,7 +604,8 @@ namespace dlib ...@@ -592,7 +604,8 @@ namespace dlib
beta.nr() == 1 && beta.nr() == 1 &&
gamma.nc() == 1 && gamma.nc() == 1 &&
beta.nc() == 1 && beta.nc() == 1 &&
gamma.k() == beta.k() && beta.k() == src.k(), gamma.k() == beta.k() && beta.k() == src.k() &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() << "\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() << "\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() << "\ngamma.nr(): " << gamma.nr() <<
...@@ -603,7 +616,8 @@ namespace dlib ...@@ -603,7 +616,8 @@ namespace dlib
"\nbeta.nc(): " << beta.nc() << "\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() << "\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() << "\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc() "\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
); );
const float in_scale = 1; const float in_scale = 1;
const float out_scale = 0; const float out_scale = 0;
...@@ -629,12 +643,13 @@ namespace dlib ...@@ -629,12 +643,13 @@ namespace dlib
averaging_factor, averaging_factor,
running_means.device(), running_means.device(),
running_variances.device(), running_variances.device(),
dlib::tt::BATCH_NORM_EPS, eps,
means.device(), means.device(),
invstds.device())); invstds.device()));
} }
void batch_normalize_conv_gradient( void batch_normalize_conv_gradient(
const double eps,
const tensor& gradient_input, const tensor& gradient_input,
const tensor& means, const tensor& means,
const tensor& invstds, const tensor& invstds,
...@@ -653,6 +668,7 @@ namespace dlib ...@@ -653,6 +668,7 @@ namespace dlib
DLIB_CASSERT(src.k() == beta_grad.size(),""); DLIB_CASSERT(src.k() == beta_grad.size(),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src),""); DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),""); DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
DLIB_CASSERT(eps > 0,"");
const float in_scale = 1; const float in_scale = 1;
const float out_scale = 1; const float out_scale = 1;
...@@ -676,7 +692,7 @@ namespace dlib ...@@ -676,7 +692,7 @@ namespace dlib
gamma.device(), gamma.device(),
gamma_grad.device(), gamma_grad.device(),
beta_grad.device(), beta_grad.device(),
dlib::tt::BATCH_NORM_EPS, eps,
means.device(), means.device(),
invstds.device())); invstds.device()));
} }
......
...@@ -135,6 +135,7 @@ namespace dlib ...@@ -135,6 +135,7 @@ namespace dlib
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
void batch_normalize_inference ( void batch_normalize_inference (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
const tensor& src, const tensor& src,
const tensor& gamma, const tensor& gamma,
...@@ -144,6 +145,7 @@ namespace dlib ...@@ -144,6 +145,7 @@ namespace dlib
); );
void batch_normalize ( void batch_normalize (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
resizable_tensor& means, resizable_tensor& means,
resizable_tensor& invstds, resizable_tensor& invstds,
...@@ -156,6 +158,7 @@ namespace dlib ...@@ -156,6 +158,7 @@ namespace dlib
); );
void batch_normalize_gradient( void batch_normalize_gradient(
const double eps,
const tensor& gradient_input, const tensor& gradient_input,
const tensor& means, const tensor& means,
const tensor& invstds, const tensor& invstds,
...@@ -169,6 +172,7 @@ namespace dlib ...@@ -169,6 +172,7 @@ namespace dlib
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
void batch_normalize_conv_inference ( void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
const tensor& src, const tensor& src,
const tensor& gamma, const tensor& gamma,
...@@ -178,6 +182,7 @@ namespace dlib ...@@ -178,6 +182,7 @@ namespace dlib
); );
void batch_normalize_conv ( void batch_normalize_conv (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
resizable_tensor& means, resizable_tensor& means,
resizable_tensor& invstds, resizable_tensor& invstds,
...@@ -190,6 +195,7 @@ namespace dlib ...@@ -190,6 +195,7 @@ namespace dlib
); );
void batch_normalize_conv_gradient( void batch_normalize_conv_gradient(
const double eps,
const tensor& gradient_input, const tensor& gradient_input,
const tensor& means, const tensor& means,
const tensor& invstds, const tensor& invstds,
......
...@@ -42,6 +42,10 @@ namespace dlib ...@@ -42,6 +42,10 @@ namespace dlib
con_( con_(
) : ) :
learning_rate_multiplier(1),
weight_decay_multiplier(1),
bias_learning_rate_multiplier(1),
bias_weight_decay_multiplier(0),
padding_y_(_padding_y), padding_y_(_padding_y),
padding_x_(_padding_x) padding_x_(_padding_x)
{} {}
...@@ -54,12 +58,27 @@ namespace dlib ...@@ -54,12 +58,27 @@ namespace dlib
long padding_y() const { return padding_y_; } long padding_y() const { return padding_y_; }
long padding_x() const { return padding_x_; } long padding_x() const { return padding_x_; }
double get_learning_rate_multiplier () const { return learning_rate_multiplier; }
double get_weight_decay_multiplier () const { return weight_decay_multiplier; }
void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
void set_weight_decay_multiplier(double val) { weight_decay_multiplier = val; }
double get_bias_learning_rate_multiplier () const { return bias_learning_rate_multiplier; }
double get_bias_weight_decay_multiplier () const { return bias_weight_decay_multiplier; }
void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
void set_bias_weight_decay_multiplier(double val) { bias_weight_decay_multiplier = val; }
con_ ( con_ (
const con_& item const con_& item
) : ) :
params(item.params), params(item.params),
filters(item.filters), filters(item.filters),
biases(item.biases), biases(item.biases),
learning_rate_multiplier(item.learning_rate_multiplier),
weight_decay_multiplier(item.weight_decay_multiplier),
bias_learning_rate_multiplier(item.bias_learning_rate_multiplier),
bias_weight_decay_multiplier(item.bias_weight_decay_multiplier),
padding_y_(item.padding_y_), padding_y_(item.padding_y_),
padding_x_(item.padding_x_) padding_x_(item.padding_x_)
{ {
...@@ -81,6 +100,10 @@ namespace dlib ...@@ -81,6 +100,10 @@ namespace dlib
biases = item.biases; biases = item.biases;
padding_y_ = item.padding_y_; padding_y_ = item.padding_y_;
padding_x_ = item.padding_x_; padding_x_ = item.padding_x_;
learning_rate_multiplier = item.learning_rate_multiplier;
weight_decay_multiplier = item.weight_decay_multiplier;
bias_learning_rate_multiplier = item.bias_learning_rate_multiplier;
bias_weight_decay_multiplier = item.bias_weight_decay_multiplier;
return *this; return *this;
} }
...@@ -121,18 +144,22 @@ namespace dlib ...@@ -121,18 +144,22 @@ namespace dlib
void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad) void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
{ {
conv.get_gradient_for_data (gradient_input, filters(params,0), sub.get_gradient_input()); conv.get_gradient_for_data (gradient_input, filters(params,0), sub.get_gradient_input());
// no point computing the parameter gradients if they won't be used.
if (learning_rate_multiplier != 0)
{
auto filt = filters(params_grad,0); auto filt = filters(params_grad,0);
conv.get_gradient_for_filters (gradient_input, sub.get_output(), filt); conv.get_gradient_for_filters (gradient_input, sub.get_output(), filt);
auto b = biases(params_grad, filters.size()); auto b = biases(params_grad, filters.size());
tt::assign_conv_bias_gradient(b, gradient_input); tt::assign_conv_bias_gradient(b, gradient_input);
} }
}
const tensor& get_layer_params() const { return params; } const tensor& get_layer_params() const { return params; }
tensor& get_layer_params() { return params; } tensor& get_layer_params() { return params; }
friend void serialize(const con_& item, std::ostream& out) friend void serialize(const con_& item, std::ostream& out)
{ {
serialize("con_2", out); serialize("con_3", out);
serialize(item.params, out); serialize(item.params, out);
serialize(_num_filters, out); serialize(_num_filters, out);
serialize(_nr, out); serialize(_nr, out);
...@@ -143,6 +170,10 @@ namespace dlib ...@@ -143,6 +170,10 @@ namespace dlib
serialize(item.padding_x_, out); serialize(item.padding_x_, out);
serialize(item.filters, out); serialize(item.filters, out);
serialize(item.biases, out); serialize(item.biases, out);
serialize(item.learning_rate_multiplier, out);
serialize(item.weight_decay_multiplier, out);
serialize(item.bias_learning_rate_multiplier, out);
serialize(item.bias_weight_decay_multiplier, out);
} }
friend void deserialize(con_& item, std::istream& in) friend void deserialize(con_& item, std::istream& in)
...@@ -167,7 +198,7 @@ namespace dlib ...@@ -167,7 +198,7 @@ namespace dlib
item.padding_y_ = nr/2; item.padding_y_ = nr/2;
item.padding_x_ = nc/2; item.padding_x_ = nc/2;
} }
else if (version == "con_2") else if (version == "con_2" || version == "con_3")
{ {
deserialize(item.params, in); deserialize(item.params, in);
deserialize(num_filters, in); deserialize(num_filters, in);
...@@ -180,6 +211,23 @@ namespace dlib ...@@ -180,6 +211,23 @@ namespace dlib
deserialize(item.filters, in); deserialize(item.filters, in);
deserialize(item.biases, in); deserialize(item.biases, in);
if (version == "con_3")
{
deserialize(item.learning_rate_multiplier, in);
deserialize(item.weight_decay_multiplier, in);
deserialize(item.bias_learning_rate_multiplier, in);
deserialize(item.bias_weight_decay_multiplier, in);
}
else
{
// Previous versions didn't have these parameters, so they were
// implicitly 1.
item.learning_rate_multiplier = 1;
item.weight_decay_multiplier = 1;
item.bias_learning_rate_multiplier = 1;
item.bias_weight_decay_multiplier = 1;
}
if (item.padding_y_ != _padding_y) throw serialization_error("Wrong padding_y found while deserializing dlib::con_"); if (item.padding_y_ != _padding_y) throw serialization_error("Wrong padding_y found while deserializing dlib::con_");
if (item.padding_x_ != _padding_x) throw serialization_error("Wrong padding_x found while deserializing dlib::con_"); if (item.padding_x_ != _padding_x) throw serialization_error("Wrong padding_x found while deserializing dlib::con_");
} }
...@@ -207,6 +255,10 @@ namespace dlib ...@@ -207,6 +255,10 @@ namespace dlib
<< ", padding_y="<<item.padding_y_ << ", padding_y="<<item.padding_y_
<< ", padding_x="<<item.padding_x_ << ", padding_x="<<item.padding_x_
<< ")"; << ")";
out << " learning_rate_mult="<<item.learning_rate_multiplier;
out << " weight_decay_mult="<<item.weight_decay_multiplier;
out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
return out; return out;
} }
...@@ -217,6 +269,10 @@ namespace dlib ...@@ -217,6 +269,10 @@ namespace dlib
alias_tensor filters, biases; alias_tensor filters, biases;
tt::tensor_conv conv; tt::tensor_conv conv;
double learning_rate_multiplier;
double weight_decay_multiplier;
double bias_learning_rate_multiplier;
double bias_weight_decay_multiplier;
// These are here only because older versions of con (which you might encounter // These are here only because older versions of con (which you might encounter
// serialized to disk) used different padding settings. // serialized to disk) used different padding settings.
...@@ -594,20 +650,43 @@ namespace dlib ...@@ -594,20 +650,43 @@ namespace dlib
FC_MODE = 1 FC_MODE = 1
}; };
const double DEFAULT_BATCH_NORM_EPS = 0.00001;
template < template <
layer_mode mode layer_mode mode
> >
class bn_ class bn_
{ {
public: public:
bn_() : num_updates(0), running_stats_window_size(1000) explicit bn_(
unsigned long window_size,
double eps_ = DEFAULT_BATCH_NORM_EPS
) :
num_updates(0),
running_stats_window_size(window_size),
learning_rate_multiplier(1),
weight_decay_multiplier(0),
bias_learning_rate_multiplier(1),
bias_weight_decay_multiplier(1),
eps(eps_)
{} {}
explicit bn_(unsigned long window_size) : num_updates(0), running_stats_window_size(window_size) bn_() : bn_(1000) {}
{}
layer_mode get_mode() const { return mode; } layer_mode get_mode() const { return mode; }
unsigned long get_running_stats_window_size () const { return running_stats_window_size; } unsigned long get_running_stats_window_size () const { return running_stats_window_size; }
double get_eps() const { return eps; }
double get_learning_rate_multiplier () const { return learning_rate_multiplier; }
double get_weight_decay_multiplier () const { return weight_decay_multiplier; }
void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
void set_weight_decay_multiplier(double val) { weight_decay_multiplier = val; }
double get_bias_learning_rate_multiplier () const { return bias_learning_rate_multiplier; }
double get_bias_weight_decay_multiplier () const { return bias_weight_decay_multiplier; }
void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
void set_bias_weight_decay_multiplier(double val) { bias_weight_decay_multiplier = val; }
template <typename SUBNET> template <typename SUBNET>
void setup (const SUBNET& sub) void setup (const SUBNET& sub)
...@@ -648,16 +727,16 @@ namespace dlib ...@@ -648,16 +727,16 @@ namespace dlib
if (num_updates <running_stats_window_size) if (num_updates <running_stats_window_size)
++num_updates; ++num_updates;
if (mode == FC_MODE) if (mode == FC_MODE)
tt::batch_normalize(output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b); tt::batch_normalize(eps, output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
else else
tt::batch_normalize_conv(output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b); tt::batch_normalize_conv(eps, output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
} }
else // we are running in testing mode so we just linearly scale the input tensor. else // we are running in testing mode so we just linearly scale the input tensor.
{ {
if (mode == FC_MODE) if (mode == FC_MODE)
tt::batch_normalize_inference(output, sub.get_output(), g, b, running_means, running_variances); tt::batch_normalize_inference(eps, output, sub.get_output(), g, b, running_means, running_variances);
else else
tt::batch_normalize_conv_inference(output, sub.get_output(), g, b, running_means, running_variances); tt::batch_normalize_conv_inference(eps, output, sub.get_output(), g, b, running_means, running_variances);
} }
} }
...@@ -668,9 +747,9 @@ namespace dlib ...@@ -668,9 +747,9 @@ namespace dlib
auto g_grad = gamma(params_grad, 0); auto g_grad = gamma(params_grad, 0);
auto b_grad = beta(params_grad, gamma.size()); auto b_grad = beta(params_grad, gamma.size());
if (mode == FC_MODE) if (mode == FC_MODE)
tt::batch_normalize_gradient(gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad ); tt::batch_normalize_gradient(eps, gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
else else
tt::batch_normalize_conv_gradient(gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad ); tt::batch_normalize_conv_gradient(eps, gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
} }
const tensor& get_layer_params() const { return params; } const tensor& get_layer_params() const { return params; }
...@@ -679,9 +758,9 @@ namespace dlib ...@@ -679,9 +758,9 @@ namespace dlib
friend void serialize(const bn_& item, std::ostream& out) friend void serialize(const bn_& item, std::ostream& out)
{ {
if (mode == CONV_MODE) if (mode == CONV_MODE)
serialize("bn_con", out); serialize("bn_con2", out);
else // if FC_MODE else // if FC_MODE
serialize("bn_fc", out); serialize("bn_fc2", out);
serialize(item.params, out); serialize(item.params, out);
serialize(item.gamma, out); serialize(item.gamma, out);
serialize(item.beta, out); serialize(item.beta, out);
...@@ -691,6 +770,11 @@ namespace dlib ...@@ -691,6 +770,11 @@ namespace dlib
serialize(item.running_variances, out); serialize(item.running_variances, out);
serialize(item.num_updates, out); serialize(item.num_updates, out);
serialize(item.running_stats_window_size, out); serialize(item.running_stats_window_size, out);
serialize(item.learning_rate_multiplier, out);
serialize(item.weight_decay_multiplier, out);
serialize(item.bias_learning_rate_multiplier, out);
serialize(item.bias_weight_decay_multiplier, out);
serialize(item.eps, out);
} }
friend void deserialize(bn_& item, std::istream& in) friend void deserialize(bn_& item, std::istream& in)
...@@ -701,12 +785,12 @@ namespace dlib ...@@ -701,12 +785,12 @@ namespace dlib
{ {
if (mode == CONV_MODE) if (mode == CONV_MODE)
{ {
if (version != "bn_con") if (version != "bn_con" && version != "bn_con2")
throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::bn_."); throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::bn_.");
} }
else // must be in FC_MODE else // must be in FC_MODE
{ {
if (version != "bn_fc") if (version != "bn_fc" && version != "bn_fc2")
throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::bn_."); throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::bn_.");
} }
} }
...@@ -731,16 +815,38 @@ namespace dlib ...@@ -731,16 +815,38 @@ namespace dlib
// We also need to flip the running_variances around since the previous // We also need to flip the running_variances around since the previous
// format saved the inverse standard deviations instead of variances. // format saved the inverse standard deviations instead of variances.
item.running_variances = 1.0f/squared(mat(item.running_variances)) - tt::BATCH_NORM_EPS; item.running_variances = 1.0f/squared(mat(item.running_variances)) - DEFAULT_BATCH_NORM_EPS;
}
else if (version == "bn_con2" || version == "bn_fc2")
{
deserialize(item.learning_rate_multiplier, in);
deserialize(item.weight_decay_multiplier, in);
deserialize(item.bias_learning_rate_multiplier, in);
deserialize(item.bias_weight_decay_multiplier, in);
deserialize(item.eps, in);
}
else
{
// Previous versions didn't have these parameters, so they were
// implicitly 1.
item.learning_rate_multiplier = 1;
item.weight_decay_multiplier = 1;
item.eps = DEFAULT_BATCH_NORM_EPS;
} }
} }
friend std::ostream& operator<<(std::ostream& out, const bn_& item) friend std::ostream& operator<<(std::ostream& out, const bn_& item)
{ {
if (mode == CONV_MODE) if (mode == CONV_MODE)
out << "bn_con"; out << "bn_con ";
else else
out << "bn_fc"; out << "bn_fc ";
out << " eps="<<item.eps;
out << " learning_rate_mult="<<item.learning_rate_multiplier;
out << " weight_decay_mult="<<item.weight_decay_multiplier;
out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
return out; return out;
} }
...@@ -754,6 +860,11 @@ namespace dlib ...@@ -754,6 +860,11 @@ namespace dlib
resizable_tensor invstds, running_variances; resizable_tensor invstds, running_variances;
unsigned long num_updates; unsigned long num_updates;
unsigned long running_stats_window_size; unsigned long running_stats_window_size;
double learning_rate_multiplier;
double weight_decay_multiplier;
double bias_learning_rate_multiplier;
double bias_weight_decay_multiplier;
double eps;
}; };
template <typename SUBNET> template <typename SUBNET>
...@@ -784,11 +895,24 @@ namespace dlib ...@@ -784,11 +895,24 @@ namespace dlib
static_assert(num_outputs_ > 0, "The number of outputs from a fc_ layer must be > 0"); static_assert(num_outputs_ > 0, "The number of outputs from a fc_ layer must be > 0");
public: public:
fc_() : num_outputs(num_outputs_), num_inputs(0) fc_(num_fc_outputs o) : num_outputs(o.num_outputs), num_inputs(0),
{ learning_rate_multiplier(1),
} weight_decay_multiplier(1),
bias_learning_rate_multiplier(1),
bias_weight_decay_multiplier(0)
{}
fc_() : fc_(num_fc_outputs(num_outputs_)) {}
fc_(num_fc_outputs o) : num_outputs(o.num_outputs), num_inputs(0) {} double get_learning_rate_multiplier () const { return learning_rate_multiplier; }
double get_weight_decay_multiplier () const { return weight_decay_multiplier; }
void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
void set_weight_decay_multiplier(double val) { weight_decay_multiplier = val; }
double get_bias_learning_rate_multiplier () const { return bias_learning_rate_multiplier; }
double get_bias_weight_decay_multiplier () const { return bias_weight_decay_multiplier; }
void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
void set_bias_weight_decay_multiplier(double val) { bias_weight_decay_multiplier = val; }
unsigned long get_num_outputs ( unsigned long get_num_outputs (
) const { return num_outputs; } ) const { return num_outputs; }
...@@ -834,6 +958,9 @@ namespace dlib ...@@ -834,6 +958,9 @@ namespace dlib
template <typename SUBNET> template <typename SUBNET>
void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad) void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
{
// no point computing the parameter gradients if they won't be used.
if (learning_rate_multiplier != 0)
{ {
// compute the gradient of the weight parameters. // compute the gradient of the weight parameters.
auto pw = weights(params_grad, 0); auto pw = weights(params_grad, 0);
...@@ -845,6 +972,7 @@ namespace dlib ...@@ -845,6 +972,7 @@ namespace dlib
auto pb = biases(params_grad, weights.size()); auto pb = biases(params_grad, weights.size());
tt::assign_bias_gradient(pb, gradient_input); tt::assign_bias_gradient(pb, gradient_input);
} }
}
// compute the gradient for the data // compute the gradient for the data
auto w = weights(params, 0); auto w = weights(params, 0);
...@@ -856,20 +984,24 @@ namespace dlib ...@@ -856,20 +984,24 @@ namespace dlib
friend void serialize(const fc_& item, std::ostream& out) friend void serialize(const fc_& item, std::ostream& out)
{ {
serialize("fc_", out); serialize("fc_2", out);
serialize(item.num_outputs, out); serialize(item.num_outputs, out);
serialize(item.num_inputs, out); serialize(item.num_inputs, out);
serialize(item.params, out); serialize(item.params, out);
serialize(item.weights, out); serialize(item.weights, out);
serialize(item.biases, out); serialize(item.biases, out);
serialize((int)bias_mode, out); serialize((int)bias_mode, out);
serialize(item.learning_rate_multiplier, out);
serialize(item.weight_decay_multiplier, out);
serialize(item.bias_learning_rate_multiplier, out);
serialize(item.bias_weight_decay_multiplier, out);
} }
friend void deserialize(fc_& item, std::istream& in) friend void deserialize(fc_& item, std::istream& in)
{ {
std::string version; std::string version;
deserialize(version, in); deserialize(version, in);
if (version != "fc_") if (version != "fc_" && version != "fc_2")
throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::fc_."); throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::fc_.");
deserialize(item.num_outputs, in); deserialize(item.num_outputs, in);
...@@ -880,6 +1012,22 @@ namespace dlib ...@@ -880,6 +1012,22 @@ namespace dlib
int bmode = 0; int bmode = 0;
deserialize(bmode, in); deserialize(bmode, in);
if (bias_mode != (fc_bias_mode)bmode) throw serialization_error("Wrong fc_bias_mode found while deserializing dlib::fc_"); if (bias_mode != (fc_bias_mode)bmode) throw serialization_error("Wrong fc_bias_mode found while deserializing dlib::fc_");
if (version == "fc_2")
{
deserialize(item.learning_rate_multiplier, in);
deserialize(item.weight_decay_multiplier, in);
deserialize(item.bias_learning_rate_multiplier, in);
deserialize(item.bias_weight_decay_multiplier, in);
}
else
{
// Previous versions didn't have these parameters, so they were
// implicitly 1.
item.learning_rate_multiplier = 1;
item.weight_decay_multiplier = 1;
item.bias_learning_rate_multiplier = 1;
item.bias_weight_decay_multiplier = 1;
}
} }
friend std::ostream& operator<<(std::ostream& out, const fc_& item) friend std::ostream& operator<<(std::ostream& out, const fc_& item)
...@@ -889,12 +1037,18 @@ namespace dlib ...@@ -889,12 +1037,18 @@ namespace dlib
out << "fc\t (" out << "fc\t ("
<< "num_outputs="<<item.num_outputs << "num_outputs="<<item.num_outputs
<< ")"; << ")";
out << " learning_rate_mult="<<item.learning_rate_multiplier;
out << " weight_decay_mult="<<item.weight_decay_multiplier;
out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
} }
else else
{ {
out << "fc_no_bias (" out << "fc_no_bias ("
<< "num_outputs="<<item.num_outputs << "num_outputs="<<item.num_outputs
<< ")"; << ")";
out << " learning_rate_mult="<<item.learning_rate_multiplier;
out << " weight_decay_mult="<<item.weight_decay_multiplier;
} }
return out; return out;
} }
...@@ -905,6 +1059,10 @@ namespace dlib ...@@ -905,6 +1059,10 @@ namespace dlib
unsigned long num_inputs; unsigned long num_inputs;
resizable_tensor params; resizable_tensor params;
alias_tensor weights, biases; alias_tensor weights, biases;
double learning_rate_multiplier;
double weight_decay_multiplier;
double bias_learning_rate_multiplier;
double bias_weight_decay_multiplier;
}; };
template < template <
...@@ -1143,7 +1301,7 @@ namespace dlib ...@@ -1143,7 +1301,7 @@ namespace dlib
auto sg = gamma(temp,0); auto sg = gamma(temp,0);
auto sb = beta(temp,gamma.size()); auto sb = beta(temp,gamma.size());
g = pointwise_multiply(mat(sg), 1.0f/sqrt(mat(item.running_variances)+tt::BATCH_NORM_EPS)); g = pointwise_multiply(mat(sg), 1.0f/sqrt(mat(item.running_variances)+item.get_eps()));
b = mat(sb) - pointwise_multiply(mat(g), mat(item.running_means)); b = mat(sb) - pointwise_multiply(mat(g), mat(item.running_means));
} }
...@@ -1223,7 +1381,7 @@ namespace dlib ...@@ -1223,7 +1381,7 @@ namespace dlib
{ {
std::string version; std::string version;
deserialize(version, in); deserialize(version, in);
if (version == "bn_con") if (version == "bn_con" || version == "bn_con2")
{ {
// Since we can build an affine_ from a bn_ we check if that's what is in // Since we can build an affine_ from a bn_ we check if that's what is in
// the stream and if so then just convert it right here. // the stream and if so then just convert it right here.
...@@ -1233,7 +1391,7 @@ namespace dlib ...@@ -1233,7 +1391,7 @@ namespace dlib
item = temp; item = temp;
return; return;
} }
else if (version == "bn_fc") else if (version == "bn_fc" || version == "bn_fc2")
{ {
// Since we can build an affine_ from a bn_ we check if that's what is in // Since we can build an affine_ from a bn_ we check if that's what is in
// the stream and if so then just convert it right here. // the stream and if so then just convert it right here.
...@@ -1289,8 +1447,13 @@ namespace dlib ...@@ -1289,8 +1447,13 @@ namespace dlib
template <typename SUBNET> template <typename SUBNET>
void forward(const SUBNET& sub, resizable_tensor& output) void forward(const SUBNET& sub, resizable_tensor& output)
{ {
output.copy_size(sub.get_output()); auto&& t1 = sub.get_output();
tt::add(output, sub.get_output(), layer<tag>(sub).get_output()); auto&& t2 = layer<tag>(sub).get_output();
output.set_size(std::max(t1.num_samples(),t2.num_samples()),
std::max(t1.k(),t2.k()),
std::max(t1.nr(),t2.nr()),
std::max(t1.nc(),t2.nc()));
tt::add(output, t1, t2);
} }
template <typename SUBNET> template <typename SUBNET>
......
...@@ -123,6 +123,16 @@ namespace dlib ...@@ -123,6 +123,16 @@ namespace dlib
allow dlib to make some layers execute in-place and therefore run a allow dlib to make some layers execute in-place and therefore run a
little faster and use less memory. Do not implement forward() and little faster and use less memory. Do not implement forward() and
backward(). backward().
It should also be noted that layers may define additional layer specific
fields and the solvers can use these fields as they see fit. For example,
some layers define get_learning_rate_multiplier() and
get_weight_decay_multiplier() methods. The solvers that come with dlib
look at these methods, if they exist, and adjust the learning rate or
weight decay for that layer according to the multiplier. Therefore, you
can add these methods to your layer types if you want, or even define new
fields and new solvers that use those fields in some way.
!*/ !*/
public: public:
...@@ -367,6 +377,10 @@ namespace dlib ...@@ -367,6 +377,10 @@ namespace dlib
ensures ensures
- #get_num_outputs() == num_outputs - #get_num_outputs() == num_outputs
- #get_bias_mode() == bias_mode - #get_bias_mode() == bias_mode
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 1
- #get_bias_learning_rate_multiplier() == 1
- #get_bias_weight_decay_multiplier() == 0
!*/ !*/
unsigned long get_num_outputs ( unsigned long get_num_outputs (
...@@ -389,6 +403,82 @@ namespace dlib ...@@ -389,6 +403,82 @@ namespace dlib
is added to each of the outputs of this layer. is added to each of the outputs of this layer.
!*/ !*/
double get_learning_rate_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its parameters be
multiplied by get_learning_rate_multiplier().
!*/
double get_weight_decay_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its parameters be
multiplied by get_weight_decay_multiplier().
!*/
void set_learning_rate_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_learning_rate_multiplier() == val
!*/
void set_weight_decay_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_weight_decay_multiplier() == val
!*/
double get_bias_learning_rate_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its bias parameters be
multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
!*/
double get_bias_weight_decay_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its bias parameters be
multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
!*/
void set_bias_learning_rate_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_bias_learning_rate_multiplier() == val
!*/
void set_bias_weight_decay_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_bias_weight_decay_multiplier() == val
!*/
template <typename SUBNET> void setup (const SUBNET& sub); template <typename SUBNET> void setup (const SUBNET& sub);
template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
...@@ -458,6 +548,10 @@ namespace dlib ...@@ -458,6 +548,10 @@ namespace dlib
- #stride_x() == _stride_x - #stride_x() == _stride_x
- #padding_y() == _padding_y - #padding_y() == _padding_y
- #padding_x() == _padding_x - #padding_x() == _padding_x
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 1
- #get_bias_learning_rate_multiplier() == 1
- #get_bias_weight_decay_multiplier() == 0
!*/ !*/
long num_filters( long num_filters(
...@@ -517,6 +611,82 @@ namespace dlib ...@@ -517,6 +611,82 @@ namespace dlib
sides of the image. sides of the image.
!*/ !*/
double get_learning_rate_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its parameters be
multiplied by get_learning_rate_multiplier().
!*/
double get_weight_decay_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its parameters be
multiplied by get_weight_decay_multiplier().
!*/
void set_learning_rate_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_learning_rate_multiplier() == val
!*/
void set_weight_decay_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_weight_decay_multiplier() == val
!*/
double get_bias_learning_rate_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its bias parameters be
multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
!*/
double get_bias_weight_decay_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its bias parameters be
multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
!*/
void set_bias_learning_rate_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_bias_learning_rate_multiplier() == val
!*/
void set_bias_weight_decay_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_bias_weight_decay_multiplier() == val
!*/
template <typename SUBNET> void setup (const SUBNET& sub); template <typename SUBNET> void setup (const SUBNET& sub);
template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
...@@ -648,6 +818,8 @@ namespace dlib ...@@ -648,6 +818,8 @@ namespace dlib
FC_MODE = 1 // fully connected mode FC_MODE = 1 // fully connected mode
}; };
const double DEFAULT_BATCH_NORM_EPS = 0.00001;
template < template <
layer_mode mode layer_mode mode
> >
...@@ -684,16 +856,29 @@ namespace dlib ...@@ -684,16 +856,29 @@ namespace dlib
/*! /*!
ensures ensures
- #get_mode() == mode - #get_mode() == mode
- get_running_stats_window_size() == 1000 - #get_running_stats_window_size() == 1000
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 0
- #get_bias_learning_rate_multiplier() == 1
- #get_bias_weight_decay_multiplier() == 1
- #get_eps() == tt::DEFAULT_BATCH_NORM_EPS
!*/ !*/
explicit bn_( explicit bn_(
unsigned long window_size unsigned long window_size,
double eps = tt::DEFAULT_BATCH_NORM_EPS
); );
/*! /*!
requires
- eps > 0
ensures ensures
- #get_mode() == mode - #get_mode() == mode
- get_running_stats_window_size() == window_size - #get_running_stats_window_size() == window_size
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 0
- #get_bias_learning_rate_multiplier() == 1
- #get_bias_weight_decay_multiplier() == 1
- #get_eps() == eps
!*/ !*/
layer_mode get_mode( layer_mode get_mode(
...@@ -712,6 +897,15 @@ namespace dlib ...@@ -712,6 +897,15 @@ namespace dlib
normalization after a convolutional layer you should use CONV_MODE. normalization after a convolutional layer you should use CONV_MODE.
!*/ !*/
double get_eps(
) const;
/*!
ensures
- When doing batch normalization, we are dividing by the standard
deviation. This epsilon value returned by this function is added to the
variance to prevent the division from dividing by zero.
!*/
unsigned long get_running_stats_window_size ( unsigned long get_running_stats_window_size (
) const; ) const;
/*! /*!
...@@ -725,6 +919,82 @@ namespace dlib ...@@ -725,6 +919,82 @@ namespace dlib
the running average. the running average.
!*/ !*/
double get_learning_rate_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its parameters be
multiplied by get_learning_rate_multiplier().
!*/
double get_weight_decay_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its parameters be
multiplied by get_weight_decay_multiplier().
!*/
void set_learning_rate_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_learning_rate_multiplier() == val
!*/
void set_weight_decay_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_weight_decay_multiplier() == val
!*/
double get_bias_learning_rate_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its bias parameters be
multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
!*/
double get_bias_weight_decay_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its bias parameters be
multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
!*/
void set_bias_learning_rate_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_bias_learning_rate_multiplier() == val
!*/
void set_bias_weight_decay_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_bias_weight_decay_multiplier() == val
!*/
template <typename SUBNET> void setup (const SUBNET& sub); template <typename SUBNET> void setup (const SUBNET& sub);
template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
...@@ -1330,7 +1600,13 @@ namespace dlib ...@@ -1330,7 +1600,13 @@ namespace dlib
what layer to add to the output of the previous layer. The result of this what layer to add to the output of the previous layer. The result of this
addition is output by add_prev_. Finally, the addition happens pointwise addition is output by add_prev_. Finally, the addition happens pointwise
according to 4D tensor arithmetic. If the dimensions don't match then according to 4D tensor arithmetic. If the dimensions don't match then
missing elements are presumed to be equal to 0. missing elements are presumed to be equal to 0. Moreover, each dimension
of the output tensor is equal to the maximum dimension of either of the
inputs. That is, if the tensors A and B are being added to produce C then:
- C.num_samples() == max(A.num_samples(), B.num_samples())
- C.k() == max(A.k(), B.k())
- C.nr() == max(A.nr(), B.nr())
- C.nc() == max(A.nc(), B.nc())
!*/ !*/
public: public:
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include "solvers_abstract.h" #include "solvers_abstract.h"
#include "tensor.h" #include "tensor.h"
#include <iostream> #include <iostream>
#include "layers.h"
namespace dlib namespace dlib
{ {
...@@ -49,10 +50,53 @@ namespace dlib ...@@ -49,10 +50,53 @@ namespace dlib
v = 0; v = 0;
} }
//perform: v = momentum*mat(v) - weight_decay*learning_rate*mat(params) - learning_rate*mat(params_grad); const double lr = learning_rate*get_learning_rate_multiplier(l);
tt::affine_transform(v, v, params, params_grad, const double wd = weight_decay*get_weight_decay_multiplier(l);
momentum, -weight_decay*learning_rate, -learning_rate, 0);
//perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
return v;
}
template <unsigned long N>
const tensor& operator() (
const float learning_rate,
const fc_<N,FC_HAS_BIAS>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, l.get_num_outputs());
return v;
}
template <
long _num_filters,
long _nr,
long _nc,
int _stride_y,
int _stride_x,
int _padding_y,
int _padding_x
>
const tensor& operator() (
const float learning_rate,
const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, l.num_filters());
return v;
}
template < layer_mode mode >
const tensor& operator() (
const float learning_rate,
const bn_<mode>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2);
return v; return v;
} }
...@@ -76,9 +120,49 @@ namespace dlib ...@@ -76,9 +120,49 @@ namespace dlib
} }
private: private:
template <typename layer_type>
void update_considering_bias(
const float learning_rate,
const layer_type& l,
const tensor& params_grad,
unsigned long bias_offset
)
{
const tensor& params = l.get_layer_params();
DLIB_CASSERT(params.size() != 0,"");
if (v.size() == 0)
{
v.copy_size(params_grad);
v = 0;
}
double lr = learning_rate*get_learning_rate_multiplier(l);
double wd = weight_decay*get_weight_decay_multiplier(l);
//perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
{
tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
}
else
{
tt::affine_transform_range(0, bias_offset, v, v, params, params_grad, momentum, -wd*lr, -lr);
// now update the biases but apply their multipliers
lr *= l.get_bias_learning_rate_multiplier();
wd *= l.get_bias_weight_decay_multiplier();
tt::affine_transform_range(bias_offset, v.size(), v, v, params, params_grad, momentum, -wd*lr, -lr);
}
}
resizable_tensor v; resizable_tensor v;
float weight_decay; float weight_decay;
float momentum; float momentum;
}; };
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
...@@ -132,11 +216,57 @@ namespace dlib ...@@ -132,11 +216,57 @@ namespace dlib
++t; ++t;
tt::compute_adam_update(s, m, v, t, learning_rate, weight_decay, momentum1, momentum2, params, params_grad);
tt::compute_adam_update(0, params.size(), s, m, v, t,
learning_rate*get_learning_rate_multiplier(l),
weight_decay*get_weight_decay_multiplier(l),
momentum1, momentum2, params, params_grad);
return s; return s;
} }
template <unsigned long N>
const tensor& operator() (
const float learning_rate,
const fc_<N,FC_HAS_BIAS>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, l.get_num_outputs());
return s;
}
template <
long _num_filters,
long _nr,
long _nc,
int _stride_y,
int _stride_x,
int _padding_y,
int _padding_x
>
const tensor& operator() (
const float learning_rate,
const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, l.num_filters());
return s;
}
template < layer_mode mode >
const tensor& operator() (
const float learning_rate,
const bn_<mode>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2);
return s;
}
friend void serialize(const adam& item, std::ostream& out) friend void serialize(const adam& item, std::ostream& out)
{ {
serialize("adam2", out); serialize("adam2", out);
...@@ -165,6 +295,49 @@ namespace dlib ...@@ -165,6 +295,49 @@ namespace dlib
} }
private: private:
template <typename layer_type>
void update_considering_bias(
const float learning_rate,
const layer_type& l,
const tensor& params_grad,
unsigned long bias_offset
)
{
const tensor& params = l.get_layer_params();
DLIB_CASSERT(params.size() != 0,"");
if (v.size() == 0)
{
m.copy_size(params_grad);
m = 0;
v.copy_size(params_grad);
v = 0;
s.copy_size(params_grad);
}
++t;
if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
{
tt::compute_adam_update(0, params.size(), s, m, v, t,
learning_rate*get_learning_rate_multiplier(l),
weight_decay*get_weight_decay_multiplier(l),
momentum1, momentum2, params, params_grad);
}
else
{
tt::compute_adam_update(0, bias_offset, s, m, v, t,
learning_rate*get_learning_rate_multiplier(l),
weight_decay*get_weight_decay_multiplier(l),
momentum1, momentum2, params, params_grad);
tt::compute_adam_update(bias_offset, params.size(), s, m, v, t,
learning_rate*get_learning_rate_multiplier(l)*l.get_bias_learning_rate_multiplier(),
weight_decay*get_weight_decay_multiplier(l)*l.get_bias_weight_decay_multiplier(),
momentum1, momentum2, params, params_grad);
}
}
resizable_tensor m; resizable_tensor m;
resizable_tensor v; resizable_tensor v;
resizable_tensor s; resizable_tensor s;
......
...@@ -78,6 +78,15 @@ namespace dlib ...@@ -78,6 +78,15 @@ namespace dlib
V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad; V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
Here V is a momentum term that is remembered by the solver from one Here V is a momentum term that is remembered by the solver from one
invocation of operator() to the next. invocation of operator() to the next.
Note that the actual learning rate and weight decay used by the solver are
multiplied by the per layer multipliers. That is, the solver will call
get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
multiply these values with the nominal learning rate and weight decay,
respectively, to determine the values it will use during each step. It is
also overloaded to allow additional learning rate multipliers to be applied
to fc_ and con_ bias parameters.
!*/ !*/
public: public:
...@@ -123,6 +132,15 @@ namespace dlib ...@@ -123,6 +132,15 @@ namespace dlib
paper: paper:
Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
optimization." International Conference on Learning Representation. 2015. optimization." International Conference on Learning Representation. 2015.
Note that the actual learning rate and weight decay used by the solver are
multiplied by the per layer multipliers. That is, the solver will call
get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
multiply these values with the nominal learning rate and weight decay,
respectively, to determine the values it will use during each step. It is
also overloaded to allow additional learning rate multipliers to be applied
to fc_ and con_ bias parameters.
!*/ !*/
public: public:
......
...@@ -240,6 +240,42 @@ namespace dlib { namespace tt ...@@ -240,6 +240,42 @@ namespace dlib { namespace tt
#endif #endif
} }
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
)
{
#ifdef DLIB_USE_CUDA
cuda::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
#else
cpu::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
#endif
}
void affine_transform(
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
)
{
#ifdef DLIB_USE_CUDA
cuda::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
#else
cpu::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
#endif
}
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void affine_transform( void affine_transform(
...@@ -275,6 +311,8 @@ namespace dlib { namespace tt ...@@ -275,6 +311,8 @@ namespace dlib { namespace tt
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void compute_adam_update ( void compute_adam_update (
size_t begin,
size_t end,
tensor& s, tensor& s,
tensor& m, tensor& m,
tensor& v, tensor& v,
...@@ -288,10 +326,10 @@ namespace dlib { namespace tt ...@@ -288,10 +326,10 @@ namespace dlib { namespace tt
) )
{ {
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cuda::compute_adam_update(s, m, v, t, learning_rate, weight_decay, momentum1, cuda::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
momentum2, params, params_grad); momentum2, params, params_grad);
#else #else
cpu::compute_adam_update(s, m, v, t, learning_rate, weight_decay, momentum1, cpu::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
momentum2, params, params_grad); momentum2, params, params_grad);
#endif #endif
} }
...@@ -299,6 +337,7 @@ namespace dlib { namespace tt ...@@ -299,6 +337,7 @@ namespace dlib { namespace tt
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void batch_normalize_inference ( void batch_normalize_inference (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
const tensor& src, const tensor& src,
const tensor& gamma, const tensor& gamma,
...@@ -308,13 +347,14 @@ namespace dlib { namespace tt ...@@ -308,13 +347,14 @@ namespace dlib { namespace tt
) )
{ {
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cuda::batch_normalize_inference(dest,src,gamma,beta,running_means,running_variances); cuda::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
#else #else
cpu::batch_normalize_inference(dest,src,gamma,beta,running_means,running_variances); cpu::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
#endif #endif
} }
void batch_normalize ( void batch_normalize (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
resizable_tensor& means, resizable_tensor& means,
resizable_tensor& vars, resizable_tensor& vars,
...@@ -327,13 +367,14 @@ namespace dlib { namespace tt ...@@ -327,13 +367,14 @@ namespace dlib { namespace tt
) )
{ {
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cuda::batch_normalize(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta); cuda::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
#else #else
cpu::batch_normalize(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta); cpu::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
#endif #endif
} }
void batch_normalize_gradient ( void batch_normalize_gradient (
const double eps,
const tensor& gradient_input, const tensor& gradient_input,
const tensor& means, const tensor& means,
const tensor& invstds, const tensor& invstds,
...@@ -346,15 +387,16 @@ namespace dlib { namespace tt ...@@ -346,15 +387,16 @@ namespace dlib { namespace tt
{ {
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cuda::batch_normalize_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); cuda::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
#else #else
cpu::batch_normalize_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); cpu::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
#endif #endif
} }
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void batch_normalize_conv_inference ( void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
const tensor& src, const tensor& src,
const tensor& gamma, const tensor& gamma,
...@@ -364,13 +406,14 @@ namespace dlib { namespace tt ...@@ -364,13 +406,14 @@ namespace dlib { namespace tt
) )
{ {
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cuda::batch_normalize_conv_inference(dest,src,gamma,beta,running_means,running_variances); cuda::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
#else #else
cpu::batch_normalize_conv_inference(dest,src,gamma,beta,running_means,running_variances); cpu::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
#endif #endif
} }
void batch_normalize_conv ( void batch_normalize_conv (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
resizable_tensor& means, resizable_tensor& means,
resizable_tensor& vars, resizable_tensor& vars,
...@@ -383,13 +426,14 @@ namespace dlib { namespace tt ...@@ -383,13 +426,14 @@ namespace dlib { namespace tt
) )
{ {
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cuda::batch_normalize_conv(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta); cuda::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
#else #else
cpu::batch_normalize_conv(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta); cpu::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
#endif #endif
} }
void batch_normalize_conv_gradient ( void batch_normalize_conv_gradient (
const double eps,
const tensor& gradient_input, const tensor& gradient_input,
const tensor& means, const tensor& means,
const tensor& invstds, const tensor& invstds,
...@@ -402,9 +446,9 @@ namespace dlib { namespace tt ...@@ -402,9 +446,9 @@ namespace dlib { namespace tt
{ {
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cuda::batch_normalize_conv_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); cuda::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
#else #else
cpu::batch_normalize_conv_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); cpu::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
#endif #endif
} }
......
...@@ -229,13 +229,58 @@ namespace dlib { namespace tt ...@@ -229,13 +229,58 @@ namespace dlib { namespace tt
const float D const float D
); );
/*! /*!
requires - dest.size()==src1.size() requires
- dest.size()==src1.size()
- dest.size()==src2.size() - dest.size()==src2.size()
- dest.size()==src3.size() - dest.size()==src3.size()
ensures ensures
- #dest == A*src1 + B*src2 + C*src3 + D - #dest == A*src1 + B*src2 + C*src3 + D
!*/ !*/
void affine_transform(
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
);
/*!
requires
- dest.size()==src1.size()
- dest.size()==src2.size()
- dest.size()==src3.size()
ensures
- #dest == A*src1 + B*src2 + C*src3
!*/
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
);
/*!
requires
- dest.size()==src1.size()
- dest.size()==src2.size()
- dest.size()==src3.size()
- begin <= end <= dest.size()
ensures
- This function operates much like
affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only
the half open range [begin,end) rather than processing the entire tensor.
Specifically, it does this:
- for i in the range [begin, end):
- #dest.host()[i] == A*src1.host()[i] + B*src2.host()[i] + C*src3.host()[i]
!*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void affine_transform( void affine_transform(
...@@ -290,6 +335,8 @@ namespace dlib { namespace tt ...@@ -290,6 +335,8 @@ namespace dlib { namespace tt
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void compute_adam_update ( void compute_adam_update (
size_t begin,
size_t end,
tensor& s, tensor& s,
tensor& m, tensor& m,
tensor& v, tensor& v,
...@@ -309,19 +356,22 @@ namespace dlib { namespace tt ...@@ -309,19 +356,22 @@ namespace dlib { namespace tt
- weight_decay >= 0 - weight_decay >= 0
- 0 <= momentum1 < 1 - 0 <= momentum1 < 1
- 0 <= momentum2 < 1 - 0 <= momentum2 < 1
- begin <= end <= params.size()
ensures ensures
- This function implements the ADAM parameter update method described in the paper: - This function implements the ADAM parameter update method described in the paper:
Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
optimization." International Conference on Learning Representation. 2015. optimization." International Conference on Learning Representation. 2015.
Specifically, it implements the method shown as Algorithm 1. Specifically, it implements the method shown as Algorithm 1.
- #s is the update vector that should be added to the parameters. - #s is the update vector that should be added to the parameters.
- The function only operates in the half open range [begin,end) of the memory
blocks of each tensor. E.g. to make this function run on the entire tensor
set begin to 0 and end to params.size().
!*/ !*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
const double BATCH_NORM_EPS = 0.00001;
void batch_normalize_inference ( void batch_normalize_inference (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
const tensor& src, const tensor& src,
const tensor& gamma, const tensor& gamma,
...@@ -331,6 +381,7 @@ namespace dlib { namespace tt ...@@ -331,6 +381,7 @@ namespace dlib { namespace tt
); );
/*! /*!
requires requires
- eps > 0
- gamma.num_samples() == 1 - gamma.num_samples() == 1
- gamma.nr() == src.nr() - gamma.nr() == src.nr()
- gamma.nc() == src.nc() - gamma.nc() == src.nc()
...@@ -342,11 +393,12 @@ namespace dlib { namespace tt ...@@ -342,11 +393,12 @@ namespace dlib { namespace tt
- Linearly transforms src as a call to batch_normalize() would if src had means - Linearly transforms src as a call to batch_normalize() would if src had means
and variances as given by running_means and running_variances. That is, this and variances as given by running_means and running_variances. That is, this
function performs: function performs:
dest = gamma*(src-running_means)/sqrt(running_variances+BATCH_NORM_EPS) + beta dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta
Note that it does it in a pointwise fashion over the samples in src. Note that it does it in a pointwise fashion over the samples in src.
!*/ !*/
void batch_normalize ( void batch_normalize (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
resizable_tensor& means, resizable_tensor& means,
resizable_tensor& invstds, resizable_tensor& invstds,
...@@ -359,6 +411,7 @@ namespace dlib { namespace tt ...@@ -359,6 +411,7 @@ namespace dlib { namespace tt
); );
/*! /*!
requires requires
- eps > 0
- src.num_samples() > 1 - src.num_samples() > 1
- gamma.num_samples() == 1 - gamma.num_samples() == 1
- beta.num_samples() == 1 - beta.num_samples() == 1
...@@ -384,6 +437,7 @@ namespace dlib { namespace tt ...@@ -384,6 +437,7 @@ namespace dlib { namespace tt
!*/ !*/
void batch_normalize_gradient ( void batch_normalize_gradient (
const double eps,
const tensor& gradient_input, const tensor& gradient_input,
const tensor& means, const tensor& means,
const tensor& invstds, const tensor& invstds,
...@@ -395,8 +449,9 @@ namespace dlib { namespace tt ...@@ -395,8 +449,9 @@ namespace dlib { namespace tt
); );
/*! /*!
requires requires
- eps > 0
- invstds and means should be the output of a call to - invstds and means should be the output of a call to
batch_normalize(dest,means,invstds,src,gamma,beta) batch_normalize(eps,dest,means,invstds,src,gamma,beta)
- have_same_dimensions(gradient_input, src) == true - have_same_dimensions(gradient_input, src) == true
- have_same_dimensions(src, src_grad) == true - have_same_dimensions(src, src_grad) == true
- src.num_samples() > 1 - src.num_samples() > 1
...@@ -410,7 +465,7 @@ namespace dlib { namespace tt ...@@ -410,7 +465,7 @@ namespace dlib { namespace tt
- have_same_dimensions(invstds, gamma) == true - have_same_dimensions(invstds, gamma) == true
ensures ensures
- Let f(src,gamma,beta) == dot(gradient_input, dest output of - Let f(src,gamma,beta) == dot(gradient_input, dest output of
batch_normalize(dest,means,invstds,src,gamma,beta)) batch_normalize(eps,dest,means,invstds,src,gamma,beta))
- Adds the gradient of f() with respect to src to #src_grad. - Adds the gradient of f() with respect to src to #src_grad.
- Assigns the gradient of f() with respect to gamma to #gamma_grad. - Assigns the gradient of f() with respect to gamma to #gamma_grad.
- Assigns the gradient of f() with respect to beta to #beta_grad. - Assigns the gradient of f() with respect to beta to #beta_grad.
...@@ -419,6 +474,7 @@ namespace dlib { namespace tt ...@@ -419,6 +474,7 @@ namespace dlib { namespace tt
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void batch_normalize_conv_inference ( void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
const tensor& src, const tensor& src,
const tensor& gamma, const tensor& gamma,
...@@ -428,6 +484,7 @@ namespace dlib { namespace tt ...@@ -428,6 +484,7 @@ namespace dlib { namespace tt
); );
/*! /*!
requires requires
- eps > 0
- gamma.num_samples() == 1 - gamma.num_samples() == 1
- gamma.nr() == 1 - gamma.nr() == 1
- gamma.nc() == 1 - gamma.nc() == 1
...@@ -439,12 +496,13 @@ namespace dlib { namespace tt ...@@ -439,12 +496,13 @@ namespace dlib { namespace tt
- Linearly transforms src as a call to batch_normalize_conv() would if src had - Linearly transforms src as a call to batch_normalize_conv() would if src had
means and variances as given by running_means and running_variances. That means and variances as given by running_means and running_variances. That
is, this function performs: is, this function performs:
dest = gamma*(src-running_means)/sqrt(running_variances+BATCH_NORM_EPS) + beta dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta
Note that it does this in a pointwise fashion over the samples, rows, and Note that it does this in a pointwise fashion over the samples, rows, and
columns in src. columns in src.
!*/ !*/
void batch_normalize_conv ( void batch_normalize_conv (
const double eps,
resizable_tensor& dest, resizable_tensor& dest,
resizable_tensor& means, resizable_tensor& means,
resizable_tensor& invstds, resizable_tensor& invstds,
...@@ -457,6 +515,7 @@ namespace dlib { namespace tt ...@@ -457,6 +515,7 @@ namespace dlib { namespace tt
); );
/*! /*!
requires requires
- eps > 0
- src.num_samples() > 1 - src.num_samples() > 1
- gamma.num_samples()==gamma.nr()==gamma.nc() == 1 - gamma.num_samples()==gamma.nr()==gamma.nc() == 1
- beta.num_samples() ==beta.nr() ==gamma.nc() == 1 - beta.num_samples() ==beta.nr() ==gamma.nc() == 1
...@@ -478,6 +537,7 @@ namespace dlib { namespace tt ...@@ -478,6 +537,7 @@ namespace dlib { namespace tt
!*/ !*/
void batch_normalize_conv_gradient ( void batch_normalize_conv_gradient (
const double eps,
const tensor& gradient_input, const tensor& gradient_input,
const tensor& means, const tensor& means,
const tensor& invstds, const tensor& invstds,
...@@ -489,8 +549,9 @@ namespace dlib { namespace tt ...@@ -489,8 +549,9 @@ namespace dlib { namespace tt
); );
/*! /*!
requires requires
- eps > 0
- invstds and means should be the output of a call to - invstds and means should be the output of a call to
batch_normalize_conv(dest,means,invstds,src,gamma,beta) batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta)
- have_same_dimensions(gradient_input, src) == true - have_same_dimensions(gradient_input, src) == true
- have_same_dimensions(src, src_grad) == true - have_same_dimensions(src, src_grad) == true
- src.num_samples() > 1 - src.num_samples() > 1
...@@ -502,7 +563,7 @@ namespace dlib { namespace tt ...@@ -502,7 +563,7 @@ namespace dlib { namespace tt
- have_same_dimensions(invstds, gamma) == true - have_same_dimensions(invstds, gamma) == true
ensures ensures
- Let f(src,gamma,beta) == dot(gradient_input, dest output of - Let f(src,gamma,beta) == dot(gradient_input, dest output of
batch_normalize_conv(dest,means,invstds,src,gamma,beta)) batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta))
- Adds the gradient of f() with respect to src to #src_grad. - Adds the gradient of f() with respect to src to #src_grad.
- Assigns the gradient of f() with respect to gamma to #gamma_grad. - Assigns the gradient of f() with respect to gamma to #gamma_grad.
- Assigns the gradient of f() with respect to beta to #beta_grad. - Assigns the gradient of f() with respect to beta to #beta_grad.
......
...@@ -526,8 +526,7 @@ namespace dlib ...@@ -526,8 +526,7 @@ namespace dlib
label_type pick_which_run_update; label_type pick_which_run_update;
job_t next_job; job_t next_job;
std::vector<std::future<double>> losses(devices.size()); std::vector<dlib::future<double>> losses(devices.size());
std::vector<std::future<void>> update_futs(devices.size());
std::vector<tt::multi_device_tensor_averager> averagers; std::vector<tt::multi_device_tensor_averager> averagers;
// An array of all the parameter tensors in the first network. We will // An array of all the parameter tensors in the first network. We will
...@@ -536,6 +535,16 @@ namespace dlib ...@@ -536,6 +535,16 @@ namespace dlib
std::vector<tensor*> reference_params; std::vector<tensor*> reference_params;
visit_layer_parameters(devices[0]->net, [&](size_t, tensor& t) { reference_params.push_back(&t); }); visit_layer_parameters(devices[0]->net, [&](size_t, tensor& t) { reference_params.push_back(&t); });
// We make separate thread pools with just one thread in them because we want
// to make sure each device is always executed on the same thread. We care
// about this because there are thread_local context variables for some cuda
// components and they get regenerated when the current cuda device changes.
// Recreating them over and over is somewhat expensive so we want to avoid
// that.
std::vector<std::shared_ptr<thread_pool>> tp;
for (size_t i = 0; i < devices.size(); ++i)
tp.push_back(std::make_shared<thread_pool>(1));
size_t iteration = 0; size_t iteration = 0;
while(job_pipe.dequeue(next_job)) while(job_pipe.dequeue(next_job))
...@@ -545,7 +554,7 @@ namespace dlib ...@@ -545,7 +554,7 @@ namespace dlib
// right version for unsupervised or supervised training based on the type // right version for unsupervised or supervised training based on the type
// of label_type. // of label_type.
for (size_t i = 0; i < devices.size(); ++i) for (size_t i = 0; i < devices.size(); ++i)
losses[i] = std::async(std::launch::async,[&,i](){ return compute_parameter_gradients(i, next_job, pick_which_run_update); }); tp[i]->add_task_by_value([&,i](double& loss){ loss = compute_parameter_gradients(i, next_job, pick_which_run_update); }, losses[i]);
// aggregate loss values from all the network computations. // aggregate loss values from all the network computations.
double theloss = 0; double theloss = 0;
for (auto&& loss : losses) for (auto&& loss : losses)
...@@ -596,10 +605,10 @@ namespace dlib ...@@ -596,10 +605,10 @@ namespace dlib
// Now apply all the updates to each device. // Now apply all the updates to each device.
for (size_t i = 0; i < devices.size(); ++i) for (size_t i = 0; i < devices.size(); ++i)
update_futs[i] = std::async(std::launch::async, [&,i](){ if (next_job.have_data[i]) update_parameters(i); }); tp[i]->add_task_by_value([&,i](){ if (next_job.have_data[i]) update_parameters(i); });
// and wait for the updates to all happen. // and wait for the updates to all happen.
for (auto&& f : update_futs) for (size_t i = 0; i < devices.size(); ++i)
f.wait(); tp[i]->wait_for_all_tasks();
// Evey now and then force all the parameters to be the same just to make // Evey now and then force all the parameters to be the same just to make
......
...@@ -482,7 +482,7 @@ namespace dlib ...@@ -482,7 +482,7 @@ namespace dlib
<< "\n\t x_upper.size(): " << x_upper.size() << "\n\t x_upper.size(): " << x_upper.size()
); );
DLIB_ASSERT ( DLIB_ASSERT (
min(x_upper-x_lower) > 0, min(x_upper-x_lower) >= 0,
"\tdouble find_min_box_constrained()" "\tdouble find_min_box_constrained()"
<< "\n\t You have to supply proper box constraints to this function." << "\n\t You have to supply proper box constraints to this function."
<< "\n\r min(x_upper-x_lower): " << min(x_upper-x_lower) << "\n\r min(x_upper-x_lower): " << min(x_upper-x_lower)
...@@ -610,7 +610,7 @@ namespace dlib ...@@ -610,7 +610,7 @@ namespace dlib
<< "\n\t x_upper.size(): " << x_upper.size() << "\n\t x_upper.size(): " << x_upper.size()
); );
DLIB_ASSERT ( DLIB_ASSERT (
min(x_upper-x_lower) > 0, min(x_upper-x_lower) >= 0,
"\tdouble find_max_box_constrained()" "\tdouble find_max_box_constrained()"
<< "\n\t You have to supply proper box constraints to this function." << "\n\t You have to supply proper box constraints to this function."
<< "\n\r min(x_upper-x_lower): " << min(x_upper-x_lower) << "\n\r min(x_upper-x_lower): " << min(x_upper-x_lower)
......
...@@ -297,7 +297,7 @@ namespace dlib ...@@ -297,7 +297,7 @@ namespace dlib
- is_col_vector(x_upper) == true - is_col_vector(x_upper) == true
- x.size() == x_lower.size() == x_upper.size() - x.size() == x_lower.size() == x_upper.size()
(i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality) (i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality)
- min(x_upper-x_lower) > 0 - min(x_upper-x_lower) >= 0
(i.e. x_upper must contain upper bounds relative to x_lower) (i.e. x_upper must contain upper bounds relative to x_lower)
ensures ensures
- Performs a box constrained minimization of the function f() using the given - Performs a box constrained minimization of the function f() using the given
...@@ -391,7 +391,7 @@ namespace dlib ...@@ -391,7 +391,7 @@ namespace dlib
- is_col_vector(x_upper) == true - is_col_vector(x_upper) == true
- x.size() == x_lower.size() == x_upper.size() - x.size() == x_lower.size() == x_upper.size()
(i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality) (i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality)
- min(x_upper-x_lower) > 0 - min(x_upper-x_lower) >= 0
(i.e. x_upper must contain upper bounds relative to x_lower) (i.e. x_upper must contain upper bounds relative to x_lower)
ensures ensures
- Performs a box constrained maximization of the function f() using the given - Performs a box constrained maximization of the function f() using the given
......
...@@ -165,13 +165,13 @@ namespace ...@@ -165,13 +165,13 @@ namespace
resizable_tensor running_means; resizable_tensor running_means;
resizable_tensor running_variances; resizable_tensor running_variances;
batch_normalize(dest, means, vars, 1, running_means, running_variances, src, gamma, beta); batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
const double scale = (src.num_samples())/(src.num_samples()-1.0); const double scale = (src.num_samples())/(src.num_samples()-1.0);
// Turn back into biased variance estimate because that's how batch_normalize() works, so if we want to match it this is necessary. // Turn back into biased variance estimate because that's how batch_normalize() works, so if we want to match it this is necessary.
running_variances = mat(running_variances)/scale; running_variances = mat(running_variances)/scale;
batch_normalize_inference(dest2, src, gamma, beta, running_means, running_variances); batch_normalize_inference(DEFAULT_BATCH_NORM_EPS,dest2, src, gamma, beta, running_means, running_variances);
DLIB_TEST_MSG(max(abs(mat(dest2)-mat(dest))) < 1e-5, max(abs(mat(dest2)-mat(dest)))); DLIB_TEST_MSG(max(abs(mat(dest2)-mat(dest))) < 1e-5, max(abs(mat(dest2)-mat(dest))));
cpu::batch_normalize_inference(dest3, src, gamma, beta, running_means, running_variances); cpu::batch_normalize_inference(DEFAULT_BATCH_NORM_EPS,dest3, src, gamma, beta, running_means, running_variances);
DLIB_TEST_MSG(max(abs(mat(dest3)-mat(dest))) < 1e-5, max(abs(mat(dest3)-mat(dest)))); DLIB_TEST_MSG(max(abs(mat(dest3)-mat(dest))) < 1e-5, max(abs(mat(dest3)-mat(dest))));
...@@ -179,7 +179,7 @@ namespace ...@@ -179,7 +179,7 @@ namespace
auto f = [&](float eps) { auto f = [&](float eps) {
const float old = src.host()[idx]; const float old = src.host()[idx];
src.host()[idx] += eps; src.host()[idx] += eps;
batch_normalize(dest, means, vars, 1, running_means, running_variances, src, gamma, beta); batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest); float result = dot(gradient_input, dest);
src.host()[idx] = old; src.host()[idx] = old;
return result; return result;
...@@ -191,7 +191,7 @@ namespace ...@@ -191,7 +191,7 @@ namespace
auto f = [&](float eps) { auto f = [&](float eps) {
const float old = gamma.host()[idx]; const float old = gamma.host()[idx];
gamma.host()[idx] += eps; gamma.host()[idx] += eps;
batch_normalize(dest, means, vars, 1, running_means, running_variances, src, gamma, beta); batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest); float result = dot(gradient_input, dest);
gamma.host()[idx] = old; gamma.host()[idx] = old;
return result; return result;
...@@ -203,7 +203,7 @@ namespace ...@@ -203,7 +203,7 @@ namespace
auto f = [&](float eps) { auto f = [&](float eps) {
const float old = beta.host()[idx]; const float old = beta.host()[idx];
beta.host()[idx] += eps; beta.host()[idx] += eps;
batch_normalize(dest, means, vars, 1, running_means, running_variances, src, gamma, beta); batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest); float result = dot(gradient_input, dest);
beta.host()[idx] = old; beta.host()[idx] = old;
return result; return result;
...@@ -220,7 +220,7 @@ namespace ...@@ -220,7 +220,7 @@ namespace
gamma_grad = 8; gamma_grad = 8;
beta_grad = 8; beta_grad = 8;
batch_normalize_gradient(gradient_input, means, vars, src, gamma, src_grad, gamma_grad, beta_grad); batch_normalize_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, vars, src, gamma, src_grad, gamma_grad, beta_grad);
auto grad_error = compare_gradients(src_grad, grad_src); auto grad_error = compare_gradients(src_grad, grad_src);
dlog << LINFO << "src error: " << grad_error; dlog << LINFO << "src error: " << grad_error;
...@@ -250,14 +250,14 @@ namespace ...@@ -250,14 +250,14 @@ namespace
resizable_tensor running_means; resizable_tensor running_means;
resizable_tensor running_variances; resizable_tensor running_variances;
batch_normalize_conv(dest, means, vars, 1, running_means, running_variances, src, gamma, beta); batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
const double scale = (src.num_samples()*src.nr()*src.nc())/(src.num_samples()*src.nr()*src.nc()-1.0); const double scale = (src.num_samples()*src.nr()*src.nc())/(src.num_samples()*src.nr()*src.nc()-1.0);
// Turn back into biased variance estimate because that's how // Turn back into biased variance estimate because that's how
// batch_normalize_conv() works, so if we want to match it this is necessary. // batch_normalize_conv() works, so if we want to match it this is necessary.
running_variances = mat(running_variances)/scale; running_variances = mat(running_variances)/scale;
batch_normalize_conv_inference(dest2, src, gamma, beta, running_means, running_variances); batch_normalize_conv_inference(DEFAULT_BATCH_NORM_EPS,dest2, src, gamma, beta, running_means, running_variances);
DLIB_TEST(max(abs(mat(dest2)-mat(dest))) < 1e-5); DLIB_TEST(max(abs(mat(dest2)-mat(dest))) < 1e-5);
cpu::batch_normalize_conv_inference(dest3, src, gamma, beta, running_means, running_variances); cpu::batch_normalize_conv_inference(DEFAULT_BATCH_NORM_EPS,dest3, src, gamma, beta, running_means, running_variances);
DLIB_TEST(max(abs(mat(dest3)-mat(dest))) < 1e-5); DLIB_TEST(max(abs(mat(dest3)-mat(dest))) < 1e-5);
...@@ -265,7 +265,7 @@ namespace ...@@ -265,7 +265,7 @@ namespace
auto f = [&](float eps) { auto f = [&](float eps) {
const float old = src.host()[idx]; const float old = src.host()[idx];
src.host()[idx] += eps; src.host()[idx] += eps;
batch_normalize_conv(dest, means, vars, 1, running_means, running_variances, src, gamma, beta); batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest); float result = dot(gradient_input, dest);
src.host()[idx] = old; src.host()[idx] = old;
return result; return result;
...@@ -277,7 +277,7 @@ namespace ...@@ -277,7 +277,7 @@ namespace
auto f = [&](float eps) { auto f = [&](float eps) {
const float old = gamma.host()[idx]; const float old = gamma.host()[idx];
gamma.host()[idx] += eps; gamma.host()[idx] += eps;
batch_normalize_conv(dest, means, vars, 1, running_means, running_variances, src, gamma, beta); batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest); float result = dot(gradient_input, dest);
gamma.host()[idx] = old; gamma.host()[idx] = old;
return result; return result;
...@@ -289,7 +289,7 @@ namespace ...@@ -289,7 +289,7 @@ namespace
auto f = [&](float eps) { auto f = [&](float eps) {
const float old = beta.host()[idx]; const float old = beta.host()[idx];
beta.host()[idx] += eps; beta.host()[idx] += eps;
batch_normalize_conv(dest, means, vars, 1, running_means, running_variances, src, gamma, beta); batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest); float result = dot(gradient_input, dest);
beta.host()[idx] = old; beta.host()[idx] = old;
return result; return result;
...@@ -307,7 +307,7 @@ namespace ...@@ -307,7 +307,7 @@ namespace
gamma_grad = 9; gamma_grad = 9;
beta_grad = 9; beta_grad = 9;
batch_normalize_conv_gradient(gradient_input, means, vars, src, gamma, src_grad, gamma_grad, beta_grad); batch_normalize_conv_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, vars, src, gamma, src_grad, gamma_grad, beta_grad);
auto grad_error = compare_gradients(src_grad, grad_src); auto grad_error = compare_gradients(src_grad, grad_src);
...@@ -662,11 +662,11 @@ namespace ...@@ -662,11 +662,11 @@ namespace
rnd.fill_uniform(params_grad); rnd.fill_uniform(params_grad);
resizable_tensor mm(m), vv(v); resizable_tensor mm(m), vv(v);
cpu::compute_adam_update(s, mm, vv, t, 0.01, 0.001, 0.9, 0.99, params, params_grad); cpu::compute_adam_update(0,params.size(),s, mm, vv, t, 0.01, 0.001, 0.9, 0.99, params, params_grad);
matrix<float> s1 = mat(s); matrix<float> s1 = mat(s);
rnd.fill_uniform(s); rnd.fill_uniform(s);
cuda::compute_adam_update(s, m, v, t, 0.01, 0.001, 0.9, 0.99, params, params_grad); cuda::compute_adam_update(0,params.size(),s, m, v, t, 0.01, 0.001, 0.9, 0.99, params, params_grad);
matrix<float> s2 = mat(s); matrix<float> s2 = mat(s);
DLIB_TEST_MSG(max(abs(s1-s2)) < 1e-6, max(abs(s1-s2))); DLIB_TEST_MSG(max(abs(s1-s2)) < 1e-6, max(abs(s1-s2)));
...@@ -775,6 +775,27 @@ namespace ...@@ -775,6 +775,27 @@ namespace
cpu::affine_transform(dest2, src2, srcb2, srcc2, 2, 3, 4, 5); cpu::affine_transform(dest2, src2, srcb2, srcc2, 2, 3, 4, 5);
DLIB_TEST(equal(mat(dest),mat(dest2))); DLIB_TEST(equal(mat(dest),mat(dest2)));
cuda::affine_transform(dest, src, srcb, srcc, 2, 3, 4, 0);
cpu::affine_transform(dest2, src2, srcb2, srcc2, 2, 3, 4, 0);
DLIB_TEST(equal(mat(dest),mat(dest2)));
cuda::affine_transform_range(0, dest.size(), dest, src, srcb, srcc, 2, 3, 4);
cpu::affine_transform_range(0, dest2.size(), dest2, src2, srcb2, srcc2, 2, 3, 4);
DLIB_TEST(equal(mat(dest),mat(dest2)));
if (3 < dest.size())
{
dest = 999;
dest2 = 999;
cuda::affine_transform_range(3, dest.size()-1, dest, src, srcb, srcc, 2, 3, 4);
cpu::affine_transform_range(3, dest2.size()-1, dest2, src2, srcb2, srcc2, 2, 3, 4);
DLIB_TEST(equal(mat(dest),mat(dest2)));
cuda::affine_transform_range(dest.size(), dest.size(), dest, src, srcb, srcc, 2, 3, 4);
cpu::affine_transform_range(dest2.size(), dest2.size(), dest2, src2, srcb2, srcc2, 2, 3, 4);
DLIB_TEST(equal(mat(dest),mat(dest2)));
}
rnd.fill_uniform(dest); rnd.fill_uniform(dest);
rnd.fill_uniform(src); rnd.fill_uniform(src);
...@@ -863,8 +884,8 @@ namespace ...@@ -863,8 +884,8 @@ namespace
rnd.fill_uniform(src); rnd.fill_uniform(src);
cpu::batch_normalize(dest, means, invstds, 1, running_means, running_variances, src, gamma, beta); cpu::batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, invstds, 1, running_means, running_variances, src, gamma, beta);
cuda::batch_normalize(dest2,means2,invstds2, 1, running_means2, running_variances2, src, gamma, beta); cuda::batch_normalize(DEFAULT_BATCH_NORM_EPS,dest2,means2,invstds2, 1, running_means2, running_variances2, src, gamma, beta);
dlog << LINFO << "dest error: "<< max(abs(mat(dest) -mat(dest2))); dlog << LINFO << "dest error: "<< max(abs(mat(dest) -mat(dest2)));
dlog << LINFO << "means error: "<< max(abs(mat(means) -mat(means2))); dlog << LINFO << "means error: "<< max(abs(mat(means) -mat(means2)));
...@@ -890,8 +911,8 @@ namespace ...@@ -890,8 +911,8 @@ namespace
rnd.fill_uniform(gradient_input); rnd.fill_uniform(gradient_input);
cpu::batch_normalize_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); cpu::batch_normalize_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cuda::batch_normalize_gradient(gradient_input, means, invstds, src, gamma, src_grad2, gamma_grad2, beta_grad2); cuda::batch_normalize_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, invstds, src, gamma, src_grad2, gamma_grad2, beta_grad2);
dlog << LINFO << "src_grad error: " << max(abs(mat(src_grad)-mat(src_grad2))); dlog << LINFO << "src_grad error: " << max(abs(mat(src_grad)-mat(src_grad2)));
dlog << LINFO << "gamma_grad error: " << max(abs(mat(gamma_grad)-mat(gamma_grad2))); dlog << LINFO << "gamma_grad error: " << max(abs(mat(gamma_grad)-mat(gamma_grad2)));
...@@ -917,8 +938,8 @@ namespace ...@@ -917,8 +938,8 @@ namespace
tt::tensor_rand rnd; tt::tensor_rand rnd;
rnd.fill_uniform(src); rnd.fill_uniform(src);
cpu::batch_normalize_conv(dest,means,invstds,1,running_means,running_variances, src, gamma, beta); cpu::batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest,means,invstds,1,running_means,running_variances, src, gamma, beta);
cuda::batch_normalize_conv(dest2,means2,invstds2,1,running_means2,running_variances2, src, gamma, beta); cuda::batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest2,means2,invstds2,1,running_means2,running_variances2, src, gamma, beta);
dlog << LINFO << "dest error: "<< max(abs(mat(dest) -mat(dest2))); dlog << LINFO << "dest error: "<< max(abs(mat(dest) -mat(dest2)));
dlog << LINFO << "means error: "<< max(abs(mat(means) -mat(means2))); dlog << LINFO << "means error: "<< max(abs(mat(means) -mat(means2)));
...@@ -942,8 +963,8 @@ namespace ...@@ -942,8 +963,8 @@ namespace
rnd.fill_uniform(gradient_input); rnd.fill_uniform(gradient_input);
cpu::batch_normalize_conv_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad); cpu::batch_normalize_conv_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cuda::batch_normalize_conv_gradient(gradient_input, means, invstds, src, gamma, src_grad2, gamma_grad2, beta_grad2); cuda::batch_normalize_conv_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, invstds, src, gamma, src_grad2, gamma_grad2, beta_grad2);
dlog << LINFO << "src_grad error: " << max(abs(mat(src_grad)-mat(src_grad2))); dlog << LINFO << "src_grad error: " << max(abs(mat(src_grad)-mat(src_grad2)));
dlog << LINFO << "gamma_grad error: " << max(abs(mat(gamma_grad)-mat(gamma_grad2))); dlog << LINFO << "gamma_grad error: " << max(abs(mat(gamma_grad)-mat(gamma_grad2)));
...@@ -1318,6 +1339,72 @@ namespace ...@@ -1318,6 +1339,72 @@ namespace
DLIB_TEST(net2.subnet().subnet().subnet().layer_details().get_num_outputs() == 4); DLIB_TEST(net2.subnet().subnet().subnet().layer_details().get_num_outputs() == 4);
} }
// ----------------------------------------------------------------------------------------
template <
int N,
template <typename> class BN,
int stride,
typename SUBNET
>
using block = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>;
template <
template <int,template<typename>class,int,typename> class block,
int N,
template<typename>class BN,
typename SUBNET
>
using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;
template <
template <int,template<typename>class,int,typename> class block,
int N,
template<typename>class BN,
typename SUBNET
>
using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>;
template <typename SUBNET> using res = relu<residual<block,8,bn_con,SUBNET>>;
template <typename SUBNET> using ares = relu<residual<block,8,affine,SUBNET>>;
template <typename SUBNET> using res_down = relu<residual_down<block,8,bn_con,SUBNET>>;
template <typename SUBNET> using ares_down = relu<residual_down<block,8,affine,SUBNET>>;
template <typename SUBNET>
using pres = prelu<add_prev1<bn_con<con<8,3,3,1,1,prelu<bn_con<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>;
void test_visit_funcions()
{
using net_type2 = loss_multiclass_log<fc<10,
avg_pool_everything<
pres<res<res<res_down< // 2 prelu layers here
tag4<repeat<9,pres, // 9 groups, each containing 2 prelu layers
res_down<
res<
input<matrix<unsigned char>>
>>>>>>>>>>>;
net_type2 pnet;
DLIB_CASSERT(pnet.num_layers == 131, pnet.num_layers);
DLIB_CASSERT(pnet.num_computational_layers == 109, pnet.num_computational_layers);
std::vector<bool> hit(pnet.num_computational_layers, false);
size_t count = 0;
visit_layer_parameter_gradients(pnet, [&](size_t i, tensor& ){hit[i] = true; ++count; });
for (auto x : hit)
DLIB_TEST(x);
DLIB_TEST(count == pnet.num_computational_layers);
count = 0;
std::vector<bool> hit2(pnet.num_computational_layers, false);
visit_layer_parameters(pnet, [&](size_t i, tensor& ){hit2[i] = true; ++count; });
for (auto x : hit2)
DLIB_TEST(x);
DLIB_TEST(count == pnet.num_computational_layers);
}
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
class dnn_tester : public tester class dnn_tester : public tester
...@@ -1378,6 +1465,7 @@ namespace ...@@ -1378,6 +1465,7 @@ namespace
test_batch_normalize_conv(); test_batch_normalize_conv();
test_basic_tensor_ops(); test_basic_tensor_ops();
test_layers(); test_layers();
test_visit_funcions();
} }
} a; } a;
......
...@@ -20,29 +20,76 @@ using namespace dlib; ...@@ -20,29 +20,76 @@ using namespace dlib;
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
// Let's start by showing how you can conveniently define large networks. The // Let's start by showing how you can conveniently define large and complex
// most important tool for doing this are C++'s alias templates. These let us // networks. The most important tool for doing this are C++'s alias templates.
// define new layer types that are combinations of a bunch of other layers. // These let us define new layer types that are combinations of a bunch of other
// These will form the building blocks for more complex networks. // layers. These will form the building blocks for more complex networks.
// So let's begin by defining the building block of a residual network (see // So let's begin by defining the building block of a residual network (see
// Figure 2 in Deep Residual Learning for Image Recognition by He, Zhang, Ren, // Figure 2 in Deep Residual Learning for Image Recognition by He, Zhang, Ren,
// and Sun). You can see a few things in this statement. The most obvious is // and Sun). We are going to decompose the residual block into a few alias
// that we have combined a bunch of layers into the name "base_res". You can // statements. First, we define the core block.
// also see the use of the tag1 layer. This layer doesn't do any computation.
// It exists solely so other layers can refer to it. In this case, the // Here we have parameterized the "block" layer on a BN layer (nominally some
// add_prev1 layer looks for the tag1 layer and will take the tag1 output and // kind of batch normalization), the number of filter outputs N, and the stride
// add it to the input of the add_prev1 layer. This combination allows us to // the block operates at.
// implement skip and residual style networks. We have also made base_res template <
// parameterized by BN, which will let us insert different batch normalization int N,
// layers. template <typename> class BN,
template <template <typename> class BN, typename SUBNET> int stride,
using base_res = relu<add_prev1<BN<con<8,3,3,1,1,relu<BN<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>; typename SUBNET
>
// We also want a residual block that begins by doing downsampling. We can using block = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>;
// reuse base_res to define it like this:
template <template <typename> class BN, typename SUBNET> // Next, we need to define the skip layer mechanism used in the residual network
using base_res_down = base_res<BN,avg_pool<1,1,2,2,SUBNET>>; // paper. They create their blocks by adding the input tensor to the output of
// each block. So we define an alias statement that takes a block and wraps it
// with this skip/add structure.
// Note the tag layer. This layer doesn't do any computation. It exists solely
// so other layers can refer to it. In this case, the add_prev1 layer looks for
// the tag1 layer and will take the tag1 output and add it to the input of the
// add_prev1 layer. This combination allows us to implement skip and residual
// style networks. We have also set the block stride to 1 in this statement.
// The significance of that is explained next.
template <
template <int,template<typename>class,int,typename> class block,
int N,
template<typename>class BN,
typename SUBNET
>
using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;
// Some residual blocks do downsampling. They do this by using a stride of 2
// instead of 1. However, when downsampling we need to also take care to
// downsample the part of the network that adds the original input to the output
// or the sizes won't make sense (the network will still run, but the results
// aren't as good). So here we define a downsampling version of residual. In
// it, we make use of the skip1 layer. This layer simply outputs whatever is
// output by the tag1 layer. Therefore, the skip1 layer (there are also skip2,
// skip3, etc. in dlib) allows you to create branching network structures.
// residual_down creates a network structure like this:
/*
input from SUBNET
/ \
/ \
block downsample(using avg_pool)
\ /
\ /
add tensors (using add_prev2 which adds the output of tag2 with avg_pool's output)
|
output
*/
template <
template <int,template<typename>class,int,typename> class block,
int N,
template<typename>class BN,
typename SUBNET
>
using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>;
// Now we can define 4 different residual blocks we will use in this example. // Now we can define 4 different residual blocks we will use in this example.
// The first two are non-downsampling residual blocks while the last two // The first two are non-downsampling residual blocks while the last two
...@@ -50,10 +97,10 @@ using base_res_down = base_res<BN,avg_pool<1,1,2,2,SUBNET>>; ...@@ -50,10 +97,10 @@ using base_res_down = base_res<BN,avg_pool<1,1,2,2,SUBNET>>;
// ares_down have had the batch normalization replaced with simple affine // ares_down have had the batch normalization replaced with simple affine
// layers. We will use the affine version of the layers when testing our // layers. We will use the affine version of the layers when testing our
// networks. // networks.
template <typename SUBNET> using res = base_res<bn_con,SUBNET>; template <typename SUBNET> using res = relu<residual<block,8,bn_con,SUBNET>>;
template <typename SUBNET> using ares = base_res<affine,SUBNET>; template <typename SUBNET> using ares = relu<residual<block,8,affine,SUBNET>>;
template <typename SUBNET> using res_down = base_res_down<bn_con,SUBNET>; template <typename SUBNET> using res_down = relu<residual_down<block,8,bn_con,SUBNET>>;
template <typename SUBNET> using ares_down = base_res_down<affine,SUBNET>; template <typename SUBNET> using ares_down = relu<residual_down<block,8,affine,SUBNET>>;
...@@ -145,39 +192,41 @@ int main(int argc, char** argv) try ...@@ -145,39 +192,41 @@ int main(int argc, char** argv) try
// These print statements will output this (I've truncated it since it's // These print statements will output this (I've truncated it since it's
// long, but you get the idea): // long, but you get the idea):
/* /*
The pnet has 127 layers in it. The pnet has 131 layers in it.
layer<0> loss_multiclass_log layer<0> loss_multiclass_log
layer<1> fc (num_outputs=10) layer<1> fc (num_outputs=10) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<2> avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0) layer<2> avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0)
layer<3> prelu (initial_param_value=0.2) layer<3> prelu (initial_param_value=0.2)
layer<4> add_prev layer<4> add_prev
layer<5> bn_con layer<5> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<6> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) layer<6> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<7> prelu (initial_param_value=0.25) layer<7> prelu (initial_param_value=0.25)
layer<8> bn_con layer<8> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<9> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) layer<9> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<10> tag1 layer<10> tag1
... ...
layer<33> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) layer<34> relu
layer<34> tag1 layer<35> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<35> avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0) layer<36> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2, padding_y=0, padding_x=0) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<36> tag4 layer<37> tag1
layer<37> prelu (initial_param_value=0.3) layer<38> tag4
layer<38> add_prev layer<39> prelu (initial_param_value=0.3)
layer<39> bn_con layer<40> add_prev
layer<41> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
... ...
layer<115> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<116> tag1
layer<117> avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
layer<118> relu layer<118> relu
layer<119> add_prev layer<119> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<120> bn_con layer<120> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2, padding_y=0, padding_x=0) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<121> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) layer<121> tag1
layer<122> relu layer<122> relu
layer<123> bn_con layer<123> add_prev
layer<124> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) layer<124> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<125> tag1 layer<125> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<126> input<matrix> layer<126> relu
layer<127> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<128> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<129> tag1
layer<130> input<matrix>
*/ */
// Now that we know the index numbers for each layer, we can access them // Now that we know the index numbers for each layer, we can access them
...@@ -195,7 +244,7 @@ int main(int argc, char** argv) try ...@@ -195,7 +244,7 @@ int main(int argc, char** argv) try
// parts of your network and access them by layer<tag>(). You can also // parts of your network and access them by layer<tag>(). You can also
// index relative to a tag. So for example, to access the layer immediately // index relative to a tag. So for example, to access the layer immediately
// after tag4 you can say: // after tag4 you can say:
layer<tag4,1>(pnet); // Equivalent to layer<36+1>(pnet). layer<tag4,1>(pnet); // Equivalent to layer<38+1>(pnet).
// Or to access the layer 2 layers after tag4: // Or to access the layer 2 layers after tag4:
layer<tag4,2>(pnet); layer<tag4,2>(pnet);
......
...@@ -42,6 +42,12 @@ int main() ...@@ -42,6 +42,12 @@ int main()
try try
{ {
cv::VideoCapture cap(0); cv::VideoCapture cap(0);
if (!cap.isOpened())
{
cerr << "Unable to connect to camera" << endl;
return 1;
}
image_window win; image_window win;
// Load face detection and pose estimation models. // Load face detection and pose estimation models.
......
Hi Davis,
thanks for your work on dlib!
I have created a natvis file to have nicer debugger visualization of dlib matrices in Visual Studio (2012 - …) and I just wanted to share it with you.
To test it, copy the file into you folder %USERPROFILE%\My Documents\Visual Studio 2015\Visualizers or %VSINSTALLDIR%\Common7\Packages\Debugger\Visualizers as described here https://msdn.microsoft.com/en-us/library/jj620914.aspx
It’s certainly extendable, especially to include it into image watch, but currently it may help users to debug much faster.
Feel free to share it.
Best,
Johannes Huber
<?xml version="1.0" encoding="utf-8"?>
<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
<!-- dlib matrix debugger visualization in Visual Studio-->
<!-- Johannes Huber, SAFEmine Part of Hexagon -->
<!-- no warranty -->
<!-- general dlib::matrix fixed size-->
<Type Name="dlib::matrix&lt;*,*,*,*&gt;">
<DisplayString>{{ size= &lt;{$T2}&gt; x &lt;{$T3}&gt; }}</DisplayString>
<Expand>
<ArrayItems>
<Size>$T2 * $T3</Size>
<ValuePointer>($T1*)data.data</ValuePointer>
</ArrayItems>
</Expand>
</Type>
<!-- general dlib::matrix fixed rows-->
<Type Name="dlib::matrix&lt;*,0,*,*&gt;">
<DisplayString>{{ size={data.nr_} x &lt;{$T2}&gt; }}</DisplayString>
<Expand>
<ArrayItems Condition="data.data != 0">
<Size>data.nr_ * $T2</Size>
<ValuePointer>($T1*)data.data</ValuePointer>
</ArrayItems>
</Expand>
</Type>
<!-- general dlib::matrix fixed cols-->
<Type Name="dlib::matrix&lt;*,*,0,*&gt;">
<DisplayString>{{ size= &lt;{$T2}&gt; x {data.nc_} }}</DisplayString>
<Expand>
<ArrayItems Condition="data.data != 0">
<Size>$T2 * data.nc_</Size>
<ValuePointer>($T1*)data.data</ValuePointer>
</ArrayItems>
</Expand>
</Type>
<!-- general dlib::matrix dynamic size-->
<Type Name="dlib::matrix&lt;*,0,0,*&gt;">
<DisplayString>{{ size= {data.nc_} x {data.nc_} }}</DisplayString>
<Expand>
<ArrayItems Condition="data.data != 0">
<Size>data.nr_*data.nc_</Size>
<ValuePointer>($T1*)data.data</ValuePointer>
</ArrayItems>
</Expand>
</Type>
</AutoVisualizer>
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment