Commit 93e786db authored by Fm's avatar Fm

Merge branch 'master' of https://github.com/davisking/dlib into dnn_group_layer

parents 59892409 91163863
......@@ -488,6 +488,13 @@ namespace dlib
// ----------------------------------------------------------------------------------------
struct general_ {};
struct special_ : general_ {};
template<typename> struct int_ { typedef int type; };
// ----------------------------------------------------------------------------------------
/*!A is_same_object
This is a templated function which checks if both of its arguments are actually
......
......@@ -24,6 +24,38 @@
namespace dlib
{
// ----------------------------------------------------------------------------------------
namespace impl
{
template <typename T, typename int_<decltype(&T::get_learning_rate_multiplier)>::type = 0>
double get_learning_rate_multiplier (
const T& obj,
special_
) { return obj.get_learning_rate_multiplier(); }
template <typename T>
double get_learning_rate_multiplier ( const T& obj, general_) { return 1; }
}
template <typename T>
double get_learning_rate_multiplier(const T& obj) { return impl::get_learning_rate_multiplier(obj, special_()); }
// ----------------------------------------------------------------------------------------
namespace impl
{
template <typename T, typename int_<decltype(&T::get_weight_decay_multiplier)>::type = 0>
double get_weight_decay_multiplier (
const T& obj,
special_
) { return obj.get_weight_decay_multiplier(); }
template <typename T>
double get_weight_decay_multiplier ( const T& obj, general_) { return 1; }
}
template <typename T>
double get_weight_decay_multiplier(const T& obj) { return impl::get_weight_decay_multiplier(obj, special_()); }
// ----------------------------------------------------------------------------------------
namespace impl
......@@ -458,7 +490,7 @@ namespace dlib
sstack pop(size_t num=1)
{
DLIB_CASSERT(num < size(), "You can't pop more things from the stack than it has in it.");
DLIB_CASSERT(num <= size(), "You can't pop more things from the stack than it has in it.");
return sstack(data+num, mysize-num);
}
......@@ -849,8 +881,9 @@ namespace dlib
void update_parameters(sstack<solver_type> solvers, double learning_rate)
{
DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
// Don't try to adjust the parameters if this layer doesn't have any.
if (params_grad.size() != 0)
// Don't try to adjust the parameters if this layer doesn't have any or the
// learning rate is disabled for this layer.
if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0)
{
const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
tt::add(details.get_layer_params(), details.get_layer_params(), step);
......@@ -1200,8 +1233,9 @@ namespace dlib
void update_parameters(sstack<solver_type> solvers, double learning_rate)
{
DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
// Don't try to adjust the parameters if this layer doesn't have any.
if (params_grad.size() != 0)
// Don't try to adjust the parameters if this layer doesn't have any or the
// learning rate is disabled for this layer.
if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0)
{
const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
tt::add(details.get_layer_params(), details.get_layer_params(), step);
......@@ -1817,9 +1851,7 @@ namespace dlib
public:
typedef INPUT_LAYER subnet_type;
typedef typename subnet_type::input_type input_type;
// This layer counts as a computational layer because it copies and stores the
// inputs.
const static size_t num_computational_layers = 1;
const static size_t num_computational_layers = 0;
const static size_t num_layers = 2;
const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
static_assert(sample_expansion_factor >= 1,
......
......@@ -67,6 +67,32 @@ namespace dlib
(except computes it using a numerically accurate method)
!*/
// ----------------------------------------------------------------------------------------
template <typename T>
double get_learning_rate_multiplier(
const T& obj
);
/*!
ensures
- if (obj has a get_learning_rate_multiplier() member function) then
- returns obj.get_learning_rate_multiplier()
- else
- returns 1
!*/
template <typename T>
double get_weight_decay_multiplier(
const T& obj
);
/*!
ensures
- if (obj has a get_weight_decay_multiplier() member function) then
- returns obj.get_weight_decay_multiplier()
- else
- returns 1
!*/
// ----------------------------------------------------------------------------------------
bool dnn_prefer_fastest_algorithms(
......@@ -152,7 +178,7 @@ namespace dlib
);
/*!
requires
- num < size()
- num <= size()
ensures
- returns a reference to the sub-stack S such that:
- S.size() == size()-num.
......
......@@ -385,6 +385,30 @@ namespace dlib
d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D;
}
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
)
{
DLIB_CASSERT(dest.size()==src1.size(),"");
DLIB_CASSERT(dest.size()==src2.size(),"");
DLIB_CASSERT(dest.size()==src3.size(),"");
DLIB_CASSERT(begin <= end && end <= dest.size(),"");
const auto d = dest.host();
const auto s1 = src1.host();
const auto s2 = src2.host();
const auto s3 = src3.host();
for (size_t i = begin; i < end; ++i)
d[i] = A*s1[i] + B*s2[i] + C*s3[i];
}
// -----------------------------------------------------------------------------------
void affine_transform(
......@@ -464,6 +488,8 @@ namespace dlib
// -----------------------------------------------------------------------------------
void compute_adam_update (
size_t begin,
size_t end,
tensor& s,
tensor& m,
tensor& v,
......@@ -480,6 +506,7 @@ namespace dlib
s.size() == v.size() &&
s.size() == params.size() &&
s.size() == params_grad.size(),"");
DLIB_CASSERT(begin <= end && end <= params.size(),"");
const float eps = 1e-8;
const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));
......@@ -492,7 +519,7 @@ namespace dlib
auto ps = s.host_write_only();
auto pparams = params.host();
auto ppgrad = params_grad.host();
for (size_t i = 0; i < params.size(); ++i)
for (size_t i = begin; i < end; ++i)
{
float g = weight_decay*pparams[i] + ppgrad[i];
pm[i] = momentum1*pm[i] + (1-momentum1)*g;
......@@ -504,6 +531,7 @@ namespace dlib
// -----------------------------------------------------------------------------------
void batch_normalize_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -519,7 +547,8 @@ namespace dlib
gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_variances),
have_same_dimensions(gamma, running_variances) &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -538,7 +567,8 @@ namespace dlib
"\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
dest.copy_size(src);
......@@ -554,7 +584,7 @@ namespace dlib
{
for (long k = 0; k < num; ++k)
{
*d = g[k]*(*s - m[k])/std::sqrt(v[k]+dlib::tt::BATCH_NORM_EPS) + b[k];
*d = g[k]*(*s - m[k])/std::sqrt(v[k]+eps) + b[k];
++d;
++s;
}
......@@ -562,6 +592,7 @@ namespace dlib
}
void batch_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -582,7 +613,8 @@ namespace dlib
beta.num_samples() == 1 &&
gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
gamma.k() == beta.k() && beta.k() == src.k(),
gamma.k() == beta.k() && beta.k() == src.k() &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -593,7 +625,8 @@ namespace dlib
"\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
dest.copy_size(src);
......@@ -635,7 +668,7 @@ namespace dlib
else
rvar[i] = (1-averaging_factor)*rvar[i] + scale*averaging_factor*actual_var;
p_invstds[i] = 1.0f/std::sqrt(actual_var + dlib::tt::BATCH_NORM_EPS);
p_invstds[i] = 1.0f/std::sqrt(actual_var + eps);
}
p_src = src.host();
......@@ -662,6 +695,7 @@ namespace dlib
}
void batch_normalize_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -682,6 +716,7 @@ namespace dlib
DLIB_CASSERT(num == beta_grad.size(),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
DLIB_CASSERT(eps > 0,"");
beta_grad = 0;
gamma_grad = 0;
......@@ -757,6 +792,7 @@ namespace dlib
// ----------------------------------------------------------------------------------------
void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -772,7 +808,8 @@ namespace dlib
gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_variances),
have_same_dimensions(gamma, running_variances) &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -791,7 +828,8 @@ namespace dlib
"\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
dest.copy_size(src);
......@@ -807,7 +845,7 @@ namespace dlib
{
for (long k = 0; k < src.k(); ++k)
{
const float invstd = 1.0f/std::sqrt(v[k] + dlib::tt::BATCH_NORM_EPS);
const float invstd = 1.0f/std::sqrt(v[k] + eps);
for (long j = 0; j < num; ++j)
{
*d = g[k]*(*s - m[k])*invstd + b[k];
......@@ -819,6 +857,7 @@ namespace dlib
}
void batch_normalize_conv (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -841,7 +880,8 @@ namespace dlib
beta.nr() == 1 &&
gamma.nc() == 1 &&
beta.nc() == 1 &&
gamma.k() == beta.k() && beta.k() == src.k(),
gamma.k() == beta.k() && beta.k() == src.k() &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -852,7 +892,8 @@ namespace dlib
"\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
dest.copy_size(src);
......@@ -900,7 +941,7 @@ namespace dlib
else
rvar[k] = (1-averaging_factor)*rvar[k] + scale*averaging_factor*actual_var;
p_invstds[k] = 1.0f/std::sqrt(actual_var + dlib::tt::BATCH_NORM_EPS);
p_invstds[k] = 1.0f/std::sqrt(actual_var + eps);
}
p_src = src.host();
......@@ -928,6 +969,7 @@ namespace dlib
}
void batch_normalize_conv_gradient(
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -948,6 +990,7 @@ namespace dlib
DLIB_CASSERT(src.k() == beta_grad.size(),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
DLIB_CASSERT(eps > 0,"");
beta_grad = 0;
gamma_grad = 0;
......
......@@ -81,6 +81,18 @@ namespace dlib
const float D
);
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
);
// -----------------------------------------------------------------------------------
void affine_transform(
......@@ -102,6 +114,8 @@ namespace dlib
// -----------------------------------------------------------------------------------
void compute_adam_update (
size_t begin,
size_t end,
tensor& s,
tensor& m,
tensor& v,
......@@ -117,6 +131,7 @@ namespace dlib
// -----------------------------------------------------------------------------------
void batch_normalize_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -126,6 +141,7 @@ namespace dlib
);
void batch_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -138,6 +154,7 @@ namespace dlib
);
void batch_normalize_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -149,6 +166,7 @@ namespace dlib
);
void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -158,6 +176,7 @@ namespace dlib
);
void batch_normalize_conv (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -170,6 +189,7 @@ namespace dlib
);
void batch_normalize_conv_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......
......@@ -504,6 +504,40 @@ namespace dlib
src2.device(), src3.device(), dest.size(), A, B, C, D);
}
// ----------------------------------------------------------------------------------------
__global__ void _cuda_affine_transform_range(
float* d, const float* s1, const float* s2, const float* s3, size_t begin, size_t end, float A, float B, float C
)
{
for (auto i : grid_stride_range(begin, end))
{
d[i] = A*s1[i] + B*s2[i] + C*s3[i];
}
}
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
)
{
DLIB_CASSERT(dest.size()==src1.size(),"");
DLIB_CASSERT(dest.size()==src2.size(),"");
DLIB_CASSERT(dest.size()==src3.size(),"");
DLIB_CASSERT(begin <= end && end <= dest.size(),"");
launch_kernel(_cuda_affine_transform_range,max_jobs(end-begin),
dest.device(), src1.device(),
src2.device(), src3.device(), begin, end, A, B, C);
}
// -----------------------------------------------------------------------------------
__global__ void _cuda_affine_transform2(float* d, const float* s, size_t n, const float* A, const float* B)
......@@ -549,7 +583,8 @@ namespace dlib
// ----------------------------------------------------------------------------------------
__global__ void _cuda_compute_adam_update(
size_t n,
size_t begin,
size_t end,
float* s,
float* m,
float* v,
......@@ -566,7 +601,7 @@ namespace dlib
// m = momentum1*m + (1-momentum1) * (weight_decay*params + params_grad);
// v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad);
// s = -alpha*m/(sqrt(v) + eps);
for (auto i : grid_stride_range(0, n))
for (auto i : grid_stride_range(begin, end))
{
float g = (weight_decay*params[i] + params_grad[i]);
m[i] = momentum1*m[i] + (1-momentum1)*g;
......@@ -576,6 +611,8 @@ namespace dlib
}
void compute_adam_update (
size_t begin,
size_t end,
tensor& s,
tensor& m,
tensor& v,
......@@ -592,10 +629,11 @@ namespace dlib
s.size() == v.size() &&
s.size() == params.size() &&
s.size() == params_grad.size(),"");
DLIB_CASSERT(begin <= end && end <= params.size(),"");
const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));
launch_kernel(_cuda_compute_adam_update,max_jobs(s.size()),
s.size(), s.device(), m.device(), v.device(), alpha, weight_decay,
launch_kernel(_cuda_compute_adam_update,max_jobs(end-begin),
begin, end, s.device(), m.device(), v.device(), alpha, weight_decay,
momentum1, momentum2, params.device(), params_grad.device());
}
......
......@@ -164,6 +164,18 @@ namespace dlib
const float D
);
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
);
// Note that this function isn't in the tt:: namespace because add_scaled() is
// called by cuda::add() so we don't need a tt:: version of add_scaled().
void add_scaled(
......@@ -193,6 +205,8 @@ namespace dlib
// ----------------------------------------------------------------------------------------
void compute_adam_update (
size_t begin,
size_t end,
tensor& s,
tensor& m,
tensor& v,
......
......@@ -338,6 +338,7 @@ namespace dlib
// ------------------------------------------------------------------------------------
void batch_normalize_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -353,7 +354,8 @@ namespace dlib
gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_variances),
have_same_dimensions(gamma, running_variances) &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -372,7 +374,8 @@ namespace dlib
"\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
const float in_scale = 1;
const float out_scale = 0;
......@@ -393,10 +396,11 @@ namespace dlib
beta.device(),
running_means.device(),
running_variances.device(),
dlib::tt::BATCH_NORM_EPS));
eps));
}
void batch_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -417,7 +421,8 @@ namespace dlib
beta.num_samples() == 1 &&
gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
gamma.k() == beta.k() && beta.k() == src.k(),
gamma.k() == beta.k() && beta.k() == src.k() &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -428,7 +433,8 @@ namespace dlib
"\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
const float in_scale = 1;
......@@ -455,12 +461,13 @@ namespace dlib
averaging_factor,
running_means.device(),
running_variances.device(),
dlib::tt::BATCH_NORM_EPS,
eps,
means.device(),
invstds.device()));
}
void batch_normalize_gradient(
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -480,6 +487,7 @@ namespace dlib
DLIB_CASSERT(num == beta_grad.size(),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
DLIB_CASSERT(eps > 0,"");
const float in_scale = 1;
const float out_scale = 1;
......@@ -503,7 +511,7 @@ namespace dlib
gamma.device(),
gamma_grad.device(),
beta_grad.device(),
dlib::tt::BATCH_NORM_EPS,
eps,
means.device(),
invstds.device()));
}
......@@ -511,6 +519,7 @@ namespace dlib
// ------------------------------------------------------------------------------------
void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -526,7 +535,8 @@ namespace dlib
gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_variances),
have_same_dimensions(gamma, running_variances) &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -545,7 +555,8 @@ namespace dlib
"\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
const float in_scale = 1;
const float out_scale = 0;
......@@ -566,10 +577,11 @@ namespace dlib
beta.device(),
running_means.device(),
running_variances.device(),
dlib::tt::BATCH_NORM_EPS));
eps));
}
void batch_normalize_conv (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -592,7 +604,8 @@ namespace dlib
beta.nr() == 1 &&
gamma.nc() == 1 &&
beta.nc() == 1 &&
gamma.k() == beta.k() && beta.k() == src.k(),
gamma.k() == beta.k() && beta.k() == src.k() &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -603,7 +616,8 @@ namespace dlib
"\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
const float in_scale = 1;
const float out_scale = 0;
......@@ -629,12 +643,13 @@ namespace dlib
averaging_factor,
running_means.device(),
running_variances.device(),
dlib::tt::BATCH_NORM_EPS,
eps,
means.device(),
invstds.device()));
}
void batch_normalize_conv_gradient(
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -653,6 +668,7 @@ namespace dlib
DLIB_CASSERT(src.k() == beta_grad.size(),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
DLIB_CASSERT(eps > 0,"");
const float in_scale = 1;
const float out_scale = 1;
......@@ -676,7 +692,7 @@ namespace dlib
gamma.device(),
gamma_grad.device(),
beta_grad.device(),
dlib::tt::BATCH_NORM_EPS,
eps,
means.device(),
invstds.device()));
}
......
......@@ -135,6 +135,7 @@ namespace dlib
// ------------------------------------------------------------------------------------
void batch_normalize_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -144,6 +145,7 @@ namespace dlib
);
void batch_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -156,6 +158,7 @@ namespace dlib
);
void batch_normalize_gradient(
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -169,6 +172,7 @@ namespace dlib
// ------------------------------------------------------------------------------------
void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -178,6 +182,7 @@ namespace dlib
);
void batch_normalize_conv (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -190,6 +195,7 @@ namespace dlib
);
void batch_normalize_conv_gradient(
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......
......@@ -42,6 +42,10 @@ namespace dlib
con_(
) :
learning_rate_multiplier(1),
weight_decay_multiplier(1),
bias_learning_rate_multiplier(1),
bias_weight_decay_multiplier(0),
padding_y_(_padding_y),
padding_x_(_padding_x)
{}
......@@ -54,12 +58,27 @@ namespace dlib
long padding_y() const { return padding_y_; }
long padding_x() const { return padding_x_; }
double get_learning_rate_multiplier () const { return learning_rate_multiplier; }
double get_weight_decay_multiplier () const { return weight_decay_multiplier; }
void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
void set_weight_decay_multiplier(double val) { weight_decay_multiplier = val; }
double get_bias_learning_rate_multiplier () const { return bias_learning_rate_multiplier; }
double get_bias_weight_decay_multiplier () const { return bias_weight_decay_multiplier; }
void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
void set_bias_weight_decay_multiplier(double val) { bias_weight_decay_multiplier = val; }
con_ (
const con_& item
) :
params(item.params),
filters(item.filters),
biases(item.biases),
learning_rate_multiplier(item.learning_rate_multiplier),
weight_decay_multiplier(item.weight_decay_multiplier),
bias_learning_rate_multiplier(item.bias_learning_rate_multiplier),
bias_weight_decay_multiplier(item.bias_weight_decay_multiplier),
padding_y_(item.padding_y_),
padding_x_(item.padding_x_)
{
......@@ -81,6 +100,10 @@ namespace dlib
biases = item.biases;
padding_y_ = item.padding_y_;
padding_x_ = item.padding_x_;
learning_rate_multiplier = item.learning_rate_multiplier;
weight_decay_multiplier = item.weight_decay_multiplier;
bias_learning_rate_multiplier = item.bias_learning_rate_multiplier;
bias_weight_decay_multiplier = item.bias_weight_decay_multiplier;
return *this;
}
......@@ -121,18 +144,22 @@ namespace dlib
void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
{
conv.get_gradient_for_data (gradient_input, filters(params,0), sub.get_gradient_input());
// no point computing the parameter gradients if they won't be used.
if (learning_rate_multiplier != 0)
{
auto filt = filters(params_grad,0);
conv.get_gradient_for_filters (gradient_input, sub.get_output(), filt);
auto b = biases(params_grad, filters.size());
tt::assign_conv_bias_gradient(b, gradient_input);
}
}
const tensor& get_layer_params() const { return params; }
tensor& get_layer_params() { return params; }
friend void serialize(const con_& item, std::ostream& out)
{
serialize("con_2", out);
serialize("con_3", out);
serialize(item.params, out);
serialize(_num_filters, out);
serialize(_nr, out);
......@@ -143,6 +170,10 @@ namespace dlib
serialize(item.padding_x_, out);
serialize(item.filters, out);
serialize(item.biases, out);
serialize(item.learning_rate_multiplier, out);
serialize(item.weight_decay_multiplier, out);
serialize(item.bias_learning_rate_multiplier, out);
serialize(item.bias_weight_decay_multiplier, out);
}
friend void deserialize(con_& item, std::istream& in)
......@@ -167,7 +198,7 @@ namespace dlib
item.padding_y_ = nr/2;
item.padding_x_ = nc/2;
}
else if (version == "con_2")
else if (version == "con_2" || version == "con_3")
{
deserialize(item.params, in);
deserialize(num_filters, in);
......@@ -180,6 +211,23 @@ namespace dlib
deserialize(item.filters, in);
deserialize(item.biases, in);
if (version == "con_3")
{
deserialize(item.learning_rate_multiplier, in);
deserialize(item.weight_decay_multiplier, in);
deserialize(item.bias_learning_rate_multiplier, in);
deserialize(item.bias_weight_decay_multiplier, in);
}
else
{
// Previous versions didn't have these parameters, so they were
// implicitly 1.
item.learning_rate_multiplier = 1;
item.weight_decay_multiplier = 1;
item.bias_learning_rate_multiplier = 1;
item.bias_weight_decay_multiplier = 1;
}
if (item.padding_y_ != _padding_y) throw serialization_error("Wrong padding_y found while deserializing dlib::con_");
if (item.padding_x_ != _padding_x) throw serialization_error("Wrong padding_x found while deserializing dlib::con_");
}
......@@ -207,6 +255,10 @@ namespace dlib
<< ", padding_y="<<item.padding_y_
<< ", padding_x="<<item.padding_x_
<< ")";
out << " learning_rate_mult="<<item.learning_rate_multiplier;
out << " weight_decay_mult="<<item.weight_decay_multiplier;
out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
return out;
}
......@@ -217,6 +269,10 @@ namespace dlib
alias_tensor filters, biases;
tt::tensor_conv conv;
double learning_rate_multiplier;
double weight_decay_multiplier;
double bias_learning_rate_multiplier;
double bias_weight_decay_multiplier;
// These are here only because older versions of con (which you might encounter
// serialized to disk) used different padding settings.
......@@ -594,20 +650,43 @@ namespace dlib
FC_MODE = 1
};
const double DEFAULT_BATCH_NORM_EPS = 0.00001;
template <
layer_mode mode
>
class bn_
{
public:
bn_() : num_updates(0), running_stats_window_size(1000)
explicit bn_(
unsigned long window_size,
double eps_ = DEFAULT_BATCH_NORM_EPS
) :
num_updates(0),
running_stats_window_size(window_size),
learning_rate_multiplier(1),
weight_decay_multiplier(0),
bias_learning_rate_multiplier(1),
bias_weight_decay_multiplier(1),
eps(eps_)
{}
explicit bn_(unsigned long window_size) : num_updates(0), running_stats_window_size(window_size)
{}
bn_() : bn_(1000) {}
layer_mode get_mode() const { return mode; }
unsigned long get_running_stats_window_size () const { return running_stats_window_size; }
double get_eps() const { return eps; }
double get_learning_rate_multiplier () const { return learning_rate_multiplier; }
double get_weight_decay_multiplier () const { return weight_decay_multiplier; }
void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
void set_weight_decay_multiplier(double val) { weight_decay_multiplier = val; }
double get_bias_learning_rate_multiplier () const { return bias_learning_rate_multiplier; }
double get_bias_weight_decay_multiplier () const { return bias_weight_decay_multiplier; }
void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
void set_bias_weight_decay_multiplier(double val) { bias_weight_decay_multiplier = val; }
template <typename SUBNET>
void setup (const SUBNET& sub)
......@@ -648,16 +727,16 @@ namespace dlib
if (num_updates <running_stats_window_size)
++num_updates;
if (mode == FC_MODE)
tt::batch_normalize(output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
tt::batch_normalize(eps, output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
else
tt::batch_normalize_conv(output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
tt::batch_normalize_conv(eps, output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
}
else // we are running in testing mode so we just linearly scale the input tensor.
{
if (mode == FC_MODE)
tt::batch_normalize_inference(output, sub.get_output(), g, b, running_means, running_variances);
tt::batch_normalize_inference(eps, output, sub.get_output(), g, b, running_means, running_variances);
else
tt::batch_normalize_conv_inference(output, sub.get_output(), g, b, running_means, running_variances);
tt::batch_normalize_conv_inference(eps, output, sub.get_output(), g, b, running_means, running_variances);
}
}
......@@ -668,9 +747,9 @@ namespace dlib
auto g_grad = gamma(params_grad, 0);
auto b_grad = beta(params_grad, gamma.size());
if (mode == FC_MODE)
tt::batch_normalize_gradient(gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
tt::batch_normalize_gradient(eps, gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
else
tt::batch_normalize_conv_gradient(gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
tt::batch_normalize_conv_gradient(eps, gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
}
const tensor& get_layer_params() const { return params; }
......@@ -679,9 +758,9 @@ namespace dlib
friend void serialize(const bn_& item, std::ostream& out)
{
if (mode == CONV_MODE)
serialize("bn_con", out);
serialize("bn_con2", out);
else // if FC_MODE
serialize("bn_fc", out);
serialize("bn_fc2", out);
serialize(item.params, out);
serialize(item.gamma, out);
serialize(item.beta, out);
......@@ -691,6 +770,11 @@ namespace dlib
serialize(item.running_variances, out);
serialize(item.num_updates, out);
serialize(item.running_stats_window_size, out);
serialize(item.learning_rate_multiplier, out);
serialize(item.weight_decay_multiplier, out);
serialize(item.bias_learning_rate_multiplier, out);
serialize(item.bias_weight_decay_multiplier, out);
serialize(item.eps, out);
}
friend void deserialize(bn_& item, std::istream& in)
......@@ -701,12 +785,12 @@ namespace dlib
{
if (mode == CONV_MODE)
{
if (version != "bn_con")
if (version != "bn_con" && version != "bn_con2")
throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::bn_.");
}
else // must be in FC_MODE
{
if (version != "bn_fc")
if (version != "bn_fc" && version != "bn_fc2")
throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::bn_.");
}
}
......@@ -731,16 +815,38 @@ namespace dlib
// We also need to flip the running_variances around since the previous
// format saved the inverse standard deviations instead of variances.
item.running_variances = 1.0f/squared(mat(item.running_variances)) - tt::BATCH_NORM_EPS;
item.running_variances = 1.0f/squared(mat(item.running_variances)) - DEFAULT_BATCH_NORM_EPS;
}
else if (version == "bn_con2" || version == "bn_fc2")
{
deserialize(item.learning_rate_multiplier, in);
deserialize(item.weight_decay_multiplier, in);
deserialize(item.bias_learning_rate_multiplier, in);
deserialize(item.bias_weight_decay_multiplier, in);
deserialize(item.eps, in);
}
else
{
// Previous versions didn't have these parameters, so they were
// implicitly 1.
item.learning_rate_multiplier = 1;
item.weight_decay_multiplier = 1;
item.eps = DEFAULT_BATCH_NORM_EPS;
}
}
friend std::ostream& operator<<(std::ostream& out, const bn_& item)
{
if (mode == CONV_MODE)
out << "bn_con";
out << "bn_con ";
else
out << "bn_fc";
out << "bn_fc ";
out << " eps="<<item.eps;
out << " learning_rate_mult="<<item.learning_rate_multiplier;
out << " weight_decay_mult="<<item.weight_decay_multiplier;
out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
return out;
}
......@@ -754,6 +860,11 @@ namespace dlib
resizable_tensor invstds, running_variances;
unsigned long num_updates;
unsigned long running_stats_window_size;
double learning_rate_multiplier;
double weight_decay_multiplier;
double bias_learning_rate_multiplier;
double bias_weight_decay_multiplier;
double eps;
};
template <typename SUBNET>
......@@ -784,11 +895,24 @@ namespace dlib
static_assert(num_outputs_ > 0, "The number of outputs from a fc_ layer must be > 0");
public:
fc_() : num_outputs(num_outputs_), num_inputs(0)
{
}
fc_(num_fc_outputs o) : num_outputs(o.num_outputs), num_inputs(0),
learning_rate_multiplier(1),
weight_decay_multiplier(1),
bias_learning_rate_multiplier(1),
bias_weight_decay_multiplier(0)
{}
fc_() : fc_(num_fc_outputs(num_outputs_)) {}
fc_(num_fc_outputs o) : num_outputs(o.num_outputs), num_inputs(0) {}
double get_learning_rate_multiplier () const { return learning_rate_multiplier; }
double get_weight_decay_multiplier () const { return weight_decay_multiplier; }
void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
void set_weight_decay_multiplier(double val) { weight_decay_multiplier = val; }
double get_bias_learning_rate_multiplier () const { return bias_learning_rate_multiplier; }
double get_bias_weight_decay_multiplier () const { return bias_weight_decay_multiplier; }
void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
void set_bias_weight_decay_multiplier(double val) { bias_weight_decay_multiplier = val; }
unsigned long get_num_outputs (
) const { return num_outputs; }
......@@ -834,6 +958,9 @@ namespace dlib
template <typename SUBNET>
void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
{
// no point computing the parameter gradients if they won't be used.
if (learning_rate_multiplier != 0)
{
// compute the gradient of the weight parameters.
auto pw = weights(params_grad, 0);
......@@ -845,6 +972,7 @@ namespace dlib
auto pb = biases(params_grad, weights.size());
tt::assign_bias_gradient(pb, gradient_input);
}
}
// compute the gradient for the data
auto w = weights(params, 0);
......@@ -856,20 +984,24 @@ namespace dlib
friend void serialize(const fc_& item, std::ostream& out)
{
serialize("fc_", out);
serialize("fc_2", out);
serialize(item.num_outputs, out);
serialize(item.num_inputs, out);
serialize(item.params, out);
serialize(item.weights, out);
serialize(item.biases, out);
serialize((int)bias_mode, out);
serialize(item.learning_rate_multiplier, out);
serialize(item.weight_decay_multiplier, out);
serialize(item.bias_learning_rate_multiplier, out);
serialize(item.bias_weight_decay_multiplier, out);
}
friend void deserialize(fc_& item, std::istream& in)
{
std::string version;
deserialize(version, in);
if (version != "fc_")
if (version != "fc_" && version != "fc_2")
throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::fc_.");
deserialize(item.num_outputs, in);
......@@ -880,6 +1012,22 @@ namespace dlib
int bmode = 0;
deserialize(bmode, in);
if (bias_mode != (fc_bias_mode)bmode) throw serialization_error("Wrong fc_bias_mode found while deserializing dlib::fc_");
if (version == "fc_2")
{
deserialize(item.learning_rate_multiplier, in);
deserialize(item.weight_decay_multiplier, in);
deserialize(item.bias_learning_rate_multiplier, in);
deserialize(item.bias_weight_decay_multiplier, in);
}
else
{
// Previous versions didn't have these parameters, so they were
// implicitly 1.
item.learning_rate_multiplier = 1;
item.weight_decay_multiplier = 1;
item.bias_learning_rate_multiplier = 1;
item.bias_weight_decay_multiplier = 1;
}
}
friend std::ostream& operator<<(std::ostream& out, const fc_& item)
......@@ -889,12 +1037,18 @@ namespace dlib
out << "fc\t ("
<< "num_outputs="<<item.num_outputs
<< ")";
out << " learning_rate_mult="<<item.learning_rate_multiplier;
out << " weight_decay_mult="<<item.weight_decay_multiplier;
out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
}
else
{
out << "fc_no_bias ("
<< "num_outputs="<<item.num_outputs
<< ")";
out << " learning_rate_mult="<<item.learning_rate_multiplier;
out << " weight_decay_mult="<<item.weight_decay_multiplier;
}
return out;
}
......@@ -905,6 +1059,10 @@ namespace dlib
unsigned long num_inputs;
resizable_tensor params;
alias_tensor weights, biases;
double learning_rate_multiplier;
double weight_decay_multiplier;
double bias_learning_rate_multiplier;
double bias_weight_decay_multiplier;
};
template <
......@@ -1143,7 +1301,7 @@ namespace dlib
auto sg = gamma(temp,0);
auto sb = beta(temp,gamma.size());
g = pointwise_multiply(mat(sg), 1.0f/sqrt(mat(item.running_variances)+tt::BATCH_NORM_EPS));
g = pointwise_multiply(mat(sg), 1.0f/sqrt(mat(item.running_variances)+item.get_eps()));
b = mat(sb) - pointwise_multiply(mat(g), mat(item.running_means));
}
......@@ -1223,7 +1381,7 @@ namespace dlib
{
std::string version;
deserialize(version, in);
if (version == "bn_con")
if (version == "bn_con" || version == "bn_con2")
{
// Since we can build an affine_ from a bn_ we check if that's what is in
// the stream and if so then just convert it right here.
......@@ -1233,7 +1391,7 @@ namespace dlib
item = temp;
return;
}
else if (version == "bn_fc")
else if (version == "bn_fc" || version == "bn_fc2")
{
// Since we can build an affine_ from a bn_ we check if that's what is in
// the stream and if so then just convert it right here.
......@@ -1289,8 +1447,13 @@ namespace dlib
template <typename SUBNET>
void forward(const SUBNET& sub, resizable_tensor& output)
{
output.copy_size(sub.get_output());
tt::add(output, sub.get_output(), layer<tag>(sub).get_output());
auto&& t1 = sub.get_output();
auto&& t2 = layer<tag>(sub).get_output();
output.set_size(std::max(t1.num_samples(),t2.num_samples()),
std::max(t1.k(),t2.k()),
std::max(t1.nr(),t2.nr()),
std::max(t1.nc(),t2.nc()));
tt::add(output, t1, t2);
}
template <typename SUBNET>
......
......@@ -123,6 +123,16 @@ namespace dlib
allow dlib to make some layers execute in-place and therefore run a
little faster and use less memory. Do not implement forward() and
backward().
It should also be noted that layers may define additional layer specific
fields and the solvers can use these fields as they see fit. For example,
some layers define get_learning_rate_multiplier() and
get_weight_decay_multiplier() methods. The solvers that come with dlib
look at these methods, if they exist, and adjust the learning rate or
weight decay for that layer according to the multiplier. Therefore, you
can add these methods to your layer types if you want, or even define new
fields and new solvers that use those fields in some way.
!*/
public:
......@@ -367,6 +377,10 @@ namespace dlib
ensures
- #get_num_outputs() == num_outputs
- #get_bias_mode() == bias_mode
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 1
- #get_bias_learning_rate_multiplier() == 1
- #get_bias_weight_decay_multiplier() == 0
!*/
unsigned long get_num_outputs (
......@@ -389,6 +403,82 @@ namespace dlib
is added to each of the outputs of this layer.
!*/
double get_learning_rate_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its parameters be
multiplied by get_learning_rate_multiplier().
!*/
double get_weight_decay_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its parameters be
multiplied by get_weight_decay_multiplier().
!*/
void set_learning_rate_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_learning_rate_multiplier() == val
!*/
void set_weight_decay_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_weight_decay_multiplier() == val
!*/
double get_bias_learning_rate_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its bias parameters be
multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
!*/
double get_bias_weight_decay_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its bias parameters be
multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
!*/
void set_bias_learning_rate_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_bias_learning_rate_multiplier() == val
!*/
void set_bias_weight_decay_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_bias_weight_decay_multiplier() == val
!*/
template <typename SUBNET> void setup (const SUBNET& sub);
template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
......@@ -458,6 +548,10 @@ namespace dlib
- #stride_x() == _stride_x
- #padding_y() == _padding_y
- #padding_x() == _padding_x
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 1
- #get_bias_learning_rate_multiplier() == 1
- #get_bias_weight_decay_multiplier() == 0
!*/
long num_filters(
......@@ -517,6 +611,82 @@ namespace dlib
sides of the image.
!*/
double get_learning_rate_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its parameters be
multiplied by get_learning_rate_multiplier().
!*/
double get_weight_decay_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its parameters be
multiplied by get_weight_decay_multiplier().
!*/
void set_learning_rate_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_learning_rate_multiplier() == val
!*/
void set_weight_decay_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_weight_decay_multiplier() == val
!*/
double get_bias_learning_rate_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its bias parameters be
multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
!*/
double get_bias_weight_decay_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its bias parameters be
multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
!*/
void set_bias_learning_rate_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_bias_learning_rate_multiplier() == val
!*/
void set_bias_weight_decay_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_bias_weight_decay_multiplier() == val
!*/
template <typename SUBNET> void setup (const SUBNET& sub);
template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
......@@ -648,6 +818,8 @@ namespace dlib
FC_MODE = 1 // fully connected mode
};
const double DEFAULT_BATCH_NORM_EPS = 0.00001;
template <
layer_mode mode
>
......@@ -684,16 +856,29 @@ namespace dlib
/*!
ensures
- #get_mode() == mode
- get_running_stats_window_size() == 1000
- #get_running_stats_window_size() == 1000
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 0
- #get_bias_learning_rate_multiplier() == 1
- #get_bias_weight_decay_multiplier() == 1
- #get_eps() == tt::DEFAULT_BATCH_NORM_EPS
!*/
explicit bn_(
unsigned long window_size
unsigned long window_size,
double eps = tt::DEFAULT_BATCH_NORM_EPS
);
/*!
requires
- eps > 0
ensures
- #get_mode() == mode
- get_running_stats_window_size() == window_size
- #get_running_stats_window_size() == window_size
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 0
- #get_bias_learning_rate_multiplier() == 1
- #get_bias_weight_decay_multiplier() == 1
- #get_eps() == eps
!*/
layer_mode get_mode(
......@@ -712,6 +897,15 @@ namespace dlib
normalization after a convolutional layer you should use CONV_MODE.
!*/
double get_eps(
) const;
/*!
ensures
- When doing batch normalization, we are dividing by the standard
deviation. This epsilon value returned by this function is added to the
variance to prevent the division from dividing by zero.
!*/
unsigned long get_running_stats_window_size (
) const;
/*!
......@@ -725,6 +919,82 @@ namespace dlib
the running average.
!*/
double get_learning_rate_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its parameters be
multiplied by get_learning_rate_multiplier().
!*/
double get_weight_decay_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its parameters be
multiplied by get_weight_decay_multiplier().
!*/
void set_learning_rate_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_learning_rate_multiplier() == val
!*/
void set_weight_decay_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_weight_decay_multiplier() == val
!*/
double get_bias_learning_rate_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its bias parameters be
multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
!*/
double get_bias_weight_decay_multiplier(
) const;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its bias parameters be
multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
!*/
void set_bias_learning_rate_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_bias_learning_rate_multiplier() == val
!*/
void set_bias_weight_decay_multiplier(
double val
);
/*!
requires
- val >= 0
ensures
- #get_bias_weight_decay_multiplier() == val
!*/
template <typename SUBNET> void setup (const SUBNET& sub);
template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
......@@ -1330,7 +1600,13 @@ namespace dlib
what layer to add to the output of the previous layer. The result of this
addition is output by add_prev_. Finally, the addition happens pointwise
according to 4D tensor arithmetic. If the dimensions don't match then
missing elements are presumed to be equal to 0.
missing elements are presumed to be equal to 0. Moreover, each dimension
of the output tensor is equal to the maximum dimension of either of the
inputs. That is, if the tensors A and B are being added to produce C then:
- C.num_samples() == max(A.num_samples(), B.num_samples())
- C.k() == max(A.k(), B.k())
- C.nr() == max(A.nr(), B.nr())
- C.nc() == max(A.nc(), B.nc())
!*/
public:
......
......@@ -6,6 +6,7 @@
#include "solvers_abstract.h"
#include "tensor.h"
#include <iostream>
#include "layers.h"
namespace dlib
{
......@@ -49,10 +50,53 @@ namespace dlib
v = 0;
}
//perform: v = momentum*mat(v) - weight_decay*learning_rate*mat(params) - learning_rate*mat(params_grad);
tt::affine_transform(v, v, params, params_grad,
momentum, -weight_decay*learning_rate, -learning_rate, 0);
const double lr = learning_rate*get_learning_rate_multiplier(l);
const double wd = weight_decay*get_weight_decay_multiplier(l);
//perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
return v;
}
template <unsigned long N>
const tensor& operator() (
const float learning_rate,
const fc_<N,FC_HAS_BIAS>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, l.get_num_outputs());
return v;
}
template <
long _num_filters,
long _nr,
long _nc,
int _stride_y,
int _stride_x,
int _padding_y,
int _padding_x
>
const tensor& operator() (
const float learning_rate,
const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, l.num_filters());
return v;
}
template < layer_mode mode >
const tensor& operator() (
const float learning_rate,
const bn_<mode>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2);
return v;
}
......@@ -76,9 +120,49 @@ namespace dlib
}
private:
template <typename layer_type>
void update_considering_bias(
const float learning_rate,
const layer_type& l,
const tensor& params_grad,
unsigned long bias_offset
)
{
const tensor& params = l.get_layer_params();
DLIB_CASSERT(params.size() != 0,"");
if (v.size() == 0)
{
v.copy_size(params_grad);
v = 0;
}
double lr = learning_rate*get_learning_rate_multiplier(l);
double wd = weight_decay*get_weight_decay_multiplier(l);
//perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
{
tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
}
else
{
tt::affine_transform_range(0, bias_offset, v, v, params, params_grad, momentum, -wd*lr, -lr);
// now update the biases but apply their multipliers
lr *= l.get_bias_learning_rate_multiplier();
wd *= l.get_bias_weight_decay_multiplier();
tt::affine_transform_range(bias_offset, v.size(), v, v, params, params_grad, momentum, -wd*lr, -lr);
}
}
resizable_tensor v;
float weight_decay;
float momentum;
};
// ----------------------------------------------------------------------------------------
......@@ -132,11 +216,57 @@ namespace dlib
++t;
tt::compute_adam_update(s, m, v, t, learning_rate, weight_decay, momentum1, momentum2, params, params_grad);
tt::compute_adam_update(0, params.size(), s, m, v, t,
learning_rate*get_learning_rate_multiplier(l),
weight_decay*get_weight_decay_multiplier(l),
momentum1, momentum2, params, params_grad);
return s;
}
template <unsigned long N>
const tensor& operator() (
const float learning_rate,
const fc_<N,FC_HAS_BIAS>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, l.get_num_outputs());
return s;
}
template <
long _num_filters,
long _nr,
long _nc,
int _stride_y,
int _stride_x,
int _padding_y,
int _padding_x
>
const tensor& operator() (
const float learning_rate,
const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, l.num_filters());
return s;
}
template < layer_mode mode >
const tensor& operator() (
const float learning_rate,
const bn_<mode>& l,
const tensor& params_grad
)
{
update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2);
return s;
}
friend void serialize(const adam& item, std::ostream& out)
{
serialize("adam2", out);
......@@ -165,6 +295,49 @@ namespace dlib
}
private:
template <typename layer_type>
void update_considering_bias(
const float learning_rate,
const layer_type& l,
const tensor& params_grad,
unsigned long bias_offset
)
{
const tensor& params = l.get_layer_params();
DLIB_CASSERT(params.size() != 0,"");
if (v.size() == 0)
{
m.copy_size(params_grad);
m = 0;
v.copy_size(params_grad);
v = 0;
s.copy_size(params_grad);
}
++t;
if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
{
tt::compute_adam_update(0, params.size(), s, m, v, t,
learning_rate*get_learning_rate_multiplier(l),
weight_decay*get_weight_decay_multiplier(l),
momentum1, momentum2, params, params_grad);
}
else
{
tt::compute_adam_update(0, bias_offset, s, m, v, t,
learning_rate*get_learning_rate_multiplier(l),
weight_decay*get_weight_decay_multiplier(l),
momentum1, momentum2, params, params_grad);
tt::compute_adam_update(bias_offset, params.size(), s, m, v, t,
learning_rate*get_learning_rate_multiplier(l)*l.get_bias_learning_rate_multiplier(),
weight_decay*get_weight_decay_multiplier(l)*l.get_bias_weight_decay_multiplier(),
momentum1, momentum2, params, params_grad);
}
}
resizable_tensor m;
resizable_tensor v;
resizable_tensor s;
......
......@@ -78,6 +78,15 @@ namespace dlib
V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
Here V is a momentum term that is remembered by the solver from one
invocation of operator() to the next.
Note that the actual learning rate and weight decay used by the solver are
multiplied by the per layer multipliers. That is, the solver will call
get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
multiply these values with the nominal learning rate and weight decay,
respectively, to determine the values it will use during each step. It is
also overloaded to allow additional learning rate multipliers to be applied
to fc_ and con_ bias parameters.
!*/
public:
......@@ -123,6 +132,15 @@ namespace dlib
paper:
Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
optimization." International Conference on Learning Representation. 2015.
Note that the actual learning rate and weight decay used by the solver are
multiplied by the per layer multipliers. That is, the solver will call
get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
multiply these values with the nominal learning rate and weight decay,
respectively, to determine the values it will use during each step. It is
also overloaded to allow additional learning rate multipliers to be applied
to fc_ and con_ bias parameters.
!*/
public:
......
......@@ -240,6 +240,42 @@ namespace dlib { namespace tt
#endif
}
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
)
{
#ifdef DLIB_USE_CUDA
cuda::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
#else
cpu::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
#endif
}
void affine_transform(
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
)
{
#ifdef DLIB_USE_CUDA
cuda::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
#else
cpu::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
#endif
}
// ----------------------------------------------------------------------------------------
void affine_transform(
......@@ -275,6 +311,8 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
void compute_adam_update (
size_t begin,
size_t end,
tensor& s,
tensor& m,
tensor& v,
......@@ -288,10 +326,10 @@ namespace dlib { namespace tt
)
{
#ifdef DLIB_USE_CUDA
cuda::compute_adam_update(s, m, v, t, learning_rate, weight_decay, momentum1,
cuda::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
momentum2, params, params_grad);
#else
cpu::compute_adam_update(s, m, v, t, learning_rate, weight_decay, momentum1,
cpu::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
momentum2, params, params_grad);
#endif
}
......@@ -299,6 +337,7 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
void batch_normalize_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -308,13 +347,14 @@ namespace dlib { namespace tt
)
{
#ifdef DLIB_USE_CUDA
cuda::batch_normalize_inference(dest,src,gamma,beta,running_means,running_variances);
cuda::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
#else
cpu::batch_normalize_inference(dest,src,gamma,beta,running_means,running_variances);
cpu::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
#endif
}
void batch_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& vars,
......@@ -327,13 +367,14 @@ namespace dlib { namespace tt
)
{
#ifdef DLIB_USE_CUDA
cuda::batch_normalize(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
cuda::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
#else
cpu::batch_normalize(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
cpu::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
#endif
}
void batch_normalize_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -346,15 +387,16 @@ namespace dlib { namespace tt
{
#ifdef DLIB_USE_CUDA
cuda::batch_normalize_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cuda::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
#else
cpu::batch_normalize_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cpu::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
#endif
}
// ----------------------------------------------------------------------------------------
void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -364,13 +406,14 @@ namespace dlib { namespace tt
)
{
#ifdef DLIB_USE_CUDA
cuda::batch_normalize_conv_inference(dest,src,gamma,beta,running_means,running_variances);
cuda::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
#else
cpu::batch_normalize_conv_inference(dest,src,gamma,beta,running_means,running_variances);
cpu::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
#endif
}
void batch_normalize_conv (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& vars,
......@@ -383,13 +426,14 @@ namespace dlib { namespace tt
)
{
#ifdef DLIB_USE_CUDA
cuda::batch_normalize_conv(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
cuda::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
#else
cpu::batch_normalize_conv(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
cpu::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
#endif
}
void batch_normalize_conv_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -402,9 +446,9 @@ namespace dlib { namespace tt
{
#ifdef DLIB_USE_CUDA
cuda::batch_normalize_conv_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cuda::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
#else
cpu::batch_normalize_conv_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cpu::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
#endif
}
......
......@@ -229,13 +229,58 @@ namespace dlib { namespace tt
const float D
);
/*!
requires - dest.size()==src1.size()
requires
- dest.size()==src1.size()
- dest.size()==src2.size()
- dest.size()==src3.size()
ensures
- #dest == A*src1 + B*src2 + C*src3 + D
!*/
void affine_transform(
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
);
/*!
requires
- dest.size()==src1.size()
- dest.size()==src2.size()
- dest.size()==src3.size()
ensures
- #dest == A*src1 + B*src2 + C*src3
!*/
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
);
/*!
requires
- dest.size()==src1.size()
- dest.size()==src2.size()
- dest.size()==src3.size()
- begin <= end <= dest.size()
ensures
- This function operates much like
affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only
the half open range [begin,end) rather than processing the entire tensor.
Specifically, it does this:
- for i in the range [begin, end):
- #dest.host()[i] == A*src1.host()[i] + B*src2.host()[i] + C*src3.host()[i]
!*/
// ----------------------------------------------------------------------------------------
void affine_transform(
......@@ -290,6 +335,8 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
void compute_adam_update (
size_t begin,
size_t end,
tensor& s,
tensor& m,
tensor& v,
......@@ -309,19 +356,22 @@ namespace dlib { namespace tt
- weight_decay >= 0
- 0 <= momentum1 < 1
- 0 <= momentum2 < 1
- begin <= end <= params.size()
ensures
- This function implements the ADAM parameter update method described in the paper:
Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
optimization." International Conference on Learning Representation. 2015.
Specifically, it implements the method shown as Algorithm 1.
- #s is the update vector that should be added to the parameters.
- The function only operates in the half open range [begin,end) of the memory
blocks of each tensor. E.g. to make this function run on the entire tensor
set begin to 0 and end to params.size().
!*/
// ----------------------------------------------------------------------------------------
const double BATCH_NORM_EPS = 0.00001;
void batch_normalize_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -331,6 +381,7 @@ namespace dlib { namespace tt
);
/*!
requires
- eps > 0
- gamma.num_samples() == 1
- gamma.nr() == src.nr()
- gamma.nc() == src.nc()
......@@ -342,11 +393,12 @@ namespace dlib { namespace tt
- Linearly transforms src as a call to batch_normalize() would if src had means
and variances as given by running_means and running_variances. That is, this
function performs:
dest = gamma*(src-running_means)/sqrt(running_variances+BATCH_NORM_EPS) + beta
dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta
Note that it does it in a pointwise fashion over the samples in src.
!*/
void batch_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -359,6 +411,7 @@ namespace dlib { namespace tt
);
/*!
requires
- eps > 0
- src.num_samples() > 1
- gamma.num_samples() == 1
- beta.num_samples() == 1
......@@ -384,6 +437,7 @@ namespace dlib { namespace tt
!*/
void batch_normalize_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -395,8 +449,9 @@ namespace dlib { namespace tt
);
/*!
requires
- eps > 0
- invstds and means should be the output of a call to
batch_normalize(dest,means,invstds,src,gamma,beta)
batch_normalize(eps,dest,means,invstds,src,gamma,beta)
- have_same_dimensions(gradient_input, src) == true
- have_same_dimensions(src, src_grad) == true
- src.num_samples() > 1
......@@ -410,7 +465,7 @@ namespace dlib { namespace tt
- have_same_dimensions(invstds, gamma) == true
ensures
- Let f(src,gamma,beta) == dot(gradient_input, dest output of
batch_normalize(dest,means,invstds,src,gamma,beta))
batch_normalize(eps,dest,means,invstds,src,gamma,beta))
- Adds the gradient of f() with respect to src to #src_grad.
- Assigns the gradient of f() with respect to gamma to #gamma_grad.
- Assigns the gradient of f() with respect to beta to #beta_grad.
......@@ -419,6 +474,7 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -428,6 +484,7 @@ namespace dlib { namespace tt
);
/*!
requires
- eps > 0
- gamma.num_samples() == 1
- gamma.nr() == 1
- gamma.nc() == 1
......@@ -439,12 +496,13 @@ namespace dlib { namespace tt
- Linearly transforms src as a call to batch_normalize_conv() would if src had
means and variances as given by running_means and running_variances. That
is, this function performs:
dest = gamma*(src-running_means)/sqrt(running_variances+BATCH_NORM_EPS) + beta
dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta
Note that it does this in a pointwise fashion over the samples, rows, and
columns in src.
!*/
void batch_normalize_conv (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -457,6 +515,7 @@ namespace dlib { namespace tt
);
/*!
requires
- eps > 0
- src.num_samples() > 1
- gamma.num_samples()==gamma.nr()==gamma.nc() == 1
- beta.num_samples() ==beta.nr() ==gamma.nc() == 1
......@@ -478,6 +537,7 @@ namespace dlib { namespace tt
!*/
void batch_normalize_conv_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -489,8 +549,9 @@ namespace dlib { namespace tt
);
/*!
requires
- eps > 0
- invstds and means should be the output of a call to
batch_normalize_conv(dest,means,invstds,src,gamma,beta)
batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta)
- have_same_dimensions(gradient_input, src) == true
- have_same_dimensions(src, src_grad) == true
- src.num_samples() > 1
......@@ -502,7 +563,7 @@ namespace dlib { namespace tt
- have_same_dimensions(invstds, gamma) == true
ensures
- Let f(src,gamma,beta) == dot(gradient_input, dest output of
batch_normalize_conv(dest,means,invstds,src,gamma,beta))
batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta))
- Adds the gradient of f() with respect to src to #src_grad.
- Assigns the gradient of f() with respect to gamma to #gamma_grad.
- Assigns the gradient of f() with respect to beta to #beta_grad.
......
......@@ -526,8 +526,7 @@ namespace dlib
label_type pick_which_run_update;
job_t next_job;
std::vector<std::future<double>> losses(devices.size());
std::vector<std::future<void>> update_futs(devices.size());
std::vector<dlib::future<double>> losses(devices.size());
std::vector<tt::multi_device_tensor_averager> averagers;
// An array of all the parameter tensors in the first network. We will
......@@ -536,6 +535,16 @@ namespace dlib
std::vector<tensor*> reference_params;
visit_layer_parameters(devices[0]->net, [&](size_t, tensor& t) { reference_params.push_back(&t); });
// We make separate thread pools with just one thread in them because we want
// to make sure each device is always executed on the same thread. We care
// about this because there are thread_local context variables for some cuda
// components and they get regenerated when the current cuda device changes.
// Recreating them over and over is somewhat expensive so we want to avoid
// that.
std::vector<std::shared_ptr<thread_pool>> tp;
for (size_t i = 0; i < devices.size(); ++i)
tp.push_back(std::make_shared<thread_pool>(1));
size_t iteration = 0;
while(job_pipe.dequeue(next_job))
......@@ -545,7 +554,7 @@ namespace dlib
// right version for unsupervised or supervised training based on the type
// of label_type.
for (size_t i = 0; i < devices.size(); ++i)
losses[i] = std::async(std::launch::async,[&,i](){ return compute_parameter_gradients(i, next_job, pick_which_run_update); });
tp[i]->add_task_by_value([&,i](double& loss){ loss = compute_parameter_gradients(i, next_job, pick_which_run_update); }, losses[i]);
// aggregate loss values from all the network computations.
double theloss = 0;
for (auto&& loss : losses)
......@@ -596,10 +605,10 @@ namespace dlib
// Now apply all the updates to each device.
for (size_t i = 0; i < devices.size(); ++i)
update_futs[i] = std::async(std::launch::async, [&,i](){ if (next_job.have_data[i]) update_parameters(i); });
tp[i]->add_task_by_value([&,i](){ if (next_job.have_data[i]) update_parameters(i); });
// and wait for the updates to all happen.
for (auto&& f : update_futs)
f.wait();
for (size_t i = 0; i < devices.size(); ++i)
tp[i]->wait_for_all_tasks();
// Evey now and then force all the parameters to be the same just to make
......
......@@ -482,7 +482,7 @@ namespace dlib
<< "\n\t x_upper.size(): " << x_upper.size()
);
DLIB_ASSERT (
min(x_upper-x_lower) > 0,
min(x_upper-x_lower) >= 0,
"\tdouble find_min_box_constrained()"
<< "\n\t You have to supply proper box constraints to this function."
<< "\n\r min(x_upper-x_lower): " << min(x_upper-x_lower)
......@@ -610,7 +610,7 @@ namespace dlib
<< "\n\t x_upper.size(): " << x_upper.size()
);
DLIB_ASSERT (
min(x_upper-x_lower) > 0,
min(x_upper-x_lower) >= 0,
"\tdouble find_max_box_constrained()"
<< "\n\t You have to supply proper box constraints to this function."
<< "\n\r min(x_upper-x_lower): " << min(x_upper-x_lower)
......
......@@ -297,7 +297,7 @@ namespace dlib
- is_col_vector(x_upper) == true
- x.size() == x_lower.size() == x_upper.size()
(i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality)
- min(x_upper-x_lower) > 0
- min(x_upper-x_lower) >= 0
(i.e. x_upper must contain upper bounds relative to x_lower)
ensures
- Performs a box constrained minimization of the function f() using the given
......@@ -391,7 +391,7 @@ namespace dlib
- is_col_vector(x_upper) == true
- x.size() == x_lower.size() == x_upper.size()
(i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality)
- min(x_upper-x_lower) > 0
- min(x_upper-x_lower) >= 0
(i.e. x_upper must contain upper bounds relative to x_lower)
ensures
- Performs a box constrained maximization of the function f() using the given
......
......@@ -165,13 +165,13 @@ namespace
resizable_tensor running_means;
resizable_tensor running_variances;
batch_normalize(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
const double scale = (src.num_samples())/(src.num_samples()-1.0);
// Turn back into biased variance estimate because that's how batch_normalize() works, so if we want to match it this is necessary.
running_variances = mat(running_variances)/scale;
batch_normalize_inference(dest2, src, gamma, beta, running_means, running_variances);
batch_normalize_inference(DEFAULT_BATCH_NORM_EPS,dest2, src, gamma, beta, running_means, running_variances);
DLIB_TEST_MSG(max(abs(mat(dest2)-mat(dest))) < 1e-5, max(abs(mat(dest2)-mat(dest))));
cpu::batch_normalize_inference(dest3, src, gamma, beta, running_means, running_variances);
cpu::batch_normalize_inference(DEFAULT_BATCH_NORM_EPS,dest3, src, gamma, beta, running_means, running_variances);
DLIB_TEST_MSG(max(abs(mat(dest3)-mat(dest))) < 1e-5, max(abs(mat(dest3)-mat(dest))));
......@@ -179,7 +179,7 @@ namespace
auto f = [&](float eps) {
const float old = src.host()[idx];
src.host()[idx] += eps;
batch_normalize(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest);
src.host()[idx] = old;
return result;
......@@ -191,7 +191,7 @@ namespace
auto f = [&](float eps) {
const float old = gamma.host()[idx];
gamma.host()[idx] += eps;
batch_normalize(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest);
gamma.host()[idx] = old;
return result;
......@@ -203,7 +203,7 @@ namespace
auto f = [&](float eps) {
const float old = beta.host()[idx];
beta.host()[idx] += eps;
batch_normalize(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest);
beta.host()[idx] = old;
return result;
......@@ -220,7 +220,7 @@ namespace
gamma_grad = 8;
beta_grad = 8;
batch_normalize_gradient(gradient_input, means, vars, src, gamma, src_grad, gamma_grad, beta_grad);
batch_normalize_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, vars, src, gamma, src_grad, gamma_grad, beta_grad);
auto grad_error = compare_gradients(src_grad, grad_src);
dlog << LINFO << "src error: " << grad_error;
......@@ -250,14 +250,14 @@ namespace
resizable_tensor running_means;
resizable_tensor running_variances;
batch_normalize_conv(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
const double scale = (src.num_samples()*src.nr()*src.nc())/(src.num_samples()*src.nr()*src.nc()-1.0);
// Turn back into biased variance estimate because that's how
// batch_normalize_conv() works, so if we want to match it this is necessary.
running_variances = mat(running_variances)/scale;
batch_normalize_conv_inference(dest2, src, gamma, beta, running_means, running_variances);
batch_normalize_conv_inference(DEFAULT_BATCH_NORM_EPS,dest2, src, gamma, beta, running_means, running_variances);
DLIB_TEST(max(abs(mat(dest2)-mat(dest))) < 1e-5);
cpu::batch_normalize_conv_inference(dest3, src, gamma, beta, running_means, running_variances);
cpu::batch_normalize_conv_inference(DEFAULT_BATCH_NORM_EPS,dest3, src, gamma, beta, running_means, running_variances);
DLIB_TEST(max(abs(mat(dest3)-mat(dest))) < 1e-5);
......@@ -265,7 +265,7 @@ namespace
auto f = [&](float eps) {
const float old = src.host()[idx];
src.host()[idx] += eps;
batch_normalize_conv(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest);
src.host()[idx] = old;
return result;
......@@ -277,7 +277,7 @@ namespace
auto f = [&](float eps) {
const float old = gamma.host()[idx];
gamma.host()[idx] += eps;
batch_normalize_conv(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest);
gamma.host()[idx] = old;
return result;
......@@ -289,7 +289,7 @@ namespace
auto f = [&](float eps) {
const float old = beta.host()[idx];
beta.host()[idx] += eps;
batch_normalize_conv(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest);
beta.host()[idx] = old;
return result;
......@@ -307,7 +307,7 @@ namespace
gamma_grad = 9;
beta_grad = 9;
batch_normalize_conv_gradient(gradient_input, means, vars, src, gamma, src_grad, gamma_grad, beta_grad);
batch_normalize_conv_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, vars, src, gamma, src_grad, gamma_grad, beta_grad);
auto grad_error = compare_gradients(src_grad, grad_src);
......@@ -662,11 +662,11 @@ namespace
rnd.fill_uniform(params_grad);
resizable_tensor mm(m), vv(v);
cpu::compute_adam_update(s, mm, vv, t, 0.01, 0.001, 0.9, 0.99, params, params_grad);
cpu::compute_adam_update(0,params.size(),s, mm, vv, t, 0.01, 0.001, 0.9, 0.99, params, params_grad);
matrix<float> s1 = mat(s);
rnd.fill_uniform(s);
cuda::compute_adam_update(s, m, v, t, 0.01, 0.001, 0.9, 0.99, params, params_grad);
cuda::compute_adam_update(0,params.size(),s, m, v, t, 0.01, 0.001, 0.9, 0.99, params, params_grad);
matrix<float> s2 = mat(s);
DLIB_TEST_MSG(max(abs(s1-s2)) < 1e-6, max(abs(s1-s2)));
......@@ -775,6 +775,27 @@ namespace
cpu::affine_transform(dest2, src2, srcb2, srcc2, 2, 3, 4, 5);
DLIB_TEST(equal(mat(dest),mat(dest2)));
cuda::affine_transform(dest, src, srcb, srcc, 2, 3, 4, 0);
cpu::affine_transform(dest2, src2, srcb2, srcc2, 2, 3, 4, 0);
DLIB_TEST(equal(mat(dest),mat(dest2)));
cuda::affine_transform_range(0, dest.size(), dest, src, srcb, srcc, 2, 3, 4);
cpu::affine_transform_range(0, dest2.size(), dest2, src2, srcb2, srcc2, 2, 3, 4);
DLIB_TEST(equal(mat(dest),mat(dest2)));
if (3 < dest.size())
{
dest = 999;
dest2 = 999;
cuda::affine_transform_range(3, dest.size()-1, dest, src, srcb, srcc, 2, 3, 4);
cpu::affine_transform_range(3, dest2.size()-1, dest2, src2, srcb2, srcc2, 2, 3, 4);
DLIB_TEST(equal(mat(dest),mat(dest2)));
cuda::affine_transform_range(dest.size(), dest.size(), dest, src, srcb, srcc, 2, 3, 4);
cpu::affine_transform_range(dest2.size(), dest2.size(), dest2, src2, srcb2, srcc2, 2, 3, 4);
DLIB_TEST(equal(mat(dest),mat(dest2)));
}
rnd.fill_uniform(dest);
rnd.fill_uniform(src);
......@@ -863,8 +884,8 @@ namespace
rnd.fill_uniform(src);
cpu::batch_normalize(dest, means, invstds, 1, running_means, running_variances, src, gamma, beta);
cuda::batch_normalize(dest2,means2,invstds2, 1, running_means2, running_variances2, src, gamma, beta);
cpu::batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, invstds, 1, running_means, running_variances, src, gamma, beta);
cuda::batch_normalize(DEFAULT_BATCH_NORM_EPS,dest2,means2,invstds2, 1, running_means2, running_variances2, src, gamma, beta);
dlog << LINFO << "dest error: "<< max(abs(mat(dest) -mat(dest2)));
dlog << LINFO << "means error: "<< max(abs(mat(means) -mat(means2)));
......@@ -890,8 +911,8 @@ namespace
rnd.fill_uniform(gradient_input);
cpu::batch_normalize_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cuda::batch_normalize_gradient(gradient_input, means, invstds, src, gamma, src_grad2, gamma_grad2, beta_grad2);
cpu::batch_normalize_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cuda::batch_normalize_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, invstds, src, gamma, src_grad2, gamma_grad2, beta_grad2);
dlog << LINFO << "src_grad error: " << max(abs(mat(src_grad)-mat(src_grad2)));
dlog << LINFO << "gamma_grad error: " << max(abs(mat(gamma_grad)-mat(gamma_grad2)));
......@@ -917,8 +938,8 @@ namespace
tt::tensor_rand rnd;
rnd.fill_uniform(src);
cpu::batch_normalize_conv(dest,means,invstds,1,running_means,running_variances, src, gamma, beta);
cuda::batch_normalize_conv(dest2,means2,invstds2,1,running_means2,running_variances2, src, gamma, beta);
cpu::batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest,means,invstds,1,running_means,running_variances, src, gamma, beta);
cuda::batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest2,means2,invstds2,1,running_means2,running_variances2, src, gamma, beta);
dlog << LINFO << "dest error: "<< max(abs(mat(dest) -mat(dest2)));
dlog << LINFO << "means error: "<< max(abs(mat(means) -mat(means2)));
......@@ -942,8 +963,8 @@ namespace
rnd.fill_uniform(gradient_input);
cpu::batch_normalize_conv_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cuda::batch_normalize_conv_gradient(gradient_input, means, invstds, src, gamma, src_grad2, gamma_grad2, beta_grad2);
cpu::batch_normalize_conv_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cuda::batch_normalize_conv_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, invstds, src, gamma, src_grad2, gamma_grad2, beta_grad2);
dlog << LINFO << "src_grad error: " << max(abs(mat(src_grad)-mat(src_grad2)));
dlog << LINFO << "gamma_grad error: " << max(abs(mat(gamma_grad)-mat(gamma_grad2)));
......@@ -1318,6 +1339,72 @@ namespace
DLIB_TEST(net2.subnet().subnet().subnet().layer_details().get_num_outputs() == 4);
}
// ----------------------------------------------------------------------------------------
template <
int N,
template <typename> class BN,
int stride,
typename SUBNET
>
using block = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>;
template <
template <int,template<typename>class,int,typename> class block,
int N,
template<typename>class BN,
typename SUBNET
>
using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;
template <
template <int,template<typename>class,int,typename> class block,
int N,
template<typename>class BN,
typename SUBNET
>
using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>;
template <typename SUBNET> using res = relu<residual<block,8,bn_con,SUBNET>>;
template <typename SUBNET> using ares = relu<residual<block,8,affine,SUBNET>>;
template <typename SUBNET> using res_down = relu<residual_down<block,8,bn_con,SUBNET>>;
template <typename SUBNET> using ares_down = relu<residual_down<block,8,affine,SUBNET>>;
template <typename SUBNET>
using pres = prelu<add_prev1<bn_con<con<8,3,3,1,1,prelu<bn_con<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>;
void test_visit_funcions()
{
using net_type2 = loss_multiclass_log<fc<10,
avg_pool_everything<
pres<res<res<res_down< // 2 prelu layers here
tag4<repeat<9,pres, // 9 groups, each containing 2 prelu layers
res_down<
res<
input<matrix<unsigned char>>
>>>>>>>>>>>;
net_type2 pnet;
DLIB_CASSERT(pnet.num_layers == 131, pnet.num_layers);
DLIB_CASSERT(pnet.num_computational_layers == 109, pnet.num_computational_layers);
std::vector<bool> hit(pnet.num_computational_layers, false);
size_t count = 0;
visit_layer_parameter_gradients(pnet, [&](size_t i, tensor& ){hit[i] = true; ++count; });
for (auto x : hit)
DLIB_TEST(x);
DLIB_TEST(count == pnet.num_computational_layers);
count = 0;
std::vector<bool> hit2(pnet.num_computational_layers, false);
visit_layer_parameters(pnet, [&](size_t i, tensor& ){hit2[i] = true; ++count; });
for (auto x : hit2)
DLIB_TEST(x);
DLIB_TEST(count == pnet.num_computational_layers);
}
// ----------------------------------------------------------------------------------------
class dnn_tester : public tester
......@@ -1378,6 +1465,7 @@ namespace
test_batch_normalize_conv();
test_basic_tensor_ops();
test_layers();
test_visit_funcions();
}
} a;
......
......@@ -20,29 +20,76 @@ using namespace dlib;
// ----------------------------------------------------------------------------------------
// Let's start by showing how you can conveniently define large networks. The
// most important tool for doing this are C++'s alias templates. These let us
// define new layer types that are combinations of a bunch of other layers.
// These will form the building blocks for more complex networks.
// Let's start by showing how you can conveniently define large and complex
// networks. The most important tool for doing this are C++'s alias templates.
// These let us define new layer types that are combinations of a bunch of other
// layers. These will form the building blocks for more complex networks.
// So let's begin by defining the building block of a residual network (see
// Figure 2 in Deep Residual Learning for Image Recognition by He, Zhang, Ren,
// and Sun). You can see a few things in this statement. The most obvious is
// that we have combined a bunch of layers into the name "base_res". You can
// also see the use of the tag1 layer. This layer doesn't do any computation.
// It exists solely so other layers can refer to it. In this case, the
// add_prev1 layer looks for the tag1 layer and will take the tag1 output and
// add it to the input of the add_prev1 layer. This combination allows us to
// implement skip and residual style networks. We have also made base_res
// parameterized by BN, which will let us insert different batch normalization
// layers.
template <template <typename> class BN, typename SUBNET>
using base_res = relu<add_prev1<BN<con<8,3,3,1,1,relu<BN<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>;
// We also want a residual block that begins by doing downsampling. We can
// reuse base_res to define it like this:
template <template <typename> class BN, typename SUBNET>
using base_res_down = base_res<BN,avg_pool<1,1,2,2,SUBNET>>;
// and Sun). We are going to decompose the residual block into a few alias
// statements. First, we define the core block.
// Here we have parameterized the "block" layer on a BN layer (nominally some
// kind of batch normalization), the number of filter outputs N, and the stride
// the block operates at.
template <
int N,
template <typename> class BN,
int stride,
typename SUBNET
>
using block = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>;
// Next, we need to define the skip layer mechanism used in the residual network
// paper. They create their blocks by adding the input tensor to the output of
// each block. So we define an alias statement that takes a block and wraps it
// with this skip/add structure.
// Note the tag layer. This layer doesn't do any computation. It exists solely
// so other layers can refer to it. In this case, the add_prev1 layer looks for
// the tag1 layer and will take the tag1 output and add it to the input of the
// add_prev1 layer. This combination allows us to implement skip and residual
// style networks. We have also set the block stride to 1 in this statement.
// The significance of that is explained next.
template <
template <int,template<typename>class,int,typename> class block,
int N,
template<typename>class BN,
typename SUBNET
>
using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;
// Some residual blocks do downsampling. They do this by using a stride of 2
// instead of 1. However, when downsampling we need to also take care to
// downsample the part of the network that adds the original input to the output
// or the sizes won't make sense (the network will still run, but the results
// aren't as good). So here we define a downsampling version of residual. In
// it, we make use of the skip1 layer. This layer simply outputs whatever is
// output by the tag1 layer. Therefore, the skip1 layer (there are also skip2,
// skip3, etc. in dlib) allows you to create branching network structures.
// residual_down creates a network structure like this:
/*
input from SUBNET
/ \
/ \
block downsample(using avg_pool)
\ /
\ /
add tensors (using add_prev2 which adds the output of tag2 with avg_pool's output)
|
output
*/
template <
template <int,template<typename>class,int,typename> class block,
int N,
template<typename>class BN,
typename SUBNET
>
using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>;
// Now we can define 4 different residual blocks we will use in this example.
// The first two are non-downsampling residual blocks while the last two
......@@ -50,10 +97,10 @@ using base_res_down = base_res<BN,avg_pool<1,1,2,2,SUBNET>>;
// ares_down have had the batch normalization replaced with simple affine
// layers. We will use the affine version of the layers when testing our
// networks.
template <typename SUBNET> using res = base_res<bn_con,SUBNET>;
template <typename SUBNET> using ares = base_res<affine,SUBNET>;
template <typename SUBNET> using res_down = base_res_down<bn_con,SUBNET>;
template <typename SUBNET> using ares_down = base_res_down<affine,SUBNET>;
template <typename SUBNET> using res = relu<residual<block,8,bn_con,SUBNET>>;
template <typename SUBNET> using ares = relu<residual<block,8,affine,SUBNET>>;
template <typename SUBNET> using res_down = relu<residual_down<block,8,bn_con,SUBNET>>;
template <typename SUBNET> using ares_down = relu<residual_down<block,8,affine,SUBNET>>;
......@@ -145,39 +192,41 @@ int main(int argc, char** argv) try
// These print statements will output this (I've truncated it since it's
// long, but you get the idea):
/*
The pnet has 127 layers in it.
The pnet has 131 layers in it.
layer<0> loss_multiclass_log
layer<1> fc (num_outputs=10)
layer<1> fc (num_outputs=10) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<2> avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0)
layer<3> prelu (initial_param_value=0.2)
layer<4> add_prev
layer<5> bn_con
layer<6> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<5> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<6> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<7> prelu (initial_param_value=0.25)
layer<8> bn_con
layer<9> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<8> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<9> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<10> tag1
...
layer<33> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<34> tag1
layer<35> avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
layer<36> tag4
layer<37> prelu (initial_param_value=0.3)
layer<38> add_prev
layer<39> bn_con
layer<34> relu
layer<35> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<36> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2, padding_y=0, padding_x=0) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<37> tag1
layer<38> tag4
layer<39> prelu (initial_param_value=0.3)
layer<40> add_prev
layer<41> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
...
layer<115> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<116> tag1
layer<117> avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
layer<118> relu
layer<119> add_prev
layer<120> bn_con
layer<121> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<119> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<120> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2, padding_y=0, padding_x=0) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<121> tag1
layer<122> relu
layer<123> bn_con
layer<124> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<125> tag1
layer<126> input<matrix>
layer<123> add_prev
layer<124> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<125> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<126> relu
layer<127> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<128> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<129> tag1
layer<130> input<matrix>
*/
// Now that we know the index numbers for each layer, we can access them
......@@ -195,7 +244,7 @@ int main(int argc, char** argv) try
// parts of your network and access them by layer<tag>(). You can also
// index relative to a tag. So for example, to access the layer immediately
// after tag4 you can say:
layer<tag4,1>(pnet); // Equivalent to layer<36+1>(pnet).
layer<tag4,1>(pnet); // Equivalent to layer<38+1>(pnet).
// Or to access the layer 2 layers after tag4:
layer<tag4,2>(pnet);
......
......@@ -42,6 +42,12 @@ int main()
try
{
cv::VideoCapture cap(0);
if (!cap.isOpened())
{
cerr << "Unable to connect to camera" << endl;
return 1;
}
image_window win;
// Load face detection and pose estimation models.
......
Hi Davis,
thanks for your work on dlib!
I have created a natvis file to have nicer debugger visualization of dlib matrices in Visual Studio (2012 - …) and I just wanted to share it with you.
To test it, copy the file into you folder %USERPROFILE%\My Documents\Visual Studio 2015\Visualizers or %VSINSTALLDIR%\Common7\Packages\Debugger\Visualizers as described here https://msdn.microsoft.com/en-us/library/jj620914.aspx
It’s certainly extendable, especially to include it into image watch, but currently it may help users to debug much faster.
Feel free to share it.
Best,
Johannes Huber
<?xml version="1.0" encoding="utf-8"?>
<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
<!-- dlib matrix debugger visualization in Visual Studio-->
<!-- Johannes Huber, SAFEmine Part of Hexagon -->
<!-- no warranty -->
<!-- general dlib::matrix fixed size-->
<Type Name="dlib::matrix&lt;*,*,*,*&gt;">
<DisplayString>{{ size= &lt;{$T2}&gt; x &lt;{$T3}&gt; }}</DisplayString>
<Expand>
<ArrayItems>
<Size>$T2 * $T3</Size>
<ValuePointer>($T1*)data.data</ValuePointer>
</ArrayItems>
</Expand>
</Type>
<!-- general dlib::matrix fixed rows-->
<Type Name="dlib::matrix&lt;*,0,*,*&gt;">
<DisplayString>{{ size={data.nr_} x &lt;{$T2}&gt; }}</DisplayString>
<Expand>
<ArrayItems Condition="data.data != 0">
<Size>data.nr_ * $T2</Size>
<ValuePointer>($T1*)data.data</ValuePointer>
</ArrayItems>
</Expand>
</Type>
<!-- general dlib::matrix fixed cols-->
<Type Name="dlib::matrix&lt;*,*,0,*&gt;">
<DisplayString>{{ size= &lt;{$T2}&gt; x {data.nc_} }}</DisplayString>
<Expand>
<ArrayItems Condition="data.data != 0">
<Size>$T2 * data.nc_</Size>
<ValuePointer>($T1*)data.data</ValuePointer>
</ArrayItems>
</Expand>
</Type>
<!-- general dlib::matrix dynamic size-->
<Type Name="dlib::matrix&lt;*,0,0,*&gt;">
<DisplayString>{{ size= {data.nc_} x {data.nc_} }}</DisplayString>
<Expand>
<ArrayItems Condition="data.data != 0">
<Size>data.nr_*data.nc_</Size>
<ValuePointer>($T1*)data.data</ValuePointer>
</ArrayItems>
</Expand>
</Type>
</AutoVisualizer>
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment