Commit 7f77ec65 authored by Davis King's avatar Davis King

Made the batch normalization epsilon user settable rather than being hard coded.

parent b92b226c
......@@ -531,6 +531,7 @@ namespace dlib
// -----------------------------------------------------------------------------------
void batch_normalize_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -546,7 +547,8 @@ namespace dlib
gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_variances),
have_same_dimensions(gamma, running_variances) &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -565,7 +567,8 @@ namespace dlib
"\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
dest.copy_size(src);
......@@ -581,7 +584,7 @@ namespace dlib
{
for (long k = 0; k < num; ++k)
{
*d = g[k]*(*s - m[k])/std::sqrt(v[k]+dlib::tt::BATCH_NORM_EPS) + b[k];
*d = g[k]*(*s - m[k])/std::sqrt(v[k]+eps) + b[k];
++d;
++s;
}
......@@ -589,6 +592,7 @@ namespace dlib
}
void batch_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -609,7 +613,8 @@ namespace dlib
beta.num_samples() == 1 &&
gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
gamma.k() == beta.k() && beta.k() == src.k(),
gamma.k() == beta.k() && beta.k() == src.k() &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -620,7 +625,8 @@ namespace dlib
"\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
dest.copy_size(src);
......@@ -662,7 +668,7 @@ namespace dlib
else
rvar[i] = (1-averaging_factor)*rvar[i] + scale*averaging_factor*actual_var;
p_invstds[i] = 1.0f/std::sqrt(actual_var + dlib::tt::BATCH_NORM_EPS);
p_invstds[i] = 1.0f/std::sqrt(actual_var + eps);
}
p_src = src.host();
......@@ -689,6 +695,7 @@ namespace dlib
}
void batch_normalize_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -709,6 +716,7 @@ namespace dlib
DLIB_CASSERT(num == beta_grad.size(),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
DLIB_CASSERT(eps > 0,"");
beta_grad = 0;
gamma_grad = 0;
......@@ -784,6 +792,7 @@ namespace dlib
// ----------------------------------------------------------------------------------------
void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -799,7 +808,8 @@ namespace dlib
gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_variances),
have_same_dimensions(gamma, running_variances) &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -818,7 +828,8 @@ namespace dlib
"\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
dest.copy_size(src);
......@@ -834,7 +845,7 @@ namespace dlib
{
for (long k = 0; k < src.k(); ++k)
{
const float invstd = 1.0f/std::sqrt(v[k] + dlib::tt::BATCH_NORM_EPS);
const float invstd = 1.0f/std::sqrt(v[k] + eps);
for (long j = 0; j < num; ++j)
{
*d = g[k]*(*s - m[k])*invstd + b[k];
......@@ -846,6 +857,7 @@ namespace dlib
}
void batch_normalize_conv (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -868,7 +880,8 @@ namespace dlib
beta.nr() == 1 &&
gamma.nc() == 1 &&
beta.nc() == 1 &&
gamma.k() == beta.k() && beta.k() == src.k(),
gamma.k() == beta.k() && beta.k() == src.k() &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -879,7 +892,8 @@ namespace dlib
"\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
dest.copy_size(src);
......@@ -927,7 +941,7 @@ namespace dlib
else
rvar[k] = (1-averaging_factor)*rvar[k] + scale*averaging_factor*actual_var;
p_invstds[k] = 1.0f/std::sqrt(actual_var + dlib::tt::BATCH_NORM_EPS);
p_invstds[k] = 1.0f/std::sqrt(actual_var + eps);
}
p_src = src.host();
......@@ -955,6 +969,7 @@ namespace dlib
}
void batch_normalize_conv_gradient(
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -975,6 +990,7 @@ namespace dlib
DLIB_CASSERT(src.k() == beta_grad.size(),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
DLIB_CASSERT(eps > 0,"");
beta_grad = 0;
gamma_grad = 0;
......
......@@ -131,6 +131,7 @@ namespace dlib
// -----------------------------------------------------------------------------------
void batch_normalize_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -140,6 +141,7 @@ namespace dlib
);
void batch_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -152,6 +154,7 @@ namespace dlib
);
void batch_normalize_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -163,6 +166,7 @@ namespace dlib
);
void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -172,6 +176,7 @@ namespace dlib
);
void batch_normalize_conv (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -184,6 +189,7 @@ namespace dlib
);
void batch_normalize_conv_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......
......@@ -338,6 +338,7 @@ namespace dlib
// ------------------------------------------------------------------------------------
void batch_normalize_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -353,7 +354,8 @@ namespace dlib
gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_variances),
have_same_dimensions(gamma, running_variances) &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -372,7 +374,8 @@ namespace dlib
"\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
const float in_scale = 1;
const float out_scale = 0;
......@@ -393,10 +396,11 @@ namespace dlib
beta.device(),
running_means.device(),
running_variances.device(),
dlib::tt::BATCH_NORM_EPS));
eps));
}
void batch_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -417,7 +421,8 @@ namespace dlib
beta.num_samples() == 1 &&
gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
gamma.k() == beta.k() && beta.k() == src.k(),
gamma.k() == beta.k() && beta.k() == src.k() &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -428,7 +433,8 @@ namespace dlib
"\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
const float in_scale = 1;
......@@ -455,12 +461,13 @@ namespace dlib
averaging_factor,
running_means.device(),
running_variances.device(),
dlib::tt::BATCH_NORM_EPS,
eps,
means.device(),
invstds.device()));
}
void batch_normalize_gradient(
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -480,6 +487,7 @@ namespace dlib
DLIB_CASSERT(num == beta_grad.size(),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
DLIB_CASSERT(eps > 0,"");
const float in_scale = 1;
const float out_scale = 1;
......@@ -503,7 +511,7 @@ namespace dlib
gamma.device(),
gamma_grad.device(),
beta_grad.device(),
dlib::tt::BATCH_NORM_EPS,
eps,
means.device(),
invstds.device()));
}
......@@ -511,6 +519,7 @@ namespace dlib
// ------------------------------------------------------------------------------------
void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -526,7 +535,8 @@ namespace dlib
gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_variances),
have_same_dimensions(gamma, running_variances) &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -545,7 +555,8 @@ namespace dlib
"\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
const float in_scale = 1;
const float out_scale = 0;
......@@ -566,10 +577,11 @@ namespace dlib
beta.device(),
running_means.device(),
running_variances.device(),
dlib::tt::BATCH_NORM_EPS));
eps));
}
void batch_normalize_conv (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -592,7 +604,8 @@ namespace dlib
beta.nr() == 1 &&
gamma.nc() == 1 &&
beta.nc() == 1 &&
gamma.k() == beta.k() && beta.k() == src.k(),
gamma.k() == beta.k() && beta.k() == src.k() &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -603,7 +616,8 @@ namespace dlib
"\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
const float in_scale = 1;
const float out_scale = 0;
......@@ -629,12 +643,13 @@ namespace dlib
averaging_factor,
running_means.device(),
running_variances.device(),
dlib::tt::BATCH_NORM_EPS,
eps,
means.device(),
invstds.device()));
}
void batch_normalize_conv_gradient(
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -653,6 +668,7 @@ namespace dlib
DLIB_CASSERT(src.k() == beta_grad.size(),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
DLIB_CASSERT(eps > 0,"");
const float in_scale = 1;
const float out_scale = 1;
......@@ -676,7 +692,7 @@ namespace dlib
gamma.device(),
gamma_grad.device(),
beta_grad.device(),
dlib::tt::BATCH_NORM_EPS,
eps,
means.device(),
invstds.device()));
}
......
......@@ -135,6 +135,7 @@ namespace dlib
// ------------------------------------------------------------------------------------
void batch_normalize_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -144,6 +145,7 @@ namespace dlib
);
void batch_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -156,6 +158,7 @@ namespace dlib
);
void batch_normalize_gradient(
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -169,6 +172,7 @@ namespace dlib
// ------------------------------------------------------------------------------------
void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -178,6 +182,7 @@ namespace dlib
);
void batch_normalize_conv (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -190,6 +195,7 @@ namespace dlib
);
void batch_normalize_conv_gradient(
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......
......@@ -650,23 +650,30 @@ namespace dlib
FC_MODE = 1
};
const double DEFAULT_BATCH_NORM_EPS = 0.00001;
template <
layer_mode mode
>
class bn_
{
public:
explicit bn_(unsigned long window_size) :
explicit bn_(
unsigned long window_size,
double eps_ = DEFAULT_BATCH_NORM_EPS
) :
num_updates(0),
running_stats_window_size(window_size),
learning_rate_multiplier(1),
weight_decay_multiplier(0)
weight_decay_multiplier(0),
eps(eps_)
{}
bn_() : bn_(1000) {}
layer_mode get_mode() const { return mode; }
unsigned long get_running_stats_window_size () const { return running_stats_window_size; }
double get_eps() const { return eps; }
double get_learning_rate_multiplier () const { return learning_rate_multiplier; }
double get_weight_decay_multiplier () const { return weight_decay_multiplier; }
......@@ -713,16 +720,16 @@ namespace dlib
if (num_updates <running_stats_window_size)
++num_updates;
if (mode == FC_MODE)
tt::batch_normalize(output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
tt::batch_normalize(eps, output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
else
tt::batch_normalize_conv(output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
tt::batch_normalize_conv(eps, output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
}
else // we are running in testing mode so we just linearly scale the input tensor.
{
if (mode == FC_MODE)
tt::batch_normalize_inference(output, sub.get_output(), g, b, running_means, running_variances);
tt::batch_normalize_inference(eps, output, sub.get_output(), g, b, running_means, running_variances);
else
tt::batch_normalize_conv_inference(output, sub.get_output(), g, b, running_means, running_variances);
tt::batch_normalize_conv_inference(eps, output, sub.get_output(), g, b, running_means, running_variances);
}
}
......@@ -733,9 +740,9 @@ namespace dlib
auto g_grad = gamma(params_grad, 0);
auto b_grad = beta(params_grad, gamma.size());
if (mode == FC_MODE)
tt::batch_normalize_gradient(gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
tt::batch_normalize_gradient(eps, gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
else
tt::batch_normalize_conv_gradient(gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
tt::batch_normalize_conv_gradient(eps, gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
}
const tensor& get_layer_params() const { return params; }
......@@ -758,6 +765,7 @@ namespace dlib
serialize(item.running_stats_window_size, out);
serialize(item.learning_rate_multiplier, out);
serialize(item.weight_decay_multiplier, out);
serialize(item.eps, out);
}
friend void deserialize(bn_& item, std::istream& in)
......@@ -798,12 +806,13 @@ namespace dlib
// We also need to flip the running_variances around since the previous
// format saved the inverse standard deviations instead of variances.
item.running_variances = 1.0f/squared(mat(item.running_variances)) - tt::BATCH_NORM_EPS;
item.running_variances = 1.0f/squared(mat(item.running_variances)) - DEFAULT_BATCH_NORM_EPS;
}
else if (version == "bn_con2" || version == "bn_fc2")
{
deserialize(item.learning_rate_multiplier, in);
deserialize(item.weight_decay_multiplier, in);
deserialize(item.eps, in);
}
else
{
......@@ -811,6 +820,8 @@ namespace dlib
// implicitly 1.
item.learning_rate_multiplier = 1;
item.weight_decay_multiplier = 1;
item.eps = DEFAULT_BATCH_NORM_EPS;
}
}
......@@ -820,6 +831,7 @@ namespace dlib
out << "bn_con ";
else
out << "bn_fc ";
out << " eps="<<item.eps;
out << " learning_rate_mult="<<item.learning_rate_multiplier;
out << " weight_decay_mult="<<item.weight_decay_multiplier;
return out;
......@@ -837,6 +849,7 @@ namespace dlib
unsigned long running_stats_window_size;
double learning_rate_multiplier;
double weight_decay_multiplier;
double eps;
};
template <typename SUBNET>
......@@ -1273,7 +1286,7 @@ namespace dlib
auto sg = gamma(temp,0);
auto sb = beta(temp,gamma.size());
g = pointwise_multiply(mat(sg), 1.0f/sqrt(mat(item.running_variances)+tt::BATCH_NORM_EPS));
g = pointwise_multiply(mat(sg), 1.0f/sqrt(mat(item.running_variances)+item.get_eps()));
b = mat(sb) - pointwise_multiply(mat(g), mat(item.running_means));
}
......
......@@ -818,6 +818,8 @@ namespace dlib
FC_MODE = 1 // fully connected mode
};
const double DEFAULT_BATCH_NORM_EPS = 0.00001;
template <
layer_mode mode
>
......@@ -857,17 +859,22 @@ namespace dlib
- #get_running_stats_window_size() == 1000
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 0
- #get_eps() == tt::DEFAULT_BATCH_NORM_EPS
!*/
explicit bn_(
unsigned long window_size
unsigned long window_size,
double eps = tt::DEFAULT_BATCH_NORM_EPS
);
/*!
requires
- eps > 0
ensures
- #get_mode() == mode
- #get_running_stats_window_size() == window_size
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 0
- #get_eps() == eps
!*/
layer_mode get_mode(
......@@ -886,6 +893,15 @@ namespace dlib
normalization after a convolutional layer you should use CONV_MODE.
!*/
double get_eps(
) const;
/*!
ensures
- When doing batch normalization, we are dividing by the standard
deviation. This epsilon value returned by this function is added to the
variance to prevent the division from dividing by zero.
!*/
unsigned long get_running_stats_window_size (
) const;
/*!
......
......@@ -337,6 +337,7 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
void batch_normalize_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -346,13 +347,14 @@ namespace dlib { namespace tt
)
{
#ifdef DLIB_USE_CUDA
cuda::batch_normalize_inference(dest,src,gamma,beta,running_means,running_variances);
cuda::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
#else
cpu::batch_normalize_inference(dest,src,gamma,beta,running_means,running_variances);
cpu::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
#endif
}
void batch_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& vars,
......@@ -365,13 +367,14 @@ namespace dlib { namespace tt
)
{
#ifdef DLIB_USE_CUDA
cuda::batch_normalize(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
cuda::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
#else
cpu::batch_normalize(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
cpu::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
#endif
}
void batch_normalize_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -384,15 +387,16 @@ namespace dlib { namespace tt
{
#ifdef DLIB_USE_CUDA
cuda::batch_normalize_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cuda::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
#else
cpu::batch_normalize_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cpu::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
#endif
}
// ----------------------------------------------------------------------------------------
void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -402,13 +406,14 @@ namespace dlib { namespace tt
)
{
#ifdef DLIB_USE_CUDA
cuda::batch_normalize_conv_inference(dest,src,gamma,beta,running_means,running_variances);
cuda::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
#else
cpu::batch_normalize_conv_inference(dest,src,gamma,beta,running_means,running_variances);
cpu::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
#endif
}
void batch_normalize_conv (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& vars,
......@@ -421,13 +426,14 @@ namespace dlib { namespace tt
)
{
#ifdef DLIB_USE_CUDA
cuda::batch_normalize_conv(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
cuda::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
#else
cpu::batch_normalize_conv(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
cpu::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
#endif
}
void batch_normalize_conv_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -440,9 +446,9 @@ namespace dlib { namespace tt
{
#ifdef DLIB_USE_CUDA
cuda::batch_normalize_conv_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cuda::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
#else
cpu::batch_normalize_conv_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cpu::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
#endif
}
......
......@@ -370,9 +370,8 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
const double BATCH_NORM_EPS = 0.00001;
void batch_normalize_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -382,6 +381,7 @@ namespace dlib { namespace tt
);
/*!
requires
- eps > 0
- gamma.num_samples() == 1
- gamma.nr() == src.nr()
- gamma.nc() == src.nc()
......@@ -393,11 +393,12 @@ namespace dlib { namespace tt
- Linearly transforms src as a call to batch_normalize() would if src had means
and variances as given by running_means and running_variances. That is, this
function performs:
dest = gamma*(src-running_means)/sqrt(running_variances+BATCH_NORM_EPS) + beta
dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta
Note that it does it in a pointwise fashion over the samples in src.
!*/
void batch_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -410,6 +411,7 @@ namespace dlib { namespace tt
);
/*!
requires
- eps > 0
- src.num_samples() > 1
- gamma.num_samples() == 1
- beta.num_samples() == 1
......@@ -435,6 +437,7 @@ namespace dlib { namespace tt
!*/
void batch_normalize_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -446,8 +449,9 @@ namespace dlib { namespace tt
);
/*!
requires
- eps > 0
- invstds and means should be the output of a call to
batch_normalize(dest,means,invstds,src,gamma,beta)
batch_normalize(eps,dest,means,invstds,src,gamma,beta)
- have_same_dimensions(gradient_input, src) == true
- have_same_dimensions(src, src_grad) == true
- src.num_samples() > 1
......@@ -461,7 +465,7 @@ namespace dlib { namespace tt
- have_same_dimensions(invstds, gamma) == true
ensures
- Let f(src,gamma,beta) == dot(gradient_input, dest output of
batch_normalize(dest,means,invstds,src,gamma,beta))
batch_normalize(eps,dest,means,invstds,src,gamma,beta))
- Adds the gradient of f() with respect to src to #src_grad.
- Assigns the gradient of f() with respect to gamma to #gamma_grad.
- Assigns the gradient of f() with respect to beta to #beta_grad.
......@@ -470,6 +474,7 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
......@@ -479,6 +484,7 @@ namespace dlib { namespace tt
);
/*!
requires
- eps > 0
- gamma.num_samples() == 1
- gamma.nr() == 1
- gamma.nc() == 1
......@@ -490,12 +496,13 @@ namespace dlib { namespace tt
- Linearly transforms src as a call to batch_normalize_conv() would if src had
means and variances as given by running_means and running_variances. That
is, this function performs:
dest = gamma*(src-running_means)/sqrt(running_variances+BATCH_NORM_EPS) + beta
dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta
Note that it does this in a pointwise fashion over the samples, rows, and
columns in src.
!*/
void batch_normalize_conv (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
......@@ -508,6 +515,7 @@ namespace dlib { namespace tt
);
/*!
requires
- eps > 0
- src.num_samples() > 1
- gamma.num_samples()==gamma.nr()==gamma.nc() == 1
- beta.num_samples() ==beta.nr() ==gamma.nc() == 1
......@@ -529,6 +537,7 @@ namespace dlib { namespace tt
!*/
void batch_normalize_conv_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
......@@ -540,8 +549,9 @@ namespace dlib { namespace tt
);
/*!
requires
- eps > 0
- invstds and means should be the output of a call to
batch_normalize_conv(dest,means,invstds,src,gamma,beta)
batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta)
- have_same_dimensions(gradient_input, src) == true
- have_same_dimensions(src, src_grad) == true
- src.num_samples() > 1
......@@ -553,7 +563,7 @@ namespace dlib { namespace tt
- have_same_dimensions(invstds, gamma) == true
ensures
- Let f(src,gamma,beta) == dot(gradient_input, dest output of
batch_normalize_conv(dest,means,invstds,src,gamma,beta))
batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta))
- Adds the gradient of f() with respect to src to #src_grad.
- Assigns the gradient of f() with respect to gamma to #gamma_grad.
- Assigns the gradient of f() with respect to beta to #beta_grad.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment