Commit 6acddf99 authored by Davis King's avatar Davis King

Just renamed variables to reflect the new meaning of the batch normalization

running variance output.
parent 538de238
......@@ -466,7 +466,7 @@ namespace dlib
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_invstds
const tensor& running_variances
)
{
DLIB_CASSERT(
......@@ -476,7 +476,7 @@ namespace dlib
gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_invstds),
have_same_dimensions(gamma, running_variances),
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -489,10 +489,10 @@ namespace dlib
"\nrunning_means.k(): " << running_means.k() <<
"\nrunning_means.nr(): " << running_means.nr() <<
"\nrunning_means.nc(): " << running_means.nc() <<
"\nrunning_invstds.num_samples(): " << running_invstds.num_samples() <<
"\nrunning_invstds.k(): " << running_invstds.k() <<
"\nrunning_invstds.nr(): " << running_invstds.nr() <<
"\nrunning_invstds.nc(): " << running_invstds.nc() <<
"\nrunning_variances.num_samples(): " << running_variances.num_samples() <<
"\nrunning_variances.k(): " << running_variances.k() <<
"\nrunning_variances.nr(): " << running_variances.nr() <<
"\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
......@@ -504,14 +504,14 @@ namespace dlib
auto g = gamma.host();
auto b = beta.host();
auto m = running_means.host();
auto i = running_invstds.host();
auto v = running_variances.host();
const long num = src.k()*src.nr()*src.nc();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long k = 0; k < num; ++k)
{
*d = g[k]*(*s - m[k])/std::sqrt(i[k]+dlib::tt::BATCH_NORM_EPS) + b[k];
*d = g[k]*(*s - m[k])/std::sqrt(v[k]+dlib::tt::BATCH_NORM_EPS) + b[k];
++d;
++s;
}
......@@ -524,7 +524,7 @@ namespace dlib
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_invstds,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
......@@ -532,7 +532,7 @@ namespace dlib
{
DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means),"");
DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_invstds,invstds),"");
DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds),"");
DLIB_CASSERT(
src.num_samples() > 1 &&
gamma.num_samples() == 1 &&
......@@ -580,8 +580,9 @@ namespace dlib
invstds.host(); means.host();
// compute variances
running_invstds.copy_size(invstds);
auto rvar = running_invstds.host();
running_variances.copy_size(invstds);
auto rvar = running_variances.host();
// This scale makes the running variances unbiased.
const double scale = (src.num_samples())/(src.num_samples()-1.0);
for (long i = 0; i < num; ++i)
{
......@@ -718,7 +719,7 @@ namespace dlib
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_invstds
const tensor& running_variances
)
{
DLIB_CASSERT(
......@@ -728,7 +729,7 @@ namespace dlib
gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_invstds),
have_same_dimensions(gamma, running_variances),
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -741,10 +742,10 @@ namespace dlib
"\nrunning_means.k(): " << running_means.k() <<
"\nrunning_means.nr(): " << running_means.nr() <<
"\nrunning_means.nc(): " << running_means.nc() <<
"\nrunning_invstds.num_samples(): " << running_invstds.num_samples() <<
"\nrunning_invstds.k(): " << running_invstds.k() <<
"\nrunning_invstds.nr(): " << running_invstds.nr() <<
"\nrunning_invstds.nc(): " << running_invstds.nc() <<
"\nrunning_variances.num_samples(): " << running_variances.num_samples() <<
"\nrunning_variances.k(): " << running_variances.k() <<
"\nrunning_variances.nr(): " << running_variances.nr() <<
"\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
......@@ -756,14 +757,14 @@ namespace dlib
auto g = gamma.host();
auto b = beta.host();
auto m = running_means.host();
auto i = running_invstds.host();
auto v = running_variances.host();
const long num = src.nr()*src.nc();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long k = 0; k < src.k(); ++k)
{
const float invstd = 1.0f/std::sqrt(i[k] + dlib::tt::BATCH_NORM_EPS);
const float invstd = 1.0f/std::sqrt(v[k] + dlib::tt::BATCH_NORM_EPS);
for (long j = 0; j < num; ++j)
{
*d = g[k]*(*s - m[k])*invstd + b[k];
......@@ -780,7 +781,7 @@ namespace dlib
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_invstds,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
......@@ -788,7 +789,7 @@ namespace dlib
{
DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means),"");
DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_invstds,invstds),"");
DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds),"");
DLIB_CASSERT(
src.num_samples() > 1 &&
gamma.num_samples() == 1 &&
......@@ -844,8 +845,9 @@ namespace dlib
p_src = src.host();
// compute variances
running_invstds.copy_size(invstds);
auto rvar = running_invstds.host();
running_variances.copy_size(invstds);
auto rvar = running_variances.host();
// This scale makes the running variances unbiased.
const double scale = (src.num_samples()*num)/(src.num_samples()*num-1.0);
for (long k = 0; k < src.k(); ++k)
{
......
......@@ -120,7 +120,7 @@ namespace dlib
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_invstds
const tensor& running_variances
);
void batch_normalize (
......@@ -129,7 +129,7 @@ namespace dlib
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_invstds,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
......@@ -152,7 +152,7 @@ namespace dlib
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_invstds
const tensor& running_variances
);
void batch_normalize_conv (
......@@ -161,7 +161,7 @@ namespace dlib
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_invstds,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
......
......@@ -343,7 +343,7 @@ namespace dlib
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_invstds
const tensor& running_variances
)
{
DLIB_CASSERT(
......@@ -353,7 +353,7 @@ namespace dlib
gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_invstds),
have_same_dimensions(gamma, running_variances),
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -366,10 +366,10 @@ namespace dlib
"\nrunning_means.k(): " << running_means.k() <<
"\nrunning_means.nr(): " << running_means.nr() <<
"\nrunning_means.nc(): " << running_means.nc() <<
"\nrunning_invstds.num_samples(): " << running_invstds.num_samples() <<
"\nrunning_invstds.k(): " << running_invstds.k() <<
"\nrunning_invstds.nr(): " << running_invstds.nr() <<
"\nrunning_invstds.nc(): " << running_invstds.nc() <<
"\nrunning_variances.num_samples(): " << running_variances.num_samples() <<
"\nrunning_variances.k(): " << running_variances.k() <<
"\nrunning_variances.nr(): " << running_variances.nr() <<
"\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
......@@ -392,7 +392,7 @@ namespace dlib
gamma.device(),
beta.device(),
running_means.device(),
running_invstds.device(),
running_variances.device(),
dlib::tt::BATCH_NORM_EPS));
}
......@@ -402,7 +402,7 @@ namespace dlib
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_invstds,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
......@@ -410,7 +410,7 @@ namespace dlib
{
DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means),"");
DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_invstds,invstds),"");
DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds),"");
DLIB_CASSERT(
src.num_samples() > 1 &&
gamma.num_samples() == 1 &&
......@@ -438,7 +438,7 @@ namespace dlib
means.set_size(1, src.k(), src.nr(), src.nc());
invstds.copy_size(means);
running_means.copy_size(means);
running_invstds.copy_size(means);
running_variances.copy_size(means);
CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(
context(),
......@@ -454,7 +454,7 @@ namespace dlib
beta.device(),
averaging_factor,
running_means.device(),
running_invstds.device(),
running_variances.device(),
dlib::tt::BATCH_NORM_EPS,
means.device(),
invstds.device()));
......@@ -516,7 +516,7 @@ namespace dlib
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_invstds
const tensor& running_variances
)
{
DLIB_CASSERT(
......@@ -526,7 +526,7 @@ namespace dlib
gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_invstds),
have_same_dimensions(gamma, running_variances),
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
......@@ -539,10 +539,10 @@ namespace dlib
"\nrunning_means.k(): " << running_means.k() <<
"\nrunning_means.nr(): " << running_means.nr() <<
"\nrunning_means.nc(): " << running_means.nc() <<
"\nrunning_invstds.num_samples(): " << running_invstds.num_samples() <<
"\nrunning_invstds.k(): " << running_invstds.k() <<
"\nrunning_invstds.nr(): " << running_invstds.nr() <<
"\nrunning_invstds.nc(): " << running_invstds.nc() <<
"\nrunning_variances.num_samples(): " << running_variances.num_samples() <<
"\nrunning_variances.k(): " << running_variances.k() <<
"\nrunning_variances.nr(): " << running_variances.nr() <<
"\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
......@@ -565,7 +565,7 @@ namespace dlib
gamma.device(),
beta.device(),
running_means.device(),
running_invstds.device(),
running_variances.device(),
dlib::tt::BATCH_NORM_EPS));
}
......@@ -575,7 +575,7 @@ namespace dlib
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_invstds,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
......@@ -583,7 +583,7 @@ namespace dlib
{
DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means),"");
DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_invstds,invstds),"");
DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds),"");
DLIB_CASSERT(
src.num_samples() > 1 &&
gamma.num_samples() == 1 &&
......@@ -612,7 +612,7 @@ namespace dlib
means.set_size(1, src.k());
invstds.copy_size(means);
running_means.copy_size(means);
running_invstds.copy_size(means);
running_variances.copy_size(means);
CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(
context(),
......@@ -628,7 +628,7 @@ namespace dlib
beta.device(),
averaging_factor,
running_means.device(),
running_invstds.device(),
running_variances.device(),
dlib::tt::BATCH_NORM_EPS,
means.device(),
invstds.device()));
......
......@@ -140,7 +140,7 @@ namespace dlib
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_invstds
const tensor& running_variances
);
void batch_normalize (
......@@ -149,7 +149,7 @@ namespace dlib
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_invstds,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
......@@ -174,7 +174,7 @@ namespace dlib
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_invstds
const tensor& running_variances
);
void batch_normalize_conv (
......@@ -183,7 +183,7 @@ namespace dlib
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_invstds,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
......
......@@ -453,9 +453,9 @@ namespace dlib
beta(params,gamma.size()) = 0;
running_means.copy_size(gamma(params,0));
running_invstds.copy_size(gamma(params,0));
running_variances.copy_size(gamma(params,0));
running_means = 0;
running_invstds = 1;
running_variances = 1;
num_updates = 0;
}
......@@ -470,16 +470,16 @@ namespace dlib
if (num_updates <running_stats_window_size)
++num_updates;
if (mode == FC_MODE)
tt::batch_normalize(output, means, invstds, decay, running_means, running_invstds, sub.get_output(), g, b);
tt::batch_normalize(output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
else
tt::batch_normalize_conv(output, means, invstds, decay, running_means, running_invstds, sub.get_output(), g, b);
tt::batch_normalize_conv(output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
}
else // we are running in testing mode so we just linearly scale the input tensor.
{
if (mode == FC_MODE)
tt::batch_normalize_inference(output, sub.get_output(), g, b, running_means, running_invstds);
tt::batch_normalize_inference(output, sub.get_output(), g, b, running_means, running_variances);
else
tt::batch_normalize_conv_inference(output, sub.get_output(), g, b, running_means, running_invstds);
tt::batch_normalize_conv_inference(output, sub.get_output(), g, b, running_means, running_variances);
}
}
......@@ -510,7 +510,7 @@ namespace dlib
serialize(item.means, out);
serialize(item.invstds, out);
serialize(item.running_means, out);
serialize(item.running_invstds, out);
serialize(item.running_variances, out);
serialize(item.num_updates, out);
serialize(item.running_stats_window_size, out);
}
......@@ -539,7 +539,7 @@ namespace dlib
deserialize(item.means, in);
deserialize(item.invstds, in);
deserialize(item.running_means, in);
deserialize(item.running_invstds, in);
deserialize(item.running_variances, in);
deserialize(item.num_updates, in);
deserialize(item.running_stats_window_size, in);
......@@ -551,9 +551,9 @@ namespace dlib
deserialize(_mode, in);
if (mode != (layer_mode)_mode) throw serialization_error("Wrong mode found while deserializing dlib::bn_");
// We also need to flip the running_invstds around since the previous
// We also need to flip the running_variances around since the previous
// format saved the inverse standard deviations instead of variances.
item.running_invstds = 1.0f/squared(mat(item.running_invstds)) - tt::BATCH_NORM_EPS;
item.running_variances = 1.0f/squared(mat(item.running_variances)) - tt::BATCH_NORM_EPS;
}
}
......@@ -564,7 +564,7 @@ namespace dlib
resizable_tensor params;
alias_tensor gamma, beta;
resizable_tensor means, running_means;
resizable_tensor invstds, running_invstds;
resizable_tensor invstds, running_variances;
unsigned long num_updates;
unsigned long running_stats_window_size;
};
......@@ -911,7 +911,7 @@ namespace dlib
auto sg = gamma(temp,0);
auto sb = beta(temp,gamma.size());
g = pointwise_multiply(mat(sg), 1.0f/sqrt(mat(item.running_invstds)+tt::BATCH_NORM_EPS));
g = pointwise_multiply(mat(sg), 1.0f/sqrt(mat(item.running_variances)+tt::BATCH_NORM_EPS));
b = mat(sb) - pointwise_multiply(mat(g), mat(item.running_means));
}
......
......@@ -274,13 +274,13 @@ namespace dlib { namespace tt
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_invstds
const tensor& running_variances
)
{
#ifdef DLIB_USE_CUDA
cuda::batch_normalize_inference(dest,src,gamma,beta,running_means,running_invstds);
cuda::batch_normalize_inference(dest,src,gamma,beta,running_means,running_variances);
#else
cpu::batch_normalize_inference(dest,src,gamma,beta,running_means,running_invstds);
cpu::batch_normalize_inference(dest,src,gamma,beta,running_means,running_variances);
#endif
}
......@@ -290,16 +290,16 @@ namespace dlib { namespace tt
resizable_tensor& vars,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_invstds,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
)
{
#ifdef DLIB_USE_CUDA
cuda::batch_normalize(dest,means,vars,averaging_factor,running_means,running_invstds,src,gamma,beta);
cuda::batch_normalize(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
#else
cpu::batch_normalize(dest,means,vars,averaging_factor,running_means,running_invstds,src,gamma,beta);
cpu::batch_normalize(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
#endif
}
......@@ -330,13 +330,13 @@ namespace dlib { namespace tt
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_invstds
const tensor& running_variances
)
{
#ifdef DLIB_USE_CUDA
cuda::batch_normalize_conv_inference(dest,src,gamma,beta,running_means,running_invstds);
cuda::batch_normalize_conv_inference(dest,src,gamma,beta,running_means,running_variances);
#else
cpu::batch_normalize_conv_inference(dest,src,gamma,beta,running_means,running_invstds);
cpu::batch_normalize_conv_inference(dest,src,gamma,beta,running_means,running_variances);
#endif
}
......@@ -346,16 +346,16 @@ namespace dlib { namespace tt
resizable_tensor& vars,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_invstds,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
)
{
#ifdef DLIB_USE_CUDA
cuda::batch_normalize_conv(dest,means,vars,averaging_factor,running_means,running_invstds,src,gamma,beta);
cuda::batch_normalize_conv(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
#else
cpu::batch_normalize_conv(dest,means,vars,averaging_factor,running_means,running_invstds,src,gamma,beta);
cpu::batch_normalize_conv(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
#endif
}
......
......@@ -294,7 +294,7 @@ namespace dlib { namespace tt
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_invstds
const tensor& running_variances
);
/*!
requires
......@@ -304,12 +304,12 @@ namespace dlib { namespace tt
- gamma.k() == src.k()
- have_same_dimensions(gamma, beta)
- have_same_dimensions(gamma, running_means)
- have_same_dimensions(gamma, running_invstds)
- have_same_dimensions(gamma, running_variances)
ensures
- Just linearly transforms src as a call to batch_normalize() would if the resulting
means and invstds were running_means and running_invstds. That is, this function
performs:
dest = gamma*(src-running_means)*running_invstds + beta
- Linearly transforms src as a call to batch_normalize() would if src had means
and variances as given by running_means and running_variances. That is, this
function performs:
dest = gamma*(src-running_means)/sqrt(running_variances+BATCH_NORM_EPS) + beta
Note that it does it in a pointwise fashion over the samples in src.
!*/
......@@ -319,7 +319,7 @@ namespace dlib { namespace tt
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_invstds,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
......@@ -335,7 +335,7 @@ namespace dlib { namespace tt
- 0 <= averaging_factor <= 1
- if (averaging_factor != 1)
- have_same_dimensions(running_means, means) == true
- have_same_dimensions(running_invstds, invstds) == true
- have_same_dimensions(running_variances, invstds) == true
ensures
- have_same_dimensions(#dest, src) == true
- #means.num_samples() == 1
......@@ -347,7 +347,7 @@ namespace dlib { namespace tt
- #means == the mean values of the contents of src.
- #invstds == 1/(the standard deviation values of the contents of src).
- #running_means = (1-averaging_factor)*mat(#running_means) + averaging_factor*mat(#means);
- #running_invstds = (1-averaging_factor)*mat(#running_invstds) + averaging_factor*mat(#invstds);
- #running_variances = (1-averaging_factor)*mat(#running_variances) + averaging_factor*(variance of contents of src);
!*/
void batch_normalize_gradient (
......@@ -391,7 +391,7 @@ namespace dlib { namespace tt
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_invstds
const tensor& running_variances
);
/*!
requires
......@@ -401,13 +401,13 @@ namespace dlib { namespace tt
- gamma.k() == src.k()
- have_same_dimensions(gamma, beta)
- have_same_dimensions(gamma, running_means)
- have_same_dimensions(gamma, running_invstds)
- have_same_dimensions(gamma, running_variances)
ensures
- Just linearly transforms src as a call to batch_normalize_conv() would if the resulting
means and invstds were running_means and running_invstds. That is, this function
performs:
dest = gamma*(src-running_means)*running_invstds + beta
Note that it does it in a pointwise fashion over the samples, rows, and
- Linearly transforms src as a call to batch_normalize_conv() would if src had
means and variances as given by running_means and running_variances. That
is, this function performs:
dest = gamma*(src-running_means)/sqrt(running_variances+BATCH_NORM_EPS) + beta
Note that it does this in a pointwise fashion over the samples, rows, and
columns in src.
!*/
......@@ -417,7 +417,7 @@ namespace dlib { namespace tt
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_invstds,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
......@@ -431,7 +431,7 @@ namespace dlib { namespace tt
- 0 <= averaging_factor <= 1
- if (averaging_factor != 1)
- have_same_dimensions(running_means, means) == true
- have_same_dimensions(running_invstds, invstds) == true
- have_same_dimensions(running_variances, invstds) == true
ensures
- have_same_dimensions(#dest, src) == true
- #means.num_samples()==means.nr()==means.nc() == 1
......@@ -441,7 +441,7 @@ namespace dlib { namespace tt
- #means == the mean values of the contents of src.
- #invstds == 1/(the standard deviation values of the contents of src).
- #running_means = (1-averaging_factor)*mat(#running_means) + averaging_factor*mat(#means);
- #running_invstds = (1-averaging_factor)*mat(#running_invstds) + averaging_factor*mat(#invstds);
- #running_variances = (1-averaging_factor)*mat(#running_variances) + averaging_factor*(variance of contents of src);
!*/
void batch_normalize_conv_gradient (
......
......@@ -164,12 +164,12 @@ namespace
beta = 0;
resizable_tensor running_means;
resizable_tensor running_invstds;
batch_normalize(dest, means, vars, 1, running_means, running_invstds, src, gamma, beta);
resizable_tensor running_variances;
batch_normalize(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
const double scale = (src.num_samples())/(src.num_samples()-1.0);
// Turn back into biased variance estimate because that's how batch_normalize() works, so if we want to match it this is necessary.
running_invstds = mat(running_invstds)/scale;
batch_normalize_inference(dest2, src, gamma, beta, running_means, running_invstds);
running_variances = mat(running_variances)/scale;
batch_normalize_inference(dest2, src, gamma, beta, running_means, running_variances);
DLIB_TEST_MSG(max(abs(mat(dest2)-mat(dest))) < 1e-5, max(abs(mat(dest2)-mat(dest))));
......@@ -177,7 +177,7 @@ namespace
auto f = [&](float eps) {
const float old = src.host()[idx];
src.host()[idx] += eps;
batch_normalize(dest, means, vars, 1, running_means, running_invstds, src, gamma, beta);
batch_normalize(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest);
src.host()[idx] = old;
return result;
......@@ -189,7 +189,7 @@ namespace
auto f = [&](float eps) {
const float old = gamma.host()[idx];
gamma.host()[idx] += eps;
batch_normalize(dest, means, vars, 1, running_means, running_invstds, src, gamma, beta);
batch_normalize(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest);
gamma.host()[idx] = old;
return result;
......@@ -201,7 +201,7 @@ namespace
auto f = [&](float eps) {
const float old = beta.host()[idx];
beta.host()[idx] += eps;
batch_normalize(dest, means, vars, 1, running_means, running_invstds, src, gamma, beta);
batch_normalize(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest);
beta.host()[idx] = old;
return result;
......@@ -247,13 +247,13 @@ namespace
beta = 0;
resizable_tensor running_means;
resizable_tensor running_invstds;
batch_normalize_conv(dest, means, vars, 1, running_means, running_invstds, src, gamma, beta);
resizable_tensor running_variances;
batch_normalize_conv(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
const double scale = (src.num_samples()*src.nr()*src.nc())/(src.num_samples()*src.nr()*src.nc()-1.0);
// Turn back into biased variance estimate because that's how
// batch_normalize_conv() works, so if we want to match it this is necessary.
running_invstds = mat(running_invstds)/scale;
batch_normalize_conv_inference(dest2, src, gamma, beta, running_means, running_invstds);
running_variances = mat(running_variances)/scale;
batch_normalize_conv_inference(dest2, src, gamma, beta, running_means, running_variances);
DLIB_TEST(max(abs(mat(dest2)-mat(dest))) < 1e-5);
......@@ -261,7 +261,7 @@ namespace
auto f = [&](float eps) {
const float old = src.host()[idx];
src.host()[idx] += eps;
batch_normalize_conv(dest, means, vars, 1, running_means, running_invstds, src, gamma, beta);
batch_normalize_conv(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest);
src.host()[idx] = old;
return result;
......@@ -273,7 +273,7 @@ namespace
auto f = [&](float eps) {
const float old = gamma.host()[idx];
gamma.host()[idx] += eps;
batch_normalize_conv(dest, means, vars, 1, running_means, running_invstds, src, gamma, beta);
batch_normalize_conv(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest);
gamma.host()[idx] = old;
return result;
......@@ -285,7 +285,7 @@ namespace
auto f = [&](float eps) {
const float old = beta.host()[idx];
beta.host()[idx] += eps;
batch_normalize_conv(dest, means, vars, 1, running_means, running_invstds, src, gamma, beta);
batch_normalize_conv(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
float result = dot(gradient_input, dest);
beta.host()[idx] = old;
return result;
......@@ -775,7 +775,7 @@ namespace
resizable_tensor means, means2;
resizable_tensor invstds, invstds2;
resizable_tensor running_means, running_means2;
resizable_tensor running_invstds, running_invstds2;
resizable_tensor running_variances, running_variances2;
resizable_tensor src(64,20,100,100);
resizable_tensor gamma(1,20,100,100);
resizable_tensor beta(1,20,100,100);
......@@ -785,20 +785,20 @@ namespace
rnd.fill_uniform(src);
cpu::batch_normalize(dest, means, invstds, 1, running_means, running_invstds, src, gamma, beta);
cuda::batch_normalize(dest2,means2,invstds2, 1, running_means2, running_invstds2, src, gamma, beta);
cpu::batch_normalize(dest, means, invstds, 1, running_means, running_variances, src, gamma, beta);
cuda::batch_normalize(dest2,means2,invstds2, 1, running_means2, running_variances2, src, gamma, beta);
dlog << LINFO << "dest error: "<< max(abs(mat(dest) -mat(dest2)));
dlog << LINFO << "means error: "<< max(abs(mat(means) -mat(means2)));
dlog << LINFO << "invstds error: "<< max(abs(mat(invstds) -mat(invstds2)));
dlog << LINFO << "running_means error: "<< max(abs(mat(running_means) -mat(running_means2)));
dlog << LINFO << "running_invstds error: "<< max(abs(mat(running_invstds) -mat(running_invstds2)));
dlog << LINFO << "running_variances error: "<< max(abs(mat(running_variances) -mat(running_variances2)));
DLIB_TEST(max(abs(mat(dest) -mat(dest2))) < 1e-4);
DLIB_TEST(max(abs(mat(means) -mat(means2))) < 1e-4);
DLIB_TEST(max(abs(mat(invstds) -mat(invstds2))) < 1e-4);
DLIB_TEST(max(abs(mat(running_means) -mat(running_means2))) < 1e-4);
DLIB_TEST(max(abs(mat(running_invstds) -mat(running_invstds2))) < 1e-4);
DLIB_TEST(max(abs(mat(running_variances) -mat(running_variances2))) < 1e-4);
// now check that the gradients match as well
......@@ -830,7 +830,7 @@ namespace
resizable_tensor means, means2;
resizable_tensor invstds, invstds2;
resizable_tensor running_means, running_means2;
resizable_tensor running_invstds, running_invstds2;
resizable_tensor running_variances, running_variances2;
resizable_tensor src(2,8,10,9);
resizable_tensor gamma(1,8);
resizable_tensor beta(1,8);
......@@ -839,20 +839,20 @@ namespace
tt::tensor_rand rnd;
rnd.fill_uniform(src);
cpu::batch_normalize_conv(dest,means,invstds,1,running_means,running_invstds, src, gamma, beta);
cuda::batch_normalize_conv(dest2,means2,invstds2,1,running_means2,running_invstds2, src, gamma, beta);
cpu::batch_normalize_conv(dest,means,invstds,1,running_means,running_variances, src, gamma, beta);
cuda::batch_normalize_conv(dest2,means2,invstds2,1,running_means2,running_variances2, src, gamma, beta);
dlog << LINFO << "dest error: "<< max(abs(mat(dest) -mat(dest2)));
dlog << LINFO << "means error: "<< max(abs(mat(means) -mat(means2)));
dlog << LINFO << "invstds error: "<< max(abs(mat(invstds) -mat(invstds2)));
dlog << LINFO << "running_means error: "<< max(abs(mat(running_means) -mat(running_means2)));
dlog << LINFO << "running_invstds error: "<< max(abs(mat(running_invstds) -mat(running_invstds2)));
dlog << LINFO << "running_variances error: "<< max(abs(mat(running_variances) -mat(running_variances2)));
DLIB_TEST(max(abs(mat(dest) -mat(dest2))) < 1e-4);
DLIB_TEST(max(abs(mat(means) -mat(means2))) < 1e-4);
DLIB_TEST(max(abs(mat(invstds) -mat(invstds2))) < 1e-4);
DLIB_TEST(max(abs(mat(running_means) -mat(running_means2))) < 1e-4);
DLIB_TEST(max(abs(mat(running_invstds) -mat(running_invstds2))) < 1e-4);
DLIB_TEST(max(abs(mat(running_variances) -mat(running_variances2))) < 1e-4);
resizable_tensor gradient_input;
resizable_tensor src_grad, gamma_grad, beta_grad;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment