Commit 93ab80c7 authored by Davis King's avatar Davis King

Made the affine_ layer support being constructed from bn_ layers. Also added

unit tests for the routines supporting this feature.
parent 669a1e17
......@@ -418,27 +418,31 @@ namespace dlib
// ----------------------------------------------------------------------------------------
enum batch_normalization_mode
enum layer_mode
{
BATCH_NORM_CONV = 0,
BATCH_NORM_FC = 1
CONV_MODE = 0,
FC_MODE = 1
};
class bn_
{
public:
bn_() : num_updates(0), running_stats_window_size(1000), mode(BATCH_NORM_FC)
bn_() : num_updates(0), running_stats_window_size(1000), mode(FC_MODE)
{}
explicit bn_(batch_normalization_mode mode_) : num_updates(0), running_stats_window_size(1000), mode(mode_)
explicit bn_(layer_mode mode_) : num_updates(0), running_stats_window_size(1000), mode(mode_)
{}
batch_normalization_mode get_mode() const { return mode; }
bn_(layer_mode mode_, unsigned long window_size) : num_updates(0), running_stats_window_size(window_size), mode(mode_)
{}
layer_mode get_mode() const { return mode; }
unsigned long get_running_stats_window_size () const { return running_stats_window_size; }
template <typename SUBNET>
void setup (const SUBNET& sub)
{
if (mode == BATCH_NORM_FC)
if (mode == FC_MODE)
{
gamma = alias_tensor(1,
sub.get_output().k(),
......@@ -473,14 +477,14 @@ namespace dlib
const double decay = 1.0 - num_updates/(num_updates+1.0);
if (num_updates <running_stats_window_size)
++num_updates;
if (mode == BATCH_NORM_FC)
if (mode == FC_MODE)
tt::batch_normalize(output, means, invstds, decay, running_means, running_invstds, sub.get_output(), g, b);
else
tt::batch_normalize_conv(output, means, invstds, decay, running_means, running_invstds, sub.get_output(), g, b);
}
else // we are running in testing mode so we just linearly scale the input tensor.
{
if (mode == BATCH_NORM_FC)
if (mode == FC_MODE)
tt::batch_normalize_inference(output, sub.get_output(), g, b, running_means, running_invstds);
else
tt::batch_normalize_conv_inference(output, sub.get_output(), g, b, running_means, running_invstds);
......@@ -493,7 +497,7 @@ namespace dlib
auto g = gamma(params,0);
auto g_grad = gamma(params_grad, 0);
auto b_grad = beta(params_grad, gamma.size());
if (mode == BATCH_NORM_FC)
if (mode == FC_MODE)
tt::batch_normalize_gradient(gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
else
tt::batch_normalize_conv_gradient(gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
......@@ -534,18 +538,20 @@ namespace dlib
deserialize(item.running_stats_window_size, in);
int mode;
deserialize(mode, in);
item.mode = (batch_normalization_mode)mode;
item.mode = (layer_mode)mode;
}
private:
friend class affine_;
resizable_tensor params;
alias_tensor gamma, beta;
resizable_tensor means, running_means;
resizable_tensor invstds, running_invstds;
unsigned long num_updates;
unsigned long running_stats_window_size;
batch_normalization_mode mode;
layer_mode mode;
};
template <typename SUBNET>
......@@ -770,17 +776,53 @@ namespace dlib
{
public:
affine_(
)
) : mode(FC_MODE)
{
}
explicit affine_(
layer_mode mode_
) : mode(mode_)
{
}
affine_(
const bn_& item
)
{
gamma = item.gamma;
beta = item.beta;
mode = item.mode;
params.copy_size(item.params);
auto g = gamma(params,0);
auto b = beta(params,gamma.size());
resizable_tensor temp(item.params);
auto sg = gamma(temp,0);
auto sb = beta(temp,gamma.size());
g = pointwise_multiply(mat(sg), mat(item.running_invstds));
b = mat(sb) - pointwise_multiply(mat(g), mat(item.running_means));
}
layer_mode get_mode() const { return mode; }
template <typename SUBNET>
void setup (const SUBNET& sub)
{
gamma = alias_tensor(1,
sub.get_output().k(),
sub.get_output().nr(),
sub.get_output().nc());
if (mode == FC_MODE)
{
gamma = alias_tensor(1,
sub.get_output().k(),
sub.get_output().nr(),
sub.get_output().nc());
}
else
{
gamma = alias_tensor(1, sub.get_output().k());
}
beta = gamma;
params.set_size(gamma.size()+beta.size());
......@@ -793,7 +835,10 @@ namespace dlib
{
auto g = gamma(params,0);
auto b = beta(params,gamma.size());
tt::affine_transform(output, input, g, b);
if (mode == FC_MODE)
tt::affine_transform(output, input, g, b);
else
tt::affine_transform_conv(output, input, g, b);
}
void backward_inplace(
......@@ -809,10 +854,18 @@ namespace dlib
auto b_grad = beta(params_grad,gamma.size());
// We are computing the gradient of dot(gradient_input, computed_output*g + b)
tt::multiply(data_grad, gradient_input, g);
tt::multiply(g_grad, gradient_input, computed_output);
tt::assign_bias_gradient(b_grad, gradient_input);
if (mode == FC_MODE)
{
tt::multiply(data_grad, gradient_input, g);
tt::multiply(g_grad, gradient_input, computed_output);
tt::assign_bias_gradient(b_grad, gradient_input);
}
else
{
tt::multiply_conv(data_grad, gradient_input, g);
tt::multiply_conv(g_grad, gradient_input, computed_output);
tt::assign_conv_bias_gradient(b_grad, gradient_input);
}
}
const tensor& get_layer_params() const { return params; }
......@@ -824,6 +877,7 @@ namespace dlib
serialize(item.params, out);
serialize(item.gamma, out);
serialize(item.beta, out);
serialize((int)item.mode, out);
}
friend void deserialize(affine_& item, std::istream& in)
......@@ -835,11 +889,15 @@ namespace dlib
deserialize(item.params, in);
deserialize(item.gamma, in);
deserialize(item.beta, in);
int mode;
deserialize(mode, in);
item.mode = (layer_mode)mode;
}
private:
resizable_tensor params;
alias_tensor gamma, beta;
layer_mode mode;
};
template <typename SUBNET>
......
......@@ -552,29 +552,89 @@ namespace dlib
// ----------------------------------------------------------------------------------------
class affine_
enum layer_mode
{
CONV_MODE = 0, // convolutional mode
FC_MODE = 1 // fully connected mode
};
class bn_
{
/*!
WHAT THIS OBJECT REPRESENTS
This is an implementation of the EXAMPLE_LAYER_ interface defined above.
In particular, it applies a simple pointwise linear transformation to an
input tensor. You can think of it as having two parameter tensors, A and
B, that each have the same dimensionality as the input tensor (except their
num_samples() dimensions are 1). If the input tensor is called INPUT
then the output of this layer is simply:
A*INPUT+B
where all operations are performed element wise and each sample in the
INPUT tensor is processed separately.
In particular, it defines a batch normalization layer that implements the
method described in the paper:
Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift by Sergey Ioffe and Christian Szegedy
In particular, this layer produces output tensors with the same
dimensionality as the input tensors, except that the mean and variances of
the elements have been standardized to 0 and 1 respectively.
It should also be noted that when tensors with a num_samples() dimension of
1 are passed to this layer it doesn't perform batch normalization.
Instead, it runs in "inference mode" where the learned linear normalizing
transformation is used to transform the tensor.
Finally, after you finish training a batch normalized network, it is a good
idea to replace each bn_ layer with an affine_ layer because the affine_
layer is faster and will never surprise you by performing batch
normalization on tensors that have a num_samples() dimension > 1. This allows
you to run large mini-batches of samples through your final network without
batch normalization executing at all.
!*/
public:
bn_(
);
/*!
ensures
- #get_mode() == FC_MODE
- get_running_stats_window_size() == 1000
!*/
affine_(
explicit bn_(
layer_mode mode
);
/*!
ensures
- #get_mode() == mode
- get_running_stats_window_size() == 1000
!*/
layer_mode get_mode(
) const;
/*!
ensures
- returns the mode of this layer, either CONV_MODE or FC_MODE.
If the mode is FC_MODE then the normalization is applied across the
samples in a tensor (i.e. k()*nr()*nc() different things will be
normalized). Otherwise, normalization is applied across everything
except for the k() dimension, resulting in there being only k()
normalization equations that are applied spatially over the tensor.
Therefore, if you are putting batch normalization after a fully connected
layer you should use FC_MODE. Otherwise, if you are putting batch
normalization after a convolutional layer you should use CONV_MODE.
!*/
unsigned long get_running_stats_window_size (
) const;
/*!
ensures
- Just as recommended in the batch normalization paper, this object keeps a
running average of the mean and standard deviations of the features.
These averages are used during "inference mode" so you can run a single
object through a batch normalized network. They are also what is used to
initialize an affine_ layer that is constructed from a bn_ layer. This
function returns the effective number of recent samples used to compute
the running average.
!*/
template <typename SUBNET> void setup (const SUBNET& sub);
void forward_inplace(const tensor& input, tensor& output);
void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
const tensor& get_layer_params() const;
tensor& get_layer_params();
/*!
......@@ -582,73 +642,83 @@ namespace dlib
!*/
};
void serialize(const affine_& item, std::ostream& out);
void deserialize(affine_& item, std::istream& in);
void serialize(const bn_& item, std::ostream& out);
void deserialize(bn_& item, std::istream& in);
/*!
provides serialization support
!*/
template <typename SUBNET>
using affine = add_layer<affine_, SUBNET>;
using bn = add_layer<bn_, SUBNET>;
// ----------------------------------------------------------------------------------------
enum batch_normalization_mode
{
BATCH_NORM_CONV = 0,
BATCH_NORM_FC = 1
};
class bn_
class affine_
{
/*!
WHAT THIS OBJECT REPRESENTS
This is an implementation of the EXAMPLE_LAYER_ interface defined above.
In particular, it defines a batch normalization layer that implements the
method described in the paper:
Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift by Sergey Ioffe and Christian Szegedy
In particular, this layer produces output tensors with the same
dimensionality as the input tensors, except that the mean and variances of
the elements have been standardized.
In particular, it applies a simple pointwise linear transformation to an
input tensor. You can think of it as having two parameter tensors, A and
B. If the input tensor is called INPUT then the output of this layer is:
A*INPUT+B
where all operations are performed element wise and each sample in the
INPUT tensor is processed separately.
Moreover, this object has two modes that effect the dimensionalities of A
and B and how they are applied to compute A*INPUT+B. If
get_mode()==FC_MODE then A and B each have the same dimensionality as the
input tensor, except their num_samples() dimensions are 1. If
get_mode()==CONV_MODE then A and B have all their dimensions set to 1
except for k(), which is equal to INPUT.k().
In either case, the computation of A*INPUT+B is performed pointwise over all
the elements of INPUT using either:
OUTPUT(n,k,r,c) == A(1,k,r,c)*INPUT(n,k,r,c)+B(1,k,r,c)
or
OUTPUT(n,k,r,c) == A(1,k,1,1)*INPUT(n,k,r,c)+B(1,k,1,1)
as appropriate.
!*/
public:
bn_(
affine_(
);
/*!
ensures
- #get_mode() == BATCH_NORM_FC
- #get_mode() == FC_MODE
!*/
explicit bn_(
batch_normalization_mode mode
affine_(
const bn_& layer
);
/*!
ensures
- Constructs affine_ so that it performs the same transformation as the
supplied batch normalization layer. You would want to do this after you
finish training a network with bn_ layers because the affine_ layer will
execute faster.
- #get_mode() == layer.get_mode()
!*/
explicit affine_(
layer_mode mode
);
/*!
ensures
- #get_mode() == mode
!*/
batch_normalization_mode get_mode(
layer_mode get_mode(
) const;
/*!
ensures
- returns the mode of this layer, either BATCH_NORM_CONV or BATCH_NORM_FC.
If the mode is BATCH_NORM_FC then the normalization is applied across the
samples in a tensor (i.e. k()*nr()*nc() different things will be
normalized). Otherwise, normalization is applied across everything
except for the k() dimension, resulting in there being only k()
normalization equations that are applied spatially over the tensor.
Therefore, if you are putting batch normalization after a fully connected
layer you should use BATCH_NORM_FC. Otherwise, if you are putting batch
normalization after a convolutional layer you should use BATCH_NORM_CONV.
- returns the mode of this layer, either CONV_MODE or FC_MODE.
!*/
template <typename SUBNET> void setup (const SUBNET& sub);
template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
void forward_inplace(const tensor& input, tensor& output);
void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
const tensor& get_layer_params() const;
tensor& get_layer_params();
/*!
......@@ -656,14 +726,14 @@ namespace dlib
!*/
};
void serialize(const bn_& item, std::ostream& out);
void deserialize(bn_& item, std::istream& in);
void serialize(const affine_& item, std::ostream& out);
void deserialize(affine_& item, std::istream& in);
/*!
provides serialization support
!*/
template <typename SUBNET>
using bn = add_layer<bn_, SUBNET>;
using affine = add_layer<affine_, SUBNET>;
// ----------------------------------------------------------------------------------------
......
......@@ -763,7 +763,101 @@ namespace
DLIB_TEST(max(abs(mat(gamma_grad)-mat(gamma_grad2))) < 1e-4);
DLIB_TEST(max(abs(mat(beta_grad)-mat(beta_grad2))) < 1e-4);
}
#endif
void test_more_ops2()
{
dlib::rand rnd;
tt::tensor_rand trand;
for (int iter = 0; iter < 100; ++iter)
{
print_spinner();
resizable_tensor dest1, dest2, src1, src2;
src1.set_size(rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1);
dest1.copy_size(src1);
dest2.copy_size(src1);
src2.set_size(1,src1.k(),1,1);
trand.fill_uniform(dest1);
trand.fill_uniform(dest2);
trand.fill_uniform(src1);
trand.fill_uniform(src2);
cpu::multiply_conv(dest1, src1, src2);
cuda::multiply_conv(dest2, src1, src2);
DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);
// now try it using the other mode of multiply_conv
src2.copy_size(src1);
dest1.set_size(1,src1.k(),1,1);
dest2.set_size(1,src1.k(),1,1);
trand.fill_uniform(dest1);
trand.fill_uniform(dest2);
trand.fill_uniform(src1);
trand.fill_uniform(src2);
cpu::multiply_conv(dest1, src1, src2);
cuda::multiply_conv(dest2, src1, src2);
const float scale = max(abs(mat(dest1)));
const float scalem = mean(abs(mat(dest1)));
DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2)))/scale < 1e-4 , max(abs(mat(dest1)-mat(dest2)))/scale);
DLIB_TEST_MSG(mean(abs(mat(dest1)-mat(dest2)))/scalem < 1e-5 , mean(abs(mat(dest1)-mat(dest2)))/scalem);
}
for (int iter = 0; iter < 100; ++iter)
{
print_spinner();
resizable_tensor dest1, dest2, src, A, B;
src.set_size(rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1);
dest1.copy_size(src);
dest2.copy_size(src);
A.set_size(1,src.k(),1,1);
B.set_size(1,src.k(),1,1);
trand.fill_uniform(dest1);
trand.fill_uniform(dest2);
trand.fill_uniform(src);
trand.fill_uniform(A);
trand.fill_uniform(B);
cpu::affine_transform_conv(dest1, src, A, B);
cuda::affine_transform_conv(dest2, src, A, B);
DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);
}
for (int iter = 0; iter < 100; ++iter)
{
print_spinner();
resizable_tensor dest1, dest2, g;
g.set_size(rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1);
dest1.set_size(1,g.k(),1,1);
dest2.set_size(1,g.k(),1,1);
trand.fill_uniform(dest1);
trand.fill_uniform(dest2);
trand.fill_uniform(g);
cpu::assign_conv_bias_gradient(dest1, g);
cuda::assign_conv_bias_gradient(dest2, g);
const float scale = max(abs(mat(dest1)));
const float scalem = mean(abs(mat(dest1)));
DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2)))/scale < 1e-4 , max(abs(mat(dest1)-mat(dest2)))/scale);
DLIB_TEST_MSG(mean(abs(mat(dest1)-mat(dest2)))/scalem < 1e-5 , mean(abs(mat(dest1)-mat(dest2)))/scalem);
}
}
#endif // DLIB_USE_CUDA
// ----------------------------------------------------------------------------------------
......@@ -883,12 +977,22 @@ namespace
}
{
print_spinner();
affine_ l;
affine_ l(CONV_MODE);
DLIB_TEST_MSG(test_layer(l), test_layer(l));
}
{
print_spinner();
affine_ l(FC_MODE);
DLIB_TEST_MSG(test_layer(l), test_layer(l));
}
{
print_spinner();
bn_ l(CONV_MODE);
DLIB_TEST_MSG(test_layer(l), test_layer(l));
}
{
print_spinner();
bn_ l;
bn_ l(FC_MODE);
DLIB_TEST_MSG(test_layer(l), test_layer(l));
}
{
......@@ -953,7 +1057,7 @@ namespace
template <typename T> using rcon = max_pool<relu<bn<con<T>>>>;
std::tuple<max_pool_,relu_,bn_,con_> rcon_ (unsigned long n)
{
return std::make_tuple(max_pool_(2,2,2,2),relu_(),bn_(BATCH_NORM_CONV),con_(n,5,5));
return std::make_tuple(max_pool_(2,2,2,2),relu_(),bn_(CONV_MODE),con_(n,5,5));
}
template <typename T> using rfc = relu<bn<fc<T>>>;
......@@ -996,6 +1100,7 @@ namespace
{
test_tagging();
#ifdef DLIB_USE_CUDA
test_more_ops2();
test_more_ops(1,1);
test_more_ops(3,4);
test_more_ops(4,3);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment