Commit 93ab80c7 authored by Davis King's avatar Davis King

Made the affine_ layer support being constructed from bn_ layers. Also added

unit tests for the routines supporting this feature.
parent 669a1e17
......@@ -418,27 +418,31 @@ namespace dlib
// ----------------------------------------------------------------------------------------
enum batch_normalization_mode
enum layer_mode
{
BATCH_NORM_CONV = 0,
BATCH_NORM_FC = 1
CONV_MODE = 0,
FC_MODE = 1
};
class bn_
{
public:
bn_() : num_updates(0), running_stats_window_size(1000), mode(BATCH_NORM_FC)
bn_() : num_updates(0), running_stats_window_size(1000), mode(FC_MODE)
{}
explicit bn_(batch_normalization_mode mode_) : num_updates(0), running_stats_window_size(1000), mode(mode_)
explicit bn_(layer_mode mode_) : num_updates(0), running_stats_window_size(1000), mode(mode_)
{}
batch_normalization_mode get_mode() const { return mode; }
bn_(layer_mode mode_, unsigned long window_size) : num_updates(0), running_stats_window_size(window_size), mode(mode_)
{}
layer_mode get_mode() const { return mode; }
unsigned long get_running_stats_window_size () const { return running_stats_window_size; }
template <typename SUBNET>
void setup (const SUBNET& sub)
{
if (mode == BATCH_NORM_FC)
if (mode == FC_MODE)
{
gamma = alias_tensor(1,
sub.get_output().k(),
......@@ -473,14 +477,14 @@ namespace dlib
const double decay = 1.0 - num_updates/(num_updates+1.0);
if (num_updates <running_stats_window_size)
++num_updates;
if (mode == BATCH_NORM_FC)
if (mode == FC_MODE)
tt::batch_normalize(output, means, invstds, decay, running_means, running_invstds, sub.get_output(), g, b);
else
tt::batch_normalize_conv(output, means, invstds, decay, running_means, running_invstds, sub.get_output(), g, b);
}
else // we are running in testing mode so we just linearly scale the input tensor.
{
if (mode == BATCH_NORM_FC)
if (mode == FC_MODE)
tt::batch_normalize_inference(output, sub.get_output(), g, b, running_means, running_invstds);
else
tt::batch_normalize_conv_inference(output, sub.get_output(), g, b, running_means, running_invstds);
......@@ -493,7 +497,7 @@ namespace dlib
auto g = gamma(params,0);
auto g_grad = gamma(params_grad, 0);
auto b_grad = beta(params_grad, gamma.size());
if (mode == BATCH_NORM_FC)
if (mode == FC_MODE)
tt::batch_normalize_gradient(gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
else
tt::batch_normalize_conv_gradient(gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
......@@ -534,18 +538,20 @@ namespace dlib
deserialize(item.running_stats_window_size, in);
int mode;
deserialize(mode, in);
item.mode = (batch_normalization_mode)mode;
item.mode = (layer_mode)mode;
}
private:
friend class affine_;
resizable_tensor params;
alias_tensor gamma, beta;
resizable_tensor means, running_means;
resizable_tensor invstds, running_invstds;
unsigned long num_updates;
unsigned long running_stats_window_size;
batch_normalization_mode mode;
layer_mode mode;
};
template <typename SUBNET>
......@@ -770,17 +776,53 @@ namespace dlib
{
public:
affine_(
) : mode(FC_MODE)
{
}
explicit affine_(
layer_mode mode_
) : mode(mode_)
{
}
affine_(
const bn_& item
)
{
gamma = item.gamma;
beta = item.beta;
mode = item.mode;
params.copy_size(item.params);
auto g = gamma(params,0);
auto b = beta(params,gamma.size());
resizable_tensor temp(item.params);
auto sg = gamma(temp,0);
auto sb = beta(temp,gamma.size());
g = pointwise_multiply(mat(sg), mat(item.running_invstds));
b = mat(sb) - pointwise_multiply(mat(g), mat(item.running_means));
}
layer_mode get_mode() const { return mode; }
template <typename SUBNET>
void setup (const SUBNET& sub)
{
if (mode == FC_MODE)
{
gamma = alias_tensor(1,
sub.get_output().k(),
sub.get_output().nr(),
sub.get_output().nc());
}
else
{
gamma = alias_tensor(1, sub.get_output().k());
}
beta = gamma;
params.set_size(gamma.size()+beta.size());
......@@ -793,7 +835,10 @@ namespace dlib
{
auto g = gamma(params,0);
auto b = beta(params,gamma.size());
if (mode == FC_MODE)
tt::affine_transform(output, input, g, b);
else
tt::affine_transform_conv(output, input, g, b);
}
void backward_inplace(
......@@ -809,11 +854,19 @@ namespace dlib
auto b_grad = beta(params_grad,gamma.size());
// We are computing the gradient of dot(gradient_input, computed_output*g + b)
if (mode == FC_MODE)
{
tt::multiply(data_grad, gradient_input, g);
tt::multiply(g_grad, gradient_input, computed_output);
tt::assign_bias_gradient(b_grad, gradient_input);
}
else
{
tt::multiply_conv(data_grad, gradient_input, g);
tt::multiply_conv(g_grad, gradient_input, computed_output);
tt::assign_conv_bias_gradient(b_grad, gradient_input);
}
}
const tensor& get_layer_params() const { return params; }
tensor& get_layer_params() { return params; }
......@@ -824,6 +877,7 @@ namespace dlib
serialize(item.params, out);
serialize(item.gamma, out);
serialize(item.beta, out);
serialize((int)item.mode, out);
}
friend void deserialize(affine_& item, std::istream& in)
......@@ -835,11 +889,15 @@ namespace dlib
deserialize(item.params, in);
deserialize(item.gamma, in);
deserialize(item.beta, in);
int mode;
deserialize(mode, in);
item.mode = (layer_mode)mode;
}
private:
resizable_tensor params;
alias_tensor gamma, beta;
layer_mode mode;
};
template <typename SUBNET>
......
This diff is collapsed.
......@@ -763,7 +763,101 @@ namespace
DLIB_TEST(max(abs(mat(gamma_grad)-mat(gamma_grad2))) < 1e-4);
DLIB_TEST(max(abs(mat(beta_grad)-mat(beta_grad2))) < 1e-4);
}
#endif
void test_more_ops2()
{
dlib::rand rnd;
tt::tensor_rand trand;
for (int iter = 0; iter < 100; ++iter)
{
print_spinner();
resizable_tensor dest1, dest2, src1, src2;
src1.set_size(rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1);
dest1.copy_size(src1);
dest2.copy_size(src1);
src2.set_size(1,src1.k(),1,1);
trand.fill_uniform(dest1);
trand.fill_uniform(dest2);
trand.fill_uniform(src1);
trand.fill_uniform(src2);
cpu::multiply_conv(dest1, src1, src2);
cuda::multiply_conv(dest2, src1, src2);
DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);
// now try it using the other mode of multiply_conv
src2.copy_size(src1);
dest1.set_size(1,src1.k(),1,1);
dest2.set_size(1,src1.k(),1,1);
trand.fill_uniform(dest1);
trand.fill_uniform(dest2);
trand.fill_uniform(src1);
trand.fill_uniform(src2);
cpu::multiply_conv(dest1, src1, src2);
cuda::multiply_conv(dest2, src1, src2);
const float scale = max(abs(mat(dest1)));
const float scalem = mean(abs(mat(dest1)));
DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2)))/scale < 1e-4 , max(abs(mat(dest1)-mat(dest2)))/scale);
DLIB_TEST_MSG(mean(abs(mat(dest1)-mat(dest2)))/scalem < 1e-5 , mean(abs(mat(dest1)-mat(dest2)))/scalem);
}
for (int iter = 0; iter < 100; ++iter)
{
print_spinner();
resizable_tensor dest1, dest2, src, A, B;
src.set_size(rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1);
dest1.copy_size(src);
dest2.copy_size(src);
A.set_size(1,src.k(),1,1);
B.set_size(1,src.k(),1,1);
trand.fill_uniform(dest1);
trand.fill_uniform(dest2);
trand.fill_uniform(src);
trand.fill_uniform(A);
trand.fill_uniform(B);
cpu::affine_transform_conv(dest1, src, A, B);
cuda::affine_transform_conv(dest2, src, A, B);
DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);
}
for (int iter = 0; iter < 100; ++iter)
{
print_spinner();
resizable_tensor dest1, dest2, g;
g.set_size(rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1,
rnd.get_random_32bit_number()%30+1);
dest1.set_size(1,g.k(),1,1);
dest2.set_size(1,g.k(),1,1);
trand.fill_uniform(dest1);
trand.fill_uniform(dest2);
trand.fill_uniform(g);
cpu::assign_conv_bias_gradient(dest1, g);
cuda::assign_conv_bias_gradient(dest2, g);
const float scale = max(abs(mat(dest1)));
const float scalem = mean(abs(mat(dest1)));
DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2)))/scale < 1e-4 , max(abs(mat(dest1)-mat(dest2)))/scale);
DLIB_TEST_MSG(mean(abs(mat(dest1)-mat(dest2)))/scalem < 1e-5 , mean(abs(mat(dest1)-mat(dest2)))/scalem);
}
}
#endif // DLIB_USE_CUDA
// ----------------------------------------------------------------------------------------
......@@ -883,12 +977,22 @@ namespace
}
{
print_spinner();
affine_ l;
affine_ l(CONV_MODE);
DLIB_TEST_MSG(test_layer(l), test_layer(l));
}
{
print_spinner();
affine_ l(FC_MODE);
DLIB_TEST_MSG(test_layer(l), test_layer(l));
}
{
print_spinner();
bn_ l(CONV_MODE);
DLIB_TEST_MSG(test_layer(l), test_layer(l));
}
{
print_spinner();
bn_ l;
bn_ l(FC_MODE);
DLIB_TEST_MSG(test_layer(l), test_layer(l));
}
{
......@@ -953,7 +1057,7 @@ namespace
template <typename T> using rcon = max_pool<relu<bn<con<T>>>>;
std::tuple<max_pool_,relu_,bn_,con_> rcon_ (unsigned long n)
{
return std::make_tuple(max_pool_(2,2,2,2),relu_(),bn_(BATCH_NORM_CONV),con_(n,5,5));
return std::make_tuple(max_pool_(2,2,2,2),relu_(),bn_(CONV_MODE),con_(n,5,5));
}
template <typename T> using rfc = relu<bn<fc<T>>>;
......@@ -996,6 +1100,7 @@ namespace
{
test_tagging();
#ifdef DLIB_USE_CUDA
test_more_ops2();
test_more_ops(1,1);
test_more_ops(3,4);
test_more_ops(4,3);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment