Commit 4bc6c1e5 authored by Juha Reunanen's avatar Juha Reunanen Committed by Davis E. King

Add new loss layer for semantic segmentation (pixel-wise classification) (#540)

* #288 - add new layer loss_multiclass_log_matrixoutput for semantic-segmentation purposes

* In semantic segmentation, add capability to ignore individual pixels when computing gradients

* In semantic segmentation, 65535 classes ought to be enough for anybody

* Divide matrix output loss by matrix dimensions too, in order to make losses related to differently sized matrices more comparable
- note that this affects the required learning rate as well!

* Review fix: avoid matrix copy

* Review fix: rename to loss_multiclass_log_per_pixel

* Review fix: just use uint16_t as the label type

* Add more tests: check that network params and outputs are correct

* Improve error message when output and truth matrix dimensions do not match

* Add test case verifying that a single call of loss_multiclass_log_per_pixel equals multiple corresponding calls of loss_multiclass_log

* Fix test failure by training longer

* Remove the test case that fails on Travis for some reason, even though it works on AppVeyor and locally
parent 37a77ad8
......@@ -1529,6 +1529,174 @@ namespace dlib
template <typename SUBNET>
using loss_mean_squared_multioutput = add_loss_layer<loss_mean_squared_multioutput_, SUBNET>;
// ----------------------------------------------------------------------------------------
// In semantic segmentation, if you don't know the ground-truth of some pixel,
// set the label of that pixel to this value. When you do so, the pixel will be
// ignored when computing gradients.
static const uint16_t label_to_ignore = std::numeric_limits<uint16_t>::max();
class loss_multiclass_log_per_pixel_
{
public:
// In semantic segmentation, 65535 classes ought to be enough for anybody.
typedef matrix<uint16_t> training_label_type;
typedef matrix<uint16_t> output_label_type;
template <
typename SUB_TYPE,
typename label_iterator
>
void to_label (
const tensor& input_tensor,
const SUB_TYPE& sub,
label_iterator iter
) const
{
DLIB_CASSERT(sub.sample_expansion_factor() == 1);
const tensor& output_tensor = sub.get_output();
DLIB_CASSERT(output_tensor.k() >= 1); // Note that output_tensor.k() should match the number of labels.
DLIB_CASSERT(output_tensor.k() < std::numeric_limits<uint16_t>::max());
DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
const float* const out_data = output_tensor.host();
// The index of the largest output for each element is the label.
const auto find_label = [&](long sample, long r, long c) {
uint16_t label = 0;
float max_value = out_data[tensor_index(output_tensor, sample, r, c, 0)];
for (long k = 1; k < output_tensor.k(); ++k) {
const float value = out_data[tensor_index(output_tensor, sample, r, c, k)];
if (value > max_value) {
label = static_cast<uint16_t>(k);
max_value = value;
}
}
return label;
};
for (long i = 0; i < output_tensor.num_samples(); ++i, ++iter) {
iter->set_size(output_tensor.nr(), output_tensor.nc());
for (long r = 0; r < output_tensor.nr(); ++r) {
for (long c = 0; c < output_tensor.nc(); ++c) {
// The index of the largest output for this element is the label.
iter->operator()(r, c) = find_label(i, r, c);
}
}
}
}
template <
typename const_label_iterator,
typename SUBNET
>
double compute_loss_value_and_gradient (
const tensor& input_tensor,
const_label_iterator truth,
SUBNET& sub
) const
{
const tensor& output_tensor = sub.get_output();
tensor& grad = sub.get_gradient_input();
DLIB_CASSERT(sub.sample_expansion_factor() == 1);
DLIB_CASSERT(input_tensor.num_samples() != 0);
DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
DLIB_CASSERT(output_tensor.k() >= 1);
DLIB_CASSERT(output_tensor.k() < std::numeric_limits<uint16_t>::max());
DLIB_CASSERT(output_tensor.nr() == grad.nr() &&
output_tensor.nc() == grad.nc() &&
output_tensor.k() == grad.k());
for (long idx = 0; idx < output_tensor.num_samples(); ++idx)
{
const_label_iterator truth_matrix_ptr = (truth + idx);
DLIB_CASSERT(truth_matrix_ptr->nr() == output_tensor.nr() &&
truth_matrix_ptr->nc() == output_tensor.nc(),
"truth size = " << truth_matrix_ptr->nr() << " x " << truth_matrix_ptr->nc() << ", "
"output size = " << output_tensor.nr() << " x " << output_tensor.nc());
}
tt::softmax(grad, output_tensor);
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc());
double loss = 0;
float* const g = grad.host();
for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
const uint16_t y = truth->operator()(r, c);
// The network must produce a number of outputs that is equal to the number
// of labels when using this type of loss.
DLIB_CASSERT(static_cast<long>(y) < output_tensor.k() || y == label_to_ignore,
"y: " << y << ", output_tensor.k(): " << output_tensor.k());
for (long k = 0; k < output_tensor.k(); ++k)
{
const size_t idx = tensor_index(output_tensor, i, r, c, k);
if (k == y)
{
loss += scale*-std::log(g[idx]);
g[idx] = scale*(g[idx] - 1);
}
else if (y == label_to_ignore)
{
g[idx] = 0.f;
}
else
{
g[idx] = scale*g[idx];
}
}
}
}
}
return loss;
}
friend void serialize(const loss_multiclass_log_per_pixel_& , std::ostream& out)
{
serialize("loss_multiclass_log_per_pixel_", out);
}
friend void deserialize(loss_multiclass_log_per_pixel_& , std::istream& in)
{
std::string version;
deserialize(version, in);
if (version != "loss_multiclass_log_per_pixel_")
throw serialization_error("Unexpected version found while deserializing dlib::loss_multiclass_log_.");
}
friend std::ostream& operator<<(std::ostream& out, const loss_multiclass_log_per_pixel_& )
{
out << "loss_multiclass_log_per_pixel";
return out;
}
friend void to_xml(const loss_multiclass_log_per_pixel_& /*item*/, std::ostream& out)
{
out << "<loss_multiclass_log_per_pixel/>";
}
private:
static size_t tensor_index(const tensor& t, long sample, long row, long column, long k)
{
// See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
}
};
template <typename SUBNET>
using loss_multiclass_log_per_pixel = add_loss_layer<loss_multiclass_log_per_pixel_, SUBNET>;
// ----------------------------------------------------------------------------------------
}
......
......@@ -798,6 +798,66 @@ namespace dlib
template <typename SUBNET>
using loss_mean_squared_multioutput = add_loss_layer<loss_mean_squared_multioutput_, SUBNET>;
// ----------------------------------------------------------------------------------------
class loss_multiclass_log_per_pixel_
{
/*!
WHAT THIS OBJECT REPRESENTS
This object implements the loss layer interface defined above by
EXAMPLE_LOSS_LAYER_. In particular, it implements the multiclass logistic
regression loss (e.g. negative log-likelihood loss), which is appropriate
for multiclass classification problems. It is basically just like
loss_multiclass_log_ except that it lets you define matrix output instead
of scalar. It should be useful, for example, in semantic segmentation where
we want to classify each pixel of an image.
!*/
public:
// In semantic segmentation, 65535 classes ought to be enough for anybody.
typedef matrix<uint16_t> training_label_type;
typedef matrix<uint16_t> output_label_type;
template <
typename SUB_TYPE,
typename label_iterator
>
void to_label (
const tensor& input_tensor,
const SUB_TYPE& sub,
label_iterator iter
) const;
/*!
This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
it has the additional calling requirements that:
- sub.get_output().num_samples() == input_tensor.num_samples()
- sub.sample_expansion_factor() == 1
and the output label is the predicted class for each classified element. The number
of possible output classes is sub.get_output().k().
!*/
template <
typename const_label_iterator,
typename SUBNET
>
double compute_loss_value_and_gradient (
const tensor& input_tensor,
const_label_iterator truth,
SUBNET& sub
) const;
/*!
This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
except it has the additional calling requirements that:
- sub.get_output().num_samples() == input_tensor.num_samples()
- sub.sample_expansion_factor() == 1
- all values pointed to by truth are < sub.get_output().k() (or std::numeric_limits<uint16_t>::max() to ignore)
!*/
};
template <typename SUBNET>
using loss_multiclass_log_per_pixel = add_loss_layer<loss_multiclass_log_per_pixel_, SUBNET>;
// ----------------------------------------------------------------------------------------
}
......
......@@ -8,6 +8,7 @@
#include <ctime>
#include <vector>
#include <random>
#include <numeric>
#include "../dnn.h"
#include "tester.h"
......@@ -1968,6 +1969,340 @@ namespace
}
// ----------------------------------------------------------------------------------------
void test_loss_multiclass_per_pixel_learned_params_on_trivial_single_pixel_task()
{
print_spinner();
constexpr uint16_t num_classes = 7;
constexpr uint16_t true_label = num_classes / 2;
::std::vector<matrix<float>> x({ matrix<float,1,1>({ 1 }) });
::std::vector<matrix<uint16_t>> y({ matrix<uint16_t,1,1>({ true_label }) });
using net_type = loss_multiclass_log_per_pixel<con<num_classes,1,1,1,1,input<matrix<float>>>>;
net_type net;
dnn_trainer<net_type> trainer(net, sgd(0,0));
trainer.set_learning_rate(1e7);
trainer.set_max_num_epochs(1);
trainer.train(x, y);
const tensor& learned_params = layer<1>(net).layer_details().get_layer_params();
const float* learned_params_data = learned_params.host();
for (int is_bias = 0; is_bias <= 1; ++is_bias) {
for (uint16_t k = 0; k < num_classes; ++k) {
size_t index = k + is_bias * num_classes;
DLIB_CASSERT(index < learned_params.size());
if (k == true_label) {
DLIB_TEST(learned_params_data[index] > 1e5);
}
else {
DLIB_TEST(learned_params_data[index] < -1e5);
}
}
}
}
// ----------------------------------------------------------------------------------------
void test_loss_multiclass_per_pixel_activations_on_trivial_single_pixel_task()
{
print_spinner();
constexpr int input_height = 35;
constexpr int input_width = 27;
constexpr int output_height = input_height;
constexpr int output_width = input_width;
constexpr int num_samples = 7;
constexpr int num_classes = 5;
::std::vector<matrix<float>> x(num_samples);
::std::vector<matrix<uint16_t>> y(num_samples);
matrix<float> xtmp(input_height, input_width);
matrix<uint16_t> ytmp(output_height, output_width);
::std::default_random_engine generator(16);
::std::bernoulli_distribution coinflip(0.5);
using filter_type = con<num_classes,1,1,1,1,input<matrix<float>>>;
// Define a "truth" filter
filter_type truth_filter;
truth_filter(xtmp); // Set up the convolutional layer
// Generate training data
for (int ii = 0; ii < num_samples; ++ii) {
// Generate random inputs x
for (int jj = 0; jj < input_height; ++jj)
for (int kk = 0; kk < input_width; ++kk)
xtmp(jj, kk) = coinflip(generator) ? 1.f : -1.f;
x[ii] = xtmp;
// Generate target output y by applying the truth filter on x
const tensor& output = truth_filter(xtmp);
const float* const out_data = output.host();
const auto out_element = [&](int row, int column, int k) {
return out_data[(k * output.nr() + row) * output.nc() + column];
};
for (int jj = 0; jj < output_height; ++jj) {
for (int kk = 0; kk < output_width; ++kk) {
uint16_t label = 0;
float max_value = out_element(jj, kk, 0);
for (long k = 1; k < num_classes; ++k) {
const float value = out_element(jj, kk, k);
if (value > max_value) {
label = static_cast<uint16_t>(k);
max_value = value;
}
}
ytmp(jj, kk) = label;
}
}
y[ii] = ytmp;
}
using net_type = loss_multiclass_log_per_pixel<filter_type>;
net_type net;
dnn_trainer<net_type> trainer(net, sgd(0,0));
trainer.set_learning_rate(1e6);
trainer.set_max_num_epochs(1);
trainer.train(x, y);
// Feed forward the training samples.
resizable_tensor temp_tensor;
net.subnet().to_tensor(&x[0], &x[0] + num_samples, temp_tensor);
net.subnet().forward(temp_tensor);
const dimpl::subnet_wrapper<filter_type> wsub(net.subnet());
const tensor& output_tensor = wsub.get_output();
const float* const out_data = output_tensor.host();
// Let's have a look at the activations before softmax. They should be pretty high
// (in terms of absolute value), because the learning task is trivial.
for (int ii = 0; ii < num_samples; ++ii) {
for (int jj = 0; jj < output_height; ++jj) {
for (int kk = 0; kk < output_width; ++kk) {
const uint16_t true_label = y[ii](jj, kk);
for (long k = 0; k < num_classes; ++k) {
const size_t index = ((ii * output_tensor.k() + k) * output_tensor.nr() + jj) * output_tensor.nc() + kk;
DLIB_CASSERT(index < output_tensor.size());
if (k == true_label) {
DLIB_TEST_MSG(out_data[index] > 1e4, "");
}
else {
DLIB_TEST_MSG(out_data[index] < -1e4, "");
}
}
}
}
}
}
// ----------------------------------------------------------------------------------------
void test_loss_multiclass_per_pixel_outputs_on_trivial_task()
{
print_spinner();
constexpr int input_height = 7;
constexpr int input_width = 5;
constexpr int output_height = input_height;
constexpr int output_width = input_width;
constexpr int num_samples = 7;
constexpr int num_classes = 5;
constexpr int filter_height = 3;
constexpr int filter_width = 3;
::std::vector<matrix<float>> x(num_samples);
::std::vector<matrix<uint16_t>> y(num_samples);
matrix<float> xtmp(input_height, input_width);
matrix<uint16_t> ytmp(output_height, output_width);
::std::default_random_engine generator(16);
::std::bernoulli_distribution coinflip(0.5);
using filter_type = con<num_classes, filter_height, filter_width, 1, 1, input<matrix<float>>>;
// Define a "truth" filter
filter_type truth_filter;
truth_filter(xtmp); // Set up the convolutional layer
// Generate training data
for (int ii = 0; ii < num_samples; ++ii) {
// Generate random inputs x
for (int jj = 0; jj < input_height; ++jj)
for (int kk = 0; kk < input_width; ++kk)
xtmp(jj, kk) = coinflip(generator) ? 1.f : -1.f;
x[ii] = xtmp;
// Generate target output y by applying the truth filter on x
const tensor& output = truth_filter(xtmp);
const float* const out_data = output.host();
const auto out_element = [&](int row, int column, int k) {
return out_data[(k * output.nr() + row) * output.nc() + column];
};
for (int jj = 0; jj < output_height; ++jj) {
for (int kk = 0; kk < output_width; ++kk) {
uint16_t label = 0;
float max_value = out_element(jj, kk, 0);
for (long k = 1; k < num_classes; ++k) {
const float value = out_element(jj, kk, k);
if (value > max_value) {
label = static_cast<uint16_t>(k);
max_value = value;
}
}
ytmp(jj, kk) = label;
}
}
y[ii] = ytmp;
}
using net_type = loss_multiclass_log_per_pixel<filter_type>;
net_type net;
dnn_trainer<net_type> trainer(net, sgd(0, 0.9));
trainer.set_learning_rate(1);
trainer.set_max_num_epochs(2000);
trainer.train(x, y);
// The learning task is separable, so the net should have no problem
// getting all the outputs right.
DLIB_TEST(net(x) == y);
}
// ----------------------------------------------------------------------------------------
void test_loss_multiclass_per_pixel_with_noise_and_pixels_to_ignore()
{
// "Semantic segmentation" - see https://github.com/davisking/dlib/issues/288
// Test learning when some pixels are to be ignored, etc.
print_spinner();
constexpr int input_height = 5;
constexpr int input_width = 7;
constexpr int output_height = input_height;
constexpr int output_width = input_width;
const int num_samples = 1000;
const int num_classes = 6;
const double ignore_probability = 0.5;
const double noise_probability = 0.05;
::std::default_random_engine generator(16);
::std::bernoulli_distribution ignore(ignore_probability);
::std::bernoulli_distribution noise_occurrence(noise_probability);
::std::uniform_int_distribution<uint16_t> noisy_label(0, num_classes - 1);
::std::vector<matrix<double>> x(num_samples);
::std::vector<matrix<uint16_t>> y(num_samples);
::std::vector<int> truth_histogram(num_classes);
matrix<double> xtmp(input_height, input_width);
matrix<uint16_t> ytmp(output_height, output_width);
// The function to be learned.
const auto ground_truth = [num_classes](const matrix<double>& x, int row, int column) {
double sum = 0.0;
const int first_column = std::max(0, column - 1);
const int last_column = std::min(static_cast<int>(x.nc() - 1), column + 1);
for (int c = first_column; c <= last_column; ++c) {
sum += x(row, c);
}
DLIB_TEST(sum < num_classes);
return static_cast<uint16_t>(sum);
};
for ( int ii = 0; ii < num_samples; ++ii ) {
for ( int jj = 0; jj < input_height; ++jj ) {
for ( int kk = 0; kk < input_width; ++kk ) {
// Generate numbers between 0 and 2.
double value = static_cast<double>(ii + jj + kk) / 10.0;
value -= (static_cast<int>(value) / 2) * 2;
DLIB_TEST(value >= 0.0 && value < 2.0);
xtmp(jj, kk) = value;
}
}
x[ii] = xtmp;
for ( int jj = 0; jj < output_height; ++jj ) {
for ( int kk = 0; kk < output_width; ++kk ) {
uint16_t truth = ground_truth(x[ii], jj, kk);
DLIB_TEST(truth < num_classes);
++truth_histogram[truth];
if (ignore(generator)) {
ytmp(jj, kk) = label_to_ignore;
}
else if (noise_occurrence(generator)) {
ytmp(jj, kk) = noisy_label(generator);
}
else {
ytmp(jj, kk) = truth;
}
}
}
y[ii] = ytmp;
}
const int num_total_elements = num_samples * output_height * output_width;
{ // Require a reasonably balanced truth histogram in order to make sure that a trivial classifier is not enough
const int required_min_histogram_value = static_cast<int>(::std::ceil(num_total_elements / num_classes * 0.375));
for (auto histogram_value : truth_histogram) {
DLIB_TEST_MSG(histogram_value >= required_min_histogram_value,
"Histogram value = " << histogram_value << ", required = " << required_min_histogram_value);
}
}
using net_type = loss_multiclass_log_per_pixel<bn_con<con<num_classes,1,input_width,1,1,input<matrix<double>>>>>;
net_type net;
sgd defsolver(0,0.9);
dnn_trainer<net_type> trainer(net, defsolver);
trainer.set_learning_rate(0.1);
trainer.set_min_learning_rate(0.01);
trainer.set_mini_batch_size(50);
trainer.set_max_num_epochs(170);
trainer.train(x, y);
const ::std::vector<matrix<uint16_t>> predictions = net(x);
int num_correct = 0;
for ( int ii = 0; ii < num_samples; ++ii ) {
const matrix<uint16_t>& prediction = predictions[ii];
DLIB_TEST(prediction.nr() == output_height);
DLIB_TEST(prediction.nc() == output_width);
for ( int jj = 0; jj < output_height; ++jj )
for ( int kk = 0; kk < output_width; ++kk )
if ( prediction(jj, kk) == ground_truth(x[ii], jj, kk) )
++num_correct;
}
// First some sanity checks.
const int num_correct_max = num_total_elements;
DLIB_TEST(num_correct_max == ::std::accumulate(truth_histogram.begin(), truth_histogram.end(), 0));
DLIB_TEST_MSG(num_correct <= num_correct_max,
"Number of correctly classified elements = " << num_correct << ", max = " << num_correct_max);
// This is the real test, verifying that we have actually learned something.
const int num_correct_required = static_cast<int>(::std::ceil(0.9 * num_correct_max));
DLIB_TEST_MSG(num_correct >= num_correct_required,
"Number of correctly classified elements = " << num_correct << ", required = " << num_correct_required);
}
// ----------------------------------------------------------------------------------------
class dnn_tester : public tester
......@@ -2038,6 +2373,10 @@ namespace
test_concat();
test_simple_linear_regression();
test_multioutput_linear_regression();
test_loss_multiclass_per_pixel_learned_params_on_trivial_single_pixel_task();
test_loss_multiclass_per_pixel_activations_on_trivial_single_pixel_task();
test_loss_multiclass_per_pixel_outputs_on_trivial_task();
test_loss_multiclass_per_pixel_with_noise_and_pixels_to_ignore();
}
void perform_test()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment