Add semantic segmentation example (#943)

* Add example of semantic segmentation using the PASCAL VOC2012 dataset

* Add note about Debug Information Format when using MSVC

* Make the upsampling layers residual as well

* Fix declaration order

* Use a wider net

* trainer.set_iterations_without_progress_threshold(5000); // (was 20000)

* Add residual_up

* Process entire directories of images (just easier to use)

* Simplify network structure so that builds finish even on Visual Studio (faster, or at all)

* Remove the training example from CMakeLists, because it's too much for the 32-bit MSVC++ compiler to handle

* Remove the probably-now-unnecessary set_dnn_prefer_smallest_algorithms call

* Review fix: remove the batch normalization layer from right before the loss

* Review fix: point out that only the Visual C++ compiler has problems.
Also expand the instructions how to run MSBuild.exe to circumvent the problems.

* Review fix: use dlib::match_endings

* Review fix: use dlib::join_rows. Also add some comments, and instructions where to download the pre-trained net from.

* Review fix: make formatting comply with dlib style conventions.

* Review fix: output training parameters.

* Review fix: remove #ifndef __INTELLISENSE__

* Review fix: use std::string instead of char*

* Review fix: update interpolation_abstract.h to say that extract_image_chips can now take the interpolation method as a parameter

* Fix whitespace formatting

* Add more comments

* Fix finding image files for inference

* Resize inference test output to the size of the input; add clarifying remarks

* Resize net output even in calculate_accuracy

* After all crop the net output instead of resizing it by interpolation

* For clarity, add an empty line in the console output
......@@ -1856,12 +1856,14 @@ namespace dlib
template <
typename image_type1,
typename image_type2
typename image_type2,
typename interpolation_type
void extract_image_chips (
const image_type1& img,
const std::vector<chip_details>& chip_locations,
dlib::array<image_type2>& chips
dlib::array<image_type2>& chips,
const interpolation_type& interp
// make sure requires clause is not broken
......@@ -1957,9 +1959,9 @@ namespace dlib
// now extract the actual chip
if (level == -1)
......@@ -1970,10 +1972,27 @@ namespace dlib
typename image_type1,
typename image_type2
void extract_image_chips(
const image_type1& img,
const std::vector<chip_details>& chip_locations,
dlib::array<image_type2>& chips
extract_image_chips(img, chip_locations, chips, interpolate_bilinear());
// ----------------------------------------------------------------------------------------
template <
typename image_type1,
typename image_type2,
typename interpolation_type
void extract_image_chip (
const image_type1& img,
const chip_details& location,
image_type2& chip
image_type2& chip,
const interpolation_type& interp
// If the chip doesn't have any rotation or scaling then use the basic version of
......@@ -1988,11 +2007,26 @@ namespace dlib
std::vector<chip_details> chip_locations(1,location);
dlib::array<image_type2> chips;
extract_image_chips(img, chip_locations, chips);
extract_image_chips(img, chip_locations, chips, interp);
swap(chips[0], chip);
// ----------------------------------------------------------------------------------------
template <
typename image_type1,
typename image_type2
void extract_image_chip (
const image_type1& img,
const chip_details& location,
image_type2& chip
extract_image_chip(img, location, chip, interpolate_bilinear());
// ----------------------------------------------------------------------------------------
inline chip_details get_face_chip_details (
......@@ -1163,12 +1163,14 @@ namespace dlib
template <
typename image_type1,
typename image_type2
typename image_type2,
typename interpolation_type
void extract_image_chips (
const image_type1& img,
const std::vector<chip_details>& chip_locations,
dlib::array<image_type2>& chips
dlib::array<image_type2>& chips,
const interpolation_type& interp
......@@ -1185,6 +1187,7 @@ namespace dlib
rectangular sub-windows (i.e. chips) within an image and extracts those
sub-windows, storing each into its own image. It also scales and rotates the
image chips according to the instructions inside each chip_details object.
It uses the interpolation method supplied as a parameter.
- #chips == the extracted image chips
- #chips.size() == chip_locations.size()
- for all valid i:
......@@ -1198,16 +1201,33 @@ namespace dlib
- Any pixels in an image chip that go outside img are set to 0 (i.e. black).
template <
typename image_type1,
typename image_type2
void extract_image_chips (
const image_type1& img,
const std::vector<chip_details>& chip_locations,
dlib::array<image_type2>& chips
- This function is a simple convenience / compatibility wrapper that calls the
above-defined extract_image_chips function using bilinear interpolation.
// ----------------------------------------------------------------------------------------
template <
typename image_type1,
typename image_type2
typename image_type2,
typename interpolation_type
void extract_image_chip (
const image_type1& img,
const chip_details& chip_location,
image_type2& chip
image_type2& chip,
const interpolation_type& interp
......@@ -1215,6 +1235,21 @@ namespace dlib
and stores the single output chip into #chip.
template <
typename image_type1,
typename image_type2
void extract_image_chip (
const image_type1& img,
const chip_details& chip_location,
image_type2& chip
- This function is a simple convenience / compatibility wrapper that calls the
above-defined extract_image_chip function using bilinear interpolation.
// ----------------------------------------------------------------------------------------
template <
......@@ -124,12 +124,22 @@ if (NOT USING_OLD_VISUAL_STUDIO_COMPILER)
# Don't try to compile these programs using Visual Studio since it causes the
# compiler to run out of RAM and to crash. Maybe someday Visual Studio
# won't be broken :(
# (NB: While the 32-bit VC++ compiler launched by the Visual Studio IDE will
# run out of memory, running a 64-bit MSBuild.exe on the Command Prompt
# seems to work fine. So you can try something like this:
# "C:\Program Files (x86)\MSBuild\14.0\Bin\amd64\MSBuild.exe" C:\path\to\examples.sln /p:Configuration=Release /p:Platform=x64 /t:dnn_imagenet_train_ex
# It does take quite a while to build these examples, though!
# Note that you may additionally need to set Debug Information Format to
# C7 compatible (/Z7), in case you get compiler error "cannot update
# program database".)
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
This example shows how to do semantic segmentation on an image using net pretrained
on the PASCAL VOC2012 dataset. For an introduction to what segmentation is, see the
accompanying header file dnn_semantic_segmentation_ex.h.
Instructions how to run the example:
1. Download the PASCAL VOC2012 data, and untar it somewhere.
2. Build the dnn_semantic_segmentation_train_ex example program.
3. Run:
./dnn_semantic_segmentation_train_ex /path/to/VOC2012
4. Wait while the network is being trained.
5. Build the dnn_semantic_segmentation_ex example program.
6. Run:
./dnn_semantic_segmentation_ex /path/to/VOC2012-or-other-images
An alternative to steps 2-4 above is to download a pre-trained network
from here:
It would be a good idea to become familiar with dlib's DNN tooling before reading this
example. So you should read dnn_introduction_ex.cpp and dnn_introduction2_ex.cpp
before reading this example program.
#include "dnn_semantic_segmentation_ex.h"
#include <iostream>
#include <dlib/data_io.h>
#include <dlib/gui_widgets.h>
using namespace std;
using namespace dlib;
// ----------------------------------------------------------------------------------------
// The PASCAL VOC2012 dataset contains 20 ground-truth classes + background. Each class
// is represented using an RGB color value. We associate each class also to an index in the
// range [0, 20], used internally by the network. To generate nice RGB representations of
// inference results, we need to be able to convert the index values to the corresponding
// RGB values.
// Given an index in the range [0, 20], find the corresponding PASCAL VOC2012 class
// (e.g., 'dog').
const Voc2012class& find_voc2012_class(const uint16_t& index_label)
return find_voc2012_class(
[&index_label](const Voc2012class& voc2012class)
return index_label == voc2012class.index;
// Convert an index in the range [0, 20] to a corresponding RGB class label.
inline rgb_pixel index_label_to_rgb_label(uint16_t index_label)
return find_voc2012_class(index_label).rgb_label;
// Convert an image containing indexes in the range [0, 20] to a corresponding
// image containing RGB class labels.
void index_label_image_to_rgb_label_image(
const matrix<uint16_t>& index_label_image,
matrix<rgb_pixel>& rgb_label_image
const long nr =;
const long nc =;
rgb_label_image.set_size(nr, nc);
for (long r = 0; r < nr; ++r)
for (long c = 0; c < nc; ++c)
rgb_label_image(r, c) = index_label_to_rgb_label(index_label_image(r, c));
// Find the most prominent class label from amongst the per-pixel predictions.
std::string get_most_prominent_non_background_classlabel(const matrix<uint16_t>& index_label_image)
const long nr =;
const long nc =;
std::vector<unsigned int> counters(class_count);
for (long r = 0; r < nr; ++r)
for (long c = 0; c < nc; ++c)
const uint16_t label = index_label_image(r, c);
const auto max_element = std::max_element(counters.begin() + 1, counters.end());
const uint16_t most_prominent_index_label = max_element - counters.begin();
return find_voc2012_class(most_prominent_index_label).classlabel;
// ----------------------------------------------------------------------------------------
int main(int argc, char** argv) try
if (argc != 2)
cout << "You call this program like this: " << endl;
cout << "./dnn_semantic_segmentation_train_ex /path/to/images" << endl;
cout << endl;
cout << "You will also need a trained 'voc2012net.dnn' file." << endl;
cout << "You can either train it yourself (see example program" << endl;
cout << "dnn_semantic_segmentation_train_ex), or download a" << endl;
cout << "copy from here:" << endl;
return 1;
// Read the file containing the trained network from the working directory.
anet_type net;
deserialize("voc2012net.dnn") >> net;
// Show inference results in a window.
image_window win;
matrix<rgb_pixel> input_image;
matrix<uint16_t> index_label_image;
matrix<rgb_pixel> rgb_label_image;
// Find supported image files.
const std::vector<file> files = dlib::get_files_in_directory_tree(argv[1],
dlib::match_endings(".jpeg .jpg .png"));
cout << "Found " << files.size() << " images, processing..." << endl;
for (const file& file : files)
// Load the input image.
load_image(input_image, file.full_name());
// Create predictions for each pixel. At this point, the type of each prediction
// is an index (a value between 0 and 20). Note that the net may return an image
// that is not exactly the same size as the input.
const matrix<uint16_t> temp = net(input_image);
// Crop the returned image to be exactly the same size as the input.
const chip_details chip_details(
centered_rect( / 2, / 2,,,
extract_image_chip(temp, chip_details, index_label_image, interpolate_nearest_neighbor());
// Convert the indexes to RGB values.
index_label_image_to_rgb_label_image(index_label_image, rgb_label_image);
// Show the input image on the left, and the predicted RGB labels on the right.
win.set_image(join_rows(input_image, rgb_label_image));
// Find the most prominent class label from amongst the per-pixel predictions.
const std::string classlabel = get_most_prominent_non_background_classlabel(index_label_image);
cout << << " : " << classlabel << " - hit enter to process the next image";
catch(std::exception& e)
cout << e.what() << endl;
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
Semantic segmentation using the PASCAL VOC2012 dataset.
In segmentation, the task is to assign each pixel of an input image
a label - for example, 'dog'. Then, the idea is that neighboring
pixels having the same label can be connected together to form a
larger region, representing a complete (or partially occluded) dog.
So technically, segmentation can be viewed as classification of
individual pixels (using the relevant context in the input images),
however the goal usually is to identify meaningful regions that
represent complete entities of interest (such as dogs).
Instructions how to run the example:
1. Download the PASCAL VOC2012 data, and untar it somewhere.
2. Build the dnn_semantic_segmentation_train_ex example program.
3. Run:
./dnn_semantic_segmentation_train_ex /path/to/VOC2012
4. Wait while the network is being trained.
5. Build the dnn_semantic_segmentation_ex example program.
6. Run:
./dnn_semantic_segmentation_ex /path/to/VOC2012-or-other-images
An alternative to steps 2-4 above is to download a pre-trained network
from here:
It would be a good idea to become familiar with dlib's DNN tooling before reading this
example. So you should read dnn_introduction_ex.cpp and dnn_introduction2_ex.cpp
before reading this example program.
#include <dlib/dnn.h>
// ----------------------------------------------------------------------------------------
inline bool operator == (const dlib::rgb_pixel& a, const dlib::rgb_pixel& b)
return == && == && ==;
// ----------------------------------------------------------------------------------------
// The PASCAL VOC2012 dataset contains 20 ground-truth classes + background. Each class
// is represented using an RGB color value. We associate each class also to an index in the
// range [0, 20], used internally by the network.
struct Voc2012class {
Voc2012class(uint16_t index, const dlib::rgb_pixel& rgb_label, const std::string& classlabel)
: index(index), rgb_label(rgb_label), classlabel(classlabel)
// The index of the class. In the PASCAL VOC 2012 dataset, indexes from 0 to 20 are valid.
const uint16_t index = 0;
// The corresponding RGB representation of the class.
const dlib::rgb_pixel rgb_label;
// The label of the class in plain text.
const std::string classlabel;
namespace {
constexpr int class_count = 21; // background + 20 classes
const std::vector<Voc2012class> classes = {
Voc2012class(0, dlib::rgb_pixel(0, 0, 0), ""), // background
// The cream-colored `void' label is used in border regions and to mask difficult objects
// (see
dlib::rgb_pixel(224, 224, 192), "border"),
Voc2012class(1, dlib::rgb_pixel(128, 0, 0), "aeroplane"),
Voc2012class(2, dlib::rgb_pixel( 0, 128, 0), "bicycle"),
Voc2012class(3, dlib::rgb_pixel(128, 128, 0), "bird"),
Voc2012class(4, dlib::rgb_pixel( 0, 0, 128), "boat"),
Voc2012class(5, dlib::rgb_pixel(128, 0, 128), "bottle"),
Voc2012class(6, dlib::rgb_pixel( 0, 128, 128), "bus"),
Voc2012class(7, dlib::rgb_pixel(128, 128, 128), "car"),
Voc2012class(8, dlib::rgb_pixel( 64, 0, 0), "cat"),
Voc2012class(9, dlib::rgb_pixel(192, 0, 0), "chair"),
Voc2012class(10, dlib::rgb_pixel( 64, 128, 0), "cow"),
Voc2012class(11, dlib::rgb_pixel(192, 128, 0), "diningtable"),
Voc2012class(12, dlib::rgb_pixel( 64, 0, 128), "dog"),
Voc2012class(13, dlib::rgb_pixel(192, 0, 128), "horse"),
Voc2012class(14, dlib::rgb_pixel( 64, 128, 128), "motorbike"),
Voc2012class(15, dlib::rgb_pixel(192, 128, 128), "person"),
Voc2012class(16, dlib::rgb_pixel( 0, 64, 0), "pottedplant"),
Voc2012class(17, dlib::rgb_pixel(128, 64, 0), "sheep"),
Voc2012class(18, dlib::rgb_pixel( 0, 192, 0), "sofa"),
Voc2012class(19, dlib::rgb_pixel(128, 192, 0), "train"),
Voc2012class(20, dlib::rgb_pixel( 0, 64, 128), "tvmonitor"),
template <typename Predicate>
const Voc2012class& find_voc2012_class(Predicate predicate)
const auto i = std::find_if(classes.begin(), classes.end(), predicate);
if (i != classes.end())
return *i;
throw std::runtime_error("Unable to find a matching VOC2012 class");
// ----------------------------------------------------------------------------------------
// Introduce the building blocks used to define the segmentation network.
// The network first does residual downsampling (similar to the dnn_imagenet_(train_)ex
// example program), and then residual upsampling. The network could be improved e.g.
// by introducing skip connections from the input image, and/or the first layers, to the
// last layer(s). (See Long et al., Fully Convolutional Networks for Semantic Segmentation,
template <int N, template <typename> class BN, int stride, typename SUBNET>
using block = BN<dlib::con<N,3,3,1,1, dlib::relu<BN<dlib::con<N,3,3,stride,stride,SUBNET>>>>>;
template <int N, template <typename> class BN, int stride, typename SUBNET>
using blockt = BN<dlib::cont<N,3,3,1,1,dlib::relu<BN<dlib::cont<N,3,3,stride,stride,SUBNET>>>>>;
template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
using residual = dlib::add_prev1<block<N,BN,1,dlib::tag1<SUBNET>>>;
template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
using residual_down = dlib::add_prev2<dlib::avg_pool<2,2,2,2,dlib::skip1<dlib::tag2<block<N,BN,2,dlib::tag1<SUBNET>>>>>>;
template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
using residual_up = dlib::add_prev2<dlib::cont<N,2,2,2,2,dlib::skip1<dlib::tag2<blockt<N,BN,2,dlib::tag1<SUBNET>>>>>>;
template <int N, typename SUBNET> using res = dlib::relu<residual<block,N,dlib::bn_con,SUBNET>>;
template <int N, typename SUBNET> using ares = dlib::relu<residual<block,N,dlib::affine,SUBNET>>;
template <int N, typename SUBNET> using res_down = dlib::relu<residual_down<block,N,dlib::bn_con,SUBNET>>;
template <int N, typename SUBNET> using ares_down = dlib::relu<residual_down<block,N,dlib::affine,SUBNET>>;
template <int N, typename SUBNET> using res_up = dlib::relu<residual_up<block,N,dlib::bn_con,SUBNET>>;
template <int N, typename SUBNET> using ares_up = dlib::relu<residual_up<block,N,dlib::affine,SUBNET>>;
// ----------------------------------------------------------------------------------------
template <typename SUBNET> using level1 = res<512,res<512,res_down<512,SUBNET>>>;
template <typename SUBNET> using level2 = res<256,res<256,res_down<256,SUBNET>>>;
template <typename SUBNET> using level3 = res<128,res<128,res_down<128,SUBNET>>>;
template <typename SUBNET> using level4 = res<64,res<64,res<64,SUBNET>>>;
template <typename SUBNET> using alevel1 = ares<512,ares<512,ares_down<512,SUBNET>>>;
template <typename SUBNET> using alevel2 = ares<256,ares<256,ares_down<256,SUBNET>>>;
template <typename SUBNET> using alevel3 = ares<128,ares<128,ares_down<128,SUBNET>>>;
template <typename SUBNET> using alevel4 = ares<64,ares<64,ares<64,SUBNET>>>;
template <typename SUBNET> using level1t = res<512,res<512,res_up<512,SUBNET>>>;
template <typename SUBNET> using level2t = res<256,res<256,res_up<256,SUBNET>>>;
template <typename SUBNET> using level3t = res<128,res<128,res_up<128,SUBNET>>>;
template <typename SUBNET> using level4t = res<64,res<64,res_up<64,SUBNET>>>;
template <typename SUBNET> using alevel1t = ares<512,ares<512,ares_up<512,SUBNET>>>;
template <typename SUBNET> using alevel2t = ares<256,ares<256,ares_up<256,SUBNET>>>;
template <typename SUBNET> using alevel3t = ares<128,ares<128,ares_up<128,SUBNET>>>;
template <typename SUBNET> using alevel4t = ares<64,ares<64,ares_up<64,SUBNET>>>;
// ----------------------------------------------------------------------------------------
// training network type
using net_type = dlib::loss_multiclass_log_per_pixel<
// testing network type (replaced batch normalization with fixed affine transforms)
using anet_type = dlib::loss_multiclass_log_per_pixel<
// ----------------------------------------------------------------------------------------
\ No newline at end of file
