Fleshed out these examples more.

902a2bee · Davis King · 02b844ea · 902a2bee · 902a2bee
Commit 902a2bee authored Apr 12, 2016 by Davis King
Hide whitespace changes
Inline Side-by-side

Showing with 160 additions and 56 deletions

dnn_mnist_ex.cpp examples/dnn_mnist_ex.cpp +3 -3

dnn_mnist_resnet_ex.cpp examples/dnn_mnist_resnet_ex.cpp +157 -53

No files found.
--- a/examples/dnn_mnist_ex.cpp
+++ b/examples/dnn_mnist_ex.cpp
@@ -57,9 +57,9 @@ int main(int argc, char** argv) try
    // even define your own types by creating custom input layers.
    //
    // Then the middle layers define the computation the network will do to transform the
-    // input into whatever we want.  Here we run the image through multiple convolutions, ReLU
+    // input into whatever we want.  Here we run the image through multiple convolutions,
-    // units, max pooling operations, and then finally a fully connected layer that converts
+    // ReLU units, max pooling operations, and then finally a fully connected layer that
-    // the whole thing into just 10 numbers.  
+    // converts the whole thing into just 10 numbers.  
    // 
    // Finally, the loss layer defines the relationship between the network outputs, our 10
    // numbers, and the labels in our dataset.  Since we selected loss_multiclass_log it

--- a/examples/dnn_mnist_resnet_ex.cpp
+++ b/examples/dnn_mnist_resnet_ex.cpp
+// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
+/*
+    This is an example illustrating the use of the deep learning tools from the
+    dlib C++ Library.  I'm assuming you have already read the dnn_mnist_ex.cpp
+    example.  So in this example program I'm going to go over a number of more
+    advanced parts of the API, including:
+        - Training on large datasets that don't fit in memory 
+        - Defining large networks
+        - Accessing and configuring layers in a network
+*/
 #include <dlib/dnn.h>
@@ -9,19 +19,58 @@ using namespace dlib;
 // ----------------------------------------------------------------------------------------
+// Let's start by showing how you can conveniently define large networks.  The
+// most important tool for doing this are C++'s alias templates.  These let us
+// define new layer types that are combinations of a bunch of other layers.
+// These will form the building blocks for more complex networks.
+// So let's begin by defining the building block of a residual network (see
+// Figure 2 in Deep Residual Learning for Image Recognition by He, Zhang, Ren,
+// and Sun).  You can see a few things in this statement.  The most obvious is
+// that we have combined a bunch of layers into the name "base_res".  You can
+// also see the use of the tag1 layer.  This layer doesn't do any computation.
+// It exists solely so other layers can refer to it.  In this case, the
+// add_prev1 layer looks for the tag1 layer and will take the tag1 output and
+// add it to the input of the add_prev1 layer.  This combination allows us to
+// implement skip and residual style networks.  
 template <int stride, typename SUBNET> 
-using base_res  = relu<add_prev1<    bn_con<con<8,3,3,1,1,relu<    bn_con<con<8,3,3,stride,stride,tag1<SUBNET>>>>>>>>;
+using base_res  = relu<add_prev1<bn_con<con<8,3,3,1,1,relu<bn_con<con<8,3,3,stride,stride,tag1<SUBNET>>>>>>>>;
+// Let's also define the same block but with all the batch normalization layers
+// replaced with affine transform layers.  We will use this type of construction
+// when testing our networks.
 template <int stride, typename SUBNET> 
 using base_ares = relu<add_prev1<affine<con<8,3,3,1,1,relu<affine<con<8,3,3,stride,stride,tag1<SUBNET>>>>>>>>;
+// And of course we can define more alias templates based on previously defined
+// alias templates.  The _down versions downsample the inputs by a factor of 2
+// while the res and ares layer types don't.
 template <typename SUBNET> using res       = base_res<1,SUBNET>;
 template <typename SUBNET> using res_down  = base_res<2,SUBNET>;
 template <typename SUBNET> using ares      = base_ares<1,SUBNET>;
 template <typename SUBNET> using ares_down = base_ares<2,SUBNET>;
+// Now that we have these convenient aliases, we can define a residual network
+// without a lot of typing.  Note the use of a repeat layer.  This special layer
+// type allows us to type repeat<9,res<SUBNET>> instead of
+// res<res<res<res<res<res<res<res<res<SUBNET>>>>>>>>>.
+const unsigned long number_of_classes = 10;
+using net_type = loss_multiclass_log<fc<number_of_classes,
+                            avg_pool<11,11,11,11,
+                            res<res<res<res_down<
+                            repeat<9,res, // repeat this layer 9 times
+                            res_down<
+                            res<
+                            input<matrix<unsigned char>
+                            >>>>>>>>>>>;
+// And finally, let's define a residual network building block that uses
+// parametric ReLU units instead of regular ReLU.
 template <typename SUBNET> 
-using pres  = prelu<add_prev1<    bn_con<con<8,3,3,1,1,prelu<    bn_con<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>;
+using pres  = prelu<add_prev1<bn_con<con<8,3,3,1,1,prelu<bn_con<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>;
 // ----------------------------------------------------------------------------------------
@@ -29,7 +78,10 @@ int main(int argc, char** argv) try
 {
    if (argc != 2)
    {
-        cout << "give MNIST data folder!" << endl;
+        cout << "This example needs the MNIST dataset to run!" << endl;
+        cout << "You can get MNIST from http://yann.lecun.com/exdb/mnist/" << endl;
+        cout << "Download the 4 files that comprise the dataset, decompress them, and" << endl;
+        cout << "put them in a folder.  Then give that folder as input to this program." << endl;
        return 1;
    }
@@ -40,71 +92,88 @@ int main(int argc, char** argv) try
    load_mnist_dataset(argv[1], training_images, training_labels, testing_images, testing_labels);
+    // dlib uses cuDNN under the covers.  One of the features of cuDNN is the
+    // option to use slower methods that use less RAM or faster methods that use
+    // a lot of RAM.  If you find that you run out of RAM on your graphics card
+    // then you can call this function and we will request the slower but more
+    // RAM frugal cuDNN algorithms.
    set_dnn_prefer_smallest_algorithms();
-    const unsigned long number_of_classes = 10;
-    typedef loss_multiclass_log<fc<number_of_classes,
-                                avg_pool<11,11,11,11,
-                                res<res<res<res_down<
-                                repeat<9,res, // repeat this layer 9 times
-                                res_down<
-                                res<
-                                input<matrix<unsigned char>
-                                >>>>>>>>>>> net_type;
+    // Create a network as defined above.  This network will produce 10 outputs
+    // because that's how we defined net_type.  However, fc layers can have the
+    // number of outputs they produce changed at runtime.  
    net_type net;
+    // So if you wanted to use the same network but override the number of
+    // outputs at runtime you can do so like this:
-    // If you wanted to use the same network but override the number of outputs at runtime
-    // you can do so like this:
    net_type net2(num_fc_outputs(15));
-    // Let's imagine we wanted to replace some of the relu layers with prelu layers.  We
+    // Now, let's imagine we wanted to replace some of the relu layers with
-    // might do it like this:
+    // prelu layers.  We might do it like this:
-    typedef loss_multiclass_log<fc<number_of_classes,
+    using net_type2 = loss_multiclass_log<fc<number_of_classes,
                                avg_pool<11,11,11,11,
                                pres<res<res<res_down< // 2 prelu layers here
                                tag4<repeat<9,pres,    // 9 groups, each containing 2 prelu layers  
                                res_down<
                                res<
                                input<matrix<unsigned char>
-                                >>>>>>>>>>>> net_type2;
+                                >>>>>>>>>>>>;
-    // prelu layers have a floating point parameter.  If you want to set it to something
+    // prelu layers have a floating point parameter.  If you want to set it to
-    // other than its default value you can do so like this:
+    // something other than its default value you can do so like this:
    net_type2 pnet(prelu_(0.2),  
                   prelu_(0.2),
                   repeat_group(prelu_(0.3),prelu_(0.4)) // Initialize all the prelu instances in the repeat 
-                                                         // layer.  repeat_group() is needed to group the things 
+                                                         // layer.  repeat_group() is needed to group the 
-                                                       // that are part of repeat's block.
+                                                         // things that are part of repeat's block.
                   );
-    // As you can see, a network will greedily assign things given to its constructor to
+    // As you can see, a network will greedily assign things given to its
-    // the layers inside itself.  The assignment is done in the order the layers are
+    // constructor to the layers inside itself.  The assignment is done in the
-    // defined but it will skip layers where the assignment doesn't make sense.  
+    // order the layers are defined, but it will skip layers where the
+    // assignment doesn't make sense.  
-    // You can access sub layers of the network like this:
+    // The API shown above lets you modify layers at construction time.  But
-    net.subnet().subnet().get_output();
+    // what about after that?  There are a number of ways to access layers
-    layer<2>(net).get_output();
+    // inside a net object.
-    layer<relu>(net).get_output();
-    layer<tag1>(net).get_output();
+    // You can access sub layers of the network like this to get their output
-    // To further illustrate the use of layer(), let's loop over the repeated layers and
+    // tensors.  The following 3 statements are all equivalent and access the
-    // print out their parameters.  But first, let's grab a reference to the repeat layer.
+    // same layer's output.
-    // Since we tagged the repeat layer we can access it using the layer() method.
+    pnet.subnet().subnet().subnet().get_output();
-    // layer<tag4>(pnet) returns the tag4 layer, but we want the repeat layer so we can
+    layer<3>(pnet).get_output();
-    // give an integer as the second argument and it will jump that many layers down the
+    layer<prelu>(pnet).get_output(); 
+    // Similarly, to get access to the prelu_ object that defines the layer's
+    // behavior we can say:
+    pnet.subnet().subnet().subnet().layer_details();
+    // or 
+    layer<prelu>(pnet).layer_details(); 
+    // So for example, to print the prelu parameter:
+    cout << "first prelu layer's initial param value: "
+         << pnet.subnet().subnet().subnet().layer_details().get_initial_param_value() << endl;
+    // From this it should be clear that layer() is a general tool for accessing
+    // sub layers.  It makes repeated calls to subnet() so you don't have to.
+    // One of it's most important uses is to access tagged layers.  For example,
+    // to access the first tag1 layer we can say:
+    layer<tag1>(pnet);
+    // To further illustrate the use of layer(), let's loop over the repeated
+    // prelu layers and print out their parameters.  But first, let's grab a
+    // reference to the repeat layer.  Since we tagged the repeat layer we can
+    // access it using the layer() method.  layer<tag4>(pnet) returns the tag4
+    // layer, but we want the repeat layer right after it so we can give an
+    // integer as the second argument and it will jump that many layers down the
    // network.  In our case we need to jump just 1 layer down to get to repeat. 
    auto&& repeat_layer = layer<tag4,1>(pnet);
    for (size_t i = 0; i < repeat_layer.num_repetitions(); ++i)
    {
-        // The repeat layer just instantiates the network block a bunch of times as a
+        // The repeat layer just instantiates the network block a bunch of
-        // network object.  get_repeated_layer() allows us to grab each of these instances.
+        // times.  get_repeated_layer() allows us to grab each of these
+        // instances.
        auto&& repeated_layer = repeat_layer.get_repeated_layer(i);
-        // Now that we have the i-th layer inside our repeat layer we can look at its
+        // Now that we have the i-th layer inside our repeat layer we can look
-        // properties.  Recall that we repeated the "pres" network block, which is itself a
+        // at its properties.  Recall that we repeated the "pres" network block,
-        // network with a bunch of layers.  So we can again use layer() to jump to the
+        // which is itself a network with a bunch of layers.  So we can again
-        // prelu layers we are interested in like so:
+        // use layer() to jump to the prelu layers we are interested in like so:
        prelu_ prelu1 = layer<prelu>(repeated_layer).layer_details();
        prelu_ prelu2 = layer<prelu>(repeated_layer.subnet()).layer_details();
        cout << "first prelu layer parameter value: "<< prelu1.get_initial_param_value() << endl;;
@@ -114,13 +183,34 @@ int main(int argc, char** argv) try
+    // Ok, so that's enough talk about defining networks.  Let's talk about
+    // training networks!
+    // The dnn_trainer will use SGD by default, but you can tell it to use
+    // different solvers like adam.  
    dnn_trainer<net_type,adam> trainer(net,adam(0.001));
    trainer.be_verbose();
    trainer.set_synchronization_file("mnist_resnet_sync", std::chrono::seconds(100));
+    // While the trainer is running it keeps an eye on the training error.  If
+    // it looks like the error hasn't decreased for the last 2000 iterations it
+    // will automatically reduce the step size by 0.1.  You can change these
+    // default parameters to some other values by calling these functions.  Or
+    // disable them entirely by setting the shrink amount to 1.
+    trainer.set_iterations_without_progress_threshold(2000);
+    trainer.set_step_size_shrink_amount(0.1);
+    // Now, what if your training dataset is so big it doesn't fit in RAM?  You
+    // make mini-batches yourself, any way you like, and you send them to the
+    // trainer by repeatedly calling trainer.train_one_step(). 
+    //
+    // For example, the loop below stream MNIST data to out trainer.
    std::vector<matrix<unsigned char>> mini_batch_samples;
    std::vector<unsigned long> mini_batch_labels; 
    dlib::rand rnd;
-    //trainer.train(training_images, training_labels);
+    // Loop until the trainer's automatic shrinking has shrunk the step size by
+    // 1e-3.  For the default shrinks amount of 0.1 this means stop after it
+    // shrinks it 3 times.
    while(trainer.get_step_size() >= 1e-3)
    {
        mini_batch_samples.clear();
@@ -136,28 +226,42 @@ int main(int argc, char** argv) try
        trainer.train_one_step(mini_batch_samples, mini_batch_labels);
    }
-    // wait for threaded processing to stop.
+    // When you call train_one_step(), the trainer will do its processing in a
+    // separate thread.  This allows the main thread to work on loading data
+    // while the trainer is busy executing the mini-batches in parallel.
+    // However, this also means we need to wait for any mini-batches that are
+    // still executing to stop before we mess with the net object.  Calling
+    // get_net() performs the necessary synchronization.
    trainer.get_net();
    net.clean();
    serialize("mnist_res_network.dat") << net;
+    // Now we have a trained network.  However, it has batch normalization
-    typedef loss_multiclass_log<fc<number_of_classes,
+    // layers in it.  As is customary, we should replace these with simple
+    // affine layers before we use the network.  This can be accomplished by
+    // making a network type which is identical to net_type but with the batch
+    // normalization layers replaced with affine.  For example:
+    using test_net_type = loss_multiclass_log<fc<number_of_classes,
                                avg_pool<11,11,11,11,
                                ares<ares<ares<ares_down<
                                repeat<9,res,
                                ares_down<
                                ares<
                                input<matrix<unsigned char>
-                                >>>>>>>>>>> test_net_type;
+                                >>>>>>>>>>>;
+    // Then we can simply assign our trained net to our testing net.
    test_net_type tnet = net;
-    // or you could deserialize the saved network
+    // Or if you only had a file with your trained network you could deserialize
+    // it directly into your testing network.  
    deserialize("mnist_res_network.dat") >> tnet;
-    // Run the net on all the data to get predictions
+    // And finally, we can run the testing network over our data.
    std::vector<unsigned long> predicted_labels = tnet(training_images);
    int num_right = 0;
    int num_wrong = 0;