cleanup

a362305e · Davis King · 5e9f094d · a362305e
Commit a362305e authored Aug 27, 2017 by Davis King
Show whitespace changes
Inline Side-by-side

Showing with 45 additions and 42 deletions

dnn_mmod_find_cars_ex.cpp examples/dnn_mmod_find_cars_ex.cpp +45 -42

No files found.
--- a/examples/dnn_mmod_find_cars_ex.cpp
+++ b/examples/dnn_mmod_find_cars_ex.cpp
@@ -2,9 +2,9 @@
 /*
    This example shows how to run a CNN based vehicle detector using dlib.  The
    example loads a pretrained model and uses it to find the rear ends of cars in
-    images.  We will also visualize some of the detector's processing steps by
+    an image.  We will also visualize some of the detector's processing steps by
    plotting various intermediate images on the screen.  Viewing these can help
-    understand how the detector works.
+    you understand how the detector works.
    The model used by this example was trained by the dnn_mmod_train_find_cars_ex.cpp 
    example.  Also, since this is a CNN, you really should use a GPU to get the
@@ -34,7 +34,7 @@ using namespace dlib;
-// the dnn rear view vehicle detector network
+// The rear view vehicle detector network
 template <long num_filters, typename SUBNET> using con5d = con<num_filters,5,5,2,2,SUBNET>;
 template <long num_filters, typename SUBNET> using con5  = con<num_filters,5,5,1,1,SUBNET>;
 template <typename SUBNET> using downsampler  = relu<affine<con5d<32, relu<affine<con5d<32, relu<affine<con5d<16,SUBNET>>>>>>>>>;
@@ -63,9 +63,9 @@ int main() try
    for (auto&& d : net(img))
    {
        // We use a shape_predictor to refine the exact shape and location of the detection
-        // box.  This shape_predictor is trained to simply output the 4 corner points.  So
+        // box.  This shape_predictor is trained to simply output the 4 corner points of
-        // all we do is make a rectangle that tightly contains those 4 points and that
+        // the box.  So all we do is make a rectangle that tightly contains those 4 points
-        // rectangle is our refined detection position.
+        // and that rectangle is our refined detection position.
        auto fd = sp(img,d);
        rectangle rect;
        for (unsigned long j = 0; j < fd.num_parts(); ++j)
@@ -79,18 +79,18 @@ int main() try
    cin.get();
-    // Now let's look at how the detector works.  The top level processing steps look like:
+    // Now let's look at how the detector works.  The high level processing steps look like:
    //   1. Create an image pyramid and pack the pyramid into one big image.  We call this
    //      the "tiled pyramid image".
    //   2. Run the tiled pyramid image through the CNN.  The CNN outputs a new image where
    //      bright pixels in the output image indicate the presence of cars.  
-    //   3. Find pixels in the CNN output image with a value > 0.  Those locations are your
+    //   3. Find pixels in the CNN's output image with a value > 0.  Those locations are your
    //      preliminary car detections.  
    //   4. Perform non-maximum suppression on the preliminary detections to produce the
    //      final output.
    //
    // We will be plotting the images from steps 1 and 2 so you can visualize what's
-    // happening.  For the CNN output image, we will use the jet colormap so that "bright"
+    // happening.  For the CNN's output image, we will use the jet colormap so that "bright"
    // outputs, i.e. pixels with big values, appear in red and "dim" outputs appear as a
    // cold blue color.  To do this we pick a range of CNN output values for the color
    // mapping.  The specific values don't matter.  They are just selected to give a nice
@@ -104,24 +104,28 @@ int main() try
    // Create a tiled pyramid image and display it on the screen. 
    std::vector<rectangle> rects;
    matrix<rgb_pixel> tiled_img;
-    create_tiled_pyramid<std::remove_reference<decltype(input_layer(net))>::type::pyramid_type>(img,
+    // Get the type of pyramid the CNN used
-        tiled_img, rects, input_layer(net).get_pyramid_padding(),
+    using pyramid_type = std::remove_reference<decltype(input_layer(net))>::type::pyramid_type;
+    // And tell create_tiled_pyramid to create the pyramid using that pyramid type.
+    create_tiled_pyramid<pyramid_type>(img, tiled_img, rects, 
+                                       input_layer(net).get_pyramid_padding(), 
                                       input_layer(net).get_pyramid_outer_padding());
    image_window winpyr(tiled_img, "Tiled pyramid image");
-    // This CNN detector represents a sliding window detector with 3 sliding windows, one
+    // This CNN detector represents a sliding window detector with 3 sliding windows.  Each
-    // for each aspect ratio of vehicle box.  The aspect ratio of a detection is determined
+    // of the 3 windows has a different aspect ratio, allowing it to find vehicles which
-    // by which channel in the output image triggers the detection.  Here we are just going
+    // are either tall and skinny, squarish, or short and wide.  The aspect ratio of a
-    // to max pool the channels together to get one final image for our display.  In this
+    // detection is determined by which channel in the output image triggers the detection.
-    // image, a pixel will be bright if any of the sliding window detectors thinks there is
+    // Here we are just going to max pool the channels together to get one final image for
-    // a car at that location.
+    // our display.  In this image, a pixel will be bright if any of the sliding window
+    // detectors thinks there is a car at that location.
    cout << "Number of channels in final tensor image: " << net.subnet().get_output().k() << endl;
    matrix<float> network_output = image_plane(net.subnet().get_output(),0,0);
    for (long k = 1; k < net.subnet().get_output().k(); ++k)
        network_output = max_pointwise(network_output, image_plane(net.subnet().get_output(),0,k));
-    // We will also upsample the CNN output image.  The CNN we defined has an 8x
+    // We will also upsample the CNN's output image.  The CNN we defined has an 8x
    // downsampling layer at the beginning. In the code below we are going to overlay this
    // CNN output image on top of the raw input image.  To make that look nice it helps to
    // upsample the CNN output image back to the same resolution as the input image, which
@@ -135,10 +139,9 @@ int main() try
    // Also, overlay network_output on top of the tiled image pyramid and display it.
-    matrix<rgb_pixel> tiled_img_sal = tiled_img;
+    for (long r = 0; r < tiled_img.nr(); ++r)
-    for (long r = 0; r < tiled_img_sal.nr(); ++r)
    {
-        for (long c = 0; c < tiled_img_sal.nc(); ++c)
+        for (long c = 0; c < tiled_img.nc(); ++c)
        {
            dpoint tmp(c,r);
            tmp = input_tensor_to_output_tensor(net, tmp);
@@ -151,16 +154,16 @@ int main() try
                rgb_alpha_pixel p;
                assign_pixel(p , colormap_jet(val,lower,upper));
                p.alpha = 120;
-                assign_pixel(tiled_img_sal(r,c), p);
+                assign_pixel(tiled_img(r,c), p);
            }
        }
    }
-    // If you look at this image you can see that the vehicles get bright red blobs on
+    // If you look at this image you can see that the vehicles have bright red blobs on
-    // them.  That's the CNN saying "there is a car here!".  You will also notice that
+    // them.  That's the CNN saying "there is a car here!".  You will also notice there is
-    // there is a certain scale it finds cars at.  They have to be not too big or too
+    // a certain scale at which it finds cars.  They have to be not too big or too small,
-    // small, which is why we have an image pyramid.  The pyramid allows us to find cars of
+    // which is why we have an image pyramid.  The pyramid allows us to find cars of all
-    // all scales.
+    // scales.
-    image_window win_pyr_sal(tiled_img_sal, "Saliency on image pyramid");
+    image_window win_pyr_overlay(tiled_img, "Detection scores on image pyramid");
@@ -169,17 +172,17 @@ int main() try
    // actually do this step, since it's enough to threshold the tiled pyramid image to get
    // the detections.  However, it makes a nice visualization and clearly indicates that
    // the detector is firing for all the cars.
-    matrix<float> collapsed_saliency(img.nr(), img.nc());
+    matrix<float> collapsed(img.nr(), img.nc());
    resizable_tensor input_tensor;
    input_layer(net).to_tensor(&img, &img+1, input_tensor);
-    for (long r = 0; r < collapsed_saliency.nr(); ++r)
+    for (long r = 0; r < collapsed.nr(); ++r)
    {
-        for (long c = 0; c < collapsed_saliency.nc(); ++c)
+        for (long c = 0; c < collapsed.nc(); ++c)
        {
-            // Loop over a bunch of scale values and look up what part of network_output corresponds to
+            // Loop over a bunch of scale values and look up what part of network_output
-            // the point(c,r) in the original image, then take the max saliency value over
+            // corresponds to the point(c,r) in the original image, then take the max
-            // all the scales and save it at pixel point(c,r).
+            // detection score over all the scales and save it at pixel point(c,r).
-            float max_sal = -1e30;
+            float max_score = -1e30;
            for (double scale = 1; scale > 0.2; scale *= 5.0/6.0)
            {
                // Map from input image coordinates to tiled pyramid coordinates.
@@ -190,24 +193,24 @@ int main() try
                if (get_rect(network_output).contains(tmp))
                {
                    float val = network_output(tmp.y(),tmp.x());
-                    if (val > max_sal)
+                    if (val > max_score)
-                        max_sal = val;
+                        max_score = val;
                }
            }
-            collapsed_saliency(r,c) = max_sal;
+            collapsed(r,c) = max_score;
-            // Also blend the saliency into the original input image so we can view it as
+            // Also blend the scores into the original input image so we can view it as
            // an overlay on the cars.
            rgb_alpha_pixel p;
-            assign_pixel(p , colormap_jet(max_sal,lower,upper));
+            assign_pixel(p , colormap_jet(max_score,lower,upper));
            p.alpha = 120;
            assign_pixel(img(r,c), p);
        }
    }
-    image_window win_collapsed(jet(collapsed_saliency, upper, lower), "collapsed saliency map");
+    image_window win_collapsed(jet(collapsed, upper, lower), "Collapsed output tensor from the network");
-    image_window win_img_and_sal(img);
+    image_window win_img_and_sal(img, "Collapsed detection scores on raw image");
    cout << "Hit enter to end program" << endl;