Cleaned up the code a bit. Still more cleaning to do.

954612b6 · Davis King · b191400a · 954612b6 · 954612b6
Commit 954612b6 authored Aug 24, 2014 by Davis King
Show whitespace changes
Inline Side-by-side

Showing with 366 additions and 60 deletions

shape_predictor.h dlib/image_processing/shape_predictor.h +155 -33

shape_predictor_abstract.h dlib/image_processing/shape_predictor_abstract.h +211 -27

No files found.
--- a/dlib/image_processing/shape_predictor.h
+++ b/dlib/image_processing/shape_predictor.h
@@ -9,6 +9,7 @@
 #include "../matrix.h"
 #include "../geometry.h"
 #include "../pixel.h"
+#include "../console_progress_indicator.h"
 namespace dlib
 {
@@ -392,34 +393,131 @@ namespace dlib
        !*/
    public:
+        shape_predictor_trainer (
+        )
+        {
+            _cascade_depth = 10;
+            _tree_depth = 2;
+            _num_trees_per_cascade_level = 500;
+            _nu = 0.1;
+            _oversampling_amount = 20;
+            _feature_pool_size = 400;
+            _lambda = 0.1;
+            _num_test_splits = 20;
+            _feature_pool_region_padding = 0;
+            _verbose = false;
+        }
+        unsigned long get_cascade_depth (
+        ) const { return _cascade_depth; }
-        unsigned long cascade_depth (
+        void set_cascade_depth (
-        ) const { return 10; }
+            unsigned long depth
+        )
+        {
+            DLIB_CASSERT(depth > 0, "");
+            _cascade_depth = depth;
+        }
-        unsigned long tree_depth (
+        unsigned long get_tree_depth (
-        ) const { return 2; }
+        ) const { return _tree_depth; }
-        unsigned long num_trees_per_cascade_level (
+        void set_tree_depth (
-        ) const { return 500; }
+            unsigned long depth
+        )
+        {
+            DLIB_CASSERT(depth > 0, "");
+            _tree_depth = depth;
+        }
+        unsigned long get_num_trees_per_cascade_level (
+        ) const { return _num_trees_per_cascade_level; }
+        void set_num_trees_per_cascade_level (
+            unsigned long num
+        )
+        {
+            DLIB_CASSERT( num > 0, "");
+            _num_trees_per_cascade_level = num;
+        }
        double get_nu (
-        ) const { return 0.1; } // the regularizer 
+        ) const { return _nu; } 
+        void set_nu (
+            double nu
+        )
+        {
+            DLIB_CASSERT(nu > 0,"");
+            _nu = nu;
+        }
-        std::string random_seed (
+        std::string get_random_seed (
-        ) const { return "dlib rules"; }
+        ) const { return rnd.get_seed(); }
+        void set_random_seed (
+            const std::string& seed
+        ) { rnd.set_seed(seed); }
-        unsigned long oversampling_amount (
+        unsigned long get_oversampling_amount (
-        ) const { return 20; }
+        ) const { return _oversampling_amount; }
+        void set_oversampling_amount (
+            unsigned long amount
+        )
+        {
+            DLIB_CASSERT(amount > 0, "");
+            _oversampling_amount = amount;
+        }
+        unsigned long get_feature_pool_size (
+        ) const { return _feature_pool_size; }
+        void set_feature_pool_size (
+            unsigned long size
+        ) 
+        {
+            DLIB_CASSERT(size > 1, "");
+            _feature_pool_size = size;
+        }
-        // feature sampling parameters
-        unsigned long feature_pool_size (
-        ) const { return 400; }// this must be > 1
        double get_lambda (
-        ) const { return 0.1; }
+        ) const { return _lambda; }
+        void set_lambda (
+            double lambda
+        )
+        {
+            DLIB_CASSERT(lambda > 0, "");
+            _lambda = lambda;
+        }
        unsigned long get_num_test_splits (
-        ) const { return 20; }
+        ) const { return _num_test_splits; }
+        void set_num_test_splits (
+            unsigned long num
+        )
+        {
+            DLIB_CASSERT(num > 0, "");
+            _num_test_splits = num;
+        }
        double get_feature_pool_region_padding (
-        ) const { return 0; }
+        ) const { return _feature_pool_region_padding; }
+        void set_feature_pool_region_padding (
+            double padding 
+        )
+        {
+            _feature_pool_region_padding = padding;
+        }
+        void be_verbose (
+        )
+        {
+            _verbose = true;
+        }
+        void be_quiet (
+        )
+        {
+            _verbose = false;
+        }
        template <typename image_array>
        shape_predictor train (
@@ -430,17 +528,21 @@ namespace dlib
            using namespace impl;
            DLIB_CASSERT(images.size() == objects.size() && images.size() > 0, "");
+            rnd.set_seed(get_random_seed());
            std::vector<training_sample> samples;
            const matrix<float,0,1> initial_shape = populate_training_sample_shapes(objects, samples);
            const std::vector<std::vector<dlib::vector<float,2> > > pixel_coordinates = randomly_sample_pixel_coordinates(initial_shape);
-            rnd.set_seed(random_seed());
+            unsigned long trees_fit_so_far = 0;
+            console_progress_indicator pbar(get_cascade_depth()*get_num_trees_per_cascade_level());
+            if (_verbose)
+                std::cout << "Fitting trees..." << std::endl;
-            std::vector<std::vector<impl::regression_tree> > forests(cascade_depth());
+            std::vector<std::vector<impl::regression_tree> > forests(get_cascade_depth());
            // Now start doing the actual training by filling in the forests
-            for (unsigned long cascade = 0; cascade < cascade_depth(); ++cascade)
+            for (unsigned long cascade = 0; cascade < get_cascade_depth(); ++cascade)
            {
-                // TODO, add some verbose option that prints here
                // Each cascade uses a different set of pixels for its features.  We compute
                // their representations relative to the initial shape first.
                std::vector<unsigned long> anchor_idx; 
@@ -457,11 +559,20 @@ namespace dlib
                }
                // Now start building the trees at this cascade level.
-                for (unsigned long i = 0; i < num_trees_per_cascade_level(); ++i)
+                for (unsigned long i = 0; i < get_num_trees_per_cascade_level(); ++i)
                {
                    forests[cascade].push_back(make_regression_tree(samples, pixel_coordinates[cascade]));
+                    if (_verbose)
+                    {
+                        ++trees_fit_so_far;
+                        pbar.print_status(trees_fit_so_far);
                    }
                }
+            }
+            if (_verbose)
+                std::cout << "Training complete                          " << std::endl;
            return shape_predictor(initial_shape, forests, pixel_coordinates);
        }
@@ -488,7 +599,7 @@ namespace dlib
            /*!
            CONVENTION
-                - feature_pixel_values.size() == feature_pool_size()
+                - feature_pixel_values.size() == get_feature_pool_size()
                - feature_pixel_values[j] == the value of the j-th feature pool
                  pixel when you look it up relative to the shape in current_shape.
@@ -527,7 +638,7 @@ namespace dlib
            impl::regression_tree tree;
            // walk the tree in breadth first order
-            const unsigned long num_split_nodes = static_cast<unsigned long>(std::pow(2.0, (double)tree_depth())-1);
+            const unsigned long num_split_nodes = static_cast<unsigned long>(std::pow(2.0, (double)get_tree_depth())-1);
            std::vector<matrix<float,0,1> > sums(num_split_nodes*2+1);
            for (unsigned long i = 0; i < samples.size(); ++i)
                sums[0] += samples[i].target_shape - samples[i].current_shape;
@@ -574,8 +685,8 @@ namespace dlib
            double accept_prob;
            do 
            {
-                feat.idx1   = rnd.get_random_32bit_number()%feature_pool_size();
+                feat.idx1   = rnd.get_random_32bit_number()%get_feature_pool_size();
-                feat.idx2   = rnd.get_random_32bit_number()%feature_pool_size();
+                feat.idx2   = rnd.get_random_32bit_number()%get_feature_pool_size();
                const double dist = length(pixel_coordinates[feat.idx1]-pixel_coordinates[feat.idx2]);
                accept_prob = std::exp(-dist/lambda);
            }
@@ -699,7 +810,7 @@ namespace dlib
                    sample.image_idx = i;
                    sample.rect = objects[i][j].get_rect();
                    sample.target_shape = object_to_shape(objects[i][j]);
-                    for (unsigned long itr = 0; itr < oversampling_amount(); ++itr)
+                    for (unsigned long itr = 0; itr < get_oversampling_amount(); ++itr)
                        samples.push_back(sample);
                    mean_shape += sample.target_shape;
                    ++count;
@@ -711,7 +822,7 @@ namespace dlib
            // now go pick random initial shapes
            for (unsigned long i = 0; i < samples.size(); ++i)
            {
-                if ((i%oversampling_amount()) == 0)
+                if ((i%get_oversampling_amount()) == 0)
                {
                    // The mean shape is what we really use as an initial shape so always
                    // include it in the training set as an example starting shape.
@@ -742,13 +853,13 @@ namespace dlib
        ) const
        /*!
            ensures
-                - #pixel_coordinates.size() == feature_pool_size() 
+                - #pixel_coordinates.size() == get_feature_pool_size() 
                - for all valid i:
                    - pixel_coordinates[i] == a point in the box defined by the min/max x/y arguments.
        !*/
        {
-            pixel_coordinates.resize(feature_pool_size());
+            pixel_coordinates.resize(get_feature_pool_size());
-            for (unsigned long i = 0; i < feature_pool_size(); ++i)
+            for (unsigned long i = 0; i < get_feature_pool_size(); ++i)
            {
                pixel_coordinates[i].x() = rnd.get_random_double()*(max_x-min_x) + min_x;
                pixel_coordinates[i].y() = rnd.get_random_double()*(max_y-min_y) + min_y;
@@ -769,15 +880,26 @@ namespace dlib
            const double max_y = max(colm(temp,1))+padding;
            std::vector<std::vector<dlib::vector<float,2> > > pixel_coordinates;
-            pixel_coordinates.resize(cascade_depth());
+            pixel_coordinates.resize(get_cascade_depth());
-            for (unsigned long i = 0; i < cascade_depth(); ++i)
+            for (unsigned long i = 0; i < get_cascade_depth(); ++i)
                randomly_sample_pixel_coordinates(pixel_coordinates[i], min_x, min_y, max_x, max_y);
            return pixel_coordinates;
        }
        mutable dlib::rand rnd;
+        unsigned long _cascade_depth;
+        unsigned long _tree_depth;
+        unsigned long _num_trees_per_cascade_level;
+        double _nu;
+        unsigned long _oversampling_amount;
+        unsigned long _feature_pool_size;
+        double _lambda;
+        unsigned long _num_test_splits;
+        unsigned long _feature_pool_region_padding;
+        bool _verbose;
    };
 // ----------------------------------------------------------------------------------------

--- a/dlib/image_processing/shape_predictor_abstract.h
+++ b/dlib/image_processing/shape_predictor_abstract.h
@@ -16,6 +16,18 @@ namespace dlib
    class shape_predictor
    {
        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object is a tool that takes in an image region containing some object
+                and outputs a "shape" or set of point locations that define the pose of the
+                object.  The classic example of this is human face pose prediction, where
+                you take an image of a human face as input and are expected to identify the
+                locations of important facial landmarks such as the corners of the mouth
+                and eyes, tip of the nose, and so forth.
+                To create useful instantiations of this object you need to use the
+                shape_predictor_trainer object defined below to train a shape_predictor
+                using a set of training images, each annotated with shapes you want to
+                predict.
        !*/
    public:
@@ -23,13 +35,15 @@ namespace dlib
        shape_predictor (
        );
        /*!
+            ensures
+                - #num_parts() == 0
        !*/
        unsigned long num_parts (
        ) const;
        /*!
            ensures
-                - returns the number of points in the shape
+                - returns the number of parts in the shapes predicted by this object.
        !*/
        template <typename image_type>
@@ -42,10 +56,18 @@ namespace dlib
                - image_type == an image object that implements the interface defined in
                  dlib/image_processing/generic_image.h 
            ensures
-                - runs the tree regressor on the detection rect inside img and returns a 
+                - Runs the shape prediction algorithm on the part of the image contained in
-                  full_object_detection DET such that:
+                  the given bounding rectangle.  So it will try and fit the shape model to
+                  the contents of the given rectangle in the image.  For example, if there
+                  is a human face inside the rectangle and you use a face landmarking shape
+                  model then this function will return the locations of the face landmarks
+                  as the parts.  So the return value is a full_object_detection DET such
+                  that:
                    - DET.get_rect() == rect
                    - DET.num_parts() == num_parts()
+                    - for all valid i:
+                        - DET.part(i) == the location in img for the i-th part of the shape
+                          predicted by this object.
        !*/
    };
@@ -61,46 +83,208 @@ namespace dlib
    class shape_predictor_trainer
    {
        /*!
-            This thing really only works with unsigned char or rgb_pixel images (since we assume the threshold 
+            WHAT THIS OBJECT REPRESENTS
-            should be in the range [-128,128]).
+                This object is a tool for training shape_predictors based on annotated training
+                images.  Its implementation uses the algorithm described in:
+                    One Millisecond Face Alignment with an Ensemble of Regression Trees
+                    by Vahid Kazemi and Josephine Sullivan, CVPR 2014
        !*/
    public:
-        unsigned long cascade_depth (
+        shape_predictor_trainer (
-        ) const { return 10; }
+        )
+        {
+            _cascade_depth = 10;
+            _tree_depth = 2;
+            _num_trees_per_cascade_level = 500;
+            _nu = 0.1;
+            _oversampling_amount = 20;
+            _feature_pool_size = 400;
+            _lambda = 0.1;
+            _num_test_splits = 20;
+            _feature_pool_region_padding = 0;
+            _verbose = false;
+        }
+        unsigned long get_cascade_depth (
+        ) const;
+        /*!
+        !*/
+        void set_cascade_depth (
+            unsigned long depth
+        );
+        /*!
+            requires
+                - depth > 0
+            ensures
+                - #get_cascade_depth() == depth
+        !*/
+        unsigned long get_tree_depth (
+        ) const; 
+        /*!
+        !*/
+        void set_tree_depth (
+            unsigned long depth
+        );
+        /*!
+            requires
+                - depth > 0
+            ensures
+                - #get_tree_depth() == depth
+        !*/
-        unsigned long tree_depth (
+        unsigned long get_num_trees_per_cascade_level (
-        ) const { return 2; }
+        ) const;
+        /*!
+        !*/
-        unsigned long num_trees_per_cascade_level (
+        void set_num_trees_per_cascade_level (
-        ) const { return 500; }
+            unsigned long num
+        );
+        /*!
+            requires
+                - num > 0
+            ensures
+                - #get_num_trees_per_cascade_level() == num
+        !*/
        double get_nu (
-        ) const { return 0.1; } // the regularizer 
+        ) const; 
+        /*!
+        !*/
+        void set_nu (
+            double nu
+        );
+        /*!
+            requires
+                - nu > 0
+            ensures
+                - #get_nu() == nu
+        !*/
+        std::string get_random_seed (
+        ) const;
+        /*!
+        !*/
+        void set_random_seed (
+            const std::string& seed
+        );
+        /*!
+            ensures
+                - #get_random_seed() == seed
+        !*/
+        unsigned long get_oversampling_amount (
+        ) const;
+        /*!
+        !*/
+        void set_oversampling_amount (
+            unsigned long amount
+        );
+        /*!
+            requires
+                - amount > 0
+            ensures
+                - #get_oversampling_amount() == amount
+        !*/
-        std::string random_seed (
+        unsigned long get_feature_pool_size (
-        ) const { return "dlib rules"; }
+        ) const;
+        /*!
+        !*/
-        unsigned long oversampling_amount (
+        void set_feature_pool_size (
-        ) const { return 20; }
+            unsigned long size
+        );
+        /*!
+            requires
+                - size > 1
+            ensures
+                - #get_feature_pool_size() == size
+        !*/
-        // feature sampling parameters
-        unsigned long feature_pool_size (
-        ) const { return 400; }// this must be > 1
        double get_lambda (
-        ) const { return 0.1; }
+        ) const;
+        /*!
+        !*/
+        void set_lambda (
+            double lambda
+        );
+        /*!
+            requires
+                - lambda > 0
+            ensures
+                - #get_lambda() == lambda
+        !*/
        unsigned long get_num_test_splits (
-        ) const { return 20; }
+        ) const;
+        /*!
+        !*/
+        void set_num_test_splits (
+            unsigned long num
+        );
+        /*!
+            requires
+                - num > 0
+            ensures
+                - #get_num_test_splits() == num
+        !*/
        double get_feature_pool_region_padding (
-        ) const { return 0; }
+        ) const; 
+        /*!
+        !*/
+        void set_feature_pool_region_padding (
+            double padding 
+        );
+        /*!
+            ensures
+                - #get_feature_pool_region_padding() == padding
+        !*/
+        void be_verbose (
+        );
+        /*!
+            ensures
+                - This object will print status messages to standard out so that a 
+                  user can observe the progress of the algorithm.
+        !*/
+        void be_quiet (
+        );
+        /*!
+            ensures
+                - this object will not print anything to standard out
+        !*/
        template <typename image_array>
        shape_predictor train (
            const image_array& images,
            const std::vector<std::vector<full_object_detection> >& objects
        ) const;
+        /*!
+            requires
+                - images.size() == objects.size()
+                - images.size() > 0
+            ensures
+                - This object will try to learn to predict the locations of an object's parts 
+                  based on the object bounding box (i.e.  full_object_detection::get_rect()) 
+                  and the image pixels in that box.  That is, we will try to learn a
+                  shape_predictor, SP, such that:
+                    SP(images[i], objects[i][j].get_rect()) == objects[i][j]
+                  This learned SP object is then returned.
+        !*/
    };
 // ----------------------------------------------------------------------------------------
@@ -134,16 +318,16 @@ namespace dlib
              valid i and j we perform:
                sp(images[i], objects[i][j].get_rect())
              and compare the result with the truth part positions in objects[i][j].  We
-              then return the average distance between a predicted part location and its
+              then return the average distance (measured in pixels) between a predicted
-              true position.  This value is then returned. 
+              part location and its true position.  
            - if (scales.size() != 0) then
                - Each time we compute the distance between a predicted part location and
                  its true location in objects[i][j] we divide the distance by
                  scales[i][j].  Therefore, if you want the reported error to be the
                  average pixel distance then give an empty scales vector, but if you want
                  the returned value to be something else like the average distance
-                  normalized by some feature of the objects (e.g. the interocular distance)
+                  normalized by some feature of each object (e.g. the interocular distance)
-                  then you an supply those normalizing values via scales.
+                  then you can supply those normalizing values via scales.
    !*/
    template <