Shape predictor trainer optimizations (#126)

* Shape predictor trainer optimizations * Fixed performance leak in single thread mode & made VS2010 support

Shape predictor trainer optimizations (#126)
* Shape predictor trainer optimizations * Fixed performance leak in single thread mode & made VS2010 support
bbeac285 · Evgeniy Fominov · Davis E. King · 6eb5bd80 · bbeac285 · bbeac285
Commit bbeac285 authored Jul 22, 2016 by Evgeniy Fominov Committed by Davis E. King Jul 22, 2016
3 changed files
--- a/dlib/image_processing/shape_predictor.h
+++ b/dlib/image_processing/shape_predictor.h
@@ -80,7 +80,7 @@ namespace dlib
                i = 0;
                while (i < splits.size())
                {
-                    if (feature_pixel_values[splits[i].idx1] - feature_pixel_values[splits[i].idx2] > splits[i].thresh)
+                    if ((float)feature_pixel_values[splits[i].idx1] - (float)feature_pixel_values[splits[i].idx2] > splits[i].thresh)
                        i = left_child(i);
                    else
                        i = right_child(i);
@@ -235,7 +235,7 @@ namespace dlib
    // ------------------------------------------------------------------------------------
-        template <typename image_type>
+        template <typename image_type, typename feature_type>
        void extract_feature_pixel_values (
            const image_type& img_,
            const rectangle& rect,
@@ -243,7 +243,7 @@ namespace dlib
            const matrix<float,0,1>& reference_shape,
            const std::vector<unsigned long>& reference_pixel_anchor_idx,
            const std::vector<dlib::vector<float,2> >& reference_pixel_deltas,
-            std::vector<float>& feature_pixel_values
+            std::vector<feature_type>& feature_pixel_values
        )
        /*!
            requires
@@ -453,6 +453,7 @@ namespace dlib
            _num_test_splits = 20;
            _feature_pool_region_padding = 0;
            _verbose = false;
+            _num_threads = 0;
        }
        unsigned long get_cascade_depth (
@@ -605,6 +606,15 @@ namespace dlib
            _verbose = false;
        }
+        unsigned long get_num_threads (
+        ) const { return _num_threads; }
+        void set_num_threads (
+                unsigned long num
+        )
+        {
+            _num_threads = num;
+        }
        template <typename image_array>
        shape_predictor train (
            const image_array& images,
@@ -661,13 +671,17 @@ namespace dlib
                << "\n\t you can't have a part that is always set to OBJECT_PART_NOT_PRESENT."
            );
+            // creating thread pool. if num_threads <= 1, trainer should work in caller thread
+            thread_pool tp(_num_threads > 1 ? _num_threads : 0);
+            // determining the type of features used for this type of images
+            typedef typename std::remove_const<typename std::remove_reference<decltype(images[0])>::type>::type image_type;
+            typedef typename image_traits<image_type>::pixel_type pixel_type;
+            typedef typename pixel_traits<pixel_type>::basic_pixel_type feature_type;
            rnd.set_seed(get_random_seed());
-            std::vector<training_sample> samples;
+            std::vector<training_sample<feature_type>> samples;
            const matrix<float,0,1> initial_shape = populate_training_sample_shapes(objects, samples);
            const std::vector<std::vector<dlib::vector<float,2> > > pixel_coordinates = randomly_sample_pixel_coordinates(initial_shape);
@@ -688,17 +702,17 @@ namespace dlib
                // First compute the feature_pixel_values for each training sample at this
                // level of the cascade.
-                for (unsigned long i = 0; i < samples.size(); ++i)
+                parallel_for(tp, 0, samples.size(), [&](unsigned long i)
                {
-                    extract_feature_pixel_values(images[samples[i].image_idx], samples[i].rect,
+                    impl::extract_feature_pixel_values(images[samples[i].image_idx], samples[i].rect,
-                        samples[i].current_shape, initial_shape, anchor_idx,
+                                                 samples[i].current_shape, initial_shape, anchor_idx,
-                        deltas, samples[i].feature_pixel_values);
+                                                 deltas, samples[i].feature_pixel_values);
-                }
+                }, 1);
                // Now start building the trees at this cascade level.
                for (unsigned long i = 0; i < get_num_trees_per_cascade_level(); ++i)
                {
-                    forests[cascade].push_back(make_regression_tree(samples, pixel_coordinates[cascade]));
+                    forests[cascade].push_back(make_regression_tree(tp, samples, pixel_coordinates[cascade]));
                    if (_verbose)
                    {
@@ -745,7 +759,8 @@ namespace dlib
            }
        }
-        struct training_sample 
+        template<typename feature_type>
+        struct training_sample
        {
            /*!
@@ -760,15 +775,18 @@ namespace dlib
                - present == 0/1 mask saying which parts of target_shape are present.
                - rect == the position of the object in the image_idx-th image.  All shape
                  coordinates are coded relative to this rectangle.
+                - diff_shape == temporary value for holding difference between current
+                  shape and target shape
            !*/
            unsigned long image_idx;
            rectangle rect;
-            matrix<float,0,1> target_shape; 
+            matrix<float,0,1> target_shape;
-            matrix<float,0,1> present; 
+            matrix<float,0,1> present;
-            matrix<float,0,1> current_shape;  
+            matrix<float,0,1> current_shape;
-            std::vector<float> feature_pixel_values;
+            matrix<float,0,1> diff_shape;
+            std::vector<feature_type> feature_pixel_values;
            void swap(training_sample& item)
            {
@@ -777,12 +795,15 @@ namespace dlib
                target_shape.swap(item.target_shape);
                present.swap(item.present);
                current_shape.swap(item.current_shape);
+                diff_shape.swap(item.diff_shape);
                feature_pixel_values.swap(item.feature_pixel_values);
            }
        };
+        template<typename feature_type>
        impl::regression_tree make_regression_tree (
-            std::vector<training_sample>& samples,
+            thread_pool& tp,
+            std::vector<training_sample<feature_type>>& samples,
            const std::vector<dlib::vector<float,2> >& pixel_coordinates
        ) const
        {
@@ -795,19 +816,53 @@ namespace dlib
            // walk the tree in breadth first order
            const unsigned long num_split_nodes = static_cast<unsigned long>(std::pow(2.0, (double)get_tree_depth())-1);
            std::vector<matrix<float,0,1> > sums(num_split_nodes*2+1);
-            for (unsigned long i = 0; i < samples.size(); ++i)
+            if (tp.num_threads_in_pool() > 1)
-                sums[0] += samples[i].target_shape - samples[i].current_shape;
+            {
+                // Here we need to calculate shape differences and store sum of differences into sums[0]
+                // to make it I am splitting of samples into blocks, each block will be processed by
+                // separate thread, and the sum of differences of each block is stored into separate
+                // place in block_sums
+                const unsigned long num_workers = std::max(1UL, tp.num_threads_in_pool());
+                const unsigned long num =  samples.size();
+                const unsigned long block_size = std::max(1UL, (num + num_workers - 1) / num_workers);
+                std::vector<matrix<float,0,1> > block_sums(num_workers);
+                parallel_for(tp, 0, num_workers, [&](unsigned long block)
+                {
+                    const unsigned long block_begin = block * block_size;
+                    const unsigned long block_end =  std::min(num, block_begin + block_size);
+                    for (unsigned long i = block_begin; i < block_end; ++i)
+                    {
+                        samples[i].diff_shape = samples[i].target_shape - samples[i].current_shape;
+                        block_sums[block] += samples[i].diff_shape;
+                    }
+                }, 1);
-            for (unsigned long i = 0; i < num_split_nodes; ++i) 
+                // now calculate the total result from separate blocks
+                for (unsigned long i = 0; i < block_sums.size(); ++i)
+                    sums[0] += block_sums[i];
+            }
+            else
+            {
+                // synchronous implementation
+                for (unsigned long i = 0; i < samples.size(); ++i)
+                {
+                    samples[i].diff_shape = samples[i].target_shape - samples[i].current_shape;
+                    sums[0] += samples[i].diff_shape;
+                }
+            }
+            for (unsigned long i = 0; i < num_split_nodes; ++i)
            {
                std::pair<unsigned long,unsigned long> range = parts.front();
                parts.pop_front();
-                const impl::split_feature split = generate_split(samples, range.first,
+                const impl::split_feature split = generate_split(tp, samples, range.first,
                    range.second, pixel_coordinates, sums[i], sums[left_child(i)],
                    sums[right_child(i)]);
                tree.splits.push_back(split);
-                const unsigned long mid = partition_samples(split, samples, range.first, range.second); 
+                const unsigned long mid = partition_samples(split, samples, range.first, range.second);
                parts.push_back(std::make_pair(range.first, mid));
                parts.push_back(std::make_pair(mid, range.second));
@@ -821,7 +876,7 @@ namespace dlib
            {
                // Get the present counts for each dimension so we can divide each
                // dimension by the number of observations we have on it to find the mean
-                // displacement in each leaf. 
+                // displacement in each leaf.
                present_counts = 0;
                for (unsigned long j = parts[i].first; j < parts[i].second; ++j)
                    present_counts += samples[j].present;
@@ -833,7 +888,7 @@ namespace dlib
                    tree.leaf_values[i] = zeros_matrix(samples[0].target_shape);
                // now adjust the current shape based on these predictions
-                for (unsigned long j = parts[i].first; j < parts[i].second; ++j)
+                parallel_for(tp, parts[i].first, parts[i].second, [&](unsigned long j)
                {
                    samples[j].current_shape += tree.leaf_values[i];
                    // For parts that aren't present in the training data, we just make
@@ -846,7 +901,7 @@ namespace dlib
                        if (samples[j].present(k) == 0)
                            samples[j].target_shape(k) = samples[j].current_shape(k);
                    }
-                }
+                }, 1);
            }
            return tree;
@@ -873,8 +928,10 @@ namespace dlib
            return feat;
        }
+        template<typename feature_type>
        impl::split_feature generate_split (
-            const std::vector<training_sample>& samples,
+            thread_pool& tp,
+            const std::vector<training_sample<feature_type>>& samples,
            unsigned long begin,
            unsigned long end,
            const std::vector<dlib::vector<float,2> >& pixel_coordinates,
@@ -896,24 +953,33 @@ namespace dlib
            std::vector<matrix<float,0,1> > left_sums(num_test_splits);
            std::vector<unsigned long> left_cnt(num_test_splits);
+            const unsigned long num_workers = std::max(1UL, tp.num_threads_in_pool());
+            const unsigned long block_size = std::max(1UL, (num_test_splits + num_workers - 1) / num_workers);
            // now compute the sums of vectors that go left for each feature
-            matrix<float,0,1> temp;
+            parallel_for(tp, 0, num_workers, [&](unsigned long block)
-            for (unsigned long j = begin; j < end; ++j)
            {
-                temp = samples[j].target_shape-samples[j].current_shape;
+                const unsigned long block_begin = block * block_size;
-                for (unsigned long i = 0; i < num_test_splits; ++i)
+                const unsigned long block_end   = std::min(block_begin + block_size, num_test_splits);
+                for (unsigned long j = begin; j < end; ++j)
                {
-                    if (samples[j].feature_pixel_values[feats[i].idx1] - samples[j].feature_pixel_values[feats[i].idx2] > feats[i].thresh)
+                    for (unsigned long i = block_begin; i < block_end; ++i)
                    {
-                        left_sums[i] += temp;
+                        if ((float)samples[j].feature_pixel_values[feats[i].idx1] - (float)samples[j].feature_pixel_values[feats[i].idx2] > feats[i].thresh)
-                        ++left_cnt[i];
+                        {
+                            left_sums[i] += samples[j].diff_shape;
+                            ++left_cnt[i];
+                        }
                    }
                }
-            }
+            }, 1);
            // now figure out which feature is the best
            double best_score = -1;
            unsigned long best_feat = 0;
+            matrix<float,0,1> temp;
            for (unsigned long i = 0; i < num_test_splits; ++i)
            {
                // check how well the feature splits the space.
@@ -944,9 +1010,10 @@ namespace dlib
            return feats[best_feat];
        }
+        template<typename feature_type>
        unsigned long partition_samples (
            const impl::split_feature& split,
-            std::vector<training_sample>& samples,
+            std::vector<training_sample<feature_type>>& samples,
            unsigned long begin,
            unsigned long end
        ) const
@@ -958,7 +1025,7 @@ namespace dlib
            unsigned long i = begin;
            for (unsigned long j = begin; j < end; ++j)
            {
-                if (samples[j].feature_pixel_values[split.idx1] - samples[j].feature_pixel_values[split.idx2] > split.thresh)
+                if ((float)samples[j].feature_pixel_values[split.idx1] - (float)samples[j].feature_pixel_values[split.idx2] > split.thresh)
                {
                    samples[i].swap(samples[j]);
                    ++i;
@@ -969,9 +1036,10 @@ namespace dlib
+        template<typename feature_type>
        matrix<float,0,1> populate_training_sample_shapes(
            const std::vector<std::vector<full_object_detection> >& objects,
-            std::vector<training_sample>& samples
+            std::vector<training_sample<feature_type>>& samples
        ) const
        {
            samples.clear();
@@ -982,7 +1050,7 @@ namespace dlib
            {
                for (unsigned long j = 0; j < objects[i].size(); ++j)
                {
-                    training_sample sample;
+                    training_sample<feature_type> sample;
                    sample.image_idx = i;
                    sample.rect = objects[i][j].get_rect();
                    object_to_shape(objects[i][j], sample.target_shape, sample.present);
@@ -1099,6 +1167,7 @@ namespace dlib
        unsigned long _num_test_splits;
        double _feature_pool_region_padding;
        bool _verbose;
+        unsigned long _num_threads;
    };
 // ----------------------------------------------------------------------------------------

--- a/dlib/image_processing/shape_predictor_abstract.h
+++ b/dlib/image_processing/shape_predictor_abstract.h
@@ -148,6 +148,7 @@ namespace dlib
                - #get_num_test_splits() == 20
                - #get_feature_pool_region_padding() == 0
                - #get_random_seed() == ""
+                - #get_num_threads() == 0
                - This object will not be verbose
        !*/
@@ -367,6 +368,26 @@ namespace dlib
                - #get_num_test_splits() == num
        !*/
+        unsigned long get_num_threads (
+        ) const;
+        /*!
+            ensures
+                - When running training process, it is possible to make some parts of it parallel
+                  using CPU threads with #parallel_for() extension and creating #thread_pool internally
+                  When get_num_threads() == 0, trainer will not create threads and all processing will
+                  be done in the calling thread
+        !*/
+        void set_num_threads (
+            unsigned long num
+        );
+        /*!
+            requires
+                - num >= 0
+            ensures
+                - #get_num_threads() == num
+        !*/
        void be_verbose (
        );
        /*!

--- a/examples/train_shape_predictor_ex.cpp
+++ b/examples/train_shape_predictor_ex.cpp
@@ -39,7 +39,7 @@ std::vector<std::vector<double> > get_interocular_distances (
 // ----------------------------------------------------------------------------------------
 int main(int argc, char** argv)
-{  
+{
    try
    {
        // In this example we are going to train a shape_predictor based on the
@@ -108,6 +108,9 @@ int main(int argc, char** argv)
        trainer.set_nu(0.05);
        trainer.set_tree_depth(2);
+        // some parts of training process can be parellelized.
+        // Trainer will use this count of threads when possible
+        trainer.set_num_threads(2);
        // Tell the trainer to print status messages to the console so we can
        // see how long the training will take.