Refactored the interfaces and objects related to object detection so that

they can support movable object part models. Now all that needs to be done is to implement the TODO inside the scan_image_pyramid object and the movable part model support should be up and working.

Refactored the interfaces and objects related to object detection so that
they can support movable object part models. Now all that needs to be done is to implement the TODO inside the scan_image_pyramid object and the movable part model support should be up and working.
838caffb · Davis King · 6f57d405 · 838caffb · 838caffb · 838caffb
Commit 838caffb authored Aug 12, 2012 by Davis King
8 changed files
--- a/dlib/image_processing/object_detector_abstract.h
+++ b/dlib/image_processing/object_detector_abstract.h
@@ -157,8 +157,8 @@ namespace dlib
                      minus the threshold, therefore this is a value > 0.
                    - #dets[i].second == the bounding box for the i-th detection.
                - #get_scanner() will have been loaded with img. Therefore, you can call
-                  #get_scanner().get_feature_vector() to obtain the feature vectors for
-                  the resulting object detection boxes.
+                  #get_scanner().get_feature_vector() to obtain the feature vectors or
+                  full_object_detections for the resulting object detection boxes.
                - The detection threshold is adjusted by having adjust_threshold added
                  to it.  Therefore, an adjust_threshold value > 0 makes detecting
                  objects harder while a negative one makes it easier.

--- a/dlib/image_processing/scan_image_pyramid.h
+++ b/dlib/image_processing/scan_image_pyramid.h
@@ -9,6 +9,7 @@
 #include "../image_processing.h"
 #include "../array2d.h"
 #include <vector>
+#include "full_object_detection.h"

 namespace dlib
 {
@@ -52,12 +53,24 @@ namespace dlib

        void add_detection_template (
            const rectangle& object_box,
-            const std::vector<rectangle>& feature_extraction_regions 
+            const std::vector<rectangle>& stationary_feature_extraction_regions,
+            const std::vector<rectangle>& movable_feature_extraction_regions
+        );
+
+        void add_detection_template (
+            const rectangle& object_box,
+            const std::vector<rectangle>& stationary_feature_extraction_regions
        );

        inline unsigned long get_num_detection_templates (
        ) const;

+        inline unsigned long get_num_movable_components_per_detection_template (
+        ) const;
+
+        inline unsigned long get_num_stationary_components_per_detection_template (
+        ) const;
+
        inline unsigned long get_num_components_per_detection_template (
        ) const;

@@ -96,7 +109,13 @@ namespace dlib
        ) const;

        void get_feature_vector (
+            const full_object_detection& obj,
+            feature_vector_type& psi
+        ) const;
+
+        full_object_detection get_feature_vector (
            const rectangle& rect,
+            const feature_vector_type& w,
            feature_vector_type& psi
        ) const;

@@ -129,6 +148,7 @@ namespace dlib
        {
            rectangle object_box; // always centered at (0,0)
            std::vector<rectangle> rects; // template with respect to (0,0)
+            std::vector<rectangle> movable_rects; 
        };

        friend void serialize(const detection_template& item, std::ostream& out)
@@ -394,27 +414,61 @@ namespace dlib
    void scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
    add_detection_template (
        const rectangle& object_box,
-        const std::vector<rectangle>& feature_extraction_regions 
+        const std::vector<rectangle>& stationary_feature_extraction_regions,
+        const std::vector<rectangle>& movable_feature_extraction_regions
    )
    {
+#ifdef ENABLE_ASSERTS
        // make sure requires clause is not broken
        DLIB_ASSERT((get_num_detection_templates() == 0 || 
-                        get_num_components_per_detection_template() == feature_extraction_regions.size()) &&
+                        (get_num_stationary_components_per_detection_template() == stationary_feature_extraction_regions.size() &&
+                        get_num_movable_components_per_detection_template() == movable_feature_extraction_regions.size())) &&
                        center(object_box) == point(0,0),
            "\t void scan_image_pyramid::add_detection_template()"
            << "\n\t The number of rects in this new detection template doesn't match "
            << "\n\t the number in previous detection templates."
-            << "\n\t get_num_components_per_detection_template(): " << get_num_components_per_detection_template()
-            << "\n\t feature_extraction_regions.size(): " << feature_extraction_regions.size()
+            << "\n\t get_num_stationary_components_per_detection_template(): " << get_num_stationary_components_per_detection_template()
+            << "\n\t stationary_feature_extraction_regions.size():           " << stationary_feature_extraction_regions.size()
+            << "\n\t get_num_movable_components_per_detection_template():    " << get_num_movable_components_per_detection_template()
+            << "\n\t movable_feature_extraction_regions.size():              " << movable_feature_extraction_regions.size()
            << "\n\t this: " << this
            );

+        for (unsigned long i = 0; i < movable_feature_extraction_regions.size(); ++i)
+        {
+            DLIB_ASSERT(center(movable_feature_extraction_regions[i]) == point(0,0),
+                        "Invalid inputs were given to this function."
+                        << "\n\t center(movable_feature_extraction_regions["<<i<<"]): " << center(movable_feature_extraction_regions[i]) 
+                        << "\n\t this: " << this
+            );
+        }
+#endif
+
        detection_template temp;
        temp.object_box = object_box;
-        temp.rects = feature_extraction_regions;
+        temp.rects = stationary_feature_extraction_regions;
+        temp.movable_rects = movable_feature_extraction_regions;
        det_templates.push_back(temp);
    }

+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename Pyramid_type,
+        typename Feature_extractor_type
+        >
+    void scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
+    add_detection_template (
+        const rectangle& object_box,
+        const std::vector<rectangle>& stationary_feature_extraction_regions
+    )
+    {
+        // an empty set of movable feature regions
+        const std::vector<rectangle> movable_feature_extraction_regions;
+        add_detection_template(object_box, stationary_feature_extraction_regions,
+                               movable_feature_extraction_regions);
+    }
+
 // ----------------------------------------------------------------------------------------

    template <
@@ -428,6 +482,48 @@ namespace dlib
        return det_templates.size();
    }

+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename Pyramid_type,
+        typename Feature_extractor_type
+        >
+    unsigned long scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
+    get_num_stationary_components_per_detection_template (
+    ) const
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(get_num_detection_templates() > 0 ,
+            "\t unsigned long scan_image_pyramid::get_num_stationary_components_per_detection_template()"
+            << "\n\t You need to give some detection templates before calling this function. "
+            << "\n\t get_num_detection_templates(): " << get_num_detection_templates()
+            << "\n\t this: " << this
+            );
+
+        return det_templates[0].rects.size();
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename Pyramid_type,
+        typename Feature_extractor_type
+        >
+    unsigned long scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
+    get_num_movable_components_per_detection_template (
+    ) const
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(get_num_detection_templates() > 0 ,
+            "\t unsigned long scan_image_pyramid::get_num_movable_components_per_detection_template()"
+            << "\n\t You need to give some detection templates before calling this function. "
+            << "\n\t get_num_detection_templates(): " << get_num_detection_templates()
+            << "\n\t this: " << this
+            );
+
+        return det_templates[0].movable_rects.size();
+    }
+
 // ----------------------------------------------------------------------------------------

    template <
@@ -446,7 +542,8 @@ namespace dlib
            << "\n\t this: " << this
            );

-        return det_templates[0].rects.size();
+        return get_num_movable_components_per_detection_template() +
+               get_num_stationary_components_per_detection_template();
    }

 // ----------------------------------------------------------------------------------------
@@ -697,25 +794,48 @@ namespace dlib
        typename Pyramid_type,
        typename Feature_extractor_type
        >
-    void scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
+    full_object_detection scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
    get_feature_vector (
        const rectangle& rect,
+        const feature_vector_type&,// w,
+        feature_vector_type& psi
+    ) const
+    {
+        // TODO
+        get_feature_vector(full_object_detection(rect), psi);
+        return full_object_detection(rect);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename Pyramid_type,
+        typename Feature_extractor_type
+        >
+    void scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
+    get_feature_vector (
+        const full_object_detection& obj,
        feature_vector_type& psi
    ) const
    {
        // make sure requires clause is not broken
        DLIB_ASSERT(get_num_detection_templates() > 0 &&
                    is_loaded_with_image() &&
-                    psi.size() >= get_num_dimensions(), 
+                    psi.size() >= get_num_dimensions() &&
+                    obj.movable_parts.size() == get_num_movable_components_per_detection_template(),
            "\t void scan_image_pyramid::get_feature_vector()"
            << "\n\t Invalid inputs were given to this function "
            << "\n\t get_num_detection_templates(): " << get_num_detection_templates()
            << "\n\t is_loaded_with_image(): " << is_loaded_with_image()
            << "\n\t psi.size():             " << psi.size()
            << "\n\t get_num_dimensions():   " << get_num_dimensions()
+            << "\n\t get_num_movable_components_per_detection_template(): " << get_num_movable_components_per_detection_template()
+            << "\n\t obj.movable_parts.size():                            " << obj.movable_parts.size()
            << "\n\t this: " << this
            );

+        const rectangle rect = obj.rect;
+
        pyramid_type pyr;
        rectangle mapped_rect;
        detection_template best_template;

--- a/dlib/image_processing/scan_image_pyramid_abstract.h
+++ b/dlib/image_processing/scan_image_pyramid_abstract.h
--- a/dlib/svm/structural_object_detection_trainer.h
+++ b/dlib/svm/structural_object_detection_trainer.h
@@ -9,6 +9,7 @@
 #include "structural_svm_object_detection_problem.h"
 #include "../image_processing/object_detector.h"
 #include "../image_processing/box_overlap_testing.h"
+#include "../image_processing/full_object_detection.h"


 namespace dlib
@@ -54,6 +55,12 @@ namespace dlib
            auto_overlap_tester = is_same_type<overlap_tester_type,test_box_overlap>::value;
        }

+        const image_scanner_type& get_scanner (
+        ) const
+        {
+            return scanner;
+        }
+
        bool auto_set_overlap_tester (
        ) const 
        { 
@@ -239,29 +246,45 @@ namespace dlib
            >
        const trained_function_type train (
            const image_array_type& images,
-            const std::vector<std::vector<rectangle> >& truth_rects
+            const std::vector<std::vector<full_object_detection> >& truth_object_detections
        ) const
        {
+#ifdef ENABLE_ASSERTS
            // make sure requires clause is not broken
-            DLIB_ASSERT(is_learning_problem(images,truth_rects) == true,
-                "\t trained_function_type structural_object_detection_trainer::train(x,y)"
+            DLIB_ASSERT(is_learning_problem(images,truth_object_detections) == true,
+                "\t trained_function_type structural_object_detection_trainer::train()"
                << "\n\t invalid inputs were given to this function"
                << "\n\t images.size():      " << images.size()
-                << "\n\t truth_rects.size(): " << truth_rects.size()
-                << "\n\t is_learning_problem(images,truth_rects): " << is_learning_problem(images,truth_rects)
+                << "\n\t truth_object_detections.size(): " << truth_object_detections.size()
+                << "\n\t is_learning_problem(images,truth_object_detections): " << is_learning_problem(images,truth_object_detections)
                );
+            for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
+            {
+                for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j)
+                {
+                    DLIB_ASSERT(truth_object_detections[i][j].movable_parts.size() == get_scanner().get_num_movable_components_per_detection_template(),
+                        "\t trained_function_type structural_object_detection_trainer::train()"
+                        << "\n\t invalid inputs were given to this function"
+                        << "\n\t truth_object_detections["<<i<<"]["<<j<<"].movable_parts.size():                " << 
+                            truth_object_detections[i][j].movable_parts.size()
+                        << "\n\t get_scanner().get_num_movable_components_per_detection_template(): " << 
+                            get_scanner().get_num_movable_components_per_detection_template()
+                    );
+                }
+            }
+#endif

            overlap_tester_type local_overlap_tester;

            if (auto_overlap_tester)
            {
-                std::vector<std::vector<rectangle> > mapped_rects(truth_rects.size());
-                for (unsigned long i = 0; i < truth_rects.size(); ++i)
+                std::vector<std::vector<rectangle> > mapped_rects(truth_object_detections.size());
+                for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
                {
-                    mapped_rects[i].resize(truth_rects[i].size());
-                    for (unsigned long j = 0; j < truth_rects[i].size(); ++j)
+                    mapped_rects[i].resize(truth_object_detections[i].size());
+                    for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j)
                    {
-                        mapped_rects[i][j] = scanner.get_best_matching_rect(truth_rects[i][j]);
+                        mapped_rects[i][j] = scanner.get_best_matching_rect(truth_object_detections[i][j].rect);
                    }
                }

@@ -273,7 +296,7 @@ namespace dlib
            }

            structural_svm_object_detection_problem<image_scanner_type,overlap_tester_type,image_array_type > 
-                svm_prob(scanner, local_overlap_tester, images, truth_rects, num_threads);
+                svm_prob(scanner, local_overlap_tester, images, truth_object_detections, num_threads);

            if (verbose)
                svm_prob.be_verbose();
@@ -293,6 +316,25 @@ namespace dlib
            return object_detector<image_scanner_type,overlap_tester_type>(scanner, local_overlap_tester, w);
        }

+        template <
+            typename image_array_type
+            >
+        const trained_function_type train (
+            const image_array_type& images,
+            const std::vector<std::vector<rectangle> >& truth_object_detections
+        ) const
+        {
+            std::vector<std::vector<full_object_detection> > truth_dets(truth_object_detections.size());
+            for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
+            {
+                for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j)
+                {
+                    truth_dets[i].push_back(full_object_detection(truth_object_detections[i][j]));
+                }
+            }
+
+            return train(images, truth_dets);
+        }

    private:


--- a/dlib/svm/structural_object_detection_trainer_abstract.h
+++ b/dlib/svm/structural_object_detection_trainer_abstract.h
@@ -6,6 +6,7 @@
 #include "structural_svm_object_detection_problem_abstract.h"
 #include "../image_processing/object_detector_abstract.h"
 #include "../image_processing/box_overlap_testing_abstract.h"
+#include "../image_processing/full_object_detection_abstract.h"


 namespace dlib
@@ -60,12 +61,22 @@ namespace dlib
                - #get_loss_per_false_alarm() == 1
                - This object will attempt to learn a model for the given
                  scanner object when train() is called.
+                - #get_scanner() == scanner
+                  (note that only the "configuration" of scanner is copied.
+                  I.e. the copy is done using copy_configuration())
                - if (overlap_tester_type == test_box_overlap) then
                    - #auto_set_overlap_tester() == true
                - else
                    - #auto_set_overlap_tester() == false
        !*/

+        const image_scanner_type& get_scanner (
+        ) const;
+        /*!
+            ensures
+                - returns the image scanner used by this object.  
+        !*/
+
        bool auto_set_overlap_tester (
        ) const;
        /*!
@@ -74,7 +85,7 @@ namespace dlib
                  state for the overlap tester used for non-max suppression.) then
                    - returns true
                    - In this case, it is determined using the find_tight_overlap_tester() 
-                      routine based on the truth_rects given to the 
+                      routine based on the truth_object_detections given to the 
                      structural_object_detection_trainer::train() method.  
                - else
                    - returns false
@@ -276,20 +287,43 @@ namespace dlib
            >
        const trained_function_type train (
            const image_array_type& images,
-            const std::vector<std::vector<rectangle> >& truth_rects
+            const std::vector<std::vector<full_object_detection> >& truth_object_detections
        ) const;
        /*!
            requires
-                - is_learning_problem(images, truth_rects) == true
+                - is_learning_problem(images, truth_object_detections) == true
                - it must be valid to pass images[0] into the image_scanner_type::load() method.
                  (also, image_array_type must be an implementation of dlib/array/array_kernel_abstract.h)
+                - for all valid i, j:
+                    - truth_object_detections[i][j].movable_parts.size() == get_scanner().get_num_movable_components_per_detection_template() 
            ensures
                - Uses the structural_svm_object_detection_problem to train an object_detector 
-                  on the given images and truth_rects.  
+                  on the given images and truth_object_detections.  
                - returns a function F with the following properties:
                    - F(new_image) == A prediction of what objects are present in new_image.  This
                      is a set of rectangles indicating their positions.
        !*/
+
+        template <
+            typename image_array_type
+            >
+        const trained_function_type train (
+            const image_array_type& images,
+            const std::vector<std::vector<rectangle> >& truth_object_detections
+        ) const;
+        /*!
+            requires
+                - is_learning_problem(images, truth_object_detections) == true
+                - it must be valid to pass images[0] into the image_scanner_type::load() method.
+                  (also, image_array_type must be an implementation of dlib/array/array_kernel_abstract.h)
+                - get_scanner().get_num_movable_components_per_detection_template() == 0
+            ensures
+                - This function is identical to the above train(), except that it converts 
+                  each element of truth_object_detections into a full_object_detection by 
+                  passing it to full_object_detection's constructor taking only a rectangle.
+                  Therefore, this version of train() is a convenience function for for the 
+                  case where you don't have any movable components of the detection templates.
+        !*/
    }; 

 // ----------------------------------------------------------------------------------------

--- a/dlib/svm/structural_svm_object_detection_problem.h
+++ b/dlib/svm/structural_svm_object_detection_problem.h
--- a/dlib/svm/structural_svm_object_detection_problem_abstract.h
+++ b/dlib/svm/structural_svm_object_detection_problem_abstract.h
@@ -6,6 +6,7 @@
 #include "../matrix.h"
 #include "structural_svm_problem_threaded_abstract.h"
 #include <sstream>
+#include "../image_processing/full_object_detection_abstract.h"

 namespace dlib
 {
@@ -81,23 +82,25 @@ namespace dlib
            const image_scanner_type& scanner,
            const overlap_tester_type& overlap_tester,
            const image_array_type& images,
-            const std::vector<std::vector<rectangle> >& truth_rects,
+            const std::vector<std::vector<full_object_detection> >& truth_object_detections,
            unsigned long num_threads = 2
        );
        /*!
            requires
-                - is_learning_problem(images, truth_rects)
+                - is_learning_problem(images, truth_object_detections)
                - scanner.get_num_detection_templates() > 0
                - scanner.load(images[0]) must be a valid expression.
+                - for all valid i, j:
+                    - truth_object_detections[i][j].movable_rects.size() == scanner.get_num_movable_components_per_detection_template() 
            ensures
                - This object attempts to learn a mapping from the given images to the 
-                  object locations given in truth_rects.  In particular, it attempts to 
-                  learn to predict truth_rects[i] based on images[i].
+                  object locations given in truth_object_detections.  In particular, it attempts to 
+                  learn to predict truth_object_detections[i] based on images[i].
                  Or in other words, this object can be used to learn a parameter vector, w, such that 
                  an object_detector declared as:
                    object_detector<image_scanner_type,overlap_tester_type> detector(scanner,overlap_tester,w)
                  results in a detector object which attempts to compute the following mapping:
-                    truth_rects[i] == detector(images[i])
+                    truth_object_detections[i].rect == detector(images[i])
                - #get_match_eps() == 0.5
                - This object will use num_threads threads during the optimization 
                  procedure.  You should set this parameter equal to the number of 

--- a/dlib/test/object_detector.cpp
+++ b/dlib/test/object_detector.cpp
@@ -57,6 +57,7 @@ namespace
            detector(images[i], dets2);

            matrix<double,0,1> psi(detector.get_w().size());
+            matrix<double,0,1> psi2(detector.get_w().size());
            const double thresh = detector.get_w()(detector.get_w().size()-1);

            DLIB_TEST(dets.size() == dets2.size());
@@ -65,10 +66,19 @@ namespace
                DLIB_TEST(dets[j] == dets2[j].second);

                psi = 0;
-                detector.get_scanner().get_feature_vector(dets[j], psi);
+                const full_object_detection fdet = detector.get_scanner().get_feature_vector(dets[j], detector.get_w(), psi);

-                const double check_score = dot(psi,detector.get_w()) - thresh;
+                double check_score = dot(psi,detector.get_w()) - thresh;
                DLIB_TEST(std::abs(check_score - dets2[j].first) < 1e-10);
+
+
+                // Make sure fdet works the way it is supposed to with get_feature_vector().
+                psi2 = 0;
+                detector.get_scanner().get_feature_vector(fdet, psi2);
+
+                check_score = dot(psi2,detector.get_w()) - thresh;
+                DLIB_TEST(std::abs(check_score - dets2[j].first) < 1e-10);
+                DLIB_TEST(max(abs(psi-psi2)) < 1e-10);
            }

        }