Refactored the interfaces and objects related to object detection so that

they can support movable object part models. Now all that needs to be done is to implement the TODO inside the scan_image_pyramid object and the movable part model support should be up and working.

Refactored the interfaces and objects related to object detection so that
they can support movable object part models. Now all that needs to be done is to implement the TODO inside the scan_image_pyramid object and the movable part model support should be up and working.
838caffb · Davis King · 6f57d405 · 838caffb · 838caffb · 838caffb
Commit 838caffb authored Aug 12, 2012 by Davis King
8 changed files
--- a/dlib/image_processing/object_detector_abstract.h
+++ b/dlib/image_processing/object_detector_abstract.h
@@ -157,8 +157,8 @@ namespace dlib
                      minus the threshold, therefore this is a value > 0.
                    - #dets[i].second == the bounding box for the i-th detection.
                - #get_scanner() will have been loaded with img. Therefore, you can call
-                  #get_scanner().get_feature_vector() to obtain the feature vectors for
-                  the resulting object detection boxes.
+                  #get_scanner().get_feature_vector() to obtain the feature vectors or
+                  full_object_detections for the resulting object detection boxes.
                - The detection threshold is adjusted by having adjust_threshold added
                  to it.  Therefore, an adjust_threshold value > 0 makes detecting
                  objects harder while a negative one makes it easier.

--- a/dlib/image_processing/scan_image_pyramid.h
+++ b/dlib/image_processing/scan_image_pyramid.h
@@ -9,6 +9,7 @@
 #include "../image_processing.h"
 #include "../array2d.h"
 #include <vector>
+#include "full_object_detection.h"

 namespace dlib
 {
@@ -52,12 +53,24 @@ namespace dlib

        void add_detection_template (
            const rectangle& object_box,
-            const std::vector<rectangle>& feature_extraction_regions 
+            const std::vector<rectangle>& stationary_feature_extraction_regions,
+            const std::vector<rectangle>& movable_feature_extraction_regions
+        );
+
+        void add_detection_template (
+            const rectangle& object_box,
+            const std::vector<rectangle>& stationary_feature_extraction_regions
        );

        inline unsigned long get_num_detection_templates (
        ) const;

+        inline unsigned long get_num_movable_components_per_detection_template (
+        ) const;
+
+        inline unsigned long get_num_stationary_components_per_detection_template (
+        ) const;
+
        inline unsigned long get_num_components_per_detection_template (
        ) const;

@@ -96,7 +109,13 @@ namespace dlib
        ) const;

        void get_feature_vector (
+            const full_object_detection& obj,
+            feature_vector_type& psi
+        ) const;
+
+        full_object_detection get_feature_vector (
            const rectangle& rect,
+            const feature_vector_type& w,
            feature_vector_type& psi
        ) const;

@@ -129,6 +148,7 @@ namespace dlib
        {
            rectangle object_box; // always centered at (0,0)
            std::vector<rectangle> rects; // template with respect to (0,0)
+            std::vector<rectangle> movable_rects; 
        };

        friend void serialize(const detection_template& item, std::ostream& out)
@@ -394,27 +414,61 @@ namespace dlib
    void scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
    add_detection_template (
        const rectangle& object_box,
-        const std::vector<rectangle>& feature_extraction_regions 
+        const std::vector<rectangle>& stationary_feature_extraction_regions,
+        const std::vector<rectangle>& movable_feature_extraction_regions
    )
    {
+#ifdef ENABLE_ASSERTS
        // make sure requires clause is not broken
        DLIB_ASSERT((get_num_detection_templates() == 0 || 
-                        get_num_components_per_detection_template() == feature_extraction_regions.size()) &&
+                        (get_num_stationary_components_per_detection_template() == stationary_feature_extraction_regions.size() &&
+                        get_num_movable_components_per_detection_template() == movable_feature_extraction_regions.size())) &&
                        center(object_box) == point(0,0),
            "\t void scan_image_pyramid::add_detection_template()"
            << "\n\t The number of rects in this new detection template doesn't match "
            << "\n\t the number in previous detection templates."
-            << "\n\t get_num_components_per_detection_template(): " << get_num_components_per_detection_template()
-            << "\n\t feature_extraction_regions.size(): " << feature_extraction_regions.size()
+            << "\n\t get_num_stationary_components_per_detection_template(): " << get_num_stationary_components_per_detection_template()
+            << "\n\t stationary_feature_extraction_regions.size():           " << stationary_feature_extraction_regions.size()
+            << "\n\t get_num_movable_components_per_detection_template():    " << get_num_movable_components_per_detection_template()
+            << "\n\t movable_feature_extraction_regions.size():              " << movable_feature_extraction_regions.size()
            << "\n\t this: " << this
            );

+        for (unsigned long i = 0; i < movable_feature_extraction_regions.size(); ++i)
+        {
+            DLIB_ASSERT(center(movable_feature_extraction_regions[i]) == point(0,0),
+                        "Invalid inputs were given to this function."
+                        << "\n\t center(movable_feature_extraction_regions["<<i<<"]): " << center(movable_feature_extraction_regions[i]) 
+                        << "\n\t this: " << this
+            );
+        }
+#endif
+
        detection_template temp;
        temp.object_box = object_box;
-        temp.rects = feature_extraction_regions;
+        temp.rects = stationary_feature_extraction_regions;
+        temp.movable_rects = movable_feature_extraction_regions;
        det_templates.push_back(temp);
    }

+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename Pyramid_type,
+        typename Feature_extractor_type
+        >
+    void scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
+    add_detection_template (
+        const rectangle& object_box,
+        const std::vector<rectangle>& stationary_feature_extraction_regions
+    )
+    {
+        // an empty set of movable feature regions
+        const std::vector<rectangle> movable_feature_extraction_regions;
+        add_detection_template(object_box, stationary_feature_extraction_regions,
+                               movable_feature_extraction_regions);
+    }
+
 // ----------------------------------------------------------------------------------------

    template <
@@ -428,6 +482,48 @@ namespace dlib
        return det_templates.size();
    }

+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename Pyramid_type,
+        typename Feature_extractor_type
+        >
+    unsigned long scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
+    get_num_stationary_components_per_detection_template (
+    ) const
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(get_num_detection_templates() > 0 ,
+            "\t unsigned long scan_image_pyramid::get_num_stationary_components_per_detection_template()"
+            << "\n\t You need to give some detection templates before calling this function. "
+            << "\n\t get_num_detection_templates(): " << get_num_detection_templates()
+            << "\n\t this: " << this
+            );
+
+        return det_templates[0].rects.size();
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename Pyramid_type,
+        typename Feature_extractor_type
+        >
+    unsigned long scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
+    get_num_movable_components_per_detection_template (
+    ) const
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(get_num_detection_templates() > 0 ,
+            "\t unsigned long scan_image_pyramid::get_num_movable_components_per_detection_template()"
+            << "\n\t You need to give some detection templates before calling this function. "
+            << "\n\t get_num_detection_templates(): " << get_num_detection_templates()
+            << "\n\t this: " << this
+            );
+
+        return det_templates[0].movable_rects.size();
+    }
+
 // ----------------------------------------------------------------------------------------

    template <
@@ -446,7 +542,8 @@ namespace dlib
            << "\n\t this: " << this
            );

-        return det_templates[0].rects.size();
+        return get_num_movable_components_per_detection_template() +
+               get_num_stationary_components_per_detection_template();
    }

 // ----------------------------------------------------------------------------------------
@@ -697,25 +794,48 @@ namespace dlib
        typename Pyramid_type,
        typename Feature_extractor_type
        >
-    void scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
+    full_object_detection scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
    get_feature_vector (
        const rectangle& rect,
+        const feature_vector_type&,// w,
+        feature_vector_type& psi
+    ) const
+    {
+        // TODO
+        get_feature_vector(full_object_detection(rect), psi);
+        return full_object_detection(rect);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename Pyramid_type,
+        typename Feature_extractor_type
+        >
+    void scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
+    get_feature_vector (
+        const full_object_detection& obj,
        feature_vector_type& psi
    ) const
    {
        // make sure requires clause is not broken
        DLIB_ASSERT(get_num_detection_templates() > 0 &&
                    is_loaded_with_image() &&
-                    psi.size() >= get_num_dimensions(), 
+                    psi.size() >= get_num_dimensions() &&
+                    obj.movable_parts.size() == get_num_movable_components_per_detection_template(),
            "\t void scan_image_pyramid::get_feature_vector()"
            << "\n\t Invalid inputs were given to this function "
            << "\n\t get_num_detection_templates(): " << get_num_detection_templates()
            << "\n\t is_loaded_with_image(): " << is_loaded_with_image()
            << "\n\t psi.size():             " << psi.size()
            << "\n\t get_num_dimensions():   " << get_num_dimensions()
+            << "\n\t get_num_movable_components_per_detection_template(): " << get_num_movable_components_per_detection_template()
+            << "\n\t obj.movable_parts.size():                            " << obj.movable_parts.size()
            << "\n\t this: " << this
            );

+        const rectangle rect = obj.rect;
+
        pyramid_type pyr;
        rectangle mapped_rect;
        detection_template best_template;

--- a/dlib/image_processing/scan_image_pyramid_abstract.h
+++ b/dlib/image_processing/scan_image_pyramid_abstract.h
@@ -8,6 +8,7 @@
 #include "../image_processing.h"
 #include "../array2d.h"
 #include <vector>
+#include "full_object_detection_abstract.h"

 namespace dlib
 {
@@ -56,30 +57,39 @@ namespace dlib
                      objects, which associate a vector with each location in an image.

                   2. A detection template.  This is a rectangle which defines the shape of a 
-                      sliding window (the object_box), as well as a set of rectangles which
-                      envelop it.  This set of enveloping rectangles defines the spatial
-                      structure of the overall feature extraction within a sliding window.  
-                      In particular, each location of a sliding window has a feature vector
+                      sliding window (i.e. the object_box), as well as a set of rectangular feature 
+                      extraction regions inside it.  This set of regions defines the spatial 
+                      structure of the overall feature extraction within a sliding window.  In 
+                      particular, each location of a sliding window has a feature vector 
                      associated with it.  This feature vector is defined as follows:
-                        - Let N denote the number of enveloping rectangles.
+                        - Let N denote the number of feature extraction zones.
                        - Let M denote the dimensionality of the vectors output by Feature_extractor_type
                          objects.
                        - Let F(i) == the M dimensional vector which is the sum of all vectors 
-                          given by our Feature_extractor_type object inside the ith enveloping 
-                          rectangle.
+                          given by our Feature_extractor_type object inside the ith feature extraction
+                          zone.
                        - Then the feature vector for a sliding window is an M*N dimensional vector
                          [F(1) F(2) F(3) ... F(N)] (i.e. it is a concatenation of the N vectors).
                          This feature vector can be thought of as a collection of N "bags of features",
-                          each bag coming from a spatial location determined by one of the enveloping 
-                          rectangles. 
+                          each bag coming from a spatial location determined by one of the rectangular
+                          feature extraction zones.
                          
                   3. A weight vector and a threshold value.  The dot product between the weight
                      vector and the feature vector for a sliding window location gives the score 
                      of the window.  If this score is greater than the threshold value then the 
                      window location is output as a detection.

-                Finally, the sliding window classifiers described above are applied to every level 
-                of an image pyramid.  
+                Finally, the sliding window classifiers described above are applied to every level of
+                an image pyramid.  Moreover, some of the feature extraction zones are allowed to move
+                freely within the object box.  This means that when we are sliding the classifier over
+                an image, some feature extraction zones are stationary (i.e. always in the same place
+                relative to the object box) while others are allowed to move anywhere within the object
+                box.  In particular, the movable regions are placed at the locations that maximize the
+                score of the classifier.  Note further that each of the movable feature extraction
+                zones must pass a threshold test for it to be included.  That is, if the score that a
+                movable zone would contribute to the overall score for a sliding window location is not
+                positive then that zone is not included in the feature vector (i.e. its part of the
+                feature vector is set to zero.  This way the length of the feature vector stays constant).

            THREAD SAFETY
                Concurrent access to an instance of this object is not safe and should be protected
@@ -164,30 +174,48 @@ namespace dlib

        void add_detection_template (
            const rectangle& object_box,
-            const std::vector<rectangle>& feature_extraction_regions 
+            const std::vector<rectangle>& stationary_feature_extraction_regions,
+            const std::vector<rectangle>& movable_feature_extraction_regions
        );
        /*!
            requires
-                - center(object_box) == point(0,0),
+                - center(object_box) == point(0,0)
+                - for all valid i:
+                    - center(movable_feature_extraction_regions[i]) == point(0,0)
                - if (get_num_detection_templates() > 0) then
-                    - get_num_components_per_detection_template() == feature_extraction_regions.size()
+                    - get_num_stationary_components_per_detection_template() == stationary_feature_extraction_regions.size() 
+                    - get_num_movable_components_per_detection_template() == movable_feature_extraction_regions.size() 
                      (i.e. if you already have detection templates in this object, then
                      any new detection template must declare a consistent number of 
                      feature extraction regions)
            ensures
                - Adds another detection template to this object.  In particular, object_box 
-                  defines the size and shape of a sliding window while feature_extraction_regions 
-                  defines the locations for feature extraction as discussed in the WHAT THIS 
-                  OBJECT REPRESENTS section above.  Note also that the locations of the feature 
-                  extraction regions are relative to the object_box.  
+                  defines the size and shape of a sliding window while stationary_feature_extraction_regions 
+                  and movable_feature_extraction_regions defines the locations for feature extraction as 
+                  discussed in the WHAT THIS OBJECT REPRESENTS section above.  Note also that the locations of 
+                  the stationary feature extraction regions are relative to the object_box.  
                - #get_num_detection_templates() == get_num_detection_templates() + 1
-                - The order of rectangles in feature_extraction_regions matters.  Recall that
-                  each rectangle gets its own set of features.  So given two different templates, 
-                  their ith rectangles will both share the same part of the weight vector (w) 
-                  supplied to detect().  So there should be some reasonable correspondence 
+                - The order of rectangles in stationary_feature_extraction_regions and
+                  movable_feature_extraction_regions matters.  Recall that each rectangle
+                  gets its own set of features.  So given two different templates, their
+                  ith rectangles will both share the same part of the weight vector (i.e. the w
+                  supplied to detect()).  So there should be some reasonable correspondence
                  between the rectangle ordering in different detection templates.  For,
-                  example, different detection templates should place corresponding 
-                  feature extraction regions in roughly the same part of the object_box.
+                  example, different detection templates should place corresponding feature
+                  extraction regions in roughly the same part of the object_box.
+                - #get_num_stationary_components_per_detection_template() = stationary_feature_extraction_regions.size() 
+                - #get_num_movable_components_per_detection_template()    = movable_feature_extraction_regions.size() 
+        !*/
+
+        void add_detection_template (
+            const rectangle& object_box,
+            const std::vector<rectangle>& stationary_feature_extraction_regions
+        );
+        /*!
+            ensures
+                - calls add_detection_template(object_box, stationary_feature_extraction_regions, empty_list)
+                  where empty_list is a vector of size 0.  I.e. this function is just a convenience
+                  routine for adding detection templates with no movable regions.
        !*/

        unsigned long get_num_detection_templates (
@@ -197,16 +225,40 @@ namespace dlib
                - returns the number of detection templates in this object
        !*/

+        unsigned long get_num_stationary_components_per_detection_template (
+        ) const;
+        /*!
+            requires
+                - get_num_detection_templates() > 0
+            ensures
+                - A detection template is a rectangle which defines the shape of a sliding
+                  window (the object_box), as well as a set of rectangles which define
+                  feature extraction zones.  This function returns the number of stationary
+                  feature extraction zones in the detection templates used by this object. 
+        !*/
+
+        unsigned long get_num_movable_components_per_detection_template (
+        ) const;
+        /*!
+            requires
+                - get_num_detection_templates() > 0
+            ensures
+                - A detection template is a rectangle which defines the shape of a sliding
+                  window (the object_box), as well as a set of rectangles which define
+                  feature extraction zones.  This function returns the number of movable 
+                  feature extraction zones in the detection templates used by this object. 
+        !*/
+
        unsigned long get_num_components_per_detection_template (
        ) const;
        /*!
            requires
                - get_num_detection_templates() > 0
            ensures
-                - A detection template is a rectangle which defines the shape of a 
-                  sliding window (the object_box), as well as a set of rectangles which
-                  envelop it.  This function returns the number of enveloping rectangles
-                  in the detection templates used by this object.
+                - returns the total number of feature extraction zones in the detection
+                  templates used by this object.  That is, returns the following:
+                    - get_num_movable_components_per_detection_template() + 
+                      get_num_stationary_components_per_detection_template()
        !*/

        long get_num_dimensions (
@@ -217,7 +269,8 @@ namespace dlib
            ensures
                - returns the number of dimensions in the feature vector for a sliding window
                  location.  This value is the dimensionality of the underlying feature vectors 
-                  produced by Feature_extractor_type times get_num_components_per_detection_template().
+                  produced by Feature_extractor_type times (get_num_stationary_components_per_detection_template() + 
+                  get_num_movable_components_per_detection_template()).
        !*/

        unsigned long get_max_pyramid_levels (
@@ -339,21 +392,45 @@ namespace dlib
        !*/

        void get_feature_vector (
+            const full_object_detection& obj,
+            feature_vector_type& psi
+        ) const;
+        /*!
+            requires
+                - obj.movable_parts.size() == get_num_movable_components_per_detection_template()
+                - is_loaded_with_image() == true
+                - get_num_detection_templates() > 0
+                - psi.size() >= get_num_dimensions()
+                  (i.e. psi must have preallocated its memory before this function is called)
+            ensures
+                - This function allows you to determine the feature vector used for a sliding window location.
+                  Note that this vector is added to psi.
+                - Since scan_image_pyramid is a sliding window classifier system, not all possible rectangles can 
+                  be output by detect().  So in the case where obj.rect could not arise from a call to detect(), this 
+                  function will map obj.rect to the nearest possible object box and then add the feature vector for 
+                  the mapped rectangle into #psi.
+                - get_best_matching_rect(obj.rect) == the rectangle obj.rect gets mapped to for feature extraction.
+        !*/
+
+        full_object_detection get_feature_vector (
            const rectangle& rect,
+            const feature_vector_type& w,
            feature_vector_type& psi
        ) const;
        /*!
            requires
+                - w.size() >= get_num_dimensions()
                - is_loaded_with_image() == true
                - get_num_detection_templates() > 0
                - psi.size() >= get_num_dimensions()
+                  (i.e. psi must have preallocated its memory before this function is called)
            ensures
                - This function allows you to determine the feature vector used for a sliding window location.
                  Note that this vector is added to psi.
                - if (rect was produced by a call to detect(), i.e. rect contains an element of dets) then
                    - #psi == psi + the feature vector corresponding to the sliding window location indicated 
                      by rect.
-                    - Let w denote the w vector given to detect(), then if we assigned psi to 0 before calling
+                    - If w is the w vector given to detect(), then if we assigned 0 to psi before calling
                      get_feature_vector() then we have:
                        - dot(w,#psi) == the score produced by detect() for rect.
                    - get_best_matching_rect(rect) == rect
@@ -363,6 +440,12 @@ namespace dlib
                      function will map rect to the nearest possible object box and then add the feature vector for 
                      the mapped rectangle into #psi.
                    - get_best_matching_rect(rect) == the rectangle rect gets mapped to for feature extraction.
+                - returns a full_object_detection OBJ such that calling get_feature_vector(OBJ,psi)
+                  and get_feature_vector(OBJ.rect,w,psi) on a psi of 0 would both result in the same psi vector being output.
+                  This means that:
+                    - OBJ.rect == rect
+                    - OBJ.movable_parts.size() == get_num_movable_components_per_detection_template()
+                    - OBJ.movable_parts == the locations of the movable parts inside this detection.
        !*/

    };

--- a/dlib/svm/structural_object_detection_trainer.h
+++ b/dlib/svm/structural_object_detection_trainer.h
@@ -9,6 +9,7 @@
 #include "structural_svm_object_detection_problem.h"
 #include "../image_processing/object_detector.h"
 #include "../image_processing/box_overlap_testing.h"
+#include "../image_processing/full_object_detection.h"


 namespace dlib
@@ -54,6 +55,12 @@ namespace dlib
            auto_overlap_tester = is_same_type<overlap_tester_type,test_box_overlap>::value;
        }

+        const image_scanner_type& get_scanner (
+        ) const
+        {
+            return scanner;
+        }
+
        bool auto_set_overlap_tester (
        ) const 
        { 
@@ -239,29 +246,45 @@ namespace dlib
            >
        const trained_function_type train (
            const image_array_type& images,
-            const std::vector<std::vector<rectangle> >& truth_rects
+            const std::vector<std::vector<full_object_detection> >& truth_object_detections
        ) const
        {
+#ifdef ENABLE_ASSERTS
            // make sure requires clause is not broken
-            DLIB_ASSERT(is_learning_problem(images,truth_rects) == true,
-                "\t trained_function_type structural_object_detection_trainer::train(x,y)"
+            DLIB_ASSERT(is_learning_problem(images,truth_object_detections) == true,
+                "\t trained_function_type structural_object_detection_trainer::train()"
                << "\n\t invalid inputs were given to this function"
                << "\n\t images.size():      " << images.size()
-                << "\n\t truth_rects.size(): " << truth_rects.size()
-                << "\n\t is_learning_problem(images,truth_rects): " << is_learning_problem(images,truth_rects)
+                << "\n\t truth_object_detections.size(): " << truth_object_detections.size()
+                << "\n\t is_learning_problem(images,truth_object_detections): " << is_learning_problem(images,truth_object_detections)
                );
+            for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
+            {
+                for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j)
+                {
+                    DLIB_ASSERT(truth_object_detections[i][j].movable_parts.size() == get_scanner().get_num_movable_components_per_detection_template(),
+                        "\t trained_function_type structural_object_detection_trainer::train()"
+                        << "\n\t invalid inputs were given to this function"
+                        << "\n\t truth_object_detections["<<i<<"]["<<j<<"].movable_parts.size():                " << 
+                            truth_object_detections[i][j].movable_parts.size()
+                        << "\n\t get_scanner().get_num_movable_components_per_detection_template(): " << 
+                            get_scanner().get_num_movable_components_per_detection_template()
+                    );
+                }
+            }
+#endif

            overlap_tester_type local_overlap_tester;

            if (auto_overlap_tester)
            {
-                std::vector<std::vector<rectangle> > mapped_rects(truth_rects.size());
-                for (unsigned long i = 0; i < truth_rects.size(); ++i)
+                std::vector<std::vector<rectangle> > mapped_rects(truth_object_detections.size());
+                for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
                {
-                    mapped_rects[i].resize(truth_rects[i].size());
-                    for (unsigned long j = 0; j < truth_rects[i].size(); ++j)
+                    mapped_rects[i].resize(truth_object_detections[i].size());
+                    for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j)
                    {
-                        mapped_rects[i][j] = scanner.get_best_matching_rect(truth_rects[i][j]);
+                        mapped_rects[i][j] = scanner.get_best_matching_rect(truth_object_detections[i][j].rect);
                    }
                }

@@ -273,7 +296,7 @@ namespace dlib
            }

            structural_svm_object_detection_problem<image_scanner_type,overlap_tester_type,image_array_type > 
-                svm_prob(scanner, local_overlap_tester, images, truth_rects, num_threads);
+                svm_prob(scanner, local_overlap_tester, images, truth_object_detections, num_threads);

            if (verbose)
                svm_prob.be_verbose();
@@ -293,6 +316,25 @@ namespace dlib
            return object_detector<image_scanner_type,overlap_tester_type>(scanner, local_overlap_tester, w);
        }

+        template <
+            typename image_array_type
+            >
+        const trained_function_type train (
+            const image_array_type& images,
+            const std::vector<std::vector<rectangle> >& truth_object_detections
+        ) const
+        {
+            std::vector<std::vector<full_object_detection> > truth_dets(truth_object_detections.size());
+            for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
+            {
+                for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j)
+                {
+                    truth_dets[i].push_back(full_object_detection(truth_object_detections[i][j]));
+                }
+            }
+
+            return train(images, truth_dets);
+        }

    private:


--- a/dlib/svm/structural_object_detection_trainer_abstract.h
+++ b/dlib/svm/structural_object_detection_trainer_abstract.h
@@ -6,6 +6,7 @@
 #include "structural_svm_object_detection_problem_abstract.h"
 #include "../image_processing/object_detector_abstract.h"
 #include "../image_processing/box_overlap_testing_abstract.h"
+#include "../image_processing/full_object_detection_abstract.h"


 namespace dlib
@@ -60,12 +61,22 @@ namespace dlib
                - #get_loss_per_false_alarm() == 1
                - This object will attempt to learn a model for the given
                  scanner object when train() is called.
+                - #get_scanner() == scanner
+                  (note that only the "configuration" of scanner is copied.
+                  I.e. the copy is done using copy_configuration())
                - if (overlap_tester_type == test_box_overlap) then
                    - #auto_set_overlap_tester() == true
                - else
                    - #auto_set_overlap_tester() == false
        !*/

+        const image_scanner_type& get_scanner (
+        ) const;
+        /*!
+            ensures
+                - returns the image scanner used by this object.  
+        !*/
+
        bool auto_set_overlap_tester (
        ) const;
        /*!
@@ -74,7 +85,7 @@ namespace dlib
                  state for the overlap tester used for non-max suppression.) then
                    - returns true
                    - In this case, it is determined using the find_tight_overlap_tester() 
-                      routine based on the truth_rects given to the 
+                      routine based on the truth_object_detections given to the 
                      structural_object_detection_trainer::train() method.  
                - else
                    - returns false
@@ -276,20 +287,43 @@ namespace dlib
            >
        const trained_function_type train (
            const image_array_type& images,
-            const std::vector<std::vector<rectangle> >& truth_rects
+            const std::vector<std::vector<full_object_detection> >& truth_object_detections
        ) const;
        /*!
            requires
-                - is_learning_problem(images, truth_rects) == true
+                - is_learning_problem(images, truth_object_detections) == true
                - it must be valid to pass images[0] into the image_scanner_type::load() method.
                  (also, image_array_type must be an implementation of dlib/array/array_kernel_abstract.h)
+                - for all valid i, j:
+                    - truth_object_detections[i][j].movable_parts.size() == get_scanner().get_num_movable_components_per_detection_template() 
            ensures
                - Uses the structural_svm_object_detection_problem to train an object_detector 
-                  on the given images and truth_rects.  
+                  on the given images and truth_object_detections.  
                - returns a function F with the following properties:
                    - F(new_image) == A prediction of what objects are present in new_image.  This
                      is a set of rectangles indicating their positions.
        !*/
+
+        template <
+            typename image_array_type
+            >
+        const trained_function_type train (
+            const image_array_type& images,
+            const std::vector<std::vector<rectangle> >& truth_object_detections
+        ) const;
+        /*!
+            requires
+                - is_learning_problem(images, truth_object_detections) == true
+                - it must be valid to pass images[0] into the image_scanner_type::load() method.
+                  (also, image_array_type must be an implementation of dlib/array/array_kernel_abstract.h)
+                - get_scanner().get_num_movable_components_per_detection_template() == 0
+            ensures
+                - This function is identical to the above train(), except that it converts 
+                  each element of truth_object_detections into a full_object_detection by 
+                  passing it to full_object_detection's constructor taking only a rectangle.
+                  Therefore, this version of train() is a convenience function for for the 
+                  case where you don't have any movable components of the detection templates.
+        !*/
    }; 

 // ----------------------------------------------------------------------------------------

--- a/dlib/svm/structural_svm_object_detection_problem.h
+++ b/dlib/svm/structural_svm_object_detection_problem.h
@@ -9,6 +9,7 @@
 #include <sstream>
 #include "../string.h"
 #include "../array.h"
+#include "../image_processing/full_object_detection.h"

 namespace dlib
 {
@@ -37,35 +38,51 @@ namespace dlib
            const image_scanner_type& scanner,
            const overlap_tester_type& overlap_tester,
            const image_array_type& images_,
-            const std::vector<std::vector<rectangle> >& truth_rects_,
+            const std::vector<std::vector<full_object_detection> >& truth_object_detections_,
            unsigned long num_threads = 2
        ) :
            structural_svm_problem_threaded<matrix<double,0,1> >(num_threads),
            boxes_overlap(overlap_tester),
            images(images_),
-            truth_rects(truth_rects_),
+            truth_object_detections(truth_object_detections_),
            match_eps(0.5),
            loss_per_false_alarm(1),
            loss_per_missed_target(1)
        {
+#ifdef ENABLE_ASSERTS
            // make sure requires clause is not broken
-            DLIB_ASSERT(is_learning_problem(images_, truth_rects_) && 
+            DLIB_ASSERT(is_learning_problem(images_, truth_object_detections_) && 
                         scanner.get_num_detection_templates() > 0,
                "\t structural_svm_object_detection_problem::structural_svm_object_detection_problem()"
                << "\n\t Invalid inputs were given to this function "
                << "\n\t scanner.get_num_detection_templates(): " << scanner.get_num_detection_templates()
-                << "\n\t is_learning_problem(images_,truth_rects_): " << is_learning_problem(images_,truth_rects_)
+                << "\n\t is_learning_problem(images_,truth_object_detections_): " << is_learning_problem(images_,truth_object_detections_)
                << "\n\t this: " << this
                );
+            for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
+            {
+                for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j)
+                {
+                    DLIB_ASSERT(truth_object_detections[i][j].movable_parts.size() == scanner.get_num_movable_components_per_detection_template(),
+                        "\t trained_function_type structural_object_detection_trainer::train()"
+                        << "\n\t invalid inputs were given to this function"
+                        << "\n\t truth_object_detections["<<i<<"]["<<j<<"].movable_parts.size():          " << 
+                            truth_object_detections[i][j].movable_parts.size()
+                        << "\n\t scanner.get_num_movable_components_per_detection_template(): " << 
+                            scanner.get_num_movable_components_per_detection_template()
+                    );
+                }
+            }
+#endif

            scanners.set_max_size(images.size());
            scanners.set_size(images.size());

            max_num_dets = 0;
-            for (unsigned long i = 0; i < truth_rects.size(); ++i)
+            for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
            {
-                if (truth_rects[i].size() > max_num_dets)
-                    max_num_dets = truth_rects[i].size();
+                if (truth_object_detections[i].size() > max_num_dets)
+                    max_num_dets = truth_object_detections[i].size();

                scanners[i].copy_configuration(scanner);
            }
@@ -160,12 +177,12 @@ namespace dlib
            std::vector<rectangle> mapped_rects;

            psi = 0;
-            for (unsigned long i = 0; i < truth_rects[idx].size(); ++i)
+            for (unsigned long i = 0; i < truth_object_detections[idx].size(); ++i)
            {
-                mapped_rects.push_back(scanner.get_best_matching_rect(truth_rects[idx][i]));
-                scanner.get_feature_vector(truth_rects[idx][i], psi);
+                mapped_rects.push_back(scanner.get_best_matching_rect(truth_object_detections[idx][i].rect));
+                scanner.get_feature_vector(truth_object_detections[idx][i], psi);
            }
-            psi(scanner.get_num_dimensions()) = -1.0*truth_rects[idx].size();
+            psi(scanner.get_num_dimensions()) = -1.0*truth_object_detections[idx].size();

            // check if any of the boxes overlap.  If they do then it is impossible for
            // us to learn to correctly classify this sample
@@ -207,8 +224,8 @@ namespace dlib
            // truth rectangles.
            for (unsigned long i = 0; i < mapped_rects.size(); ++i)
            {
-                const double area = (truth_rects[idx][i].intersect(mapped_rects[i])).area();
-                const double total_area = (truth_rects[idx][i] + mapped_rects[i]).area();
+                const double area = (truth_object_detections[idx][i].rect.intersect(mapped_rects[i])).area();
+                const double total_area = (truth_object_detections[idx][i].rect + mapped_rects[i]).area();
                if (area/total_area <= match_eps)
                {
                    using namespace std;
@@ -231,9 +248,9 @@ namespace dlib
                    sout << "image index              "<< idx << endl;
                    sout << "match_eps:               "<< match_eps << endl;
                    sout << "best possible match:     "<< area/total_area << endl;
-                    sout << "truth rect:              "<< truth_rects[idx][i] << endl;
-                    sout << "truth rect width/height: "<< truth_rects[idx][i].width()/(double)truth_rects[idx][i].height() << endl;
-                    sout << "truth rect area:         "<< truth_rects[idx][i].area() << endl;
+                    sout << "truth rect:              "<< truth_object_detections[idx][i].rect << endl;
+                    sout << "truth rect width/height: "<< truth_object_detections[idx][i].rect.width()/(double)truth_object_detections[idx][i].rect.height() << endl;
+                    sout << "truth rect area:         "<< truth_object_detections[idx][i].rect.area() << endl;
                    sout << "nearest detection template rect:              "<< mapped_rects[i] << endl;
                    sout << "nearest detection template rect width/height: "<< mapped_rects[i].width()/(double)mapped_rects[i].height() << endl;
                    sout << "nearest detection template rect area:         "<< mapped_rects[i].area() << endl;
@@ -262,13 +279,13 @@ namespace dlib
            // The loss will measure the number of incorrect detections.  A detection is
            // incorrect if it doesn't hit a truth rectangle or if it is a duplicate detection
            // on a truth rectangle.
-            loss = truth_rects[idx].size()*loss_per_missed_target;
+            loss = truth_object_detections[idx].size()*loss_per_missed_target;

            // Measure the loss augmented score for the detections which hit a truth rect.
-            std::vector<double> truth_score_hits(truth_rects[idx].size(), 0);
+            std::vector<double> truth_score_hits(truth_object_detections[idx].size(), 0);

            // keep track of which truth boxes we have hit so far.
-            std::vector<bool> hit_truth_table(truth_rects[idx].size(), false);
+            std::vector<bool> hit_truth_table(truth_object_detections[idx].size(), false);

            std::vector<rectangle> final_dets;
            // The point of this loop is to fill out the truth_score_hits array. 
@@ -277,7 +294,7 @@ namespace dlib
                if (overlaps_any_box(final_dets, dets[i].second))
                    continue;

-                const std::pair<double,unsigned int> truth = find_best_match(truth_rects[idx], dets[i].second);
+                const std::pair<double,unsigned int> truth = find_best_match(truth_object_detections[idx], dets[i].second);

                final_dets.push_back(dets[i].second);

@@ -285,7 +302,7 @@ namespace dlib
                // if hit truth rect
                if (truth_match > match_eps)
                {
-                    // if this is the first time we have seen a detect which hit truth_rects[truth.second]
+                    // if this is the first time we have seen a detect which hit truth_object_detections[truth.second]
                    const double score = dets[i].first - thresh;
                    if (hit_truth_table[truth.second] == false)
                    {
@@ -311,7 +328,7 @@ namespace dlib
                if (overlaps_any_box(final_dets, dets[i].second))
                    continue;

-                const std::pair<double,unsigned int> truth = find_best_match(truth_rects[idx], dets[i].second);
+                const std::pair<double,unsigned int> truth = find_best_match(truth_object_detections[idx], dets[i].second);

                const double truth_match = truth.first;
                if (truth_match > match_eps)
@@ -342,27 +359,27 @@ namespace dlib
            psi.set_size(get_num_dimensions());
            psi = 0;
            for (unsigned long i = 0; i < final_dets.size(); ++i)
-                scanner.get_feature_vector(final_dets[i], psi);
+                scanner.get_feature_vector(final_dets[i], current_solution, psi);

            psi(scanner.get_num_dimensions()) = -1.0*final_dets.size();
        }


        bool overlaps_any_box (
-            const std::vector<rectangle>& truth_rects,
+            const std::vector<rectangle>& truth_object_detections,
            const dlib::rectangle& rect
        ) const
        {
-            for (unsigned long i = 0; i < truth_rects.size(); ++i)
+            for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
            {
-                if (boxes_overlap(truth_rects[i], rect))
+                if (boxes_overlap(truth_object_detections[i], rect))
                    return true;
            }
            return false;
        }

        std::pair<double,unsigned int> find_best_match(
-            const std::vector<rectangle>& boxes,
+            const std::vector<full_object_detection>& boxes,
            const rectangle rect
        ) const
        /*!
@@ -381,10 +398,10 @@ namespace dlib
            for (unsigned long i = 0; i < boxes.size(); ++i)
            {

-                const unsigned long area = rect.intersect(boxes[i]).area();
+                const unsigned long area = rect.intersect(boxes[i].rect).area();
                if (area != 0)
                {
-                    const double new_match = area / static_cast<double>((rect + boxes[i]).area());
+                    const double new_match = area / static_cast<double>((rect + boxes[i].rect).area());
                    if (new_match > match)
                    {
                        match = new_match;
@@ -411,7 +428,7 @@ namespace dlib
        mutable array<image_scanner_type> scanners;

        const image_array_type& images;
-        const std::vector<std::vector<rectangle> >& truth_rects;
+        const std::vector<std::vector<full_object_detection> >& truth_object_detections;

        unsigned long max_num_dets;
        double match_eps;

--- a/dlib/svm/structural_svm_object_detection_problem_abstract.h
+++ b/dlib/svm/structural_svm_object_detection_problem_abstract.h
@@ -6,6 +6,7 @@
 #include "../matrix.h"
 #include "structural_svm_problem_threaded_abstract.h"
 #include <sstream>
+#include "../image_processing/full_object_detection_abstract.h"

 namespace dlib
 {
@@ -81,23 +82,25 @@ namespace dlib
            const image_scanner_type& scanner,
            const overlap_tester_type& overlap_tester,
            const image_array_type& images,
-            const std::vector<std::vector<rectangle> >& truth_rects,
+            const std::vector<std::vector<full_object_detection> >& truth_object_detections,
            unsigned long num_threads = 2
        );
        /*!
            requires
-                - is_learning_problem(images, truth_rects)
+                - is_learning_problem(images, truth_object_detections)
                - scanner.get_num_detection_templates() > 0
                - scanner.load(images[0]) must be a valid expression.
+                - for all valid i, j:
+                    - truth_object_detections[i][j].movable_rects.size() == scanner.get_num_movable_components_per_detection_template() 
            ensures
                - This object attempts to learn a mapping from the given images to the 
-                  object locations given in truth_rects.  In particular, it attempts to 
-                  learn to predict truth_rects[i] based on images[i].
+                  object locations given in truth_object_detections.  In particular, it attempts to 
+                  learn to predict truth_object_detections[i] based on images[i].
                  Or in other words, this object can be used to learn a parameter vector, w, such that 
                  an object_detector declared as:
                    object_detector<image_scanner_type,overlap_tester_type> detector(scanner,overlap_tester,w)
                  results in a detector object which attempts to compute the following mapping:
-                    truth_rects[i] == detector(images[i])
+                    truth_object_detections[i].rect == detector(images[i])
                - #get_match_eps() == 0.5
                - This object will use num_threads threads during the optimization 
                  procedure.  You should set this parameter equal to the number of 

--- a/dlib/test/object_detector.cpp
+++ b/dlib/test/object_detector.cpp
@@ -57,6 +57,7 @@ namespace
            detector(images[i], dets2);

            matrix<double,0,1> psi(detector.get_w().size());
+            matrix<double,0,1> psi2(detector.get_w().size());
            const double thresh = detector.get_w()(detector.get_w().size()-1);

            DLIB_TEST(dets.size() == dets2.size());
@@ -65,10 +66,19 @@ namespace
                DLIB_TEST(dets[j] == dets2[j].second);

                psi = 0;
-                detector.get_scanner().get_feature_vector(dets[j], psi);
+                const full_object_detection fdet = detector.get_scanner().get_feature_vector(dets[j], detector.get_w(), psi);

-                const double check_score = dot(psi,detector.get_w()) - thresh;
+                double check_score = dot(psi,detector.get_w()) - thresh;
                DLIB_TEST(std::abs(check_score - dets2[j].first) < 1e-10);
+
+
+                // Make sure fdet works the way it is supposed to with get_feature_vector().
+                psi2 = 0;
+                detector.get_scanner().get_feature_vector(fdet, psi2);
+
+                check_score = dot(psi2,detector.get_w()) - thresh;
+                DLIB_TEST(std::abs(check_score - dets2[j].first) < 1e-10);
+                DLIB_TEST(max(abs(psi-psi2)) < 1e-10);
            }

        }