Commit 954612b6 authored by Davis King's avatar Davis King

Cleaned up the code a bit. Still more cleaning to do.

parent b191400a
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include "../matrix.h" #include "../matrix.h"
#include "../geometry.h" #include "../geometry.h"
#include "../pixel.h" #include "../pixel.h"
#include "../console_progress_indicator.h"
namespace dlib namespace dlib
{ {
...@@ -392,34 +393,131 @@ namespace dlib ...@@ -392,34 +393,131 @@ namespace dlib
!*/ !*/
public: public:
shape_predictor_trainer (
)
{
_cascade_depth = 10;
_tree_depth = 2;
_num_trees_per_cascade_level = 500;
_nu = 0.1;
_oversampling_amount = 20;
_feature_pool_size = 400;
_lambda = 0.1;
_num_test_splits = 20;
_feature_pool_region_padding = 0;
_verbose = false;
}
unsigned long get_cascade_depth (
) const { return _cascade_depth; }
unsigned long cascade_depth ( void set_cascade_depth (
) const { return 10; } unsigned long depth
)
{
DLIB_CASSERT(depth > 0, "");
_cascade_depth = depth;
}
unsigned long tree_depth ( unsigned long get_tree_depth (
) const { return 2; } ) const { return _tree_depth; }
unsigned long num_trees_per_cascade_level ( void set_tree_depth (
) const { return 500; } unsigned long depth
)
{
DLIB_CASSERT(depth > 0, "");
_tree_depth = depth;
}
unsigned long get_num_trees_per_cascade_level (
) const { return _num_trees_per_cascade_level; }
void set_num_trees_per_cascade_level (
unsigned long num
)
{
DLIB_CASSERT( num > 0, "");
_num_trees_per_cascade_level = num;
}
double get_nu ( double get_nu (
) const { return 0.1; } // the regularizer ) const { return _nu; }
void set_nu (
double nu
)
{
DLIB_CASSERT(nu > 0,"");
_nu = nu;
}
std::string random_seed ( std::string get_random_seed (
) const { return "dlib rules"; } ) const { return rnd.get_seed(); }
void set_random_seed (
const std::string& seed
) { rnd.set_seed(seed); }
unsigned long oversampling_amount ( unsigned long get_oversampling_amount (
) const { return 20; } ) const { return _oversampling_amount; }
void set_oversampling_amount (
unsigned long amount
)
{
DLIB_CASSERT(amount > 0, "");
_oversampling_amount = amount;
}
unsigned long get_feature_pool_size (
) const { return _feature_pool_size; }
void set_feature_pool_size (
unsigned long size
)
{
DLIB_CASSERT(size > 1, "");
_feature_pool_size = size;
}
// feature sampling parameters
unsigned long feature_pool_size (
) const { return 400; }// this must be > 1
double get_lambda ( double get_lambda (
) const { return 0.1; } ) const { return _lambda; }
void set_lambda (
double lambda
)
{
DLIB_CASSERT(lambda > 0, "");
_lambda = lambda;
}
unsigned long get_num_test_splits ( unsigned long get_num_test_splits (
) const { return 20; } ) const { return _num_test_splits; }
void set_num_test_splits (
unsigned long num
)
{
DLIB_CASSERT(num > 0, "");
_num_test_splits = num;
}
double get_feature_pool_region_padding ( double get_feature_pool_region_padding (
) const { return 0; } ) const { return _feature_pool_region_padding; }
void set_feature_pool_region_padding (
double padding
)
{
_feature_pool_region_padding = padding;
}
void be_verbose (
)
{
_verbose = true;
}
void be_quiet (
)
{
_verbose = false;
}
template <typename image_array> template <typename image_array>
shape_predictor train ( shape_predictor train (
...@@ -430,17 +528,21 @@ namespace dlib ...@@ -430,17 +528,21 @@ namespace dlib
using namespace impl; using namespace impl;
DLIB_CASSERT(images.size() == objects.size() && images.size() > 0, ""); DLIB_CASSERT(images.size() == objects.size() && images.size() > 0, "");
rnd.set_seed(get_random_seed());
std::vector<training_sample> samples; std::vector<training_sample> samples;
const matrix<float,0,1> initial_shape = populate_training_sample_shapes(objects, samples); const matrix<float,0,1> initial_shape = populate_training_sample_shapes(objects, samples);
const std::vector<std::vector<dlib::vector<float,2> > > pixel_coordinates = randomly_sample_pixel_coordinates(initial_shape); const std::vector<std::vector<dlib::vector<float,2> > > pixel_coordinates = randomly_sample_pixel_coordinates(initial_shape);
rnd.set_seed(random_seed()); unsigned long trees_fit_so_far = 0;
console_progress_indicator pbar(get_cascade_depth()*get_num_trees_per_cascade_level());
if (_verbose)
std::cout << "Fitting trees..." << std::endl;
std::vector<std::vector<impl::regression_tree> > forests(cascade_depth()); std::vector<std::vector<impl::regression_tree> > forests(get_cascade_depth());
// Now start doing the actual training by filling in the forests // Now start doing the actual training by filling in the forests
for (unsigned long cascade = 0; cascade < cascade_depth(); ++cascade) for (unsigned long cascade = 0; cascade < get_cascade_depth(); ++cascade)
{ {
// TODO, add some verbose option that prints here
// Each cascade uses a different set of pixels for its features. We compute // Each cascade uses a different set of pixels for its features. We compute
// their representations relative to the initial shape first. // their representations relative to the initial shape first.
std::vector<unsigned long> anchor_idx; std::vector<unsigned long> anchor_idx;
...@@ -457,12 +559,21 @@ namespace dlib ...@@ -457,12 +559,21 @@ namespace dlib
} }
// Now start building the trees at this cascade level. // Now start building the trees at this cascade level.
for (unsigned long i = 0; i < num_trees_per_cascade_level(); ++i) for (unsigned long i = 0; i < get_num_trees_per_cascade_level(); ++i)
{ {
forests[cascade].push_back(make_regression_tree(samples, pixel_coordinates[cascade])); forests[cascade].push_back(make_regression_tree(samples, pixel_coordinates[cascade]));
if (_verbose)
{
++trees_fit_so_far;
pbar.print_status(trees_fit_so_far);
}
} }
} }
if (_verbose)
std::cout << "Training complete " << std::endl;
return shape_predictor(initial_shape, forests, pixel_coordinates); return shape_predictor(initial_shape, forests, pixel_coordinates);
} }
...@@ -488,7 +599,7 @@ namespace dlib ...@@ -488,7 +599,7 @@ namespace dlib
/*! /*!
CONVENTION CONVENTION
- feature_pixel_values.size() == feature_pool_size() - feature_pixel_values.size() == get_feature_pool_size()
- feature_pixel_values[j] == the value of the j-th feature pool - feature_pixel_values[j] == the value of the j-th feature pool
pixel when you look it up relative to the shape in current_shape. pixel when you look it up relative to the shape in current_shape.
...@@ -527,7 +638,7 @@ namespace dlib ...@@ -527,7 +638,7 @@ namespace dlib
impl::regression_tree tree; impl::regression_tree tree;
// walk the tree in breadth first order // walk the tree in breadth first order
const unsigned long num_split_nodes = static_cast<unsigned long>(std::pow(2.0, (double)tree_depth())-1); const unsigned long num_split_nodes = static_cast<unsigned long>(std::pow(2.0, (double)get_tree_depth())-1);
std::vector<matrix<float,0,1> > sums(num_split_nodes*2+1); std::vector<matrix<float,0,1> > sums(num_split_nodes*2+1);
for (unsigned long i = 0; i < samples.size(); ++i) for (unsigned long i = 0; i < samples.size(); ++i)
sums[0] += samples[i].target_shape - samples[i].current_shape; sums[0] += samples[i].target_shape - samples[i].current_shape;
...@@ -574,8 +685,8 @@ namespace dlib ...@@ -574,8 +685,8 @@ namespace dlib
double accept_prob; double accept_prob;
do do
{ {
feat.idx1 = rnd.get_random_32bit_number()%feature_pool_size(); feat.idx1 = rnd.get_random_32bit_number()%get_feature_pool_size();
feat.idx2 = rnd.get_random_32bit_number()%feature_pool_size(); feat.idx2 = rnd.get_random_32bit_number()%get_feature_pool_size();
const double dist = length(pixel_coordinates[feat.idx1]-pixel_coordinates[feat.idx2]); const double dist = length(pixel_coordinates[feat.idx1]-pixel_coordinates[feat.idx2]);
accept_prob = std::exp(-dist/lambda); accept_prob = std::exp(-dist/lambda);
} }
...@@ -699,7 +810,7 @@ namespace dlib ...@@ -699,7 +810,7 @@ namespace dlib
sample.image_idx = i; sample.image_idx = i;
sample.rect = objects[i][j].get_rect(); sample.rect = objects[i][j].get_rect();
sample.target_shape = object_to_shape(objects[i][j]); sample.target_shape = object_to_shape(objects[i][j]);
for (unsigned long itr = 0; itr < oversampling_amount(); ++itr) for (unsigned long itr = 0; itr < get_oversampling_amount(); ++itr)
samples.push_back(sample); samples.push_back(sample);
mean_shape += sample.target_shape; mean_shape += sample.target_shape;
++count; ++count;
...@@ -711,7 +822,7 @@ namespace dlib ...@@ -711,7 +822,7 @@ namespace dlib
// now go pick random initial shapes // now go pick random initial shapes
for (unsigned long i = 0; i < samples.size(); ++i) for (unsigned long i = 0; i < samples.size(); ++i)
{ {
if ((i%oversampling_amount()) == 0) if ((i%get_oversampling_amount()) == 0)
{ {
// The mean shape is what we really use as an initial shape so always // The mean shape is what we really use as an initial shape so always
// include it in the training set as an example starting shape. // include it in the training set as an example starting shape.
...@@ -742,13 +853,13 @@ namespace dlib ...@@ -742,13 +853,13 @@ namespace dlib
) const ) const
/*! /*!
ensures ensures
- #pixel_coordinates.size() == feature_pool_size() - #pixel_coordinates.size() == get_feature_pool_size()
- for all valid i: - for all valid i:
- pixel_coordinates[i] == a point in the box defined by the min/max x/y arguments. - pixel_coordinates[i] == a point in the box defined by the min/max x/y arguments.
!*/ !*/
{ {
pixel_coordinates.resize(feature_pool_size()); pixel_coordinates.resize(get_feature_pool_size());
for (unsigned long i = 0; i < feature_pool_size(); ++i) for (unsigned long i = 0; i < get_feature_pool_size(); ++i)
{ {
pixel_coordinates[i].x() = rnd.get_random_double()*(max_x-min_x) + min_x; pixel_coordinates[i].x() = rnd.get_random_double()*(max_x-min_x) + min_x;
pixel_coordinates[i].y() = rnd.get_random_double()*(max_y-min_y) + min_y; pixel_coordinates[i].y() = rnd.get_random_double()*(max_y-min_y) + min_y;
...@@ -769,15 +880,26 @@ namespace dlib ...@@ -769,15 +880,26 @@ namespace dlib
const double max_y = max(colm(temp,1))+padding; const double max_y = max(colm(temp,1))+padding;
std::vector<std::vector<dlib::vector<float,2> > > pixel_coordinates; std::vector<std::vector<dlib::vector<float,2> > > pixel_coordinates;
pixel_coordinates.resize(cascade_depth()); pixel_coordinates.resize(get_cascade_depth());
for (unsigned long i = 0; i < cascade_depth(); ++i) for (unsigned long i = 0; i < get_cascade_depth(); ++i)
randomly_sample_pixel_coordinates(pixel_coordinates[i], min_x, min_y, max_x, max_y); randomly_sample_pixel_coordinates(pixel_coordinates[i], min_x, min_y, max_x, max_y);
return pixel_coordinates; return pixel_coordinates;
} }
mutable dlib::rand rnd; mutable dlib::rand rnd;
unsigned long _cascade_depth;
unsigned long _tree_depth;
unsigned long _num_trees_per_cascade_level;
double _nu;
unsigned long _oversampling_amount;
unsigned long _feature_pool_size;
double _lambda;
unsigned long _num_test_splits;
unsigned long _feature_pool_region_padding;
bool _verbose;
}; };
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
......
...@@ -16,6 +16,18 @@ namespace dlib ...@@ -16,6 +16,18 @@ namespace dlib
class shape_predictor class shape_predictor
{ {
/*! /*!
WHAT THIS OBJECT REPRESENTS
This object is a tool that takes in an image region containing some object
and outputs a "shape" or set of point locations that define the pose of the
object. The classic example of this is human face pose prediction, where
you take an image of a human face as input and are expected to identify the
locations of important facial landmarks such as the corners of the mouth
and eyes, tip of the nose, and so forth.
To create useful instantiations of this object you need to use the
shape_predictor_trainer object defined below to train a shape_predictor
using a set of training images, each annotated with shapes you want to
predict.
!*/ !*/
public: public:
...@@ -23,13 +35,15 @@ namespace dlib ...@@ -23,13 +35,15 @@ namespace dlib
shape_predictor ( shape_predictor (
); );
/*! /*!
ensures
- #num_parts() == 0
!*/ !*/
unsigned long num_parts ( unsigned long num_parts (
) const; ) const;
/*! /*!
ensures ensures
- returns the number of points in the shape - returns the number of parts in the shapes predicted by this object.
!*/ !*/
template <typename image_type> template <typename image_type>
...@@ -42,10 +56,18 @@ namespace dlib ...@@ -42,10 +56,18 @@ namespace dlib
- image_type == an image object that implements the interface defined in - image_type == an image object that implements the interface defined in
dlib/image_processing/generic_image.h dlib/image_processing/generic_image.h
ensures ensures
- runs the tree regressor on the detection rect inside img and returns a - Runs the shape prediction algorithm on the part of the image contained in
full_object_detection DET such that: the given bounding rectangle. So it will try and fit the shape model to
the contents of the given rectangle in the image. For example, if there
is a human face inside the rectangle and you use a face landmarking shape
model then this function will return the locations of the face landmarks
as the parts. So the return value is a full_object_detection DET such
that:
- DET.get_rect() == rect - DET.get_rect() == rect
- DET.num_parts() == num_parts() - DET.num_parts() == num_parts()
- for all valid i:
- DET.part(i) == the location in img for the i-th part of the shape
predicted by this object.
!*/ !*/
}; };
...@@ -61,46 +83,208 @@ namespace dlib ...@@ -61,46 +83,208 @@ namespace dlib
class shape_predictor_trainer class shape_predictor_trainer
{ {
/*! /*!
This thing really only works with unsigned char or rgb_pixel images (since we assume the threshold WHAT THIS OBJECT REPRESENTS
should be in the range [-128,128]). This object is a tool for training shape_predictors based on annotated training
images. Its implementation uses the algorithm described in:
One Millisecond Face Alignment with an Ensemble of Regression Trees
by Vahid Kazemi and Josephine Sullivan, CVPR 2014
!*/ !*/
public: public:
unsigned long cascade_depth ( shape_predictor_trainer (
) const { return 10; } )
{
_cascade_depth = 10;
_tree_depth = 2;
_num_trees_per_cascade_level = 500;
_nu = 0.1;
_oversampling_amount = 20;
_feature_pool_size = 400;
_lambda = 0.1;
_num_test_splits = 20;
_feature_pool_region_padding = 0;
_verbose = false;
}
unsigned long get_cascade_depth (
) const;
/*!
!*/
unsigned long tree_depth ( void set_cascade_depth (
) const { return 2; } unsigned long depth
);
/*!
requires
- depth > 0
ensures
- #get_cascade_depth() == depth
!*/
unsigned long num_trees_per_cascade_level ( unsigned long get_tree_depth (
) const { return 500; } ) const;
/*!
!*/
void set_tree_depth (
unsigned long depth
);
/*!
requires
- depth > 0
ensures
- #get_tree_depth() == depth
!*/
unsigned long get_num_trees_per_cascade_level (
) const;
/*!
!*/
void set_num_trees_per_cascade_level (
unsigned long num
);
/*!
requires
- num > 0
ensures
- #get_num_trees_per_cascade_level() == num
!*/
double get_nu ( double get_nu (
) const { return 0.1; } // the regularizer ) const;
/*!
!*/
std::string random_seed ( void set_nu (
) const { return "dlib rules"; } double nu
);
/*!
requires
- nu > 0
ensures
- #get_nu() == nu
!*/
unsigned long oversampling_amount ( std::string get_random_seed (
) const { return 20; } ) const;
/*!
!*/
void set_random_seed (
const std::string& seed
);
/*!
ensures
- #get_random_seed() == seed
!*/
unsigned long get_oversampling_amount (
) const;
/*!
!*/
void set_oversampling_amount (
unsigned long amount
);
/*!
requires
- amount > 0
ensures
- #get_oversampling_amount() == amount
!*/
unsigned long get_feature_pool_size (
) const;
/*!
!*/
void set_feature_pool_size (
unsigned long size
);
/*!
requires
- size > 1
ensures
- #get_feature_pool_size() == size
!*/
// feature sampling parameters
unsigned long feature_pool_size (
) const { return 400; }// this must be > 1
double get_lambda ( double get_lambda (
) const { return 0.1; } ) const;
/*!
!*/
void set_lambda (
double lambda
);
/*!
requires
- lambda > 0
ensures
- #get_lambda() == lambda
!*/
unsigned long get_num_test_splits ( unsigned long get_num_test_splits (
) const { return 20; } ) const;
/*!
!*/
void set_num_test_splits (
unsigned long num
);
/*!
requires
- num > 0
ensures
- #get_num_test_splits() == num
!*/
double get_feature_pool_region_padding ( double get_feature_pool_region_padding (
) const { return 0; } ) const;
/*!
!*/
void set_feature_pool_region_padding (
double padding
);
/*!
ensures
- #get_feature_pool_region_padding() == padding
!*/
void be_verbose (
);
/*!
ensures
- This object will print status messages to standard out so that a
user can observe the progress of the algorithm.
!*/
void be_quiet (
);
/*!
ensures
- this object will not print anything to standard out
!*/
template <typename image_array> template <typename image_array>
shape_predictor train ( shape_predictor train (
const image_array& images, const image_array& images,
const std::vector<std::vector<full_object_detection> >& objects const std::vector<std::vector<full_object_detection> >& objects
) const; ) const;
/*!
requires
- images.size() == objects.size()
- images.size() > 0
ensures
- This object will try to learn to predict the locations of an object's parts
based on the object bounding box (i.e. full_object_detection::get_rect())
and the image pixels in that box. That is, we will try to learn a
shape_predictor, SP, such that:
SP(images[i], objects[i][j].get_rect()) == objects[i][j]
This learned SP object is then returned.
!*/
}; };
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
...@@ -133,17 +317,17 @@ namespace dlib ...@@ -133,17 +317,17 @@ namespace dlib
checking how well it recovers the part positions. In particular, for all checking how well it recovers the part positions. In particular, for all
valid i and j we perform: valid i and j we perform:
sp(images[i], objects[i][j].get_rect()) sp(images[i], objects[i][j].get_rect())
and compare the result with the truth part positions in objects[i][j]. We and compare the result with the truth part positions in objects[i][j]. We
then return the average distance between a predicted part location and its then return the average distance (measured in pixels) between a predicted
true position. This value is then returned. part location and its true position.
- if (scales.size() != 0) then - if (scales.size() != 0) then
- Each time we compute the distance between a predicted part location and - Each time we compute the distance between a predicted part location and
its true location in objects[i][j] we divide the distance by its true location in objects[i][j] we divide the distance by
scales[i][j]. Therefore, if you want the reported error to be the scales[i][j]. Therefore, if you want the reported error to be the
average pixel distance then give an empty scales vector, but if you want average pixel distance then give an empty scales vector, but if you want
the returned value to be something else like the average distance the returned value to be something else like the average distance
normalized by some feature of the objects (e.g. the interocular distance) normalized by some feature of each object (e.g. the interocular distance)
then you an supply those normalizing values via scales. then you can supply those normalizing values via scales.
!*/ !*/
template < template <
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment