Commit 15240203 authored by Davis King's avatar Davis King

Added image shape prediction code. It works well, just need to fill out the

spec and add asserts.
parent 5de87932
......@@ -14,6 +14,7 @@
#include "image_processing/scan_image_custom.h"
#include "image_processing/remove_unobtainable_rectangles.h"
#include "image_processing/scan_fhog_pyramid.h"
#include "image_processing/shape_predictor.h"
#endif // DLIB_IMAGE_PROCESSInG_H_h_
......
// Copyright (C) 2014 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_RENDER_FACE_DeTECTIONS_H_
#define DLIB_RENDER_FACE_DeTECTIONS_H_
#include "full_object_detection.h"
#include "../gui_widgets.h"
#include "render_face_detections_abstract.h"
#include <vector>
namespace dlib
{
inline std::vector<image_window::overlay_line> render_face_detections (
const std::vector<full_object_detection>& dets,
const rgb_pixel color = rgb_pixel(0,255,0)
)
{
std::vector<image_window::overlay_line> lines;
for (unsigned long i = 0; i < dets.size(); ++i)
{
const full_object_detection& d = dets[i];
for (unsigned long i = 1; i <= 16; ++i)
lines.push_back(image_window::overlay_line(d.part(i), d.part(i-1), color));
for (unsigned long i = 28; i <= 30; ++i)
lines.push_back(image_window::overlay_line(d.part(i), d.part(i-1), color));
for (unsigned long i = 18; i <= 21; ++i)
lines.push_back(image_window::overlay_line(d.part(i), d.part(i-1), color));
for (unsigned long i = 23; i <= 26; ++i)
lines.push_back(image_window::overlay_line(d.part(i), d.part(i-1), color));
for (unsigned long i = 31; i <= 35; ++i)
lines.push_back(image_window::overlay_line(d.part(i), d.part(i-1), color));
lines.push_back(image_window::overlay_line(d.part(30), d.part(35), color));
for (unsigned long i = 37; i <= 41; ++i)
lines.push_back(image_window::overlay_line(d.part(i), d.part(i-1), color));
lines.push_back(image_window::overlay_line(d.part(36), d.part(41), color));
for (unsigned long i = 43; i <= 47; ++i)
lines.push_back(image_window::overlay_line(d.part(i), d.part(i-1), color));
lines.push_back(image_window::overlay_line(d.part(42), d.part(47), color));
for (unsigned long i = 49; i <= 59; ++i)
lines.push_back(image_window::overlay_line(d.part(i), d.part(i-1), color));
lines.push_back(image_window::overlay_line(d.part(48), d.part(59), color));
for (unsigned long i = 61; i <= 67; ++i)
lines.push_back(image_window::overlay_line(d.part(i), d.part(i-1), color));
lines.push_back(image_window::overlay_line(d.part(60), d.part(67), color));
}
return lines;
}
}
#endif // DLIB_RENDER_FACE_DeTECTIONS_H_
// Copyright (C) 2014 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_RENDER_FACE_DeTECTIONS_ABSTRACT_H_
#ifdef DLIB_RENDER_FACE_DeTECTIONS_ABSTRACT_H_
#include "full_object_detection_abstract.h"
#include "../gui_widgets.h"
namespace dlib
{
inline std::vector<image_window::overlay_line> render_face_detections (
const std::vector<full_object_detection>& dets,
const rgb_pixel color = rgb_pixel(0,255,0)
);
/*!
requires
- for all valid i:
- dets[i].num_parts() == 68
ensures
- Interprets the given objects as face detections with parts annotated using
the iBUG face landmark scheme. We then return a set of overlay lines that
will draw the objects onto the screen in a way that properly draws the
outline of the face features defined by the part locations.
- returns a vector with dets.size() elements, each containing the lines
necessary to render a face detection from dets.
!*/
}
#endif // DLIB_RENDER_FACE_DeTECTIONS_ABSTRACT_H_
// Copyright (C) 2014 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_SHAPE_PREDICToR_H_
#define DLIB_SHAPE_PREDICToR_H_
#include "shape_predictor_abstract.h"
#include "full_object_detection.h"
#include "../algs.h"
#include "../matrix.h"
#include "../geometry.h"
#include "../pixel.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
namespace impl
{
struct split_feature
{
unsigned long idx1;
unsigned long idx2;
float thresh;
friend inline void serialize (const split_feature& item, std::ostream& out)
{
dlib::serialize(item.idx1, out);
dlib::serialize(item.idx2, out);
dlib::serialize(item.thresh, out);
}
friend inline void deserialize (split_feature& item, std::istream& in)
{
dlib::deserialize(item.idx1, in);
dlib::deserialize(item.idx2, in);
dlib::deserialize(item.thresh, in);
}
};
// a tree is just a std::vector<impl::split_feature>. We use this function to navigate the
// tree nodes
inline unsigned long left_child (unsigned long idx) { return 2*idx + 1; }
/*!
ensures
- returns the index of the left child of the binary tree node idx
!*/
inline unsigned long right_child (unsigned long idx) { return 2*idx + 2; }
/*!
ensures
- returns the index of the left child of the binary tree node idx
!*/
struct regression_tree
{
std::vector<split_feature> splits;
std::vector<matrix<float,0,1> > leaf_values;
inline const matrix<float,0,1>& operator()(
const std::vector<float>& feature_pixel_values
) const
/*!
requires
- All the index values in splits are less than feature_pixel_values.size()
- leaf_values.size() is a power of 2.
(i.e. we require a tree with all the levels fully filled out.
- leaf_values.size() == splits.size()+1
(i.e. there needs to be the right number of leaves given the number of splits in the tree)
ensures
- runs through the tree and returns the vector at the leaf we end up in.
!*/
{
unsigned long i = 0;
while (i < splits.size())
{
if (feature_pixel_values[splits[i].idx1] - feature_pixel_values[splits[i].idx2] > splits[i].thresh)
i = left_child(i);
else
i = right_child(i);
}
return leaf_values[i - splits.size()];
}
friend void serialize (const regression_tree& item, std::ostream& out)
{
dlib::serialize(item.splits, out);
dlib::serialize(item.leaf_values, out);
}
friend void deserialize (regression_tree& item, std::istream& in)
{
dlib::deserialize(item.splits, in);
dlib::deserialize(item.leaf_values, in);
}
};
// ------------------------------------------------------------------------------------
inline vector<float,2> location (
const matrix<float,0,1>& shape,
unsigned long idx
)
/*!
requires
- idx < shape.size()/2
- shape.size()%2 == 0
ensures
- returns the idx-th point from the shape vector.
!*/
{
return vector<float,2>(shape(idx*2), shape(idx*2+1));
}
// ------------------------------------------------------------------------------------
inline unsigned long nearest_shape_point (
const matrix<float,0,1>& shape,
const dlib::vector<float,2>& pt
)
{
// find the nearest part of the shape to this pixel
float best_dist = std::numeric_limits<float>::infinity();
const unsigned long num_shape_parts = shape.size()/2;
unsigned long best_idx = 0;
for (unsigned long j = 0; j < num_shape_parts; ++j)
{
const float dist = length_squared(location(shape,j)-pt);
if (dist < best_dist)
{
best_dist = dist;
best_idx = j;
}
}
return best_idx;
}
// ------------------------------------------------------------------------------------
inline void create_shape_relative_encoding (
const matrix<float,0,1>& shape,
const std::vector<dlib::vector<float,2> >& pixel_coordinates,
std::vector<unsigned long>& anchor_idx,
std::vector<dlib::vector<float,2> >& deltas
)
/*!
requires
- shape.size()%2 == 0
- shape.size() > 0
ensures
- #anchor_idx.size() == pixel_coordinates.size()
- #deltas.size() == pixel_coordinates.size()
- for all valid i:
- pixel_coordinates[i] == location(shape,#anchor_idx[i]) + #deltas[i]
!*/
{
anchor_idx.resize(pixel_coordinates.size());
deltas.resize(pixel_coordinates.size());
for (unsigned long i = 0; i < pixel_coordinates.size(); ++i)
{
anchor_idx[i] = nearest_shape_point(shape, pixel_coordinates[i]);
deltas[i] = pixel_coordinates[i] - location(shape,anchor_idx[i]);
}
}
// ------------------------------------------------------------------------------------
inline point_transform_affine find_tform_between_shapes (
const matrix<float,0,1>& from_shape,
const matrix<float,0,1>& to_shape
)
{
DLIB_CASSERT(from_shape.size() == to_shape.size() && (from_shape.size()%2) == 0 && from_shape.size() > 0,"");
std::vector<vector<float,2> > from_points, to_points;
const unsigned long num = from_shape.size()/2;
from_points.reserve(num);
to_points.reserve(num);
if (num == 1)
{
// Just use an identity transform if there is only one landmark.
return point_transform_affine();
}
for (unsigned long i = 0; i < num; ++i)
{
from_points.push_back(location(from_shape,i));
to_points.push_back(location(to_shape,i));
}
return find_similarity_transform(from_points, to_points);
}
// ------------------------------------------------------------------------------------
inline point_transform_affine normalizing_tform (
const rectangle& rect
)
/*!
ensures
- returns a transform that maps rect.tl_corner() to (0,0) and rect.br_corner()
to (1,1).
!*/
{
std::vector<vector<float,2> > from_points, to_points;
from_points.push_back(rect.tl_corner()); to_points.push_back(point(0,0));
from_points.push_back(rect.tr_corner()); to_points.push_back(point(1,0));
from_points.push_back(rect.br_corner()); to_points.push_back(point(1,1));
return find_similarity_transform(from_points, to_points);
}
// ------------------------------------------------------------------------------------
inline point_transform_affine unnormalizing_tform (
const rectangle& rect
)
/*!
ensures
- returns a transform that maps (0,0) to rect.tl_corner() and (1,1) to
rect.br_corner().
!*/
{
std::vector<vector<float,2> > from_points, to_points;
to_points.push_back(rect.tl_corner()); from_points.push_back(point(0,0));
to_points.push_back(rect.tr_corner()); from_points.push_back(point(1,0));
to_points.push_back(rect.br_corner()); from_points.push_back(point(1,1));
return find_similarity_transform(from_points, to_points);
}
// ------------------------------------------------------------------------------------
template <typename image_type>
void extract_feature_pixel_values (
const image_type& img_,
const rectangle& rect,
const matrix<float,0,1>& current_shape,
const matrix<float,0,1>& reference_shape,
const std::vector<unsigned long>& reference_pixel_anchor_idx,
const std::vector<dlib::vector<float,2> >& reference_pixel_deltas,
std::vector<float>& feature_pixel_values
)
/*!
requires
- image_type == an image object that implements the interface defined in
dlib/image_processing/generic_image.h
- reference_pixel_anchor_idx.size() == reference_pixel_deltas.size()
- current_shape.size() == reference_shape.size()
- reference_shape.size()%2 == 0
- max(mat(reference_pixel_anchor_idx)) < reference_shape.size()/2
ensures
- #feature_pixel_values.size() == reference_pixel_deltas.size()
- for all valid i:
- #feature_pixel_values[i] == the value of the pixel in img_ that
corresponds to the pixel identified by reference_pixel_anchor_idx[i]
and reference_pixel_deltas[i] when the pixel is located relative to
current_shape rather than reference_shape.
!*/
{
const matrix<float,2,2> tform = matrix_cast<float>(find_tform_between_shapes(reference_shape, current_shape).get_m());
const point_transform_affine tform_to_img = unnormalizing_tform(rect);
const rectangle area = get_rect(img_);
const_image_view<image_type> img(img_);
feature_pixel_values.resize(reference_pixel_deltas.size());
for (unsigned long i = 0; i < feature_pixel_values.size(); ++i)
{
// Compute the point in the current shape corresponding to the i-th pixel and
// then map it from the normalized shape space into pixel space.
point p = tform_to_img(tform*reference_pixel_deltas[i] + location(current_shape, reference_pixel_anchor_idx[i]));
if (area.contains(p))
feature_pixel_values[i] = get_pixel_intensity(img[p.y()][p.x()]);
else
feature_pixel_values[i] = 0;
}
}
} // end namespace impl
// ----------------------------------------------------------------------------------------
class shape_predictor
{
public:
shape_predictor (
)
{}
shape_predictor (
const matrix<float,0,1>& initial_shape_,
const std::vector<std::vector<impl::regression_tree> >& forests_,
const std::vector<std::vector<dlib::vector<float,2> > >& pixel_coordinates
) : initial_shape(initial_shape_), forests(forests_)
/*!
requires
- initial_shape.size()%2 == 0
- forests.size() == pixel_coordinates.size() == the number of cascades
- for all valid i:
- all the index values in forests[i] are less than pixel_coordinates[i].size()
- for all valid i and j:
- forests[i][j].leaf_values.size() is a power of 2.
(i.e. we require a tree with all the levels fully filled out.
- forests[i][j].leaf_values.size() == forests[i][j].splits.size()+1
(i.e. there need to be the right number of leaves given the number of splits in the tree)
!*/
{
anchor_idx.resize(pixel_coordinates.size());
deltas.resize(pixel_coordinates.size());
// Each cascade uses a different set of pixels for its features. We compute
// their representations relative to the initial shape now and save it.
for (unsigned long i = 0; i < pixel_coordinates.size(); ++i)
impl::create_shape_relative_encoding(initial_shape, pixel_coordinates[i], anchor_idx[i], deltas[i]);
}
unsigned long num_parts (
) const
/*!
ensures
- returns the number of points in the shape
!*/
{
return initial_shape.size()/2;
}
template <typename image_type>
full_object_detection operator()(
const image_type& img,
const rectangle& rect
) const
/*!
ensures
- runs the tree regressor on the detection rect inside img and returns a
full_object_detection DET such that:
- DET.get_rect() == rect
- DET.num_parts() == num_parts()
!*/
{
using namespace impl;
matrix<float,0,1> current_shape = initial_shape;
std::vector<float> feature_pixel_values;
for (unsigned long iter = 0; iter < forests.size(); ++iter)
{
extract_feature_pixel_values(img, rect, current_shape, initial_shape, anchor_idx[iter], deltas[iter], feature_pixel_values);
// evaluate all the trees at this level of the cascade.
for (unsigned long i = 0; i < forests[iter].size(); ++i)
current_shape += forests[iter][i](feature_pixel_values);
}
// convert the current_shape into a full_object_detection
const point_transform_affine tform_to_img = unnormalizing_tform(rect);
std::vector<point> parts(current_shape.size()/2);
for (unsigned long i = 0; i < parts.size(); ++i)
parts[i] = tform_to_img(location(current_shape, i));
return full_object_detection(rect, parts);
}
friend void serialize (const shape_predictor& item, std::ostream& out)
{
int version = 1;
dlib::serialize(version, out);
dlib::serialize(item.initial_shape, out);
dlib::serialize(item.forests, out);
dlib::serialize(item.anchor_idx, out);
dlib::serialize(item.deltas, out);
}
friend void deserialize (shape_predictor& item, std::istream& in)
{
int version = 0;
dlib::deserialize(version, in);
if (version != 1)
throw serialization_error("Unexpected version found while deserializing dlib::shape_predictor.");
dlib::deserialize(item.initial_shape, in);
dlib::deserialize(item.forests, in);
dlib::deserialize(item.anchor_idx, in);
dlib::deserialize(item.deltas, in);
}
private:
matrix<float,0,1> initial_shape;
std::vector<std::vector<impl::regression_tree> > forests;
std::vector<std::vector<unsigned long> > anchor_idx;
std::vector<std::vector<dlib::vector<float,2> > > deltas;
};
// ----------------------------------------------------------------------------------------
class shape_predictor_trainer
{
/*!
This thing really only works with unsigned char or rgb_pixel images (since we assume the threshold
should be in the range [-128,128]).
!*/
public:
unsigned long cascade_depth (
) const { return 10; }
unsigned long tree_depth (
) const { return 2; }
unsigned long num_trees_per_cascade_level (
) const { return 500; }
double get_nu (
) const { return 0.1; } // the regularizer
std::string random_seed (
) const { return "dlib rules"; }
unsigned long oversampling_amount (
) const { return 20; }
// feature sampling parameters
unsigned long feature_pool_size (
) const { return 400; }// this must be > 1
double get_lambda (
) const { return 0.1; }
unsigned long get_num_test_splits (
) const { return 20; }
double get_feature_pool_region_padding (
) const { return 0; }
template <typename image_array>
shape_predictor train (
const image_array& images,
const std::vector<std::vector<full_object_detection> >& objects
) const
{
using namespace impl;
DLIB_CASSERT(images.size() == objects.size() && images.size() > 0, "");
std::vector<training_sample> samples;
const matrix<float,0,1> initial_shape = populate_training_sample_shapes(objects, samples);
const std::vector<std::vector<dlib::vector<float,2> > > pixel_coordinates = randomly_sample_pixel_coordinates(initial_shape);
rnd.set_seed(random_seed());
std::vector<std::vector<impl::regression_tree> > forests(cascade_depth());
// Now start doing the actual training by filling in the forests
for (unsigned long cascade = 0; cascade < cascade_depth(); ++cascade)
{
// TODO, add some verbose option that prints here
// Each cascade uses a different set of pixels for its features. We compute
// their representations relative to the initial shape first.
std::vector<unsigned long> anchor_idx;
std::vector<dlib::vector<float,2> > deltas;
create_shape_relative_encoding(initial_shape, pixel_coordinates[cascade], anchor_idx, deltas);
// First compute the feature_pixel_values for each training sample at this
// level of the cascade.
for (unsigned long i = 0; i < samples.size(); ++i)
{
extract_feature_pixel_values(images[samples[i].image_idx], samples[i].rect,
samples[i].current_shape, initial_shape, anchor_idx,
deltas, samples[i].feature_pixel_values);
}
// Now start building the trees at this cascade level.
for (unsigned long i = 0; i < num_trees_per_cascade_level(); ++i)
{
forests[cascade].push_back(make_regression_tree(samples, pixel_coordinates[cascade]));
}
}
return shape_predictor(initial_shape, forests, pixel_coordinates);
}
private:
static matrix<float,0,1> object_to_shape (
const full_object_detection& obj
)
{
matrix<float,0,1> shape(obj.num_parts()*2);
const point_transform_affine tform_from_img = impl::normalizing_tform(obj.get_rect());
for (unsigned long i = 0; i < obj.num_parts(); ++i)
{
vector<float,2> p = tform_from_img(obj.part(i));
shape(2*i) = p.x();
shape(2*i+1) = p.y();
}
return shape;
}
struct training_sample
{
/*!
CONVENTION
- feature_pixel_values.size() == feature_pool_size()
- feature_pixel_values[j] == the value of the j-th feature pool
pixel when you look it up relative to the shape in current_shape.
- target_shape == The truth shape. Stays constant during the whole
training process.
- rect == the position of the object in the image_idx-th image. All shape
coordinates are coded relative to this rectangle.
!*/
unsigned long image_idx;
rectangle rect;
matrix<float,0,1> target_shape;
matrix<float,0,1> current_shape;
std::vector<float> feature_pixel_values;
void swap(training_sample& item)
{
std::swap(image_idx, item.image_idx);
std::swap(rect, item.rect);
target_shape.swap(item.target_shape);
current_shape.swap(item.current_shape);
feature_pixel_values.swap(item.feature_pixel_values);
}
};
impl::regression_tree make_regression_tree (
std::vector<training_sample>& samples,
const std::vector<dlib::vector<float,2> >& pixel_coordinates
) const
{
using namespace impl;
std::deque<std::pair<unsigned long, unsigned long> > parts;
parts.push_back(std::make_pair(0, samples.size()));
impl::regression_tree tree;
// walk the tree in breadth first order
const unsigned long num_split_nodes = static_cast<unsigned long>(std::pow(2.0, (double)tree_depth())-1);
std::vector<matrix<float,0,1> > sums(num_split_nodes*2+1);
for (unsigned long i = 0; i < samples.size(); ++i)
sums[0] += samples[i].target_shape - samples[i].current_shape;
for (unsigned long i = 0; i < num_split_nodes; ++i)
{
std::pair<unsigned long,unsigned long> range = parts.front();
parts.pop_front();
const impl::split_feature split = generate_split(samples, range.first,
range.second, pixel_coordinates, sums[i], sums[left_child(i)],
sums[right_child(i)]);
tree.splits.push_back(split);
const unsigned long mid = partition_samples(split, samples, range.first, range.second);
parts.push_back(std::make_pair(range.first, mid));
parts.push_back(std::make_pair(mid, range.second));
}
// Now all the parts contain the ranges for the leaves so we can use them to
// compute the average leaf values.
tree.leaf_values.resize(parts.size());
for (unsigned long i = 0; i < parts.size(); ++i)
{
if (parts[i].second != parts[i].first)
tree.leaf_values[i] = sums[num_split_nodes+i]*get_nu()/(parts[i].second - parts[i].first);
else
tree.leaf_values[i] = zeros_matrix(samples[0].target_shape);
// now adjust the current shape based on these predictions
for (unsigned long j = parts[i].first; j < parts[i].second; ++j)
samples[j].current_shape += tree.leaf_values[i];
}
return tree;
}
impl::split_feature randomly_generate_split_feature (
const std::vector<dlib::vector<float,2> >& pixel_coordinates
) const
{
const double lambda = get_lambda();
impl::split_feature feat;
double accept_prob;
do
{
feat.idx1 = rnd.get_random_32bit_number()%feature_pool_size();
feat.idx2 = rnd.get_random_32bit_number()%feature_pool_size();
const double dist = length(pixel_coordinates[feat.idx1]-pixel_coordinates[feat.idx2]);
accept_prob = std::exp(-dist/lambda);
}
while(feat.idx1 == feat.idx2 || !(accept_prob > rnd.get_random_double()));
feat.thresh = (rnd.get_random_double()*256 - 128)/2.0;
return feat;
}
impl::split_feature generate_split (
const std::vector<training_sample>& samples,
unsigned long begin,
unsigned long end,
const std::vector<dlib::vector<float,2> >& pixel_coordinates,
const matrix<float,0,1>& sum,
matrix<float,0,1>& left_sum,
matrix<float,0,1>& right_sum
) const
{
// generate a bunch of random splits and test them and return the best one.
const unsigned long num_test_splits = get_num_test_splits();
// sample the random features we test in this function
std::vector<impl::split_feature> feats;
feats.reserve(num_test_splits);
for (unsigned long i = 0; i < num_test_splits; ++i)
feats.push_back(randomly_generate_split_feature(pixel_coordinates));
std::vector<matrix<float,0,1> > left_sums(num_test_splits);
std::vector<unsigned long> left_cnt(num_test_splits);
// now compute the sums of vectors that go left for each feature
matrix<float,0,1> temp;
for (unsigned long j = begin; j < end; ++j)
{
temp = samples[j].target_shape-samples[j].current_shape;
for (unsigned long i = 0; i < num_test_splits; ++i)
{
if (samples[j].feature_pixel_values[feats[i].idx1] - samples[j].feature_pixel_values[feats[i].idx2] > feats[i].thresh)
{
left_sums[i] += temp;
++left_cnt[i];
}
}
}
// now figure out which feature is the best
double best_score = -1;
unsigned long best_feat = 0;
for (unsigned long i = 0; i < num_test_splits; ++i)
{
// check how well the feature splits the space.
double score = 0;
unsigned long right_cnt = end-begin-left_cnt[i];
if (left_cnt[i] != 0 && right_cnt != 0)
{
temp = sum - left_sums[i];
score = dot(left_sums[i],left_sums[i])/left_cnt[i] + dot(temp,temp)/right_cnt;
if (score > best_score)
{
best_score = score;
best_feat = i;
}
}
}
left_sums[best_feat].swap(left_sum);
if (left_sum.size() != 0)
{
right_sum = sum - left_sum;
}
else
{
right_sum = sum;
left_sum = zeros_matrix(sum);
}
return feats[best_feat];
}
unsigned long partition_samples (
const impl::split_feature& split,
std::vector<training_sample>& samples,
unsigned long begin,
unsigned long end
) const
{
// splits samples based on split (sorta like in quick sort) and returns the mid
// point. make sure you return the mid in a way compatible with how we walk
// through the tree.
unsigned long i = begin;
for (unsigned long j = begin; j < end; ++j)
{
if (samples[j].feature_pixel_values[split.idx1] - samples[j].feature_pixel_values[split.idx2] > split.thresh)
{
samples[i].swap(samples[j]);
++i;
}
}
return i;
}
matrix<float,0,1> populate_training_sample_shapes(
const std::vector<std::vector<full_object_detection> >& objects,
std::vector<training_sample>& samples
) const
{
samples.clear();
matrix<float,0,1> mean_shape;
long count = 0;
// first fill out the target shapes
for (unsigned long i = 0; i < objects.size(); ++i)
{
for (unsigned long j = 0; j < objects[i].size(); ++j)
{
training_sample sample;
sample.image_idx = i;
sample.rect = objects[i][j].get_rect();
sample.target_shape = object_to_shape(objects[i][j]);
for (unsigned long itr = 0; itr < oversampling_amount(); ++itr)
samples.push_back(sample);
mean_shape += sample.target_shape;
++count;
}
}
mean_shape /= count;
// now go pick random initial shapes
for (unsigned long i = 0; i < samples.size(); ++i)
{
if ((i%oversampling_amount()) == 0)
{
// The mean shape is what we really use as an initial shape so always
// include it in the training set as an example starting shape.
samples[i].current_shape = mean_shape;
}
else
{
// Pick a random convex combination of two of the target shapes and use
// that as the initial shape for this sample.
const unsigned long rand_idx = rnd.get_random_32bit_number()%samples.size();
const unsigned long rand_idx2 = rnd.get_random_32bit_number()%samples.size();
const double alpha = rnd.get_random_double();
samples[i].current_shape = alpha*samples[rand_idx].target_shape + (1-alpha)*samples[rand_idx2].target_shape;
}
}
return mean_shape;
}
void randomly_sample_pixel_coordinates (
std::vector<dlib::vector<float,2> >& pixel_coordinates,
const double min_x,
const double min_y,
const double max_x,
const double max_y
) const
/*!
ensures
- #pixel_coordinates.size() == feature_pool_size()
- for all valid i:
- pixel_coordinates[i] == a point in the box defined by the min/max x/y arguments.
!*/
{
pixel_coordinates.resize(feature_pool_size());
for (unsigned long i = 0; i < feature_pool_size(); ++i)
{
pixel_coordinates[i].x() = rnd.get_random_double()*(max_x-min_x) + min_x;
pixel_coordinates[i].y() = rnd.get_random_double()*(max_y-min_y) + min_y;
}
}
std::vector<std::vector<dlib::vector<float,2> > > randomly_sample_pixel_coordinates (
const matrix<float,0,1>& initial_shape
) const
{
const double padding = get_feature_pool_region_padding();
// Figure figure out the bounds on the object shapes. We will sample uniformly
// from this box.
matrix<float> temp = reshape(initial_shape, initial_shape.size()/2, 2);
const double min_x = min(colm(temp,0))-padding;
const double min_y = min(colm(temp,1))-padding;
const double max_x = max(colm(temp,0))+padding;
const double max_y = max(colm(temp,1))+padding;
std::vector<std::vector<dlib::vector<float,2> > > pixel_coordinates;
pixel_coordinates.resize(cascade_depth());
for (unsigned long i = 0; i < cascade_depth(); ++i)
randomly_sample_pixel_coordinates(pixel_coordinates[i], min_x, min_y, max_x, max_y);
return pixel_coordinates;
}
mutable dlib::rand rnd;
};
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
template <
typename image_array
>
double test_shape_predictor (
const shape_predictor& sp,
const image_array& images,
const std::vector<std::vector<full_object_detection> >& objects,
const std::vector<std::vector<double> >& scales
)
{
running_stats<double> rs;
for (unsigned long i = 0; i < objects.size(); ++i)
{
for (unsigned long j = 0; j < objects[i].size(); ++j)
{
// Just use a scale of 1 (i.e. no scale at all) if the caller didn't supply
// any scales.
const double scale = scales.size()==0 ? 1 : scales[i][j];
full_object_detection det = sp(images[i], objects[i][j].get_rect());
for (unsigned long k = 0; k < det.num_parts(); ++k)
{
double score = length(det.part(k) - objects[i][j].part(k))/scale;
rs.add(score);
}
}
}
return rs.mean();
}
// ----------------------------------------------------------------------------------------
template <
typename image_array
>
double test_shape_predictor (
const shape_predictor& sp,
const image_array& images,
const std::vector<std::vector<full_object_detection> >& objects
)
{
std::vector<std::vector<double> > no_scales;
return test_shape_predictor(sp, images, objects, no_scales);
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_SHAPE_PREDICToR_H_
// Copyright (C) 2014 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_SHAPE_PREDICToR_ABSTRACT_H_
#ifdef DLIB_SHAPE_PREDICToR_ABSTRACT_H_
#include "full_object_detection_abstract.h"
#include "../matrix.h"
#include "../geometry.h"
#include "../pixel.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
class shape_predictor
{
/*!
!*/
public:
shape_predictor (
);
/*!
!*/
unsigned long num_parts (
) const;
/*!
ensures
- returns the number of points in the shape
!*/
template <typename image_type>
full_object_detection operator()(
const image_type& img,
const rectangle& rect
) const;
/*!
requires
- image_type == an image object that implements the interface defined in
dlib/image_processing/generic_image.h
ensures
- runs the tree regressor on the detection rect inside img and returns a
full_object_detection DET such that:
- DET.get_rect() == rect
- DET.num_parts() == num_parts()
!*/
};
void serialize (const shape_predictor& item, std::ostream& out);
void deserialize (shape_predictor& item, std::istream& in);
/*!
provides serialization support
!*/
// ----------------------------------------------------------------------------------------
class shape_predictor_trainer
{
/*!
This thing really only works with unsigned char or rgb_pixel images (since we assume the threshold
should be in the range [-128,128]).
!*/
public:
unsigned long cascade_depth (
) const { return 10; }
unsigned long tree_depth (
) const { return 2; }
unsigned long num_trees_per_cascade_level (
) const { return 500; }
double get_nu (
) const { return 0.1; } // the regularizer
std::string random_seed (
) const { return "dlib rules"; }
unsigned long oversampling_amount (
) const { return 20; }
// feature sampling parameters
unsigned long feature_pool_size (
) const { return 400; }// this must be > 1
double get_lambda (
) const { return 0.1; }
unsigned long get_num_test_splits (
) const { return 20; }
double get_feature_pool_region_padding (
) const { return 0; }
template <typename image_array>
shape_predictor train (
const image_array& images,
const std::vector<std::vector<full_object_detection> >& objects
) const;
};
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
template <
typename image_array
>
double test_shape_predictor (
const shape_predictor& sp,
const image_array& images,
const std::vector<std::vector<full_object_detection> >& objects,
const std::vector<std::vector<double> >& scales
);
/*!
requires
- images.size() == objects.size()
- for all valid i and j:
- objects[i][j].num_parts() == sp.num_parts()
- if (scales.size() != 0) then
- There must be a scale value for each full_object_detection in objects.
That is, it must be the case that:
- scales.size() == objects.size()
- for all valid i:
- scales[i].size() == objects[i].size()
ensures
- Tests the given shape_predictor by running it on each of the given objects and
checking how well it recovers the part positions. In particular, for all
valid i and j we perform:
sp(images[i], objects[i][j].get_rect())
and compare the result with the truth part positions in objects[i][j]. We
then return the average distance between a predicted part location and its
true position. This value is then returned.
- if (scales.size() != 0) then
- Each time we compute the distance between a predicted part location and
its true location in objects[i][j] we divide the distance by
scales[i][j]. Therefore, if you want the reported error to be the
average pixel distance then give an empty scales vector, but if you want
the returned value to be something else like the average distance
normalized by some feature of the objects (e.g. the interocular distance)
then you an supply those normalizing values via scales.
!*/
template <
typename image_array
>
double test_shape_predictor (
const shape_predictor& sp,
const image_array& images,
const std::vector<std::vector<full_object_detection> >& objects
);
/*!
!*/
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_SHAPE_PREDICToR_ABSTRACT_H_
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment