Commit 6401e693 authored by Davis King's avatar Davis King

Added scan_fhog_pyramid.

parent dc4cc092
......@@ -13,6 +13,7 @@
#include "image_processing/scan_image_boxes.h"
#include "image_processing/scan_image_custom.h"
#include "image_processing/remove_unobtainable_rectangles.h"
#include "image_processing/scan_fhog_pyramid.h"
#endif // DLIB_IMAGE_PROCESSInG_H___
......
// Copyright (C) 2013 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_SCAN_fHOG_PYRAMID_H__
#define DLIB_SCAN_fHOG_PYRAMID_H__
#include "scan_fhog_pyramid_abstract.h"
#include "../matrix.h"
#include "../image_transforms.h"
#include "../array.h"
#include "../array2d.h"
#include "object_detector.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
class scan_fhog_pyramid : noncopyable
{
public:
typedef matrix<double,0,1> feature_vector_type;
typedef Pyramid_type pyramid_type;
scan_fhog_pyramid (
);
template <
typename image_type
>
void load (
const image_type& img
);
inline bool is_loaded_with_image (
) const;
inline void copy_configuration (
const scan_fhog_pyramid& item
);
void set_detection_window_size (
unsigned long width,
unsigned long height
)
{
window_width = width;
window_height = height;
}
inline unsigned long get_detection_window_width (
) const { return window_width; }
inline unsigned long get_detection_window_height (
) const { return window_height; }
inline unsigned long get_num_detection_templates (
) const;
inline unsigned long get_num_movable_components_per_detection_template (
) const;
void set_padding (
unsigned long new_padding
)
{
padding = new_padding;
}
unsigned long get_padding (
) const { return padding; }
void set_cell_size (
unsigned long new_cell_size
)
{
// make sure requires clause is not broken
DLIB_ASSERT(new_cell_size > 0 ,
"\t void scan_fhog_pyramid::set_cell_size()"
<< "\n\t You can't have zero sized fHOG cells. "
<< "\n\t this: " << this
);
cell_size = new_cell_size;
}
unsigned long get_cell_size (
) const { return cell_size; }
inline long get_num_dimensions (
) const;
unsigned long get_max_pyramid_levels (
) const;
void set_max_pyramid_levels (
unsigned long max_levels
);
void set_min_pyramid_layer_size (
unsigned long width,
unsigned long height
);
inline unsigned long get_min_pyramid_layer_width (
) const;
inline unsigned long get_min_pyramid_layer_height (
) const;
void detect (
const feature_vector_type& w,
std::vector<std::pair<double, rectangle> >& dets,
const double thresh
) const
{
fhog_filterbank temp = build_fhog_filterbank(w);
detect(temp, dets, thresh);
}
class fhog_filterbank
{
friend class scan_fhog_pyramid;
public:
inline unsigned long get_num_dimensions() const
{
unsigned long dims = 0;
for (unsigned long i = 0; i < filters.size(); ++i)
{
dims += filters[i].size();
}
return dims;
}
const std::vector<matrix<float> >& get_filters() const { return filters;}
unsigned long num_separable_filters() const
{
unsigned long num = 0;
for (unsigned long i = 0; i < row_filters.size(); ++i)
{
num += row_filters[i].size();
}
return num;
}
private:
std::vector<matrix<float> > filters;
std::vector<std::vector<matrix<float,0,1> > > row_filters, col_filters;
};
fhog_filterbank build_fhog_filterbank (
const feature_vector_type& weights
) const
{
fhog_filterbank temp;
temp.filters.resize(31);
temp.row_filters.resize(31);
temp.col_filters.resize(31);
// load filters from w
unsigned long width, height;
compute_fhog_window_size(width, height);
const long size = width*height;
for (unsigned long i = 0; i < temp.filters.size(); ++i)
{
matrix<double> u,v,w,f;
f = reshape(rowm(weights, range(i*size, (i+1)*size-1)), height, width);
temp.filters[i] = matrix_cast<float>(f);
svd3(f, u,w,v);
matrix<double> w2 = w;
rsort_columns(u,w);
rsort_columns(v,w2);
double thresh = std::max(1e-3, max(w)*0.01);
w = round_zeros(w, thresh);
for (long j = 0; j < w.size(); ++j)
{
if (w(j) != 0)
{
temp.col_filters[i].push_back(matrix_cast<float>(colm(u,j)*std::sqrt(w(j))));
temp.row_filters[i].push_back(matrix_cast<float>(colm(v,j)*std::sqrt(w(j))));
}
}
}
return temp;
}
void detect (
const fhog_filterbank& w,
std::vector<std::pair<double, rectangle> >& dets,
const double thresh
) const;
void get_feature_vector (
const full_object_detection& obj,
feature_vector_type& psi
) const;
full_object_detection get_full_object_detection (
const rectangle& rect,
const feature_vector_type& w
) const;
const rectangle get_best_matching_rect (
const rectangle& rect
) const;
double get_nuclear_norm_regularization_strength (
) const { return nuclear_norm_regularization_strength; }
void set_nuclear_norm_regularization_strength (
double strength
)
/*!
requires
- strength >= 0
ensures
- #get_nuclear_norm_regularization_strength() == strength
!*/
{
nuclear_norm_regularization_strength = strength;
}
unsigned long get_fhog_window_width (
) const
{
unsigned long width, height;
compute_fhog_window_size(width, height);
return width;
}
unsigned long get_fhog_window_height (
) const
{
unsigned long width, height;
compute_fhog_window_size(width, height);
return height;
}
template <typename T>
friend void serialize (
const scan_fhog_pyramid<T>& item,
std::ostream& out
);
template <typename T>
friend void deserialize (
scan_fhog_pyramid<T>& item,
std::istream& in
);
private:
inline void compute_fhog_window_size(
unsigned long& width,
unsigned long& height
) const
{
const rectangle temp = grow_rect(image_to_fhog(centered_rect(point(0,0),window_width,window_height), cell_size), padding);
width = temp.width();
height = temp.height();
}
static bool compare_pair_rect (
const std::pair<double, rectangle>& a,
const std::pair<double, rectangle>& b
)
{
return a.first < b.first;
}
void get_mapped_rect_and_metadata (
const unsigned long number_pyramid_levels,
const rectangle& rect,
rectangle& mapped_rect,
rectangle& fhog_rect,
unsigned long& best_level
) const;
double get_match_score (
rectangle r1,
rectangle r2
) const
{
// make the rectangles overlap as much as possible before computing the match score.
r1 = move_rect(r1, r2.tl_corner());
return (r1.intersect(r2).area())/(double)(r1 + r2).area();
}
typedef array<array2d<float> > fhog_image;
array<fhog_image> feats;
int cell_size;
unsigned long padding;
unsigned long window_width;
unsigned long window_height;
unsigned long max_pyramid_levels;
unsigned long min_pyramid_layer_width;
unsigned long min_pyramid_layer_height;
double nuclear_norm_regularization_strength;
};
// ----------------------------------------------------------------------------------------
template <typename T>
void serialize (
const scan_fhog_pyramid<T>& item,
std::ostream& out
)
{
int version = 1;
serialize(version, out);
serialize(item.feats, out);
serialize(item.cell_size, out);
serialize(item.padding, out);
serialize(item.window_width, out);
serialize(item.window_height, out);
serialize(item.max_pyramid_levels, out);
serialize(item.min_pyramid_layer_width, out);
serialize(item.min_pyramid_layer_height, out);
serialize(item.nuclear_norm_regularization_strength, out);
serialize(item.get_num_dimensions(), out);
}
// ----------------------------------------------------------------------------------------
template <typename T>
void deserialize (
scan_fhog_pyramid<T>& item,
std::istream& in
)
{
int version = 0;
deserialize(version, in);
if (version != 1)
throw serialization_error("Unsupported version found when deserializing a scan_fhog_pyramid object.");
deserialize(item.feats, in);
deserialize(item.cell_size, in);
deserialize(item.padding, in);
deserialize(item.window_width, in);
deserialize(item.window_height, in);
deserialize(item.max_pyramid_levels, in);
deserialize(item.min_pyramid_layer_width, in);
deserialize(item.min_pyramid_layer_height, in);
deserialize(item.nuclear_norm_regularization_strength, in);
// When developing some feature extractor, it's easy to accidentally change its
// number of dimensions and then try to deserialize data from an older version of
// your extractor into the current code. This check is here to catch that kind of
// user error.
long dims;
deserialize(dims, in);
if (item.get_num_dimensions() != dims)
throw serialization_error("Number of dimensions in serialized scan_fhog_pyramid doesn't match the expected number.");
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// scan_fhog_pyramid member functions
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
scan_fhog_pyramid<Pyramid_type>::
scan_fhog_pyramid (
) :
cell_size(8),
padding(1),
window_width(64),
window_height(64),
max_pyramid_levels(1000),
min_pyramid_layer_width(64),
min_pyramid_layer_height(64),
nuclear_norm_regularization_strength(0)
{
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
template <
typename image_type
>
void scan_fhog_pyramid<Pyramid_type>::
load (
const image_type& img
)
{
unsigned long levels = 0;
rectangle rect = get_rect(img);
// figure out how many pyramid levels we should be using based on the image size
pyramid_type pyr;
do
{
rect = pyr.rect_down(rect);
++levels;
} while (rect.width() >= min_pyramid_layer_width && rect.height() >= min_pyramid_layer_height &&
levels < max_pyramid_levels);
if (feats.max_size() < levels)
feats.set_max_size(levels);
feats.set_size(levels);
unsigned long width, height;
compute_fhog_window_size(width,height);
// build our feature pyramid
extract_fhog_features(img, feats[0], cell_size,height,width);
if (feats.size() > 1)
{
image_type temp1, temp2;
pyr(img, temp1);
extract_fhog_features(temp1, feats[1], cell_size,height,width);
swap(temp1,temp2);
for (unsigned long i = 2; i < feats.size(); ++i)
{
pyr(temp2, temp1);
extract_fhog_features(temp1, feats[i], cell_size,height,width);
swap(temp1,temp2);
}
}
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
bool scan_fhog_pyramid<Pyramid_type>::
is_loaded_with_image (
) const
{
return feats.size() != 0;
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
void scan_fhog_pyramid<Pyramid_type>::
copy_configuration (
const scan_fhog_pyramid& item
)
{
cell_size = item.cell_size;
padding = item.padding;
window_width = item.window_width;
window_height = item.window_height;
max_pyramid_levels = item.max_pyramid_levels;
min_pyramid_layer_width = item.min_pyramid_layer_width;
min_pyramid_layer_height = item.min_pyramid_layer_height;
nuclear_norm_regularization_strength = item.nuclear_norm_regularization_strength;
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
unsigned long scan_fhog_pyramid<Pyramid_type>::
get_num_detection_templates (
) const
{
return 1;
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
unsigned long scan_fhog_pyramid<Pyramid_type>::
get_num_movable_components_per_detection_template (
) const
{
return 0;
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
long scan_fhog_pyramid<Pyramid_type>::
get_num_dimensions (
) const
{
unsigned long width, height;
compute_fhog_window_size(width,height);
return width*height*31;
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
unsigned long scan_fhog_pyramid<Pyramid_type>::
get_max_pyramid_levels (
) const
{
return max_pyramid_levels;
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
void scan_fhog_pyramid<Pyramid_type>::
set_max_pyramid_levels (
unsigned long max_levels
)
{
// make sure requires clause is not broken
DLIB_ASSERT(max_levels > 0 ,
"\t void scan_fhog_pyramid::set_max_pyramid_levels()"
<< "\n\t You can't have zero levels. "
<< "\n\t max_levels: " << max_levels
<< "\n\t this: " << this
);
max_pyramid_levels = max_levels;
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
void scan_fhog_pyramid<Pyramid_type>::
detect (
const fhog_filterbank& w,
std::vector<std::pair<double, rectangle> >& dets,
const double thresh
) const
{
// make sure requires clause is not broken
DLIB_ASSERT(is_loaded_with_image() &&
w.get_num_dimensions() == get_num_dimensions(),
"\t void scan_fhog_pyramid::detect()"
<< "\n\t Invalid inputs were given to this function "
<< "\n\t is_loaded_with_image(): " << is_loaded_with_image()
<< "\n\t w.get_num_dimensions(): " << w.get_num_dimensions()
<< "\n\t get_num_dimensions(): " << get_num_dimensions()
<< "\n\t this: " << this
);
dets.clear();
unsigned long width, height;
compute_fhog_window_size(width,height);
const point anchor((width+1)%2,
(height+1)%2);
array2d<float> saliency_image;
array2d<float> temp;
pyramid_type pyr;
const unsigned long num_separable_filters = w.num_separable_filters();
// for all pyramid levels
for (unsigned long l = 0; l < feats.size(); ++l)
{
rectangle area;
if (num_separable_filters > 62)
{
area = spatially_filter_image(feats[l][0], saliency_image, w.filters[0]);
for (unsigned long i = 1; i < w.filters.size(); ++i)
{
// now we filter but the output adds to saliency_image rather than
// overwriting it.
spatially_filter_image(feats[l][i], saliency_image, w.filters[i], 1, false, true);
}
}
else
{
saliency_image.clear();
// find the first filter to apply
unsigned long i = 0;
while (i < w.row_filters.size() && w.row_filters[i].size() == 0)
++i;
for (; i < w.row_filters.size(); ++i)
{
for (unsigned long j = 0; j < w.row_filters[i].size(); ++j)
{
if (saliency_image.size() == 0)
area = spatially_filter_image_separable(feats[l][i], saliency_image, w.row_filters[i][j], w.col_filters[i][j],1,false,false);
else
area = spatially_filter_image_separable(feats[l][i], saliency_image, w.row_filters[i][j], w.col_filters[i][j],1,false,true);
}
}
if (saliency_image.size() == 0)
{
saliency_image.set_size(feats[l][0].nr(), feats[l][0].nc());
assign_all_pixels(saliency_image, 0);
}
}
// now search the saliency image for any detections
for (long r = area.top(); r <= area.bottom(); ++r)
{
for (long c = area.left(); c <= area.right(); ++c)
{
// if we found a detection
if (saliency_image[r][c] >= thresh)
{
rectangle rect = fhog_to_image(centered_rect(point(c,r)+anchor,width-2*padding,height-2*padding), cell_size, height,width);
rect = pyr.rect_up(rect, l);
dets.push_back(std::make_pair(saliency_image[r][c], rect));
}
}
}
}
std::sort(dets.rbegin(), dets.rend(), compare_pair_rect);
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
const rectangle scan_fhog_pyramid<Pyramid_type>::
get_best_matching_rect (
const rectangle& rect
) const
{
rectangle mapped_rect, fhog_rect;
unsigned long best_level;
get_mapped_rect_and_metadata(max_pyramid_levels, rect, mapped_rect, fhog_rect, best_level);
return mapped_rect;
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
void scan_fhog_pyramid<Pyramid_type>::
get_mapped_rect_and_metadata (
const unsigned long number_pyramid_levels,
const rectangle& rect,
rectangle& mapped_rect,
rectangle& fhog_rect,
unsigned long& best_level
) const
{
pyramid_type pyr;
best_level = 0;
double best_match_score = -1;
unsigned long width, height;
compute_fhog_window_size(width,height);
// Figure out the pyramid level which best matches rect against our detection
// window.
for (unsigned long l = 0; l < number_pyramid_levels; ++l)
{
const rectangle rect_fhog_space = image_to_fhog(pyr.rect_down(rect,l), cell_size, height,width);
const rectangle win_image_space = pyr.rect_up(fhog_to_image(centered_rect(center(rect_fhog_space),width-2*padding,height-2*padding), cell_size, height,width), l);
const double match_score = get_match_score(win_image_space, rect);
if (match_score > best_match_score)
{
best_match_score = match_score;
best_level = l;
fhog_rect = centered_rect(center(rect_fhog_space), width, height);
}
if (rect_fhog_space.area() <= 1)
break;
}
mapped_rect = pyr.rect_up(fhog_to_image(shrink_rect(fhog_rect,padding), cell_size,height,width),best_level);
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
full_object_detection scan_fhog_pyramid<Pyramid_type>::
get_full_object_detection (
const rectangle& rect,
const feature_vector_type&
) const
{
return full_object_detection(rect);
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
void scan_fhog_pyramid<Pyramid_type>::
get_feature_vector (
const full_object_detection& obj,
feature_vector_type& psi
) const
{
// make sure requires clause is not broken
DLIB_ASSERT(is_loaded_with_image() &&
psi.size() >= get_num_dimensions() &&
obj.num_parts() == 0,
"\t void scan_fhog_pyramid::get_feature_vector()"
<< "\n\t Invalid inputs were given to this function "
<< "\n\t is_loaded_with_image(): " << is_loaded_with_image()
<< "\n\t psi.size(): " << psi.size()
<< "\n\t get_num_dimensions(): " << get_num_dimensions()
<< "\n\t obj.num_parts(): " << obj.num_parts()
<< "\n\t this: " << this
);
rectangle mapped_rect;
unsigned long best_level;
rectangle fhog_rect;
get_mapped_rect_and_metadata(feats.size(), obj.get_rect(), mapped_rect, fhog_rect, best_level);
long i = 0;
for (unsigned long ii = 0; ii < feats[best_level].size(); ++ii)
{
const rectangle rect = get_rect(feats[best_level][0]);
for (long r = fhog_rect.top(); r <= fhog_rect.bottom(); ++r)
{
for (long c = fhog_rect.left(); c <= fhog_rect.right(); ++c)
{
if (rect.contains(c,r))
psi(i) += feats[best_level][ii][r][c];
++i;
}
}
}
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
void scan_fhog_pyramid<Pyramid_type>::
set_min_pyramid_layer_size (
unsigned long width,
unsigned long height
)
{
// make sure requires clause is not broken
DLIB_ASSERT(width > 0 && height > 0 ,
"\t void scan_fhog_pyramid::set_min_pyramid_layer_size()"
<< "\n\t These sizes can't be zero. "
<< "\n\t width: " << width
<< "\n\t height: " << height
<< "\n\t this: " << this
);
min_pyramid_layer_width = width;
min_pyramid_layer_height = height;
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
unsigned long scan_fhog_pyramid<Pyramid_type>::
get_min_pyramid_layer_width (
) const
{
return min_pyramid_layer_width;
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
unsigned long scan_fhog_pyramid<Pyramid_type>::
get_min_pyramid_layer_height (
) const
{
return min_pyramid_layer_height;
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
matrix<unsigned char> draw_fhog (
const object_detector<scan_fhog_pyramid<Pyramid_type> >& detector,
const long cell_draw_size = 15
)
{
typename scan_fhog_pyramid<Pyramid_type>::fhog_filterbank fb = detector.get_scanner().build_fhog_filterbank(detector.get_w());
return draw_fhog(fb.get_filters(),cell_draw_size);
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
unsigned long num_separable_filters (
const object_detector<scan_fhog_pyramid<Pyramid_type> >& detector
)
{
typename scan_fhog_pyramid<Pyramid_type>::fhog_filterbank fb = detector.get_scanner().build_fhog_filterbank(detector.get_w());
return fb.num_separable_filters();
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type,
typename svm_struct_prob_type
>
void configure_nuclear_norm_regularizer (
const scan_fhog_pyramid<Pyramid_type>& scanner,
svm_struct_prob_type& prob
)
{
const double strength = scanner.get_nuclear_norm_regularization_strength();
if (strength != 0)
{
const unsigned long width = scanner.get_fhog_window_width();
const unsigned long height = scanner.get_fhog_window_height();
for (int i = 0; i < 31; ++i)
{
prob.add_nuclear_norm_regularizer(i*width*height, height, width, strength);
}
prob.set_cache_based_epsilon(0.001);
}
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_SCAN_fHOG_PYRAMID_H__
// Copyright (C) 2013 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_SCAN_fHOG_PYRAMID_ABSTRACT_H__
#ifdef DLIB_SCAN_fHOG_PYRAMID_ABSTRACT_H__
#include <vector>
#include "../image_transforms/fhog_abstract.h"
#include "object_detector_abstract.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
class scan_fhog_pyramid : noncopyable
{
/*!
REQUIREMENTS ON Pyramid_type
- Must be one of the pyramid_down objects defined in
dlib/image_transforms/image_pyramid_abstract.h or an object with a
compatible interface
INITIAL VALUE
WHAT THIS OBJECT REPRESENTS
This object is a tool for running a fixed sized sliding window classifier
over an image pyramid. In particular, it slides a linear classifier over
a HOG pyramid as discussed in the paper:
Histograms of Oriented Gradients for Human Detection by Navneet Dalal
and Bill Triggs, CVPR 2005
However, we augment the method slightly to use the version of HOG features
from:
Object Detection with Discriminatively Trained Part Based Models by
P. Felzenszwalb, R. Girshick, D. McAllester, D. Ramanan
IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 32, No. 9, Sep. 2010
Since these HOG features have been shown to give superior performance.
THREAD SAFETY
Concurrent access to an instance of this object is not safe and should be
protected by a mutex lock except for the case where you are copying the
configuration (via copy_configuration()) of a scan_fhog_pyramid object to
many other threads. In this case, it is safe to copy the configuration of
a shared object so long as no other operations are performed on it.
!*/
public:
typedef matrix<double,0,1> feature_vector_type;
typedef Pyramid_type pyramid_type;
scan_fhog_pyramid (
);
/*!
ensures
- this object is properly initialized
!*/
template <
typename image_type
>
void load (
const image_type& img
);
/*!
requires
- image_type == is an implementation of array2d/array2d_kernel_abstract.h
- img contains some kind of pixel type.
(i.e. pixel_traits<typename image_type::type> is defined)
ensures
- #is_loaded_with_image() == true
- This object is ready to run a classifier over img to detect object
locations. Call detect() to do this.
!*/
bool is_loaded_with_image (
) const;
/*!
ensures
- returns true if this object has been loaded with an image to process and
false otherwise.
!*/
void copy_configuration (
const scan_fhog_pyramid& item
);
/*!
ensures
- Copies all the state information of item into *this, except for state
information populated by load(). More precisely, given two scan_fhog_pyramid
objects S1 and S2, the following sequence of instructions should always
result in both of them having the exact same state:
S2.copy_configuration(S1);
S1.load(img);
S2.load(img);
!*/
void set_detection_window_size (
unsigned long window_width,
unsigned long window_height
);
/*!
requires
- window_width > 0
- window_height > 0
ensures
- When detect() is called, this object scans a window that is of the given
width and height (in pixels) over each layer in an image pyramid. This
means that the rectangle detections which come out of detect() will have
a width to height ratio approximately equal to window_width/window_height
and will be approximately window_width*window_height pixels in area or
larger. Therefore, the smallest object that can be detected is roughly
window_width by window_height pixels in size.
- #get_detection_window_width() == window_width
- #get_detection_window_height() == window_height
- Since we use a HOG feature representation, the detection procedure works
as follows:
Step 1. Make an image pyramid.
Step 2. Convert each layer of the image pyramid into a 31 band HOG "image".
Step 3. Scan a linear classifier over each HOG image in the pyramid.
Moreover, the HOG features quantize the input image into a grid of cells,
each cell being get_cell_size() by get_cell_size() pixels in size. So
when we scan the object detector over the pyramid we are scanning an
appropriately sized window over these smaller quantized HOG features. In
particular, the size of the window we scan over the HOG feature pyramid
is #get_fhog_window_width() by #get_fhog_window_height() HOG cells in
size.
!*/
unsigned long get_detection_window_width (
) const;
/*!
ensures
- returns the width, in pixels, of the detection window that is scanned
over the image when detect() is called.
!*/
inline unsigned long get_detection_window_height (
) const;
/*!
ensures
- returns the height, in pixels, of the detection window that is scanned
over the image when detect() is called.
!*/
unsigned long get_fhog_window_width (
) const;
/*!
ensures
- Returns the width of the HOG scanning window in terms of HOG cell blocks.
Note that this is a function of get_detection_window_width(), get_cell_size(),
and get_padding() and is therefore not something you set directly.
- #get_fhog_window_width() is approximately equal to the number of HOG cells
that fit into get_detection_window_width() pixels plus 2*get_padding()
since we include additional padding around each window to add context.
!*/
unsigned long get_fhog_window_height (
) const;
/*!
ensures
- Returns the height of the HOG scanning window in terms of HOG cell blocks.
Note that this is a function of get_detection_window_height(), get_cell_size(),
and get_padding() and is therefore not something you set directly.
- #get_fhog_window_height() is approximately equal to the number of HOG cells
that fit into get_detection_window_height() pixels plus 2*get_padding()
since we include additional padding around each window to add context.
!*/
void set_padding (
unsigned long new_padding
);
/*!
ensures
- #get_padding() == new_padding
!*/
unsigned long get_padding (
) const;
/*!
ensures
- The HOG windows scanned over the HOG pyramid can include additional HOG
cells outside the detection window. This can help add context and
improve detection accuracy. This function returns the number of extra
HOG cells added onto the border of the HOG windows which are scanned by
detect().
!*/
unsigned long get_cell_size (
) const;
/*!
ensures
- Returns the size of the HOG cells. Each HOG cell is square and contains
get_cell_size()*get_cell_size() pixels.
!*/
void set_cell_size (
unsigned long new_cell_size
);
/*!
requires
- new_cell_size > 0
ensures
- #get_cell_size() == new_cell_size
!*/
inline long get_num_dimensions (
) const;
/*!
ensures
- get_fhog_window_width()*get_fhog_window_height()*31
(i.e. The number of features is equal to the size of the HOG window
times 31 since there are 31 channels in the HOG feature representation.)
!*/
inline unsigned long get_num_detection_templates (
) const { return 1; }
/*!
ensures
- returns 1. Note that this function is here only for compatibility with
the scan_image_pyramid object. Notionally, its return value indicates
that a scan_fhog_pyramid object is always ready to detect objects once
an image has been loaded.
!*/
inline unsigned long get_num_movable_components_per_detection_template (
) const { return 0; }
/*!
ensures
- returns 0. Note that this function is here only for compatibility with
the scan_image_pyramid object. Its return value means that this object
does not support using movable part models.
!*/
unsigned long get_max_pyramid_levels (
) const;
/*!
ensures
- returns the maximum number of image pyramid levels this object will use.
Note that #get_max_pyramid_levels() == 1 indicates that no image pyramid
will be used at all. That is, only the original image will be processed
and no lower scale versions will be created.
!*/
void set_max_pyramid_levels (
unsigned long max_levels
);
/*!
requires
- max_levels > 0
ensures
- #get_max_pyramid_levels() == max_levels
!*/
void set_min_pyramid_layer_size (
unsigned long width,
unsigned long height
);
/*!
requires
- width > 0
- height > 0
ensures
- #get_min_pyramid_layer_width() == width
- #get_min_pyramid_layer_height() == height
!*/
inline unsigned long get_min_pyramid_layer_width (
) const;
/*!
ensures
- returns the smallest allowable width of an image in the image pyramid.
All pyramids will always include the original input image, however, no
pyramid levels will be created which have a width smaller than the
value returned by this function.
!*/
inline unsigned long get_min_pyramid_layer_height (
) const;
/*!
ensures
- returns the smallest allowable height of an image in the image pyramid.
All pyramids will always include the original input image, however, no
pyramid levels will be created which have a height smaller than the
value returned by this function.
!*/
fhog_filterbank build_fhog_filterbank (
const feature_vector_type& weights
) const;
/*!
requires
- weights.size() >= get_num_dimensions()
ensures
- Creates and then returns a fhog_filterbank object FB such that:
- FB.get_num_dimensions() == get_num_dimensions()
- FB.get_filters() == the values in weights unpacked into 31 filters.
- FB.num_separable_filters() == the number of separable filters necessary to
represent all the filters in FB.get_filters().
!*/
class fhog_filterbank
{
/*!
WHAT THIS OBJECT REPRESENTS
This object represents a HOG filter bank. That is, the classifier that
is slid over a HOG pyramid is a set of 31 linear filters, each
get_fhog_window_width() rows by get_fhog_window_height() columns in
size. This object contains that set of 31 filters.
!*/
public:
unsigned long get_num_dimensions(
) const;
/*!
ensures
- Returns the total number of values in the filters.
!*/
const std::vector<matrix<float> >& get_filters(
) const;
/*!
ensures
- returns the set of 31 HOG filters in this object.
!*/
unsigned long num_separable_filters(
) const;
/*!
ensures
- returns the number of separable filters necessary to represent all
the filters in get_filters().
!*/
};
void detect (
const fhog_filterbank& w,
std::vector<std::pair<double, rectangle> >& dets,
const double thresh
) const;
/*!
requires
- w.get_num_dimensions() == get_num_dimensions()
- is_loaded_with_image() == true
ensures
- Scans the HOG filter defined by w over the HOG pyramid that was populated
by the last call to load() and stores all object detections into #dets.
- for all valid i:
- #dets[i].second == The object box which produced this detection. This rectangle gives
the location of the detection. Note that the rectangle will have been converted back into
the original image input space. That is, if this detection was made at a low level in the
image pyramid then the object box will have been automatically mapped up the pyramid layers
to the original image space. Or in other words, if you plot #dets[i].second on top of the
image given to load() it will show up in the right place.
- #dets[i].first == The score for this detection. This value is equal to dot(w, feature vector
for this sliding window location).
- #dets[i].first >= thresh
- #dets will be sorted in descending order. (i.e. #dets[i].first >= #dets[j].first for all i, and j>i)
- Elements of w beyond index get_num_dimensions()-1 are ignored. I.e. only the first
get_num_dimensions() are used.
- Note that no form of non-max suppression is performed. If a window has a score >= thresh
then it is reported in #dets.
!*/
void detect (
const feature_vector_type& w,
std::vector<std::pair<double, rectangle> >& dets,
const double thresh
) const;
/*!
requires
- w.get_num_dimensions() >= get_num_dimensions()
- is_loaded_with_image() == true
ensures
- performs: detect(build_fhog_filterbank(w), dets, thresh)
!*/
void get_feature_vector (
const full_object_detection& obj,
feature_vector_type& psi
) const;
/*!
requires
- obj.num_parts() == 0
- is_loaded_with_image() == true
- psi.size() >= get_num_dimensions()
(i.e. psi must have preallocated its memory before this function is called)
ensures
- This function allows you to determine the feature vector used for an
object detection output from detect(). Note that this vector is
added to psi. Note also that you must use get_full_object_detection() to
convert a rectangle from detect() into the needed full_object_detection.
- The dimensionality of the vector added to psi is get_num_dimensions(). This
means that elements of psi after psi(get_num_dimensions()-1) are not modified.
- Since scan_fhog_pyramid only searches a limited set of object locations,
not all possible rectangles can be output by detect(). So in the case
where obj.get_rect() could not arise from a call to detect(), this
function will map obj.get_rect() to the nearest possible rectangle and
then add the feature vector for the mapped rectangle into #psi.
- get_best_matching_rect(obj.get_rect()) == the rectangle obj.get_rect()
gets mapped to for feature extraction.
!*/
full_object_detection get_full_object_detection (
const rectangle& rect,
const feature_vector_type& w
) const;
/*!
ensures
- returns full_object_detection(rect)
(This function is here only for compatibility with the scan_image_pyramid
object)
!*/
const rectangle get_best_matching_rect (
const rectangle& rect
) const;
/*!
ensures
- Since scan_fhog_pyramid only searches a limited set of object locations,
not all possible rectangles can be represented. Therefore, this function
allows you to supply a rectangle and obtain the nearest possible
candidate object location rectangle.
!*/
double get_nuclear_norm_regularization_strength (
) const;
/*!
ensures
- If the number of separable filters in a fhog_filterbank is small then the
filter bank can be scanned over an image much faster than a normal set of
31 filters. Therefore, this object provides the option to encourage
machine learning methods that learn a HOG filter bank (i.e.
structural_object_detection_trainer) to select filter banks that have
this beneficial property. In particular, the value returned by
get_nuclear_norm_regularization_strength() is a multiplier on a nuclear
norm regularizer which will encourage the selection of filters that use a
small number of separable components. Larger values encourage tend to
give a smaller number of separable filters.
- if (get_nuclear_norm_regularization_strength() == 0) then
- This feature is disabled
- else
- A nuclear norm regularizer will be added when
structural_object_detection_trainer is used to learn a HOG filter
bank. Note that this can make the training process take
significantly longer (but can result in faster object detectors).
!*/
void set_nuclear_norm_regularization_strength (
double strength
);
/*!
requires
- strength >= 0
ensures
- #get_nuclear_norm_regularization_strength() == strength
!*/
};
// ----------------------------------------------------------------------------------------
template <typename T>
void serialize (
const scan_fhog_pyramid<T>& item,
std::ostream& out
);
/*!
provides serialization support
!*/
// ----------------------------------------------------------------------------------------
template <typename T>
void deserialize (
scan_fhog_pyramid<T>& item,
std::istream& in
);
/*!
provides deserialization support
!*/
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
matrix<unsigned char> draw_fhog (
const object_detector<scan_fhog_pyramid<Pyramid_type> >& detector,
const long cell_draw_size = 15
);
/*!
requires
- detector.get_w().size() >= detector.get_scanner().get_num_dimensions()
(i.e. the detector must have been populated with a HOG filter)
ensures
- Converts the HOG filters in the given detector into an image suitable for
display on the screen. In particular, we draw all the HOG cells into a
grayscale image in a way that shows the magnitude and orientation of the
gradient energy in each cell. The resulting image is then returned.
!*/
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type
>
unsigned long num_separable_filters (
const object_detector<scan_fhog_pyramid<Pyramid_type> >& detector
);
/*!
requires
- detector.get_w().size() >= detector.get_scanner().get_num_dimensions()
(i.e. the detector must have been populated with a HOG filter)
ensures
- Returns the number of separable filters necessary to represent the HOG
filters in the given detector.
!*/
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_SCAN_fHOG_PYRAMID_ABSTRACT_H__
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment