Commit 7b006f37 authored by Davis King's avatar Davis King

Added an option to do bounding box regression to the loss_mmod layer.

parent 16c96bc5
......@@ -702,6 +702,8 @@ namespace dlib
double truth_match_iou_threshold = 0.5;
test_box_overlap overlaps_nms = test_box_overlap(0.4);
test_box_overlap overlaps_ignore;
bool use_bounding_box_regression = false;
double bbr_lambda = 100;
use_image_pyramid assume_image_pyramid = use_image_pyramid::yes;
......@@ -937,7 +939,7 @@ namespace dlib
inline void serialize(const mmod_options& item, std::ostream& out)
{
int version = 3;
int version = 4;
serialize(version, out);
serialize(item.detector_windows, out);
......@@ -947,13 +949,15 @@ namespace dlib
serialize(item.overlaps_nms, out);
serialize(item.overlaps_ignore, out);
serialize(static_cast<uint8_t>(item.assume_image_pyramid), out);
serialize(item.use_bounding_box_regression, out);
serialize(item.bbr_lambda, out);
}
inline void deserialize(mmod_options& item, std::istream& in)
{
int version = 0;
deserialize(version, in);
if (version != 3 && version != 2 && version != 1)
if (!(1 <= version && version <= 4))
throw serialization_error("Unexpected version found while deserializing dlib::mmod_options");
if (version == 1)
{
......@@ -979,6 +983,13 @@ namespace dlib
deserialize(assume_image_pyramid, in);
item.assume_image_pyramid = static_cast<use_image_pyramid>(assume_image_pyramid);
}
item.use_bounding_box_regression = mmod_options().use_bounding_box_regression; // use default value since this wasn't provided
item.bbr_lambda = mmod_options().bbr_lambda; // use default value since this wasn't provided
if (version >= 4)
{
deserialize(item.use_bounding_box_regression, in);
deserialize(item.bbr_lambda, in);
}
}
// ----------------------------------------------------------------------------------------
......@@ -991,20 +1002,31 @@ namespace dlib
intermediate_detection(
rectangle rect_
) : rect(rect_) {}
) : rect(rect_), rect_bbr(rect_) {}
intermediate_detection(
rectangle rect_,
double detection_confidence_,
size_t tensor_offset_,
long channel
) : rect(rect_), detection_confidence(detection_confidence_), tensor_offset(tensor_offset_), tensor_channel(channel) {}
) : rect(rect_), rect_bbr(rect_), detection_confidence(detection_confidence_), tensor_offset(tensor_offset_), tensor_channel(channel) {}
// rect is the rectangle you get without any bounding box regression. So it's
// the basic sliding window box (aka, the "anchor box").
rectangle rect;
double detection_confidence = 0;
size_t tensor_offset = 0;
long tensor_channel = 0;
// rect_bbr = rect + bounding box regression. So more accurate. Or if bbr is off then
// this is just rect. The important thing about rect_bbr is that its the
// rectangle we use for doing NMS.
drectangle rect_bbr;
size_t tensor_offset_dx = 0;
size_t tensor_offset_dy = 0;
size_t tensor_offset_dw = 0;
size_t tensor_offset_dh = 0;
bool operator<(const intermediate_detection& item) const { return detection_confidence < item.detection_confidence; }
};
......@@ -1032,7 +1054,14 @@ namespace dlib
) const
{
const tensor& output_tensor = sub.get_output();
if (options.use_bounding_box_regression)
{
DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size()*5);
}
else
{
DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
}
DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
DLIB_CASSERT(sub.sample_expansion_factor() == 1, sub.sample_expansion_factor());
......@@ -1046,10 +1075,10 @@ namespace dlib
final_dets.clear();
for (unsigned long i = 0; i < dets_accum.size(); ++i)
{
if (overlaps_any_box_nms(final_dets, dets_accum[i].rect))
if (overlaps_any_box_nms(final_dets, dets_accum[i].rect_bbr))
continue;
final_dets.push_back(mmod_rect(dets_accum[i].rect,
final_dets.push_back(mmod_rect(dets_accum[i].rect_bbr,
dets_accum[i].detection_confidence,
options.detector_windows[dets_accum[i].tensor_channel].label));
}
......@@ -1075,13 +1104,19 @@ namespace dlib
DLIB_CASSERT(sub.sample_expansion_factor() == 1);
DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
if (options.use_bounding_box_regression)
{
DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size()*5);
}
else
{
DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
}
double det_thresh_speed_adjust = 0;
// we will scale the loss so that it doesn't get really huge
const double scale = 1.0/output_tensor.size();
const double scale = 1.0/(output_tensor.nr()*output_tensor.nc()*output_tensor.num_samples()*options.detector_windows.size());
double loss = 0;
float* g = grad.host_write_only();
......@@ -1230,6 +1265,59 @@ namespace dlib
hit_truth_table[hittruth.second] = true;
final_dets.push_back(dets[i]);
loss -= options.loss_per_missed_target;
// Now account for BBR loss and gradient if appropriate.
if (options.use_bounding_box_regression)
{
double dx = out_data[dets[i].tensor_offset_dx];
double dy = out_data[dets[i].tensor_offset_dy];
double dw = out_data[dets[i].tensor_offset_dw];
double dh = out_data[dets[i].tensor_offset_dh];
dpoint p = dcenter(dets[i].rect_bbr);
double w = dets[i].rect_bbr.width()-1;
double h = dets[i].rect_bbr.height()-1;
drectangle truth_box = (*truth)[hittruth.second].rect;
dpoint p_truth = dcenter(truth_box);
DLIB_CASSERT(w > 0);
DLIB_CASSERT(h > 0);
double target_dx = (p_truth.x() - p.x())/w;
double target_dy = (p_truth.y() - p.y())/h;
double target_dw = std::log((truth_box.width()-1)/w);
double target_dh = std::log((truth_box.height()-1)/h);
// compute smoothed L1 loss on BBR outputs. This loss
// is just the MSE loss when the loss is small and L1
// when large.
dx = dx-target_dx;
dy = dy-target_dy;
dw = dw-target_dw;
dh = dh-target_dh;
// use smoothed L1
double ldx = std::abs(dx)<1 ? 0.5*dx*dx : std::abs(dx)-0.5;
double ldy = std::abs(dy)<1 ? 0.5*dy*dy : std::abs(dy)-0.5;
double ldw = std::abs(dw)<1 ? 0.5*dw*dw : std::abs(dw)-0.5;
double ldh = std::abs(dh)<1 ? 0.5*dh*dh : std::abs(dh)-0.5;
loss += options.bbr_lambda*(ldx + ldy + ldw + ldh);
// now compute the derivatives of the smoothed L1 loss
ldx = put_in_range(-1,1, dx);
ldy = put_in_range(-1,1, dy);
ldw = put_in_range(-1,1, dw);
ldh = put_in_range(-1,1, dh);
// also smoothed L1 gradient goes to gradient output
g[dets[i].tensor_offset_dx] += scale*options.bbr_lambda*ldx;
g[dets[i].tensor_offset_dy] += scale*options.bbr_lambda*ldy;
g[dets[i].tensor_offset_dw] += scale*options.bbr_lambda*ldw;
g[dets[i].tensor_offset_dh] += scale*options.bbr_lambda*ldh;
}
}
else
{
......@@ -1299,6 +1387,9 @@ namespace dlib
out << ", loss per FA:" << opts.loss_per_false_alarm;
out << ", loss per miss:" << opts.loss_per_missed_target;
out << ", truth match IOU thresh:" << opts.truth_match_iou_threshold;
out << ", use_bounding_box_regression:" << opts.use_bounding_box_regression;
if (opts.use_bounding_box_regression)
out << ", bbr_lambda:" << opts.bbr_lambda;
out << ", overlaps_nms:("<<opts.overlaps_nms.get_iou_thresh()<<","<<opts.overlaps_nms.get_percent_covered_thresh()<<")";
out << ", overlaps_ignore:("<<opts.overlaps_ignore.get_iou_thresh()<<","<<opts.overlaps_ignore.get_percent_covered_thresh()<<")";
......@@ -1325,11 +1416,19 @@ namespace dlib
) const
{
DLIB_CASSERT(net.sample_expansion_factor() == 1,net.sample_expansion_factor());
if (options.use_bounding_box_regression)
{
DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size()*5);
}
else
{
DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
}
const float* out_data = output_tensor.host() + output_tensor.k()*output_tensor.nr()*output_tensor.nc()*i;
// scan the final layer and output the positive scoring locations
dets_accum.clear();
for (long k = 0; k < output_tensor.k(); ++k)
for (long k = 0; k < options.detector_windows.size(); ++k)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
......@@ -1343,6 +1442,28 @@ namespace dlib
rect = input_layer(net).tensor_space_to_image_space(input_tensor,rect);
dets_accum.push_back(intermediate_detection(rect, score, (k*output_tensor.nr() + r)*output_tensor.nc() + c, k));
if (options.use_bounding_box_regression)
{
const auto offset = options.detector_windows.size() + k*4;
dets_accum.back().tensor_offset_dx = ((offset+0)*output_tensor.nr() + r)*output_tensor.nc() + c;
dets_accum.back().tensor_offset_dy = ((offset+1)*output_tensor.nr() + r)*output_tensor.nc() + c;
dets_accum.back().tensor_offset_dw = ((offset+2)*output_tensor.nr() + r)*output_tensor.nc() + c;
dets_accum.back().tensor_offset_dh = ((offset+3)*output_tensor.nr() + r)*output_tensor.nc() + c;
// apply BBR to dets_accum.back()
double dx = out_data[dets_accum.back().tensor_offset_dx];
double dy = out_data[dets_accum.back().tensor_offset_dy];
double dw = out_data[dets_accum.back().tensor_offset_dw];
double dh = out_data[dets_accum.back().tensor_offset_dh];
dw = std::exp(dw);
dh = std::exp(dh);
double w = rect.width()-1;
double h = rect.height()-1;
rect = translate_rect(rect, dpoint(dx*w,dy*h));
rect = centered_drect(rect, w*dw+1, h*dh+1);
dets_accum.back().rect_bbr = rect;
}
}
}
}
......
......@@ -652,6 +652,21 @@ namespace dlib
// However, sometimes scale-invariance may not be desired.
use_image_pyramid assume_image_pyramid = use_image_pyramid::yes;
// By default, the mmod loss doesn't train any bounding box regression model. But
// if you set use_bounding_box_regression == true then it expects the network to
// output a tensor with detector_windows.size()*5 channels rather than just
// detector_windows.size() channels. The 4 extra channels per window are trained
// to give a bounding box regression output that improves the positioning of the
// output detection box.
bool use_bounding_box_regression = false;
// When using bounding box regression, bbr_lambda determines how much you care
// about getting the bounding box shape correct vs just getting the detector to
// find objects. That is, the objective function being optimized is
// basic_mmod_loss + bbr_lambda*bounding_box_regression_loss. So setting
// bbr_lambda to a larger value will cause the overall loss to care more about
// getting the bounding box shape correct.
double bbr_lambda = 100;
mmod_options (
const std::vector<std::vector<mmod_rect>>& boxes,
const unsigned long target_size,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment