Commit a88f1bd8 authored by Davis King's avatar Davis King

Made the converter add zero padding layers when needed by Eltwise to replicate

the behavior of dlib's add_prev layers.
parent 984b6949
......@@ -21,6 +21,8 @@ struct layer
string type; // comp, loss, or input
int idx;
matrix<long,4,1> output_tensor_shape; // (N,K,NR,NC)
string detail_name; // The name of the tag inside the layer tag. e.g. fc, con, max_pool, input_rgb_image.
std::map<string,double> attributes;
matrix<double> params;
......@@ -49,30 +51,32 @@ struct layer
// ----------------------------------------------------------------------------------------
std::vector<layer> parse_dlib_xml(
const matrix<long,4,1>& input_tensor_shape,
const string& xml_filename
);
// ----------------------------------------------------------------------------------------
template <typename iterator>
string find_layer_caffe_name (
const layer& find_layer (
iterator i,
long tag_id
)
/*!
requires
- i is an iterator pointing to a layer in the list of layers produced by parse_dlib_xml().
- i is a reverse iterator pointing to a layer in the list of layers produced by parse_dlib_xml().
- i is not an input layer.
ensures
- if (tag_id == -1) then
- returns the caffe string name for the previous layer to layer i.
- returns the previous layer (i.e. closer to the input) to layer i.
- else
- returns the caffe string name for the previous layer to layer i with the given tag_id.
- returns the previous layer (i.e. closer to the input) to layer i with the
given tag_id.
!*/
{
if (tag_id == -1)
{
return (i-1)->caffe_layer_name();
return *(i-1);
}
else
{
......@@ -81,7 +85,7 @@ string find_layer_caffe_name (
i--;
// if we hit the end of the network before we found what we were looking for
if (i->tag_id == tag_id)
return i->caffe_layer_name();
return *i;
if (i->type == "input")
throw dlib::error("Network definition is bad, a layer wanted to skip back to a non-existing layer.");
}
......@@ -89,7 +93,19 @@ string find_layer_caffe_name (
}
template <typename iterator>
string find_input_layer_caffe_name (iterator i) { return find_layer_caffe_name(i, i->skip_id); }
const layer& find_input_layer (iterator i) { return find_layer(i, i->skip_id); }
template <typename iterator>
string find_layer_caffe_name (
iterator i,
long tag_id
)
{
return find_layer(i,tag_id).caffe_layer_name();
}
template <typename iterator>
string find_input_layer_caffe_name (iterator i) { return find_input_layer(i).caffe_layer_name(); }
// ----------------------------------------------------------------------------------------
......@@ -116,7 +132,8 @@ void convert_dlib_xml_to_caffe_python_code(
cout << "Writing model to " << out_filename << endl;
ofstream fout(out_filename);
fout.precision(9);
const auto layers = parse_dlib_xml(xml_filename);
const auto layers = parse_dlib_xml({N,K,NR,NC}, xml_filename);
fout << "#\n";
fout << "# !!! This file was automatically generated by dlib's tools/convert_dlib_nets_to_caffe utility. !!!\n";
......@@ -301,10 +318,59 @@ void convert_dlib_xml_to_caffe_python_code(
}
else if (i->detail_name == "add_prev")
{
fout << " n." << i->caffe_layer_name() << " = L.Eltwise(n." << find_input_layer_caffe_name(i);
fout << ", n." << find_layer_caffe_name(i, i->attribute("tag"));
fout << ", operation=P.Eltwise.SUM";
fout << ");\n";
auto in_shape1 = find_input_layer(i).output_tensor_shape;
auto in_shape2 = find_layer(i,i->attribute("tag")).output_tensor_shape;
if (in_shape1 != in_shape2)
{
// if only the number of channels differs then we will use a dummy layer to
// pad with zeros. But otherwise we will throw an error.
if (in_shape1(0) == in_shape2(0) &&
in_shape1(2) == in_shape2(2) &&
in_shape1(3) == in_shape2(3))
{
fout << " n." << i->caffe_layer_name() << "_zeropad = L.DummyData(num=" << in_shape1(0);
fout << ", channels="<<std::abs(in_shape1(1)-in_shape2(1));
fout << ", height="<<in_shape1(2);
fout << ", width="<<in_shape1(3);
fout << ");\n";
string smaller_layer = find_input_layer_caffe_name(i);
string bigger_layer = find_layer_caffe_name(i, i->attribute("tag"));
if (in_shape1(1) > in_shape2(1))
swap(smaller_layer, bigger_layer);
fout << " n." << i->caffe_layer_name() << "_concat = L.Concat(n." << smaller_layer;
fout << ", n." << i->caffe_layer_name() << "_zeropad";
fout << ");\n";
fout << " n." << i->caffe_layer_name() << " = L.Eltwise(n." << i->caffe_layer_name() << "_concat";
fout << ", n." << bigger_layer;
fout << ", operation=P.Eltwise.SUM";
fout << ");\n";
}
else
{
std::ostringstream sout;
sout << "The dlib network contained an add_prev layer (layer idx " << i->idx << ") that adds two previous ";
sout << "layers with different output tensor dimensions. Caffe's equivalent layer, Eltwise, doesn't support ";
sout << "adding layers together with different dimensions. In the special case where the only difference is ";
sout << "in the number of channels, this converter program will add a dummy layer that outputs a tensor full of zeros ";
sout << "and concat it appropriately so this will work. However, this network you are converting has tensor dimensions ";
sout << "different in values other than the number of channels. In particular, here are the two tensor shapes (batch size, channels, rows, cols): ";
std::ostringstream sout2;
sout2 << wrap_string(sout.str()) << endl;
sout2 << trans(in_shape1);
sout2 << trans(in_shape2);
throw dlib::error(sout2.str());
}
}
else
{
fout << " n." << i->caffe_layer_name() << " = L.Eltwise(n." << find_input_layer_caffe_name(i);
fout << ", n." << find_layer_caffe_name(i, i->attribute("tag"));
fout << ", operation=P.Eltwise.SUM";
fout << ");\n";
}
}
else
{
......@@ -549,7 +615,68 @@ public:
// ----------------------------------------------------------------------------------------
void compute_output_tensor_shapes(const matrix<long,4,1>& input_tensor_shape, std::vector<layer>& layers)
{
DLIB_CASSERT(layers.back().type == "input");
layers.back().output_tensor_shape = input_tensor_shape;
for (auto i = ++layers.rbegin(); i != layers.rend(); ++i)
{
const auto input_shape = find_input_layer(i).output_tensor_shape;
if (i->type == "comp")
{
if (i->detail_name == "fc" || i->detail_name == "fc_no_bias")
{
long num_outputs = i->attribute("num_outputs");
i->output_tensor_shape = {input_shape(0), num_outputs, 1, 1};
}
else if (i->detail_name == "con")
{
long num_filters = i->attribute("num_filters");
long filter_nc = i->attribute("nc");
long filter_nr = i->attribute("nr");
long stride_x = i->attribute("stride_x");
long stride_y = i->attribute("stride_y");
long padding_x = i->attribute("padding_x");
long padding_y = i->attribute("padding_y");
long nr = 1+(input_shape(2) + 2*padding_y - filter_nr)/stride_y;
long nc = 1+(input_shape(3) + 2*padding_x - filter_nc)/stride_x;
i->output_tensor_shape = {input_shape(0), num_filters, nr, nc};
}
else if (i->detail_name == "max_pool" || i->detail_name == "avg_pool")
{
long filter_nc = i->attribute("nc");
long filter_nr = i->attribute("nr");
long stride_x = i->attribute("stride_x");
long stride_y = i->attribute("stride_y");
long padding_x = i->attribute("padding_x");
long padding_y = i->attribute("padding_y");
long nr = 1+(input_shape(2) + 2*padding_y - filter_nr)/stride_y;
long nc = 1+(input_shape(3) + 2*padding_x - filter_nc)/stride_x;
i->output_tensor_shape = {input_shape(0), input_shape(1), nr, nc};
}
else if (i->detail_name == "add_prev")
{
auto aux_shape = find_layer(i, i->attribute("tag")).output_tensor_shape;
for (long j = 0; j < input_shape.size(); ++j)
i->output_tensor_shape(j) = std::max(input_shape(j), aux_shape(j));
}
else
{
i->output_tensor_shape = input_shape;
}
}
else
{
i->output_tensor_shape = input_shape;
}
}
}
// ----------------------------------------------------------------------------------------
std::vector<layer> parse_dlib_xml(
const matrix<long,4,1>& input_tensor_shape,
const string& xml_filename
)
{
......@@ -561,6 +688,8 @@ std::vector<layer> parse_dlib_xml(
if (dh.layers.back().type != "input")
throw dlib::error("The network in the XML file is missing an input layer!");
compute_output_tensor_shapes(input_tensor_shape, dh.layers);
return dh.layers;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment