merged

44387e39 · Davis King · 97ff8cb2 · a88f1bd8 · 44387e39 · 44387e39
Commit 44387e39 authored May 24, 2017 by Davis King
Hide whitespace changes
Inline Side-by-side

Showing with 188 additions and 36 deletions

main.cpp tools/convert_dlib_nets_to_caffe/main.cpp +181 -31

running_a_dlib_model_with_caffe_example.py ..._nets_to_caffe/running_a_dlib_model_with_caffe_example.py +7 -5

No files found.
--- a/tools/convert_dlib_nets_to_caffe/main.cpp
+++ b/tools/convert_dlib_nets_to_caffe/main.cpp
@@ -21,6 +21,8 @@ struct layer
    string type; // comp, loss, or input
    int idx;

+    matrix<long,4,1> output_tensor_shape; // (N,K,NR,NC)
+
    string detail_name; // The name of the tag inside the layer tag. e.g. fc, con, max_pool, input_rgb_image.
    std::map<string,double> attributes;
    matrix<double> params;
@@ -49,30 +51,32 @@ struct layer
 // ----------------------------------------------------------------------------------------

 std::vector<layer> parse_dlib_xml(
+    const matrix<long,4,1>& input_tensor_shape, 
    const string& xml_filename
 );

 // ----------------------------------------------------------------------------------------

 template <typename iterator>
-string find_layer_caffe_name (
+const layer& find_layer (
    iterator i,
    long tag_id
 )
 /*!
    requires
-        - i is an iterator pointing to a layer in the list of layers produced by parse_dlib_xml().
+        - i is a reverse iterator pointing to a layer in the list of layers produced by parse_dlib_xml().
        - i is not an input layer.
    ensures
        - if (tag_id == -1) then
-            - returns the caffe string name for the previous layer to layer i.
+            - returns the previous layer (i.e. closer to the input) to layer i.
        - else
-            - returns the caffe string name for the previous layer to layer i with the given tag_id.
+            - returns the previous layer (i.e. closer to the input) to layer i with the
+              given tag_id.
 !*/
 {
    if (tag_id == -1)
    {
-        return (i-1)->caffe_layer_name();
+        return *(i-1);
    }
    else
    {
@@ -81,7 +85,7 @@ string find_layer_caffe_name (
            i--;
            // if we hit the end of the network before we found what we were looking for
            if (i->tag_id == tag_id)
-                return i->caffe_layer_name();
+                return *i;
            if (i->type == "input")
                throw dlib::error("Network definition is bad, a layer wanted to skip back to a non-existing layer.");
        }
@@ -89,7 +93,19 @@ string find_layer_caffe_name (
 }

 template <typename iterator>
-string find_input_layer_caffe_name (iterator i) { return find_layer_caffe_name(i, i->skip_id); }
+const layer& find_input_layer (iterator i) { return find_layer(i, i->skip_id); }
+
+template <typename iterator>
+string find_layer_caffe_name (
+    iterator i,
+    long tag_id
+)
+{
+    return find_layer(i,tag_id).caffe_layer_name();
+}
+
+template <typename iterator>
+string find_input_layer_caffe_name (iterator i) { return find_input_layer(i).caffe_layer_name(); }

 // ----------------------------------------------------------------------------------------

@@ -105,14 +121,19 @@ void print_as_np_array(std::ostream& out, const matrix_exp<EXP>& m)
 // ----------------------------------------------------------------------------------------

 void convert_dlib_xml_to_caffe_python_code(
-    const string& xml_filename
+    const string& xml_filename,
+    const long N,
+    const long K,
+    const long NR,
+    const long NC
 )
 {
    const string out_filename = left_substr(xml_filename,".") + "_dlib_to_caffe_model.py";
    cout << "Writing model to " << out_filename << endl;
    ofstream fout(out_filename);
    fout.precision(9);
-    const auto layers = parse_dlib_xml(xml_filename);
+    const auto layers = parse_dlib_xml({N,K,NR,NC}, xml_filename);
+

    fout << "#\n";
    fout << "# !!! This file was automatically generated by dlib's tools/convert_dlib_nets_to_caffe utility.     !!!\n";
@@ -124,26 +145,34 @@ void convert_dlib_xml_to_caffe_python_code(

    // dlib nets don't commit to a batch size, so just use 1 as the default
    fout << "\n# Input tensor dimensions" << endl;
-    fout << "batch_size = 1;" << endl;
+    fout << "input_batch_size = " << N << ";" << endl;
    if (layers.back().detail_name == "input_rgb_image")
    {
-        cout << "WARNING: The source dlib network didn't commit to a specific input tensor size, we are using a default size of 28x28x1 which is appropriate for MNIST input.  But if you are using different inputs you will need to edit the auto-generated python script to tell it your input size." << endl;
-        fout << "input_nr = 28; #WARNING, the source dlib network didn't commit to a specific input size, so we put 28 here as a default.  It might not be the right value." << endl;
-        fout << "input_nc = 28; #WARNING, the source dlib network didn't commit to a specific input size, so we put 28 here as a default.  It might not be the right value." << endl;
-        fout << "input_k = 3;" << endl;
+        fout << "input_num_channels = 3;" << endl;
+        fout << "input_num_rows = "<<NR<<";" << endl;
+        fout << "input_num_cols = "<<NC<<";" << endl;
+        if (K != 3)
+            throw dlib::error("The dlib model requires input tensors with NUM_CHANNELS==3, but the dtoc command line specified NUM_CHANNELS=="+to_string(K));
    }
    else if (layers.back().detail_name == "input_rgb_image_sized")
    {
-        fout << "input_nr = " << layers.back().attribute("nr") << ";" << endl;
-        fout << "input_nc = " << layers.back().attribute("nc") << ";" << endl;
-        fout << "input_k = 3;" << endl;
+        fout << "input_num_channels = 3;" << endl;
+        fout << "input_num_rows = " << layers.back().attribute("nr") << ";" << endl;
+        fout << "input_num_cols = " << layers.back().attribute("nc") << ";" << endl;
+        if (NR != layers.back().attribute("nr"))
+            throw dlib::error("The dlib model requires input tensors with NUM_ROWS=="+to_string((long)layers.back().attribute("nr"))+", but the dtoc command line specified NUM_ROWS=="+to_string(NR));
+        if (NC != layers.back().attribute("nc"))
+            throw dlib::error("The dlib model requires input tensors with NUM_COLUMNS=="+to_string((long)layers.back().attribute("nc"))+", but the dtoc command line specified NUM_COLUMNS=="+to_string(NC));
+        if (K != 3)
+            throw dlib::error("The dlib model requires input tensors with NUM_CHANNELS==3, but the dtoc command line specified NUM_CHANNELS=="+to_string(K));
    }
    else if (layers.back().detail_name == "input")
    {
-        cout << "WARNING: The source dlib network didn't commit to a specific input tensor size, we are using a default size of 28x28x1 which is appropriate for MNIST input.  But if you are using different inputs you will need to edit the auto-generated python script to tell it your input size." << endl;
-        fout << "input_nr = 28; #WARNING, the source dlib network didn't commit to a specific input size, so we put 28 here as a default.  It might not be the right value." << endl;
-        fout << "input_nc = 28; #WARNING, the source dlib network didn't commit to a specific input size, so we put 28 here as a default.  It might not be the right value." << endl;
-        fout << "input_k = 1;" << endl;
+        fout << "input_num_channels = 1;" << endl;
+        fout << "input_num_rows = "<<NR<<";" << endl;
+        fout << "input_num_cols = "<<NC<<";" << endl;
+        if (K != 1)
+            throw dlib::error("The dlib model requires input tensors with NUM_CHANNELS==1, but the dtoc command line specified NUM_CHANNELS=="+to_string(K));
    }
    else
    {
@@ -173,7 +202,7 @@ void convert_dlib_xml_to_caffe_python_code(
    fout << "    # For reference, the only \"documentation\" about caffe layer parameters seems to be this page:\n";
    fout << "    # https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto\n" << endl;
    fout << "    n = caffe.NetSpec(); " << endl;
-    fout << "    n.data,n.label = L.MemoryData(batch_size=batch_size, channels=input_k, height=input_nr, width=input_nc, ntop=2)" << endl;
+    fout << "    n.data,n.label = L.MemoryData(batch_size=input_batch_size, channels=input_num_channels, height=input_num_rows, width=input_num_cols, ntop=2)" << endl;
    // iterate the layers starting with the input layer
    for (auto i = layers.rbegin(); i != layers.rend(); ++i)
    {
@@ -289,10 +318,59 @@ void convert_dlib_xml_to_caffe_python_code(
        }
        else if (i->detail_name == "add_prev")
        {
-            fout << "    n." << i->caffe_layer_name() << " = L.Eltwise(n." << find_input_layer_caffe_name(i);
-            fout << ", n." << find_layer_caffe_name(i, i->attribute("tag"));
-            fout << ", operation=P.Eltwise.SUM";
-            fout << ");\n";
+            auto in_shape1 = find_input_layer(i).output_tensor_shape;
+            auto in_shape2 = find_layer(i,i->attribute("tag")).output_tensor_shape;
+            if (in_shape1 != in_shape2)
+            {
+                // if only the number of channels differs then we will use a dummy layer to
+                // pad with zeros.  But otherwise we will throw an error.
+                if (in_shape1(0) == in_shape2(0) && 
+                    in_shape1(2) == in_shape2(2) && 
+                    in_shape1(3) == in_shape2(3))
+                {
+                    fout << "    n." << i->caffe_layer_name() << "_zeropad = L.DummyData(num=" << in_shape1(0);
+                    fout << ", channels="<<std::abs(in_shape1(1)-in_shape2(1));
+                    fout << ", height="<<in_shape1(2);
+                    fout << ", width="<<in_shape1(3);
+                    fout << ");\n";
+
+                    string smaller_layer = find_input_layer_caffe_name(i);
+                    string bigger_layer = find_layer_caffe_name(i, i->attribute("tag"));
+                    if (in_shape1(1) > in_shape2(1))
+                        swap(smaller_layer, bigger_layer);
+
+                    fout << "    n." << i->caffe_layer_name() << "_concat = L.Concat(n." << smaller_layer;
+                    fout << ", n." << i->caffe_layer_name() << "_zeropad";
+                    fout << ");\n";
+
+                    fout << "    n." << i->caffe_layer_name() << " = L.Eltwise(n." << i->caffe_layer_name() << "_concat";
+                    fout << ", n." << bigger_layer;
+                    fout << ", operation=P.Eltwise.SUM";
+                    fout << ");\n";
+                }
+                else
+                {
+                    std::ostringstream sout;
+                    sout << "The dlib network contained an add_prev layer (layer idx " << i->idx << ") that adds two previous ";
+                    sout << "layers with different output tensor dimensions.  Caffe's equivalent layer, Eltwise, doesn't support ";
+                    sout << "adding layers together with different dimensions.  In the special case where the only difference is "; 
+                    sout << "in the number of channels, this converter program will add a dummy layer that outputs a tensor full of zeros ";
+                    sout << "and concat it appropriately so this will work.  However, this network you are converting has tensor dimensions ";
+                    sout << "different in values other than the number of channels.  In particular, here are the two tensor shapes (batch size, channels, rows, cols): ";
+                    std::ostringstream sout2;
+                    sout2 << wrap_string(sout.str()) << endl;
+                    sout2 << trans(in_shape1);
+                    sout2 << trans(in_shape2);
+                    throw dlib::error(sout2.str());
+                }
+            }
+            else
+            {
+                fout << "    n." << i->caffe_layer_name() << " = L.Eltwise(n." << find_input_layer_caffe_name(i);
+                fout << ", n." << find_layer_caffe_name(i, i->attribute("tag"));
+                fout << ", operation=P.Eltwise.SUM";
+                fout << ");\n";
+            }
        }
        else
        {
@@ -389,15 +467,24 @@ void convert_dlib_xml_to_caffe_python_code(

 int main(int argc, char** argv) try
 {
-    if (argc == 1)
+    if (argc != 6)
    {
-        cout << "Give this program an xml file generated by dlib::net_to_xml() and it will" << endl;
-        cout << "convert it into a python file that outputs a caffe model containing the dlib model." << endl;
+        cout << "To use this program, give it an xml file generated by dlib::net_to_xml() " << endl;
+        cout << "and then 4 numbers that indicate the input tensor size.  It will convert " << endl;
+        cout << "the xml file into a python file that outputs a caffe model containing the dlib model." << endl;
+        cout << "For example, you might run this program like this: " << endl;
+        cout << "   ./dtoc lenet.xml 1 1 28 28" << endl;
+        cout << "would convert the lenet.xml model into a caffe model with an input tensor of shape(1,1,28,28)" << endl;
+        cout << "where the shape values are (num samples in batch, num channels, num rows, num columns)." << endl;
        return 0;
    }

-    for (int i = 1; i < argc; ++i)
-        convert_dlib_xml_to_caffe_python_code(argv[i]);
+    const long N = sa = argv[2];
+    const long K = sa = argv[3];
+    const long NR = sa = argv[4];
+    const long NC = sa = argv[5];
+
+    convert_dlib_xml_to_caffe_python_code(argv[1], N, K, NR, NC);

    return 0;
 }
@@ -528,7 +615,68 @@ public:

 // ----------------------------------------------------------------------------------------

+void compute_output_tensor_shapes(const matrix<long,4,1>& input_tensor_shape, std::vector<layer>& layers)
+{
+    DLIB_CASSERT(layers.back().type == "input");
+    layers.back().output_tensor_shape = input_tensor_shape;
+    for (auto i = ++layers.rbegin(); i != layers.rend(); ++i)
+    {
+        const auto input_shape = find_input_layer(i).output_tensor_shape;
+        if (i->type == "comp")
+        {
+            if (i->detail_name == "fc" || i->detail_name == "fc_no_bias")
+            {
+                long num_outputs = i->attribute("num_outputs");
+                i->output_tensor_shape = {input_shape(0), num_outputs, 1, 1};
+            }
+            else if (i->detail_name == "con")
+            {
+                long num_filters = i->attribute("num_filters");
+                long filter_nc = i->attribute("nc");
+                long filter_nr = i->attribute("nr");
+                long stride_x = i->attribute("stride_x");
+                long stride_y = i->attribute("stride_y");
+                long padding_x = i->attribute("padding_x");
+                long padding_y = i->attribute("padding_y");
+                long nr = 1+(input_shape(2) + 2*padding_y - filter_nr)/stride_y;
+                long nc = 1+(input_shape(3) + 2*padding_x - filter_nc)/stride_x;
+                i->output_tensor_shape = {input_shape(0), num_filters, nr, nc};
+            }
+            else if (i->detail_name == "max_pool" || i->detail_name == "avg_pool")
+            {
+                long filter_nc = i->attribute("nc");
+                long filter_nr = i->attribute("nr");
+                long stride_x = i->attribute("stride_x");
+                long stride_y = i->attribute("stride_y");
+                long padding_x = i->attribute("padding_x");
+                long padding_y = i->attribute("padding_y");
+                long nr = 1+(input_shape(2) + 2*padding_y - filter_nr)/stride_y;
+                long nc = 1+(input_shape(3) + 2*padding_x - filter_nc)/stride_x;
+                i->output_tensor_shape = {input_shape(0), input_shape(1), nr, nc};
+            }
+            else if (i->detail_name == "add_prev")
+            {
+                auto aux_shape = find_layer(i, i->attribute("tag")).output_tensor_shape;
+                for (long j = 0; j < input_shape.size(); ++j)
+                    i->output_tensor_shape(j) = std::max(input_shape(j), aux_shape(j));
+            }
+            else
+            {
+                i->output_tensor_shape = input_shape;
+            }
+        }
+        else
+        {
+            i->output_tensor_shape = input_shape;
+        }
+
+    }
+}
+
+// ----------------------------------------------------------------------------------------
+
 std::vector<layer> parse_dlib_xml(
+    const matrix<long,4,1>& input_tensor_shape, 
    const string& xml_filename
 )
 {
@@ -540,6 +688,8 @@ std::vector<layer> parse_dlib_xml(
    if (dh.layers.back().type != "input")
        throw dlib::error("The network in the XML file is missing an input layer!");

+    compute_output_tensor_shapes(input_tensor_shape, dh.layers);
+
    return dh.layers;
 }


--- a/tools/convert_dlib_nets_to_caffe/running_a_dlib_model_with_caffe_example.py
+++ b/tools/convert_dlib_nets_to_caffe/running_a_dlib_model_with_caffe_example.py
@@ -10,8 +10,10 @@ import numpy as np
 # dlib lenet model.  Then you need to convert that model into a "dlib to caffe
 # model" python script.  You can do this using the command line program
 # included with dlib: tools/convert_dlib_nets_to_caffe.  That program will
-# output a lenet_dlib_to_caffe_model.py file.  This line here imports that
-# file.
+# output a lenet_dlib_to_caffe_model.py file.  You run that program like this:
+#    ./dtoc lenet.xml 1 1 28 28
+# and it will create the lenet_dlib_to_caffe_model.py file, which we import
+# with the next line:
 import lenet_dlib_to_caffe_model as dlib_model

 # lenet_dlib_to_caffe_model defines a function, save_as_caffe_model() that does
@@ -54,12 +56,12 @@ data = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,121,254,254,219,40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,121,254,207,18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float32');
-data.shape = (dlib_model.batch_size, dlib_model.input_k, dlib_model.input_nr, dlib_model.input_nc);
+data.shape = (dlib_model.input_batch_size, dlib_model.input_num_channels, dlib_model.input_num_rows, dlib_model.input_num_cols);

 # labels isn't logically needed but there doesn't seem to be a way to use
 # caffe's Net interface without providing a superfluous input array.  So we do
 # that here.
-labels = np.ones((dlib_model.batch_size), dtype='float32')
+labels = np.ones((dlib_model.input_batch_size), dtype='float32')
 # Give the image to caffe
 net.set_input_arrays(data/256, labels)
 # Run the data through the network and get the results.
@@ -67,7 +69,7 @@ out = net.forward()

 # Print outputs, looping over minibatch.  You should see that the network
 # correctly classifies the image (it's the number 7).
-for i in xrange(dlib_model.batch_size):
+for i in xrange(dlib_model.input_batch_size):
    print i, 'net final layer = ', out['fc1'][i]
    print i, 'predicted number =', np.argmax(out['fc1'][i])