Added face recognition example

fe1a15f3 · Davis King · bcf6bd9a · fe1a15f3 · fe1a15f3 · fe1a15f3
Commit fe1a15f3 authored Feb 11, 2017 by Davis King
4 changed files
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -45,6 +45,7 @@ ENDMACRO()
 if (NOT USING_OLD_VISUAL_STUDIO_COMPILER)
   add_example(dnn_metric_learning_on_images_ex)
   add_example(dnn_metric_learning_ex)
+   add_example(dnn_face_recognition_ex)
   add_example(dnn_introduction_ex)
   add_example(dnn_introduction2_ex)
   add_example(dnn_inception_ex)

--- a/examples/dnn_face_recognition_ex.cpp
+++ b/examples/dnn_face_recognition_ex.cpp
+// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
+/*
+    This is an example illustrating the use of the deep learning tools from the dlib C++
+    Library.  In it, we will show how to do face recognition.  This example uses the
+    pretrained dlib_face_recognition_resnet_model_v1 model which is freely available from
+    the dlib web site.  This model has a 99.38% accuracy on the standard LFW face
+    recognition benchmark, which is comparable to other state-of-the-art methods for face
+    recognition as of February 2017. 
+    
+    In this example, we will use dlib to do face clustering.  Included in the examples
+    folder is an image, bald_guys.jpg, which contains a bunch of photos of action movie
+    stars Vin Diesel, The Rock, Jason Statham, and Bruce Willis.   We will use dlib to
+    automatically find their faces in the image and then to automatically determine how
+    many people there are (4 in this case) as well as which faces belong to each person.
+    
+    Finally, this example uses a network with the loss_metric loss.  Therefore, if you want
+    to learn how to train your own models, or to get a general introduction to this loss
+    layer, you should read the dnn_metric_learning_ex.cpp and
+    dnn_metric_learning_on_images_ex.cpp examples.
+*/
+
+#include <dlib/gui_widgets.h>
+#include <dlib/clustering.h>
+#include <dlib/string.h>
+#include <dlib/dnn.h>
+#include <dlib/image_io.h>
+#include <dlib/image_processing/frontal_face_detector.h>
+
+using namespace dlib;
+using namespace std;
+
+// ----------------------------------------------------------------------------------------
+
+// The next bit of code defines a ResNet network.  It's basically copied
+// and pasted from the dnn_imagenet_ex.cpp example, except we replaced the loss
+// layer with loss_metric and make the network somewhat smaller.  Go read the introductory
+// dlib DNN examples to learn what all this stuff means.
+//
+// Also, the dnn_metric_learning_on_images_ex.cpp example shows how to train this network.
+// The dlib_face_recognition_resnet_model_v1 model used by this example was trained using
+// essentially the code shown in dnn_metric_learning_on_images_ex.cpp except the
+// mini-batches were made larger (35x15 instead of 5x5), the iterations without progress
+// was set to 10000, the jittering you can see below in jitter_image() was used during
+// training, and the training dataset consisted of about 3 million images instead of 55.
+template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
+using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;
+
+template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
+using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>;
+
+template <int N, template <typename> class BN, int stride, typename SUBNET> 
+using block  = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>;
+
+template <int N, typename SUBNET> using ares      = relu<residual<block,N,affine,SUBNET>>;
+template <int N, typename SUBNET> using ares_down = relu<residual_down<block,N,affine,SUBNET>>;
+
+template <typename SUBNET> using alevel0 = ares_down<256,SUBNET>;
+template <typename SUBNET> using alevel1 = ares<256,ares<256,ares_down<256,SUBNET>>>;
+template <typename SUBNET> using alevel2 = ares<128,ares<128,ares_down<128,SUBNET>>>;
+template <typename SUBNET> using alevel3 = ares<64,ares<64,ares<64,ares_down<64,SUBNET>>>>;
+template <typename SUBNET> using alevel4 = ares<32,ares<32,ares<32,SUBNET>>>;
+
+using anet_type = loss_metric<fc_no_bias<128,avg_pool_everything<
+                            alevel0<
+                            alevel1<
+                            alevel2<
+                            alevel3<
+                            alevel4<
+                            max_pool<3,3,2,2,relu<affine<con<32,7,7,2,2,
+                            input_rgb_image_sized<150>
+                            >>>>>>>>>>>>;
+
+// ----------------------------------------------------------------------------------------
+
+std::vector<matrix<rgb_pixel>> jitter_image(
+    const matrix<rgb_pixel>& img
+);
+
+// ----------------------------------------------------------------------------------------
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        cout << "Run this example by invoking it like this: " << endl;
+        cout << "   ./dnn_face_recognition_ex faces/bald_guys.jpg" << endl;
+        cout << endl;
+        cout << "You will also need to get the face landmarking model file as well as " << endl;
+        cout << "the face recognition model file.  Download and then decompress these files from: " << endl;
+        cout << "http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2" << endl;
+        cout << "http://dlib.net/files/dlib_face_recognition_resnet_model_v1.dat.bz2" << endl;
+        cout << endl;
+        return 1;
+    }
+
+    // The first thing we are going to do is load all our models.  First, since we need to
+    // find faces in the image we will need a face detector:
+    frontal_face_detector detector = get_frontal_face_detector();
+    // We will also use a face landmarking model to align faces to a standard pose:  (see face_landmark_detection_ex.cpp for an introduction)
+    shape_predictor sp;
+    deserialize("shape_predictor_68_face_landmarks.dat") >> sp;
+    // And finally we load the DNN responsible for face recognition.
+    anet_type net;
+    deserialize("dlib_face_recognition_resnet_model_v1.dat") >> net;
+
+    matrix<rgb_pixel> img;
+    load_image(img, argv[1]);
+    // Display the raw image on the screen
+    image_window win(img); 
+
+    // Run the face detector on the image of our action heroes, and for each face extract a
+    // copy that has been normalized to 150x150 pixels in size and appropriately rotated
+    // and centered.
+    std::vector<matrix<rgb_pixel>> faces;
+    for (auto face : detector(img))
+    {
+        auto shape = sp(img, face);
+        matrix<rgb_pixel> face_chip;
+        extract_image_chip(img, get_face_chip_details(shape,150,0.25), face_chip);
+        faces.push_back(move(face_chip));
+        // Also put some boxes on the faces so we can see that the detector is finding
+        // them.
+        win.add_overlay(face);
+    }
+
+    if (faces.size() == 0)
+    {
+        cout << "No faces found in image!" << endl;
+        return 1;
+    }
+
+    // This call asks the DNN to convert each face image in faces into a 128D vector.
+    // In this 128D vector space, images from the same person will be close to each other
+    // but vectors from different people will be far apart.  So we can use these vectors to
+    // identify if a pair of images are from the same person or from different people.  
+    std::vector<matrix<float,0,1>> face_descriptors = net(faces);
+
+
+    // In particular, one simple thing we can do is face clustering.  This next bit of code
+    // creates a graph of connected faces and then uses the Chinese whispers graph clustering
+    // algorithm to identify how many people there are and which faces belong to whom.
+    std::vector<sample_pair> edges;
+    int possible_edges = 0;
+    for (size_t i = 0; i < face_descriptors.size(); ++i)
+    {
+        for (size_t j = i+1; j < face_descriptors.size(); ++j)
+        {
+            // Faces are connected in the graph if they are close enough.  Here we check if
+            // the distance between two face descriptors is less than 0.6, which is the
+            // decision threshold the network was trained to use.  Although you can
+            // certainly use any other threshold you find useful.
+            if (length(face_descriptors[i]-face_descriptors[j]) < 0.6)
+                edges.push_back(sample_pair(i,j));
+        }
+    }
+    std::vector<unsigned long> labels;
+    const auto num_clusters = chinese_whispers(edges, labels);
+    // This will correctly indicate that there are 4 people in the image.
+    cout << "number of people found in the image: "<< num_clusters << endl;
+
+
+    // Now let's display the face clustering results on the screen.  You will see that it
+    // correctly grouped all the faces. 
+    std::vector<image_window> win_clusters(num_clusters);
+    for (size_t cluster_id = 0; cluster_id < num_clusters; ++cluster_id)
+    {
+        std::vector<matrix<rgb_pixel>> temp;
+        for (size_t j = 0; j < labels.size(); ++j)
+        {
+            if (cluster_id == labels[j])
+                temp.push_back(faces[j]);
+        }
+        win_clusters[cluster_id].set_title("face cluster " + cast_to_string(cluster_id));
+        win_clusters[cluster_id].set_image(tile_images(temp));
+    }
+
+
+
+
+    // Finally, let's print one of the face descriptors to the screen.  
+    cout << "face descriptor for one face: " << trans(face_descriptors[0]) << endl;
+
+    // It should also be noted that face recognition accuracy can be improved if jittering
+    // is used when creating face descriptors.  In particular, to get 99.38% on the LFW
+    // benchmark you need to use the jitter_image() routine to compute the descriptors,
+    // like so:
+    matrix<float,0,1> face_descriptor = mean(mat(net(jitter_image(faces[0]))));
+    cout << "jittered face descriptor for one face: " << trans(face_descriptor) << endl;
+    // If you use the model without jittering, as we did when clustering the bald guys, it
+    // gets an accuracy of 99.13% on the LFW benchmark.  So jittering makes the whole
+    // procedure a little more accurate but makes face descriptor calculation slower.
+
+
+    cout << "hit enter to terminate" << endl;
+    cin.get();
+}
+
+// ----------------------------------------------------------------------------------------
+
+std::vector<matrix<rgb_pixel>> jitter_image(
+    const matrix<rgb_pixel>& img
+)
+{
+    // All this function does is make 100 copies of img, all slightly jittered by being
+    // zoomed, rotated, and translated a little bit differently.
+    thread_local random_cropper cropper;
+    cropper.set_chip_dims(150,150);
+    cropper.set_randomly_flip(true);
+    cropper.set_max_object_height(0.99999);
+    cropper.set_background_crops_fraction(0);
+    cropper.set_min_object_height(0.97);
+    cropper.set_translate_amount(0.02);
+    cropper.set_max_rotation_degrees(3);
+
+    std::vector<mmod_rect> raw_boxes(1), ignored_crop_boxes;
+    raw_boxes[0] = shrink_rect(get_rect(img),3);
+    std::vector<matrix<rgb_pixel>> crops; 
+
+    matrix<rgb_pixel> temp; 
+    for (int i = 0; i < 100; ++i)
+    {
+        cropper(img, raw_boxes, temp, ignored_crop_boxes);
+        crops.push_back(move(temp));
+    }
+    return crops;
+}
+
+// ----------------------------------------------------------------------------------------
+
--- a/examples/dnn_metric_learning_on_images_ex.cpp
+++ b/examples/dnn_metric_learning_on_images_ex.cpp
@@ -212,13 +212,13 @@ int main(int argc, char** argv)

    net_type net;

-    dnn_trainer<net_type> trainer(net, sgd(0.0005, 0.9));
+    dnn_trainer<net_type> trainer(net, sgd(0.0001, 0.9));
    trainer.set_learning_rate(0.1);
    trainer.be_verbose();
    trainer.set_synchronization_file("face_metric_sync", std::chrono::minutes(5));
    // I've set this to something really small to make the example terminate
    // sooner.  But when you really want to train a good model you should set
-    // this to something like 8000 so training doesn't terminate too early.
+    // this to something like 10000 so training doesn't terminate too early.
    trainer.set_iterations_without_progress_threshold(300);

    // If you have a lot of data then it might not be reasonable to load it all

--- a/examples/faces/bald_guys.jpg
+++ b/examples/faces/bald_guys.jpg