Added a tt::inv() for computing matrix inversions on the GPU.

373cbb57 · Davis King · b3dc8169 · 373cbb57 · 373cbb57 · 373cbb57
Commit 373cbb57 authored Apr 05, 2017 by Davis King
6 changed files
--- a/dlib/CMakeLists.txt
+++ b/dlib/CMakeLists.txt
@@ -568,14 +568,24 @@ if (NOT TARGET dlib)
                  endif()
               endif()
            endif()
-         endif()

+            # Find where cuSOLVER is since the FindCUDA cmake package doesn't
+            # bother to look for it.
+            get_filename_component(cuda_blas_path ${CUDA_CUBLAS_LIBRARIES} DIRECTORY)
+            find_library(cusolver cusolver HINTS ${cuda_blas_path})
+            # Also find OpenMP since cuSOLVER needs it.
+            find_package(OpenMP)
+            if (NOT OPENMP_FOUND)
+               message(STATUS "*** Didn't find OpenMP, which is required to use CUDA. ***")
+            endif()
+         endif()

-         if (CUDA_FOUND AND cudnn AND cudnn_include AND COMPILER_CAN_DO_CPP_11 AND cuda_test_compile_worked AND cudnn_test_compile_worked)
+         if (CUDA_FOUND AND cudnn AND cudnn_include AND COMPILER_CAN_DO_CPP_11 AND cuda_test_compile_worked AND cudnn_test_compile_worked AND OPENMP_FOUND)
            set(source_files ${source_files} 
               dnn/cuda_dlib.cu 
               dnn/cudnn_dlibapi.cpp
               dnn/cublas_dlibapi.cpp
+               dnn/cusolver_dlibapi.cu
               dnn/curand_dlibapi.cpp
               dnn/cuda_data_ptr.cpp
               dnn/gpu_data.cpp
@@ -584,6 +594,7 @@ if (NOT TARGET dlib)
                                      ${CUDA_CUBLAS_LIBRARIES} 
                                      ${cudnn}
                                      ${CUDA_curand_LIBRARY}
+                                      ${cusolver}
                                      )
            include_directories(${cudnn_include})
         else()
@@ -648,6 +659,11 @@ if (NOT TARGET dlib)
                                 PUBLIC ${dlib_needed_includes}
                                 )
      target_link_libraries(dlib PRIVATE ${dlib_needed_libraries})
+      if (OPENMP_FOUND)
+         # Enable OpenMP
+         target_compile_options(dlib PUBLIC ${OpenMP_CXX_FLAGS}) 
+         target_link_libraries(dlib PUBLIC ${OpenMP_CXX_FLAGS}) 
+      endif()
      if (UNIX AND NOT DLIB_IN_PROJECT_BUILD)
         if (DLIB_USE_CUDA)
            cuda_add_library(dlib_shared SHARED ${source_files} )
@@ -662,6 +678,11 @@ if (NOT TARGET dlib)
                                    PUBLIC ${dlib_needed_includes}
                                    )
         target_link_libraries(dlib_shared PRIVATE ${dlib_needed_libraries})
+         if (OPENMP_FOUND)
+            # Enable OpenMP
+            target_compile_options(dlib_shared PUBLIC ${OpenMP_CXX_FLAGS}) 
+            target_link_libraries(dlib_shared PUBLIC ${OpenMP_CXX_FLAGS}) 
+         endif()
      endif()

   endif ()  ##### end of if NOT DLIB_ISO_CPP_ONLY ##########################################################

--- a/dlib/dnn/cusolver_dlibapi.cu
+++ b/dlib/dnn/cusolver_dlibapi.cu
+// Copyright (C) 2017  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuSOLVER_CU_
+#define DLIB_DNN_CuSOLVER_CU_
+
+#ifdef DLIB_USE_CUDA
+
+#include "cusolver_dlibapi.h"
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+#include "cuda_utils.h"
+
+// ----------------------------------------------------------------------------------------
+
+static const char* cusolver_get_error_string(cusolverStatus_t s)
+{
+    switch(s)
+    {
+        case CUSOLVER_STATUS_NOT_INITIALIZED: 
+            return "CUDA Runtime API initialization failed.";
+        case CUSOLVER_STATUS_ALLOC_FAILED: 
+            return "CUDA Resources could not be allocated.";
+        default:
+            return "A call to cuSolver failed";
+    }
+}
+
+// Check the return value of a call to the cuSolver runtime for an error condition.
+#define CHECK_CUSOLVER(call)                                                      \
+do{                                                                              \
+    const cusolverStatus_t error = call;                                         \
+    if (error != CUSOLVER_STATUS_SUCCESS)                                        \
+    {                                                                          \
+        std::ostringstream sout;                                               \
+        sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
+        sout << "code: " << error << ", reason: " << cusolver_get_error_string(error);\
+        throw dlib::cusolver_error(sout.str());                                \
+    }                                                                          \
+}while(false)
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        class cusolver_context
+        {
+        public:
+            // not copyable
+            cusolver_context(const cusolver_context&) = delete;
+            cusolver_context& operator=(const cusolver_context&) = delete;
+
+            cusolver_context()
+            {
+                handles.resize(16);
+            }
+            ~cusolver_context()
+            {
+                for (auto h : handles)
+                {
+                    if (h)
+                        cusolverDnDestroy(h);
+                }
+            }
+
+            cusolverDnHandle_t get_handle (
+            )  
+            { 
+                int new_device_id;
+                CHECK_CUDA(cudaGetDevice(&new_device_id));
+                // make room for more devices if needed
+                if (new_device_id >= (long)handles.size())
+                    handles.resize(new_device_id+16);
+
+                // If we don't have a handle already for this device then make one
+                if (!handles[new_device_id])
+                    CHECK_CUSOLVER(cusolverDnCreate(&handles[new_device_id]));
+
+                // Finally, return the handle for the current device
+                return handles[new_device_id];
+            }
+
+        private:
+
+            std::vector<cusolverDnHandle_t> handles;
+        };
+
+        static cusolverDnHandle_t context()
+        {
+            thread_local cusolver_context c;
+            return c.get_handle();
+        }
+
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+        __global__ void _cuda_set_to_identity_matrix(float* m, size_t nr)
+        {
+            for (auto j : grid_stride_range(0, nr*nr))
+            {
+                if (j%(nr+1) == 0)
+                    m[j] = 1;
+                else
+                    m[j] = 0;
+            }
+        }
+
+        void set_to_identity_matrix (
+            tensor& m 
+        )
+        {
+            DLIB_CASSERT(m.size() == m.num_samples()*m.num_samples());
+            launch_kernel(_cuda_set_to_identity_matrix, max_jobs(m.size()), m.device(), m.num_samples());
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        inv::~inv()
+        {
+            sync_if_needed();
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void inv::
+        operator() (
+            const tensor& m_,
+            resizable_tensor& out
+        )
+        {
+            DLIB_CASSERT(m_.size() == m_.num_samples()*m_.num_samples(), "Input matrix must be square if you want to invert it.");
+            m = m_;
+
+            out.copy_size(m);
+            set_to_identity_matrix(out);
+
+            const int nc = m.num_samples();
+            int Lwork;
+            CHECK_CUSOLVER(cusolverDnSgetrf_bufferSize(context(), nc , nc, m.device(), nc, &Lwork));
+
+            if (Lwork > (int)workspace.size())
+            {
+                sync_if_needed();
+                workspace = cuda_data_ptr<float>(Lwork);
+            }
+            if (nc > (int)Ipiv.size())
+            {
+                sync_if_needed();
+                Ipiv = cuda_data_ptr<int>(nc);
+            }
+            if (info.size() != 1)
+            {
+                info = cuda_data_ptr<int>(1);
+            }
+
+            CHECK_CUSOLVER(cusolverDnSgetrf(context(), nc, nc, m.device(), nc, workspace, Ipiv, info));
+            CHECK_CUSOLVER(cusolverDnSgetrs(context(), CUBLAS_OP_N, nc, nc, m.device(), nc, Ipiv, out.device(), nc, info));
+            did_work_lately = true;
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        int inv::
+        get_last_status(
+        )
+        {
+            std::vector<int> linfo; 
+            memcpy(linfo, info);
+            if (linfo.size() != 0)
+                return linfo[0];
+            else
+                return 0;
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void inv::
+        sync_if_needed()
+        {
+            if (did_work_lately)
+            {
+                did_work_lately = false;
+                // make sure we wait until any previous kernel launches have finished
+                // before we do something like deallocate the GPU memory.
+                cudaDeviceSynchronize();
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+    }  
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuSOLVER_CU_
+
+
--- a/dlib/dnn/cusolver_dlibapi.h
+++ b/dlib/dnn/cusolver_dlibapi.h
+// Copyright (C) 2017  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuSOLVER_H_
+#define DLIB_DNN_CuSOLVER_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include "tensor.h"
+#include "cuda_errors.h"
+#include "cuda_data_ptr.h"
+#include "../noncopyable.h"
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        class inv : noncopyable
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a functor for doing matrix inversion on the GPU.  The only
+                    reason it's an object is to avoid the reallocation of some GPU memory
+                    blocks if you want to do a bunch of matrix inversions in a row.
+            !*/
+
+        public:
+
+            inv() = default;
+            ~inv();
+
+            void operator() (
+                const tensor& m,
+                resizable_tensor& out
+            );
+            /*!
+                requires
+                    - m.size() == m.num_samples()*m.num_samples()
+                      (i.e. mat(m) must be a square matrix)
+                ensures
+                    - out == inv(mat(m));
+            !*/
+
+            int get_last_status(
+            );
+            /*!
+                ensures
+                    - returns 0 if the last matrix inversion was successful and != 0
+                      otherwise.
+            !*/
+
+        private:
+
+            void sync_if_needed();
+
+            bool did_work_lately = false;
+            resizable_tensor m;
+            cuda_data_ptr<float> workspace;
+            cuda_data_ptr<int> Ipiv;
+            cuda_data_ptr<int> info;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+    }  
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuSOLVER_H_
+
+
+
--- a/dlib/dnn/tensor_tools.cpp
+++ b/dlib/dnn/tensor_tools.cpp
@@ -792,20 +792,35 @@ namespace dlib { namespace tt

 // ------------------------------------------------------------------------------------

-        void copy_tensor(
-                tensor& dest,
-                size_t dest_k_offset,
-                const tensor& src,
-                size_t src_k_offset,
-                size_t count_k
-        )
-        {
+    void copy_tensor(
+            tensor& dest,
+            size_t dest_k_offset,
+            const tensor& src,
+            size_t src_k_offset,
+            size_t count_k
+    )
+    {
 #ifdef DLIB_USE_CUDA
-            cuda::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
+        cuda::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
 #else
-            cpu::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
+        cpu::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
 #endif
-        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void inv::
+    operator() (
+        const tensor& m,
+        resizable_tensor& out
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        finv(m,out);
+#else
+        out = dlib::inv(m);
+#endif
+    }

 // ----------------------------------------------------------------------------------------


--- a/dlib/dnn/tensor_tools.h
+++ b/dlib/dnn/tensor_tools.h
@@ -6,6 +6,7 @@
 #include "tensor.h"
 #include "cudnn_dlibapi.h"
 #include "cublas_dlibapi.h"
+#include "cusolver_dlibapi.h"
 #include "curand_dlibapi.h"
 #include "cpu_dlib.h"
 #include "cuda_dlib.h"
@@ -123,6 +124,36 @@ namespace dlib { namespace tt
            - performs: dest = alpha*L*R + beta*mat(dest)
    !*/

+// ----------------------------------------------------------------------------------------
+
+    class inv
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is a functor for doing matrix inversion on the GPU.  The only
+                reason it's an object is to avoid the reallocation of some GPU memory
+                blocks if you want to do a bunch of matrix inversions in a row.
+        !*/
+    public:
+
+        void operator() (
+            const tensor& m,
+            resizable_tensor& out
+        );
+        /*!
+            requires
+                - m.size() == m.num_samples()*m.num_samples()
+                  (i.e. mat(m) must be a square matrix)
+            ensures
+                - out == inv(mat(m));
+        !*/
+
+    private:
+#ifdef DLIB_USE_CUDA
+        cuda::inv finv;
+#endif
+    };
+
 // ----------------------------------------------------------------------------------------

    class tensor_rand

--- a/dlib/test/cublas.cpp
+++ b/dlib/test/cublas.cpp
@@ -8,7 +8,7 @@
 #include <cstdlib>
 #include <ctime>
 #include <vector>
-#include "../dnn/cublas_dlibapi.h"
+#include "../dnn/tensor_tools.h"

 #include "tester.h"

@@ -25,6 +25,26 @@ namespace
    logger dlog("test.cublas");


+    void test_inv()
+    {
+        tt::tensor_rand rnd;
+        dlib::tt::inv tinv;
+        dlib::cuda::inv cinv;
+        resizable_tensor minv1, minv2;
+        for (int n = 1; n < 20; ++n)
+        {
+            print_spinner();
+            resizable_tensor m(n,n);
+            rnd.fill_uniform(m);
+
+            tinv(m, minv1);
+            cinv(m, minv2);
+            matrix<float> mref = inv(mat(m));
+            DLIB_TEST_MSG(mean(abs(mref-mat(minv1)))/mean(abs(mref)) < 1e-5, mean(abs(mref-mat(minv1)))/mean(abs(mref)) <<"  n: " << n);
+            DLIB_TEST_MSG(mean(abs(mref-mat(minv2)))/mean(abs(mref)) < 1e-5, mean(abs(mref-mat(minv2)))/mean(abs(mref)) <<"  n: " << n);
+        }
+    }
+

    class cublas_tester : public tester
    {
@@ -38,6 +58,7 @@ namespace
        void perform_test (
        )
        {
+            test_inv();
            {
                resizable_tensor a(4,3), b(3,4), c(3,3);