Commit 373cbb57 authored by Davis King's avatar Davis King

Added a tt::inv() for computing matrix inversions on the GPU.

parent b3dc8169
......@@ -568,14 +568,24 @@ if (NOT TARGET dlib)
endif()
endif()
endif()
endif()
# Find where cuSOLVER is since the FindCUDA cmake package doesn't
# bother to look for it.
get_filename_component(cuda_blas_path ${CUDA_CUBLAS_LIBRARIES} DIRECTORY)
find_library(cusolver cusolver HINTS ${cuda_blas_path})
# Also find OpenMP since cuSOLVER needs it.
find_package(OpenMP)
if (NOT OPENMP_FOUND)
message(STATUS "*** Didn't find OpenMP, which is required to use CUDA. ***")
endif()
endif()
if (CUDA_FOUND AND cudnn AND cudnn_include AND COMPILER_CAN_DO_CPP_11 AND cuda_test_compile_worked AND cudnn_test_compile_worked)
if (CUDA_FOUND AND cudnn AND cudnn_include AND COMPILER_CAN_DO_CPP_11 AND cuda_test_compile_worked AND cudnn_test_compile_worked AND OPENMP_FOUND)
set(source_files ${source_files}
dnn/cuda_dlib.cu
dnn/cudnn_dlibapi.cpp
dnn/cublas_dlibapi.cpp
dnn/cusolver_dlibapi.cu
dnn/curand_dlibapi.cpp
dnn/cuda_data_ptr.cpp
dnn/gpu_data.cpp
......@@ -584,6 +594,7 @@ if (NOT TARGET dlib)
${CUDA_CUBLAS_LIBRARIES}
${cudnn}
${CUDA_curand_LIBRARY}
${cusolver}
)
include_directories(${cudnn_include})
else()
......@@ -648,6 +659,11 @@ if (NOT TARGET dlib)
PUBLIC ${dlib_needed_includes}
)
target_link_libraries(dlib PRIVATE ${dlib_needed_libraries})
if (OPENMP_FOUND)
# Enable OpenMP
target_compile_options(dlib PUBLIC ${OpenMP_CXX_FLAGS})
target_link_libraries(dlib PUBLIC ${OpenMP_CXX_FLAGS})
endif()
if (UNIX AND NOT DLIB_IN_PROJECT_BUILD)
if (DLIB_USE_CUDA)
cuda_add_library(dlib_shared SHARED ${source_files} )
......@@ -662,6 +678,11 @@ if (NOT TARGET dlib)
PUBLIC ${dlib_needed_includes}
)
target_link_libraries(dlib_shared PRIVATE ${dlib_needed_libraries})
if (OPENMP_FOUND)
# Enable OpenMP
target_compile_options(dlib_shared PUBLIC ${OpenMP_CXX_FLAGS})
target_link_libraries(dlib_shared PUBLIC ${OpenMP_CXX_FLAGS})
endif()
endif()
endif () ##### end of if NOT DLIB_ISO_CPP_ONLY ##########################################################
......
// Copyright (C) 2017 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuSOLVER_CU_
#define DLIB_DNN_CuSOLVER_CU_
#ifdef DLIB_USE_CUDA
#include "cusolver_dlibapi.h"
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "cuda_utils.h"
// ----------------------------------------------------------------------------------------
static const char* cusolver_get_error_string(cusolverStatus_t s)
{
switch(s)
{
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUDA Runtime API initialization failed.";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUDA Resources could not be allocated.";
default:
return "A call to cuSolver failed";
}
}
// Check the return value of a call to the cuSolver runtime for an error condition.
#define CHECK_CUSOLVER(call) \
do{ \
const cusolverStatus_t error = call; \
if (error != CUSOLVER_STATUS_SUCCESS) \
{ \
std::ostringstream sout; \
sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
sout << "code: " << error << ", reason: " << cusolver_get_error_string(error);\
throw dlib::cusolver_error(sout.str()); \
} \
}while(false)
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
namespace dlib
{
namespace cuda
{
// -----------------------------------------------------------------------------------
class cusolver_context
{
public:
// not copyable
cusolver_context(const cusolver_context&) = delete;
cusolver_context& operator=(const cusolver_context&) = delete;
cusolver_context()
{
handles.resize(16);
}
~cusolver_context()
{
for (auto h : handles)
{
if (h)
cusolverDnDestroy(h);
}
}
cusolverDnHandle_t get_handle (
)
{
int new_device_id;
CHECK_CUDA(cudaGetDevice(&new_device_id));
// make room for more devices if needed
if (new_device_id >= (long)handles.size())
handles.resize(new_device_id+16);
// If we don't have a handle already for this device then make one
if (!handles[new_device_id])
CHECK_CUSOLVER(cusolverDnCreate(&handles[new_device_id]));
// Finally, return the handle for the current device
return handles[new_device_id];
}
private:
std::vector<cusolverDnHandle_t> handles;
};
static cusolverDnHandle_t context()
{
thread_local cusolver_context c;
return c.get_handle();
}
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
__global__ void _cuda_set_to_identity_matrix(float* m, size_t nr)
{
for (auto j : grid_stride_range(0, nr*nr))
{
if (j%(nr+1) == 0)
m[j] = 1;
else
m[j] = 0;
}
}
void set_to_identity_matrix (
tensor& m
)
{
DLIB_CASSERT(m.size() == m.num_samples()*m.num_samples());
launch_kernel(_cuda_set_to_identity_matrix, max_jobs(m.size()), m.device(), m.num_samples());
}
// ------------------------------------------------------------------------------------
inv::~inv()
{
sync_if_needed();
}
// ------------------------------------------------------------------------------------
void inv::
operator() (
const tensor& m_,
resizable_tensor& out
)
{
DLIB_CASSERT(m_.size() == m_.num_samples()*m_.num_samples(), "Input matrix must be square if you want to invert it.");
m = m_;
out.copy_size(m);
set_to_identity_matrix(out);
const int nc = m.num_samples();
int Lwork;
CHECK_CUSOLVER(cusolverDnSgetrf_bufferSize(context(), nc , nc, m.device(), nc, &Lwork));
if (Lwork > (int)workspace.size())
{
sync_if_needed();
workspace = cuda_data_ptr<float>(Lwork);
}
if (nc > (int)Ipiv.size())
{
sync_if_needed();
Ipiv = cuda_data_ptr<int>(nc);
}
if (info.size() != 1)
{
info = cuda_data_ptr<int>(1);
}
CHECK_CUSOLVER(cusolverDnSgetrf(context(), nc, nc, m.device(), nc, workspace, Ipiv, info));
CHECK_CUSOLVER(cusolverDnSgetrs(context(), CUBLAS_OP_N, nc, nc, m.device(), nc, Ipiv, out.device(), nc, info));
did_work_lately = true;
}
// ------------------------------------------------------------------------------------
int inv::
get_last_status(
)
{
std::vector<int> linfo;
memcpy(linfo, info);
if (linfo.size() != 0)
return linfo[0];
else
return 0;
}
// ------------------------------------------------------------------------------------
void inv::
sync_if_needed()
{
if (did_work_lately)
{
did_work_lately = false;
// make sure we wait until any previous kernel launches have finished
// before we do something like deallocate the GPU memory.
cudaDeviceSynchronize();
}
}
// ------------------------------------------------------------------------------------
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuSOLVER_CU_
// Copyright (C) 2017 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuSOLVER_H_
#define DLIB_DNN_CuSOLVER_H_
#ifdef DLIB_USE_CUDA
#include "tensor.h"
#include "cuda_errors.h"
#include "cuda_data_ptr.h"
#include "../noncopyable.h"
namespace dlib
{
namespace cuda
{
// -----------------------------------------------------------------------------------
class inv : noncopyable
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a functor for doing matrix inversion on the GPU. The only
reason it's an object is to avoid the reallocation of some GPU memory
blocks if you want to do a bunch of matrix inversions in a row.
!*/
public:
inv() = default;
~inv();
void operator() (
const tensor& m,
resizable_tensor& out
);
/*!
requires
- m.size() == m.num_samples()*m.num_samples()
(i.e. mat(m) must be a square matrix)
ensures
- out == inv(mat(m));
!*/
int get_last_status(
);
/*!
ensures
- returns 0 if the last matrix inversion was successful and != 0
otherwise.
!*/
private:
void sync_if_needed();
bool did_work_lately = false;
resizable_tensor m;
cuda_data_ptr<float> workspace;
cuda_data_ptr<int> Ipiv;
cuda_data_ptr<int> info;
};
// ------------------------------------------------------------------------------------
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuSOLVER_H_
......@@ -792,20 +792,35 @@ namespace dlib { namespace tt
// ------------------------------------------------------------------------------------
void copy_tensor(
tensor& dest,
size_t dest_k_offset,
const tensor& src,
size_t src_k_offset,
size_t count_k
)
{
void copy_tensor(
tensor& dest,
size_t dest_k_offset,
const tensor& src,
size_t src_k_offset,
size_t count_k
)
{
#ifdef DLIB_USE_CUDA
cuda::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
cuda::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
#else
cpu::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
cpu::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
#endif
}
}
// ----------------------------------------------------------------------------------------
void inv::
operator() (
const tensor& m,
resizable_tensor& out
)
{
#ifdef DLIB_USE_CUDA
finv(m,out);
#else
out = dlib::inv(m);
#endif
}
// ----------------------------------------------------------------------------------------
......
......@@ -6,6 +6,7 @@
#include "tensor.h"
#include "cudnn_dlibapi.h"
#include "cublas_dlibapi.h"
#include "cusolver_dlibapi.h"
#include "curand_dlibapi.h"
#include "cpu_dlib.h"
#include "cuda_dlib.h"
......@@ -123,6 +124,36 @@ namespace dlib { namespace tt
- performs: dest = alpha*L*R + beta*mat(dest)
!*/
// ----------------------------------------------------------------------------------------
class inv
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a functor for doing matrix inversion on the GPU. The only
reason it's an object is to avoid the reallocation of some GPU memory
blocks if you want to do a bunch of matrix inversions in a row.
!*/
public:
void operator() (
const tensor& m,
resizable_tensor& out
);
/*!
requires
- m.size() == m.num_samples()*m.num_samples()
(i.e. mat(m) must be a square matrix)
ensures
- out == inv(mat(m));
!*/
private:
#ifdef DLIB_USE_CUDA
cuda::inv finv;
#endif
};
// ----------------------------------------------------------------------------------------
class tensor_rand
......
......@@ -8,7 +8,7 @@
#include <cstdlib>
#include <ctime>
#include <vector>
#include "../dnn/cublas_dlibapi.h"
#include "../dnn/tensor_tools.h"
#include "tester.h"
......@@ -25,6 +25,26 @@ namespace
logger dlog("test.cublas");
void test_inv()
{
tt::tensor_rand rnd;
dlib::tt::inv tinv;
dlib::cuda::inv cinv;
resizable_tensor minv1, minv2;
for (int n = 1; n < 20; ++n)
{
print_spinner();
resizable_tensor m(n,n);
rnd.fill_uniform(m);
tinv(m, minv1);
cinv(m, minv2);
matrix<float> mref = inv(mat(m));
DLIB_TEST_MSG(mean(abs(mref-mat(minv1)))/mean(abs(mref)) < 1e-5, mean(abs(mref-mat(minv1)))/mean(abs(mref)) <<" n: " << n);
DLIB_TEST_MSG(mean(abs(mref-mat(minv2)))/mean(abs(mref)) < 1e-5, mean(abs(mref-mat(minv2)))/mean(abs(mref)) <<" n: " << n);
}
}
class cublas_tester : public tester
{
......@@ -38,6 +58,7 @@ namespace
void perform_test (
)
{
test_inv();
{
resizable_tensor a(4,3), b(3,4), c(3,3);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment