Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
D
dlib
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
钟尚武
dlib
Commits
3ba21975
Commit
3ba21975
authored
Oct 18, 2015
by
Davis King
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Moved gpu_data into its own file, fixed a few bugs, and cleaned up
a few things.
parent
0508fe2b
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
482 additions
and
389 deletions
+482
-389
CMakeLists.txt
dlib/CMakeLists.txt
+1
-0
cublas_dlibapi.h
dlib/dnn/cublas_dlibapi.h
+13
-2
cuda_errors.h
dlib/dnn/cuda_errors.h
+7
-12
cuda_utils.h
dlib/dnn/cuda_utils.h
+72
-62
cudnn_dlibapi.cpp
dlib/dnn/cudnn_dlibapi.cpp
+52
-156
cudnn_dlibapi.h
dlib/dnn/cudnn_dlibapi.h
+26
-4
gpu_data.cpp
dlib/dnn/gpu_data.cpp
+142
-0
gpu_data.h
dlib/dnn/gpu_data.h
+168
-0
tensor.h
dlib/dnn/tensor.h
+1
-153
No files found.
dlib/CMakeLists.txt
View file @
3ba21975
...
@@ -441,6 +441,7 @@ if (NOT TARGET dlib)
...
@@ -441,6 +441,7 @@ if (NOT TARGET dlib)
dnn/cuda_dlib.cu
dnn/cuda_dlib.cu
dnn/cudnn_dlibapi.cpp
dnn/cudnn_dlibapi.cpp
dnn/cublas_dlibapi.cpp
dnn/cublas_dlibapi.cpp
dnn/gpu_data.cpp
)
)
set
(
dlib_needed_libraries
${
dlib_needed_libraries
}
${
CUDA_CUBLAS_LIBRARIES
}
${
cudnn
}
)
set
(
dlib_needed_libraries
${
dlib_needed_libraries
}
${
CUDA_CUBLAS_LIBRARIES
}
${
cudnn
}
)
include_directories
(
${
cudnn_include
}
)
include_directories
(
${
cudnn_include
}
)
...
...
dlib/dnn/cublas_dlibapi.h
View file @
3ba21975
...
@@ -29,8 +29,19 @@ namespace dlib
...
@@ -29,8 +29,19 @@ namespace dlib
cublas_context
(
const
cublas_context
&
)
=
delete
;
cublas_context
(
const
cublas_context
&
)
=
delete
;
cublas_context
&
operator
=
(
const
cublas_context
&
)
=
delete
;
cublas_context
&
operator
=
(
const
cublas_context
&
)
=
delete
;
// but is movable
// but is movable
cublas_context
(
cublas_context
&&
)
=
default
;
cublas_context
(
cublas_context
&&
item
)
cublas_context
&
operator
=
(
cublas_context
&&
)
=
default
;
{
handle
=
item
.
handle
;
item
.
handle
=
nullptr
;
}
cublas_context
&
operator
=
(
cublas_context
&&
item
)
{
if
(
this
==
&
item
)
return
*
this
;
handle
=
item
.
handle
;
item
.
handle
=
nullptr
;
return
*
this
;
}
cublas_context
();
cublas_context
();
~
cublas_context
();
~
cublas_context
();
...
...
dlib/dnn/cuda_errors.h
View file @
3ba21975
...
@@ -3,27 +3,22 @@
...
@@ -3,27 +3,22 @@
#ifndef DLIB_CUDA_ERRORs_H_
#ifndef DLIB_CUDA_ERRORs_H_
#define DLIB_CUDA_ERRORs_H_
#define DLIB_CUDA_ERRORs_H_
#ifdef DLIB_USE_CUDA
#include "../error.h"
#include "../error.h"
namespace
dlib
namespace
dlib
{
{
namespace
cuda
struct
cuda_error
:
public
error
{
{
struct
cuda_error
:
public
error
cuda_error
(
const
std
::
string
&
message
)
:
error
(
message
)
{}
{
};
cuda_error
(
const
std
::
string
&
message
)
:
error
(
message
)
{}
};
struct
cudnn_error
:
public
cuda_error
struct
cudnn_error
:
public
cuda_error
{
{
cudnn_error
(
const
std
::
string
&
message
)
:
cuda_error
(
message
)
{}
cudnn_error
(
const
std
::
string
&
message
)
:
cuda_error
(
message
)
{}
};
};
}
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_CUDA_ERRORs_H_
#endif // DLIB_CUDA_ERRORs_H_
dlib/dnn/cuda_utils.h
View file @
3ba21975
...
@@ -3,10 +3,13 @@
...
@@ -3,10 +3,13 @@
#ifndef DLIB_CUDA_UtILS_H_
#ifndef DLIB_CUDA_UtILS_H_
#define DLIB_CUDA_UtILS_H_
#define DLIB_CUDA_UtILS_H_
#ifndef DLIB_USE_CUDA
#error "This file shouldn't be #included unless DLIB_USE_CUDA is #defined"
#endif
#include "cuda_errors.h"
#include "cuda_errors.h"
#include <cuda.h>
#include <cuda
_runtime
.h>
#include <sstream>
#include <sstream>
...
@@ -19,7 +22,7 @@
...
@@ -19,7 +22,7 @@
std::ostringstream sout; \
std::ostringstream sout; \
sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
sout << "code: " << error << ", reason: " << cudaGetErrorString(error);\
sout << "code: " << error << ", reason: " << cudaGetErrorString(error);\
throw dlib::cuda
::cuda
_error(sout.str()); \
throw dlib::cuda_error(sout.str()); \
} \
} \
}
}
...
@@ -27,75 +30,82 @@
...
@@ -27,75 +30,82 @@
#ifdef __CUDACC__
#ifdef __CUDACC__
class
grid_stride_range
namespace
dlib
{
{
/*!
namespace
cuda
WHAT THIS OBJECT REPRESENTS
{
This is a tool for making a for loop that loops over an entire block of memory
class
grid_stride_range
inside a kernel, but doing so in a way that parallelizes appropriately across
{
all the threads in a kernel launch. For example, the following kernel would
/*!
add the vector a to the vector b and store the output in out (assuming all
WHAT THIS OBJECT REPRESENTS
vectors are of dimension n):
This is a tool for making a for loop that loops over an entire block of
__global__ void add_arrays(
memory inside a kernel, but doing so in a way that parallelizes
const float* a,
appropriately across all the threads in a kernel launch. For example,
const float* b,
the following kernel would add the vector a to the vector b and store
float* out,
the output in out (assuming all vectors are of dimension n):
size_t n
__global__ void add_arrays(
)
const float* a,
const float* b,
float* out,
size_t n
)
{
for (auto i : grid_stride_range(0, n))
{
out[i] = a[i]+b[i];
}
}
!*/
public
:
__device__
grid_stride_range
(
size_t
ibegin_
,
size_t
iend_
)
:
ibegin
(
ibegin_
),
iend
(
iend_
)
{}
class
iterator
{
public
:
__device__
iterator
()
{}
__device__
iterator
(
size_t
pos_
)
:
pos
(
pos_
)
{}
__device__
size_t
operator
*
()
const
{
{
for (auto i : grid_stride_range(0, n))
return
pos
;
{
out[i] = a[i]+b[i];
}
}
}
!*/
public
:
__device__
grid_stride_range
(
size_t
ibegin_
,
size_t
iend_
)
:
ibegin
(
ibegin_
),
iend
(
iend_
)
{}
class
iterator
{
public
:
__device__
iterator
()
{}
__device__
iterator
(
size_t
pos_
)
:
pos
(
pos_
)
{}
__device__
size_t
operator
*
()
const
__device__
iterator
&
operator
++
()
{
{
return
pos
;
pos
+=
gridDim
.
x
*
blockDim
.
x
;
}
return
*
this
;
}
__device__
iterator
&
operator
++
()
__device__
bool
operator
!=
(
const
iterator
&
item
)
const
{
{
return
pos
<
item
.
pos
;
}
pos
+=
gridDim
.
x
*
blockDim
.
x
;
return
*
this
;
}
__device__
bool
operator
!=
(
const
iterator
&
item
)
const
private
:
{
return
pos
<
item
.
pos
;
}
size_t
pos
;
};
private
:
__device__
iterator
begin
()
const
size_t
pos
;
{
};
return
iterator
(
ibegin
+
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
);
}
__device__
iterator
end
()
const
{
return
iterator
(
iend
);
}
private
:
__device__
iterator
begin
()
const
size_t
ibegin
;
{
size_t
iend
;
return
iterator
(
ibegin
+
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
);
};
}
__device__
iterator
end
()
const
{
return
iterator
(
iend
);
}
private
:
size_t
ibegin
;
}
size_t
iend
;
}
};
#endif // __CUDACC__
#endif // __CUDACC__
...
...
dlib/dnn/cudnn_dlibapi.cpp
View file @
3ba21975
...
@@ -15,133 +15,6 @@
...
@@ -15,133 +15,6 @@
namespace
dlib
namespace
dlib
{
{
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// gpu_data member functions
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// TODO, add error handling
void
gpu_data
::
wait_for_transfer_to_finish
()
const
{
if
(
have_active_transfer
)
{
std
::
cout
<<
"wait for cudaStreamSynchronize()"
<<
std
::
endl
;
CHECK_CUDA
(
cudaStreamSynchronize
((
cudaStream_t
)
cuda_stream
.
get
()));
have_active_transfer
=
false
;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA
(
cudaGetLastError
());
}
}
void
gpu_data
::
copy_to_device
()
const
{
wait_for_transfer_to_finish
();
if
(
!
device_current
)
{
std
::
cout
<<
"cudaMemcpy to device"
<<
std
::
endl
;
CHECK_CUDA
(
cudaMemcpy
(
data_device
.
get
(),
data_host
.
get
(),
data_size
*
sizeof
(
float
),
cudaMemcpyHostToDevice
));
device_current
=
true
;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA
(
cudaGetLastError
());
}
}
void
gpu_data
::
copy_to_host
()
const
{
wait_for_transfer_to_finish
();
if
(
!
host_current
)
{
std
::
cout
<<
"cudaMemcpy to host"
<<
std
::
endl
;
CHECK_CUDA
(
cudaMemcpy
(
data_host
.
get
(),
data_device
.
get
(),
data_size
*
sizeof
(
float
),
cudaMemcpyDeviceToHost
));
host_current
=
true
;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA
(
cudaGetLastError
());
}
}
void
gpu_data
::
async_copy_to_device
()
{
if
(
!
device_current
)
{
std
::
cout
<<
"cudaMemcpyAsync to device"
<<
std
::
endl
;
CHECK_CUDA
(
cudaMemcpyAsync
(
data_device
.
get
(),
data_host
.
get
(),
data_size
*
sizeof
(
float
),
cudaMemcpyHostToDevice
,
(
cudaStream_t
)
cuda_stream
.
get
()));
have_active_transfer
=
true
;
device_current
=
true
;
}
}
void
gpu_data
::
set_size
(
size_t
new_size
)
{
wait_for_transfer_to_finish
();
if
(
new_size
==
0
)
{
data_size
=
0
;
host_current
=
true
;
device_current
=
true
;
data_host
.
reset
();
data_device
.
reset
();
}
else
if
(
new_size
!=
data_size
)
{
data_size
=
new_size
;
host_current
=
true
;
device_current
=
true
;
try
{
void
*
data
;
CHECK_CUDA
(
cudaMallocHost
(
&
data
,
new_size
*
sizeof
(
float
)));
// Note that we don't throw exceptions since the free calls are invariably
// called in destructors. They also shouldn't fail anyway unless someone
// is resetting the GPU card in the middle of their program.
data_host
.
reset
((
float
*
)
data
,
[](
float
*
ptr
){
auto
err
=
cudaFreeHost
(
ptr
);
if
(
err
!=
cudaSuccess
)
std
::
cerr
<<
"cudaFreeHost() failed. Reason: "
<<
cudaGetErrorString
(
err
)
<<
std
::
endl
;
});
CHECK_CUDA
(
cudaMalloc
(
&
data
,
new_size
*
sizeof
(
float
)));
data_device
.
reset
((
float
*
)
data
,
[](
float
*
ptr
){
auto
err
=
cudaFree
(
ptr
);
if
(
err
!=
cudaSuccess
)
std
::
cerr
<<
"cudaFree() failed. Reason: "
<<
cudaGetErrorString
(
err
)
<<
std
::
endl
;
});
if
(
!
cuda_stream
)
{
cudaStream_t
cstream
;
CHECK_CUDA
(
cudaStreamCreateWithFlags
(
&
cstream
,
cudaStreamNonBlocking
));
cuda_stream
.
reset
(
cstream
,
[](
void
*
ptr
){
auto
err
=
cudaStreamDestroy
((
cudaStream_t
)
ptr
);
if
(
err
!=
cudaSuccess
)
std
::
cerr
<<
"cudaStreamDestroy() failed. Reason: "
<<
cudaGetErrorString
(
err
)
<<
std
::
endl
;
});
}
}
catch
(...)
{
set_size
(
0
);
throw
;
}
}
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
namespace
cuda
namespace
cuda
{
{
...
@@ -155,6 +28,8 @@ namespace dlib
...
@@ -155,6 +28,8 @@ namespace dlib
throw
cudnn_error
(
"CUDA Runtime API initialization failed."
);
throw
cudnn_error
(
"CUDA Runtime API initialization failed."
);
case
CUDNN_STATUS_ALLOC_FAILED
:
case
CUDNN_STATUS_ALLOC_FAILED
:
throw
cudnn_error
(
"CUDA Resources could not be allocated."
);
throw
cudnn_error
(
"CUDA Resources could not be allocated."
);
case
CUDNN_STATUS_BAD_PARAM
:
throw
cudnn_error
(
"CUDNN_STATUS_BAD_PARAM"
);
default
:
default
:
throw
cudnn_error
(
"A call to cuDNN failed."
);
throw
cudnn_error
(
"A call to cuDNN failed."
);
}
}
...
@@ -180,20 +55,16 @@ namespace dlib
...
@@ -180,20 +55,16 @@ namespace dlib
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
tensor_descriptor
::
tensor_descriptor
()
:
handle
(
nullptr
)
tensor_descriptor
::
tensor_descriptor
(
)
:
handle
(
nullptr
)
{
{
cudnnTensorDescriptor_t
h
;
check
(
cudnnCreateTensorDescriptor
(
&
h
));
handle
=
h
;
}
}
tensor_descriptor
::~
tensor_descriptor
()
tensor_descriptor
::
~
tensor_descriptor
()
{
{
if
(
handle
)
set_size
(
0
,
0
,
0
,
0
);
{
cudnnDestroyTensorDescriptor
((
cudnnTensorDescriptor_t
)
handle
);
handle
=
nullptr
;
}
}
}
void
tensor_descriptor
::
void
tensor_descriptor
::
...
@@ -204,13 +75,28 @@ namespace dlib
...
@@ -204,13 +75,28 @@ namespace dlib
int
k
int
k
)
)
{
{
check
(
cudnnSetTensor4dDescriptor
((
cudnnTensorDescriptor_t
)
handle
,
if
(
n
==
0
||
nr
==
0
||
nc
==
0
||
k
==
0
)
CUDNN_TENSOR_NHWC
,
{
CUDNN_DATA_FLOAT
,
if
(
handle
)
n
,
{
k
,
cudnnDestroyTensorDescriptor
((
cudnnTensorDescriptor_t
)
handle
);
nr
,
handle
=
nullptr
;
nc
));
}
}
else
{
cudnnTensorDescriptor_t
h
;
check
(
cudnnCreateTensorDescriptor
(
&
h
));
handle
=
h
;
check
(
cudnnSetTensor4dDescriptor
((
cudnnTensorDescriptor_t
)
handle
,
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_FLOAT
,
n
,
k
,
nr
,
nc
));
}
}
}
void
tensor_descriptor
::
void
tensor_descriptor
::
...
@@ -221,18 +107,28 @@ namespace dlib
...
@@ -221,18 +107,28 @@ namespace dlib
int
&
k
int
&
k
)
const
)
const
{
{
int
nStride
,
cStride
,
hStride
,
wStride
;
if
(
handle
)
cudnnDataType_t
datatype
;
{
check
(
cudnnGetTensor4dDescriptor
((
cudnnTensorDescriptor_t
)
handle
,
int
nStride
,
cStride
,
hStride
,
wStride
;
&
datatype
,
cudnnDataType_t
datatype
;
&
n
,
check
(
cudnnGetTensor4dDescriptor
((
cudnnTensorDescriptor_t
)
handle
,
&
k
,
&
datatype
,
&
nr
,
&
n
,
&
nc
,
&
k
,
&
nStride
,
&
nr
,
&
cStride
,
&
nc
,
&
hStride
,
&
nStride
,
&
wStride
));
&
cStride
,
&
hStride
,
&
wStride
));
}
else
{
n
=
0
;
nr
=
0
;
nc
=
0
;
k
=
0
;
}
}
}
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
...
...
dlib/dnn/cudnn_dlibapi.h
View file @
3ba21975
...
@@ -25,8 +25,19 @@ namespace dlib
...
@@ -25,8 +25,19 @@ namespace dlib
cudnn_context
(
const
cudnn_context
&
)
=
delete
;
cudnn_context
(
const
cudnn_context
&
)
=
delete
;
cudnn_context
&
operator
=
(
const
cudnn_context
&
)
=
delete
;
cudnn_context
&
operator
=
(
const
cudnn_context
&
)
=
delete
;
// but is movable
// but is movable
cudnn_context
(
cudnn_context
&&
)
=
default
;
cudnn_context
(
cudnn_context
&&
item
)
cudnn_context
&
operator
=
(
cudnn_context
&&
)
=
default
;
{
handle
=
item
.
handle
;
item
.
handle
=
nullptr
;
}
cudnn_context
&
operator
=
(
cudnn_context
&&
item
)
{
if
(
this
==
&
item
)
return
*
this
;
handle
=
item
.
handle
;
item
.
handle
=
nullptr
;
return
*
this
;
}
cudnn_context
();
cudnn_context
();
~
cudnn_context
();
~
cudnn_context
();
...
@@ -53,8 +64,19 @@ namespace dlib
...
@@ -53,8 +64,19 @@ namespace dlib
tensor_descriptor
(
const
tensor_descriptor
&
)
=
delete
;
tensor_descriptor
(
const
tensor_descriptor
&
)
=
delete
;
tensor_descriptor
&
operator
=
(
const
tensor_descriptor
&
)
=
delete
;
tensor_descriptor
&
operator
=
(
const
tensor_descriptor
&
)
=
delete
;
// but is movable
// but is movable
tensor_descriptor
(
tensor_descriptor
&&
)
=
default
;
tensor_descriptor
(
tensor_descriptor
&&
item
)
tensor_descriptor
&
operator
=
(
tensor_descriptor
&&
)
=
default
;
{
handle
=
item
.
handle
;
item
.
handle
=
nullptr
;
}
tensor_descriptor
&
operator
=
(
tensor_descriptor
&&
item
)
{
if
(
this
==
&
item
)
return
*
this
;
handle
=
item
.
handle
;
item
.
handle
=
nullptr
;
return
*
this
;
}
tensor_descriptor
();
tensor_descriptor
();
~
tensor_descriptor
();
~
tensor_descriptor
();
...
...
dlib/dnn/gpu_data.cpp
0 → 100644
View file @
3ba21975
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_GPU_DaTA_CPP_
#define DLIB_GPU_DaTA_CPP_
// Only things that require CUDA are declared in this cpp file. Everything else is in the
// gpu_data.h header so that it can operate as "header-only" code when using just the CPU.
#ifdef DLIB_USE_CUDA
#include "gpu_data.h"
#include <iostream>
#include "cuda_utils.h"
namespace
dlib
{
// ----------------------------------------------------------------------------------------
void
gpu_data
::
wait_for_transfer_to_finish
()
const
{
if
(
have_active_transfer
)
{
std
::
cout
<<
"wait for cudaStreamSynchronize()"
<<
std
::
endl
;
CHECK_CUDA
(
cudaStreamSynchronize
((
cudaStream_t
)
cuda_stream
.
get
()));
have_active_transfer
=
false
;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA
(
cudaGetLastError
());
}
}
void
gpu_data
::
copy_to_device
()
const
{
wait_for_transfer_to_finish
();
if
(
!
device_current
)
{
std
::
cout
<<
"cudaMemcpy to device"
<<
std
::
endl
;
CHECK_CUDA
(
cudaMemcpy
(
data_device
.
get
(),
data_host
.
get
(),
data_size
*
sizeof
(
float
),
cudaMemcpyHostToDevice
));
device_current
=
true
;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA
(
cudaGetLastError
());
}
}
void
gpu_data
::
copy_to_host
()
const
{
wait_for_transfer_to_finish
();
if
(
!
host_current
)
{
std
::
cout
<<
"cudaMemcpy to host"
<<
std
::
endl
;
CHECK_CUDA
(
cudaMemcpy
(
data_host
.
get
(),
data_device
.
get
(),
data_size
*
sizeof
(
float
),
cudaMemcpyDeviceToHost
));
host_current
=
true
;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA
(
cudaGetLastError
());
}
}
void
gpu_data
::
async_copy_to_device
()
{
if
(
!
device_current
)
{
std
::
cout
<<
"cudaMemcpyAsync to device"
<<
std
::
endl
;
CHECK_CUDA
(
cudaMemcpyAsync
(
data_device
.
get
(),
data_host
.
get
(),
data_size
*
sizeof
(
float
),
cudaMemcpyHostToDevice
,
(
cudaStream_t
)
cuda_stream
.
get
()));
have_active_transfer
=
true
;
device_current
=
true
;
}
}
void
gpu_data
::
set_size
(
size_t
new_size
)
{
wait_for_transfer_to_finish
();
if
(
new_size
==
0
)
{
data_size
=
0
;
host_current
=
true
;
device_current
=
true
;
data_host
.
reset
();
data_device
.
reset
();
}
else
if
(
new_size
!=
data_size
)
{
data_size
=
new_size
;
host_current
=
true
;
device_current
=
true
;
try
{
void
*
data
;
CHECK_CUDA
(
cudaMallocHost
(
&
data
,
new_size
*
sizeof
(
float
)));
// Note that we don't throw exceptions since the free calls are invariably
// called in destructors. They also shouldn't fail anyway unless someone
// is resetting the GPU card in the middle of their program.
data_host
.
reset
((
float
*
)
data
,
[](
float
*
ptr
){
auto
err
=
cudaFreeHost
(
ptr
);
if
(
err
!=
cudaSuccess
)
std
::
cerr
<<
"cudaFreeHost() failed. Reason: "
<<
cudaGetErrorString
(
err
)
<<
std
::
endl
;
});
CHECK_CUDA
(
cudaMalloc
(
&
data
,
new_size
*
sizeof
(
float
)));
data_device
.
reset
((
float
*
)
data
,
[](
float
*
ptr
){
auto
err
=
cudaFree
(
ptr
);
if
(
err
!=
cudaSuccess
)
std
::
cerr
<<
"cudaFree() failed. Reason: "
<<
cudaGetErrorString
(
err
)
<<
std
::
endl
;
});
if
(
!
cuda_stream
)
{
cudaStream_t
cstream
;
CHECK_CUDA
(
cudaStreamCreateWithFlags
(
&
cstream
,
cudaStreamNonBlocking
));
cuda_stream
.
reset
(
cstream
,
[](
void
*
ptr
){
auto
err
=
cudaStreamDestroy
((
cudaStream_t
)
ptr
);
if
(
err
!=
cudaSuccess
)
std
::
cerr
<<
"cudaStreamDestroy() failed. Reason: "
<<
cudaGetErrorString
(
err
)
<<
std
::
endl
;
});
}
}
catch
(...)
{
set_size
(
0
);
throw
;
}
}
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_USE_CUDA
#endif // DLIB_GPU_DaTA_CPP_
dlib/dnn/gpu_data.h
0 → 100644
View file @
3ba21975
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_GPU_DaTA_H_
#define DLIB_GPU_DaTA_H_
#include <memory>
#include "cuda_errors.h"
#include "../serialize.h"
namespace
dlib
{
// ----------------------------------------------------------------------------------------
class
gpu_data
{
/*!
CONVENTION
- if (size() != 0) then
- data_host == a pointer to size() floats in CPU memory.
- if (data_device) then
- data_device == a pointer to size() floats in device memory.
- if (there might be an active transfer between host and device) then
- have_active_transfer == true
- We use the host_current and device_current bools to keep track of which
copy of the data (or both) are most current. e.g. if the CPU has
modified the tensor and it hasn't been copied to the device yet then
host_current==true and device_current == false.
THREAD SAFETY
This object is not thread-safe. Don't touch it from multiple threads as the
same time.
!*/
public
:
gpu_data
(
)
:
data_size
(
0
),
host_current
(
true
),
device_current
(
true
),
have_active_transfer
(
false
)
{
}
// Not copyable
gpu_data
(
const
gpu_data
&
)
=
delete
;
gpu_data
&
operator
=
(
const
gpu_data
&
)
=
delete
;
// but is movable
gpu_data
(
gpu_data
&&
)
=
default
;
gpu_data
&
operator
=
(
gpu_data
&&
)
=
default
;
#ifdef DLIB_USE_CUDA
void
async_copy_to_device
();
void
set_size
(
size_t
new_size
);
#else
// Note that calls to host() or device() will block until any async transfers are complete.
void
async_copy_to_device
(){}
void
set_size
(
size_t
new_size
)
{
if
(
new_size
==
0
)
{
data_size
=
0
;
host_current
=
true
;
device_current
=
true
;
data_host
.
reset
();
data_device
.
reset
();
}
else
if
(
new_size
!=
data_size
)
{
data_size
=
new_size
;
host_current
=
true
;
device_current
=
true
;
data_host
.
reset
(
new
float
[
new_size
],
std
::
default_delete
<
float
[]
>
());
data_device
.
reset
();
}
}
#endif
const
float
*
host
()
const
{
copy_to_host
();
return
data_host
.
get
();
}
float
*
host
()
{
copy_to_host
();
device_current
=
false
;
return
data_host
.
get
();
}
const
float
*
device
()
const
{
#ifndef DLIB_USE_CUDA
DLIB_CASSERT
(
false
,
"CUDA NOT ENABLED"
);
#endif
copy_to_device
();
return
data_device
.
get
();
}
float
*
device
()
{
#ifndef DLIB_USE_CUDA
DLIB_CASSERT
(
false
,
"CUDA NOT ENABLED"
);
#endif
copy_to_device
();
host_current
=
false
;
return
data_device
.
get
();
}
size_t
size
()
const
{
return
data_size
;
}
private
:
#ifdef DLIB_USE_CUDA
void
copy_to_device
()
const
;
void
copy_to_host
()
const
;
void
wait_for_transfer_to_finish
()
const
;
#else
void
copy_to_device
()
const
{}
void
copy_to_host
()
const
{}
void
wait_for_transfer_to_finish
()
const
{}
#endif
size_t
data_size
;
mutable
bool
host_current
;
mutable
bool
device_current
;
mutable
bool
have_active_transfer
;
std
::
shared_ptr
<
float
>
data_host
;
std
::
shared_ptr
<
float
>
data_device
;
std
::
shared_ptr
<
void
>
cuda_stream
;
};
inline
void
serialize
(
const
gpu_data
&
item
,
std
::
ostream
&
out
)
{
int
version
=
1
;
serialize
(
version
,
out
);
serialize
(
item
.
size
(),
out
);
auto
data
=
item
.
host
();
for
(
size_t
i
=
0
;
i
<
item
.
size
();
++
i
)
serialize
(
data
[
i
],
out
);
}
inline
void
deserialize
(
gpu_data
&
item
,
std
::
istream
&
in
)
{
int
version
;
deserialize
(
version
,
in
);
if
(
version
!=
1
)
throw
serialization_error
(
"Unexpected version found while deserializing dlib::gpu_data."
);
size_t
s
;
deserialize
(
s
,
in
);
item
.
set_size
(
s
);
auto
data
=
item
.
host
();
for
(
size_t
i
=
0
;
i
<
item
.
size
();
++
i
)
deserialize
(
data
[
i
],
in
);
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_GPU_DaTA_H_
dlib/dnn/tensor.h
View file @
3ba21975
...
@@ -3,165 +3,13 @@
...
@@ -3,165 +3,13 @@
#ifndef DLIB_DNn_TENSOR_H_
#ifndef DLIB_DNn_TENSOR_H_
#define DLIB_DNn_TENSOR_H_
#define DLIB_DNn_TENSOR_H_
#include <memory>
#include <cstring>
#include <cstring>
#include "../matrix.h"
#include "../matrix.h"
#include "cudnn_dlibapi.h"
#include "cudnn_dlibapi.h"
#include "gpu_data.h"
namespace
dlib
namespace
dlib
{
{
// ----------------------------------------------------------------------------------------
class
gpu_data
{
/*!
CONVENTION
- if (size() != 0) then
- data_host == a pointer to size() floats in CPU memory.
- if (data_device) then
- data_device == a pointer to size() floats in device memory.
- if (there might be an active transfer between host and device) then
- have_active_transfer == true
- We use the host_current and device_current bools to keep track of which
copy of the data (or both) are most current. e.g. if the CPU has
modified the tensor and it hasn't been copied to the device yet then
host_current==true and device_current == false.
THREAD SAFETY
This object is not thread-safe. Don't touch it from multiple threads as the
same time.
!*/
public
:
gpu_data
(
)
:
data_size
(
0
),
host_current
(
true
),
device_current
(
true
),
have_active_transfer
(
false
)
{
}
// Not copyable
gpu_data
(
const
gpu_data
&
)
=
delete
;
gpu_data
&
operator
=
(
const
gpu_data
&
)
=
delete
;
// but is movable
gpu_data
(
gpu_data
&&
)
=
default
;
gpu_data
&
operator
=
(
gpu_data
&&
)
=
default
;
#ifdef DLIB_USE_CUDA
void
async_copy_to_device
();
void
set_size
(
size_t
new_size
);
#else
// Note that calls to host() or device() will block until any async transfers are complete.
void
async_copy_to_device
(){}
void
set_size
(
size_t
new_size
)
{
if
(
new_size
==
0
)
{
data_size
=
0
;
host_current
=
true
;
device_current
=
true
;
data_host
.
reset
();
data_device
.
reset
();
}
else
if
(
new_size
!=
data_size
)
{
data_size
=
new_size
;
host_current
=
true
;
device_current
=
true
;
data_host
.
reset
(
new
float
[
new_size
],
std
::
default_delete
<
float
[]
>
());
data_device
.
reset
();
}
}
#endif
const
float
*
host
()
const
{
copy_to_host
();
return
data_host
.
get
();
}
float
*
host
()
{
copy_to_host
();
device_current
=
false
;
return
data_host
.
get
();
}
const
float
*
device
()
const
{
#ifndef DLIB_USE_CUDA
DLIB_CASSERT
(
false
,
"CUDA NOT ENABLED"
);
#endif
copy_to_device
();
return
data_device
.
get
();
}
float
*
device
()
{
#ifndef DLIB_USE_CUDA
DLIB_CASSERT
(
false
,
"CUDA NOT ENABLED"
);
#endif
copy_to_device
();
host_current
=
false
;
return
data_device
.
get
();
}
size_t
size
()
const
{
return
data_size
;
}
private
:
#ifdef DLIB_USE_CUDA
void
copy_to_device
()
const
;
void
copy_to_host
()
const
;
void
wait_for_transfer_to_finish
()
const
;
#else
void
copy_to_device
()
const
{}
void
copy_to_host
()
const
{}
void
wait_for_transfer_to_finish
()
const
{}
#endif
size_t
data_size
;
mutable
bool
host_current
;
mutable
bool
device_current
;
mutable
bool
have_active_transfer
;
std
::
shared_ptr
<
float
>
data_host
;
std
::
shared_ptr
<
float
>
data_device
;
std
::
shared_ptr
<
void
>
cuda_stream
;
};
inline
void
serialize
(
const
gpu_data
&
item
,
std
::
ostream
&
out
)
{
int
version
=
1
;
serialize
(
version
,
out
);
serialize
(
item
.
size
(),
out
);
auto
data
=
item
.
host
();
for
(
size_t
i
=
0
;
i
<
item
.
size
();
++
i
)
serialize
(
data
[
i
],
out
);
}
inline
void
deserialize
(
gpu_data
&
item
,
std
::
istream
&
in
)
{
int
version
;
deserialize
(
version
,
in
);
if
(
version
!=
1
)
throw
serialization_error
(
"Unexpected version found while deserializing dlib::gpu_data."
);
size_t
s
;
deserialize
(
s
,
in
);
item
.
set_size
(
s
);
auto
data
=
item
.
host
();
for
(
size_t
i
=
0
;
i
<
item
.
size
();
++
i
)
deserialize
(
data
[
i
],
in
);
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
class
tensor
class
tensor
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment