Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
D
dlib
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
钟尚武
dlib
Commits
49a5d39d
Commit
49a5d39d
authored
7 years ago
by
Davis King
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Made loss_multiclass_log_per_pixel use CUDA.
parent
6c962dd9
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
139 additions
and
0 deletions
+139
-0
cuda_dlib.cu
dlib/dnn/cuda_dlib.cu
+66
-0
cuda_dlib.h
dlib/dnn/cuda_dlib.h
+61
-0
loss.h
dlib/dnn/loss.h
+12
-0
No files found.
dlib/dnn/cuda_dlib.cu
View file @
49a5d39d
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
#include "cuda_utils.h"
#include "cuda_utils.h"
#include "cuda_dlib.h"
#include "cuda_dlib.h"
#include "cudnn_dlibapi.h"
namespace dlib
namespace dlib
...
@@ -1623,6 +1624,71 @@ namespace dlib
...
@@ -1623,6 +1624,71 @@ namespace dlib
}
}
}
}
// ----------------------------------------------------------------------------------------
__device__ float cuda_safe_log(float x, float epsilon = 1e-10)
{
// Prevent trying to calculate the logarithm of a very small number (let alone zero)
if (x >= epsilon)
return ::log(x);
else
return ::log(epsilon);
}
__global__ void _cuda_compute_loss_multiclass_log_per_pixel(float* loss_out, float* g, const uint16_t* truth, size_t n, size_t plane_size, size_t sample_size, size_t nk, uint16_t label_to_ignore, const float scale)
{
float loss = 0;
for(auto i : grid_stride_range(0, n))
{
const size_t k = (i/plane_size)%nk;
const size_t idx = (i%plane_size) + plane_size*(i/sample_size);
const size_t y = truth[idx];
if (k == y)
{
loss -= cuda_safe_log(g[i]);
g[i] = scale*(g[i] - 1);
}
else if (y == label_to_ignore)
{
g[i] = 0.f;
}
else
{
g[i] = scale*g[i];
}
}
warp_reduce_atomic_add(*loss_out, loss);
}
void compute_loss_multiclass_log_per_pixel::
do_work(
float* loss_cuda_work_buffer,
const uint16_t* truth_buffer,
const tensor& subnetwork_output,
tensor& gradient,
double& loss
)
{
CHECK_CUDA(cudaMemset(loss_cuda_work_buffer, 0, sizeof(float)));
softmax(gradient, subnetwork_output);
static const uint16_t label_to_ignore = std::numeric_limits<uint16_t>::max();
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0 / (subnetwork_output.num_samples() * subnetwork_output.nr() * subnetwork_output.nc());
launch_kernel(_cuda_compute_loss_multiclass_log_per_pixel, max_jobs(gradient.size()),
loss_cuda_work_buffer, gradient.device(), truth_buffer, gradient.size(), gradient.nr()*gradient.nc(), gradient.nr()*gradient.nc()*gradient.k(), gradient.k(), label_to_ignore, scale);
float floss;
CHECK_CUDA(cudaMemcpy(&floss, loss_cuda_work_buffer, sizeof(float), cudaMemcpyDefault));
loss = scale*floss;
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
}
}
...
...
This diff is collapsed.
Click to expand it.
dlib/dnn/cuda_dlib.h
View file @
49a5d39d
...
@@ -409,6 +409,67 @@ namespace dlib
...
@@ -409,6 +409,67 @@ namespace dlib
size_t
count_k
size_t
count_k
);
);
// ----------------------------------------------------------------------------------------
class
compute_loss_multiclass_log_per_pixel
{
/*!
The point of this class is to compute the loss computed by
loss_multiclass_log_per_pixel, but to do so with CUDA.
!*/
public
:
compute_loss_multiclass_log_per_pixel
(
)
{
work
=
device_global_buffer
();
}
template
<
typename
const_label_iterator
>
void
operator
()
(
const_label_iterator
truth
,
const
tensor
&
subnetwork_output
,
tensor
&
gradient
,
double
&
loss
)
const
{
const
size_t
bytes_per_plane
=
subnetwork_output
.
nr
()
*
subnetwork_output
.
nc
()
*
sizeof
(
uint16_t
);
// Allocate a cuda buffer to store all the truth images and also one float
// for the scalar loss output.
cuda_data_void_ptr
buf
=
work
->
get
(
subnetwork_output
.
num_samples
()
*
bytes_per_plane
+
sizeof
(
float
));
cuda_data_void_ptr
loss_buf
=
buf
;
buf
=
buf
+
sizeof
(
float
);
// copy the truth data into a cuda buffer.
for
(
long
i
=
0
;
i
<
subnetwork_output
.
num_samples
();
++
i
,
++
truth
)
{
const
matrix
<
uint16_t
>&
t
=
*
truth
;
DLIB_ASSERT
(
t
.
nr
()
==
subnetwork_output
.
nr
());
DLIB_ASSERT
(
t
.
nc
()
==
subnetwork_output
.
nc
());
memcpy
(
buf
+
i
*
bytes_per_plane
,
&
t
(
0
,
0
),
bytes_per_plane
);
}
do_work
(
static_cast
<
float
*>
(
loss_buf
.
data
()),
static_cast
<
uint16_t
*>
(
buf
.
data
()),
subnetwork_output
,
gradient
,
loss
);
}
private
:
static
void
do_work
(
float
*
loss_cuda_work_buffer
,
const
uint16_t
*
truth_buffer
,
const
tensor
&
subnetwork_output
,
tensor
&
gradient
,
double
&
loss
);
std
::
shared_ptr
<
resizable_cuda_buffer
>
work
;
};
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
...
...
This diff is collapsed.
Click to expand it.
dlib/dnn/loss.h
View file @
49a5d39d
...
@@ -2407,6 +2407,13 @@ namespace dlib
...
@@ -2407,6 +2407,13 @@ namespace dlib
"output size = "
<<
output_tensor
.
nr
()
<<
" x "
<<
output_tensor
.
nc
());
"output size = "
<<
output_tensor
.
nr
()
<<
" x "
<<
output_tensor
.
nc
());
}
}
#ifdef DLIB_USE_CUDA
double
loss
;
cuda_compute
(
truth
,
output_tensor
,
grad
,
loss
);
return
loss
;
#else
tt
::
softmax
(
grad
,
output_tensor
);
tt
::
softmax
(
grad
,
output_tensor
);
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
...
@@ -2445,6 +2452,7 @@ namespace dlib
...
@@ -2445,6 +2452,7 @@ namespace dlib
}
}
}
}
return
loss
;
return
loss
;
#endif
}
}
friend
void
serialize
(
const
loss_multiclass_log_per_pixel_
&
,
std
::
ostream
&
out
)
friend
void
serialize
(
const
loss_multiclass_log_per_pixel_
&
,
std
::
ostream
&
out
)
...
@@ -2478,6 +2486,10 @@ namespace dlib
...
@@ -2478,6 +2486,10 @@ namespace dlib
return
((
sample
*
t
.
k
()
+
k
)
*
t
.
nr
()
+
row
)
*
t
.
nc
()
+
column
;
return
((
sample
*
t
.
k
()
+
k
)
*
t
.
nr
()
+
row
)
*
t
.
nc
()
+
column
;
}
}
#ifdef DLIB_USE_CUDA
cuda
::
compute_loss_multiclass_log_per_pixel
cuda_compute
;
#endif
};
};
template
<
typename
SUBNET
>
template
<
typename
SUBNET
>
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment