Commit 49a5d39d authored by Davis King's avatar Davis King

Made loss_multiclass_log_per_pixel use CUDA.

parent 6c962dd9
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include "cuda_utils.h" #include "cuda_utils.h"
#include "cuda_dlib.h" #include "cuda_dlib.h"
#include "cudnn_dlibapi.h"
namespace dlib namespace dlib
...@@ -1623,6 +1624,71 @@ namespace dlib ...@@ -1623,6 +1624,71 @@ namespace dlib
} }
} }
// ----------------------------------------------------------------------------------------
__device__ float cuda_safe_log(float x, float epsilon = 1e-10)
{
// Prevent trying to calculate the logarithm of a very small number (let alone zero)
if (x >= epsilon)
return ::log(x);
else
return ::log(epsilon);
}
__global__ void _cuda_compute_loss_multiclass_log_per_pixel(float* loss_out, float* g, const uint16_t* truth, size_t n, size_t plane_size, size_t sample_size, size_t nk, uint16_t label_to_ignore, const float scale)
{
float loss = 0;
for(auto i : grid_stride_range(0, n))
{
const size_t k = (i/plane_size)%nk;
const size_t idx = (i%plane_size) + plane_size*(i/sample_size);
const size_t y = truth[idx];
if (k == y)
{
loss -= cuda_safe_log(g[i]);
g[i] = scale*(g[i] - 1);
}
else if (y == label_to_ignore)
{
g[i] = 0.f;
}
else
{
g[i] = scale*g[i];
}
}
warp_reduce_atomic_add(*loss_out, loss);
}
void compute_loss_multiclass_log_per_pixel::
do_work(
float* loss_cuda_work_buffer,
const uint16_t* truth_buffer,
const tensor& subnetwork_output,
tensor& gradient,
double& loss
)
{
CHECK_CUDA(cudaMemset(loss_cuda_work_buffer, 0, sizeof(float)));
softmax(gradient, subnetwork_output);
static const uint16_t label_to_ignore = std::numeric_limits<uint16_t>::max();
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0 / (subnetwork_output.num_samples() * subnetwork_output.nr() * subnetwork_output.nc());
launch_kernel(_cuda_compute_loss_multiclass_log_per_pixel, max_jobs(gradient.size()),
loss_cuda_work_buffer, gradient.device(), truth_buffer, gradient.size(), gradient.nr()*gradient.nc(), gradient.nr()*gradient.nc()*gradient.k(), gradient.k(), label_to_ignore, scale);
float floss;
CHECK_CUDA(cudaMemcpy(&floss, loss_cuda_work_buffer, sizeof(float), cudaMemcpyDefault));
loss = scale*floss;
}
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
} }
......
...@@ -409,6 +409,67 @@ namespace dlib ...@@ -409,6 +409,67 @@ namespace dlib
size_t count_k size_t count_k
); );
// ----------------------------------------------------------------------------------------
class compute_loss_multiclass_log_per_pixel
{
/*!
The point of this class is to compute the loss computed by
loss_multiclass_log_per_pixel, but to do so with CUDA.
!*/
public:
compute_loss_multiclass_log_per_pixel(
)
{
work = device_global_buffer();
}
template <
typename const_label_iterator
>
void operator() (
const_label_iterator truth,
const tensor& subnetwork_output,
tensor& gradient,
double& loss
) const
{
const size_t bytes_per_plane = subnetwork_output.nr()*subnetwork_output.nc()*sizeof(uint16_t);
// Allocate a cuda buffer to store all the truth images and also one float
// for the scalar loss output.
cuda_data_void_ptr buf = work->get(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float));
cuda_data_void_ptr loss_buf = buf;
buf = buf+sizeof(float);
// copy the truth data into a cuda buffer.
for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth)
{
const matrix<uint16_t>& t = *truth;
DLIB_ASSERT(t.nr() == subnetwork_output.nr());
DLIB_ASSERT(t.nc() == subnetwork_output.nc());
memcpy(buf + i*bytes_per_plane, &t(0,0), bytes_per_plane);
}
do_work(static_cast<float*>(loss_buf.data()), static_cast<uint16_t*>(buf.data()), subnetwork_output, gradient, loss);
}
private:
static void do_work(
float* loss_cuda_work_buffer,
const uint16_t* truth_buffer,
const tensor& subnetwork_output,
tensor& gradient,
double& loss
);
std::shared_ptr<resizable_cuda_buffer> work;
};
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
......
...@@ -2407,6 +2407,13 @@ namespace dlib ...@@ -2407,6 +2407,13 @@ namespace dlib
"output size = " << output_tensor.nr() << " x " << output_tensor.nc()); "output size = " << output_tensor.nr() << " x " << output_tensor.nc());
} }
#ifdef DLIB_USE_CUDA
double loss;
cuda_compute(truth, output_tensor, grad, loss);
return loss;
#else
tt::softmax(grad, output_tensor); tt::softmax(grad, output_tensor);
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output. // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
...@@ -2445,6 +2452,7 @@ namespace dlib ...@@ -2445,6 +2452,7 @@ namespace dlib
} }
} }
return loss; return loss;
#endif
} }
friend void serialize(const loss_multiclass_log_per_pixel_& , std::ostream& out) friend void serialize(const loss_multiclass_log_per_pixel_& , std::ostream& out)
...@@ -2478,6 +2486,10 @@ namespace dlib ...@@ -2478,6 +2486,10 @@ namespace dlib
return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column; return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
} }
#ifdef DLIB_USE_CUDA
cuda::compute_loss_multiclass_log_per_pixel cuda_compute;
#endif
}; };
template <typename SUBNET> template <typename SUBNET>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment