Commit cbce85ec authored by Davis King's avatar Davis King

Added GPU versions of the batch normalization functions.

parent 06534305
...@@ -185,7 +185,7 @@ namespace dlib ...@@ -185,7 +185,7 @@ namespace dlib
for (long i = 0; i < num; ++i) for (long i = 0; i < num; ++i)
{ {
auto actual_var = p_invstds[i] - p_means[i]*p_means[i]; auto actual_var = p_invstds[i] - p_means[i]*p_means[i];
p_invstds[i] = 1.0/std::sqrt(actual_var+eps); p_invstds[i] = 1.0f/std::sqrt(actual_var+eps);
} }
p_src = src.host(); p_src = src.host();
...@@ -361,8 +361,8 @@ namespace dlib ...@@ -361,8 +361,8 @@ namespace dlib
// compute variances // compute variances
for (long k = 0; k < src.k(); ++k) for (long k = 0; k < src.k(); ++k)
{ {
auto actual_var = p_invstds[k] - p_means[k]*p_means[k]; float actual_var = p_invstds[k] - p_means[k]*p_means[k];
p_invstds[k] = 1.0/std::sqrt(actual_var + eps); p_invstds[k] = 1.0f/std::sqrt(actual_var + eps);
} }
p_src = src.host(); p_src = src.host();
...@@ -421,7 +421,7 @@ namespace dlib ...@@ -421,7 +421,7 @@ namespace dlib
{ {
for (long k = 0; k < src.k(); ++k) for (long k = 0; k < src.k(); ++k)
{ {
const auto invstd_pow = -0.5*std::pow(p_invstds[k], 3.0f); const float invstd_pow = -0.5*std::pow(p_invstds[k], 3.0f);
for (long i = 0; i < num; ++i) for (long i = 0; i < num; ++i)
{ {
const float x_hat = (*p_src - p_means[k])*p_invstds[k]; const float x_hat = (*p_src - p_means[k])*p_invstds[k];
......
...@@ -163,8 +163,48 @@ namespace dlib ...@@ -163,8 +163,48 @@ namespace dlib
} }
} }
// -----------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
__global__ void _cuda_batch_normalize(
float* dest,
float* means,
float* invstds,
const float* src,
const float* gamma,
const float* beta,
long num,
long num_samples
)
{
const float eps = 0.00001;
const float invnum = 1.0f/num_samples;
for (auto i : grid_stride_range(0, num))
{
means[i] = 0;
invstds[i] = 0;
for (long n = 0; n < num_samples; ++n)
{
float val = src[n*num+i];
means[i] += val;
invstds[i] += val*val;
}
means[i] *= invnum;
invstds[i] *= invnum;
float actual_var = invstds[i] - means[i]*means[i];
invstds[i] = 1.0f/::sqrt(actual_var+eps);
for (long n = 0; n < num_samples; ++n)
{
long idx = n*num+i;
float temp = (src[idx] - means[i])*invstds[i];
dest[idx] = temp*gamma[i] + beta[i];
}
}
}
void batch_normalize ( void batch_normalize (
resizable_tensor& dest, resizable_tensor& dest,
resizable_tensor& means, resizable_tensor& means,
...@@ -174,8 +214,90 @@ namespace dlib ...@@ -174,8 +214,90 @@ namespace dlib
const tensor& beta const tensor& beta
) )
{ {
// TODO DLIB_CASSERT(
DLIB_CASSERT(false,""); src.num_samples() > 1 &&
gamma.num_samples() == 1 &&
beta.num_samples() == 1 &&
gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
gamma.k() == beta.k() && beta.k() == src.k(),
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
"\ngamma.nc(): " << gamma.nc() <<
"\nbeta.num_samples(): " << beta.num_samples() <<
"\nbeta.k(): " << beta.k() <<
"\nbeta.nr(): " << beta.nr() <<
"\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
);
dest.copy_size(src);
means.set_size(1, src.k(), src.nr(), src.nc());
invstds.set_size(1, src.k(), src.nr(), src.nc());
_cuda_batch_normalize<<<512,512>>>(dest.device(),
means.device(),
invstds.device(),
src.device(),
gamma.device(),
beta.device(),
means.size(),
src.num_samples());
}
__global__ void _cuda_batch_normalize_gradient(
const float* grad,
const float* means,
const float* invstds,
const float* src,
const float* gamma,
float* src_grad,
float* gamma_grad,
float* beta_grad,
float* dmeans,
float* dvars,
long num,
long num_samples
)
{
const float invnum = 1.0f/num_samples;
for (auto i : grid_stride_range(0, num))
{
dvars[i] = 0;
dmeans[i] = 0;
for (long n = 0; n < num_samples; ++n)
{
const long idx = n*num+i;
const float x_hat = (src[idx] - means[i])*invstds[i];
beta_grad[i] += grad[idx];
gamma_grad[i] += grad[idx]*x_hat;
const float dx = grad[idx] * gamma[i];
dvars[i] += dx*(src[idx] - means[i])*-0.5*::pow(invstds[i], 3.0f);
}
for (long n = 0; n < num_samples; ++n)
{
const long idx = n*num+i;
const float dx = grad[idx]*gamma[i];
dmeans[i] += dx*-invstds[i] + dvars[i] * -2*(src[idx] - means[i])*invnum;
}
for (long n = 0; n < num_samples; ++n)
{
const long idx = n*num+i;
const float dx = grad[idx]*gamma[i];
src_grad[idx] += dx*invstds[i] +
dvars[i] *2*(src[idx] - means[i])*invnum +
dmeans[i]*invnum;
}
}
} }
void batch_normalize_gradient::operator() ( void batch_normalize_gradient::operator() (
...@@ -189,12 +311,141 @@ namespace dlib ...@@ -189,12 +311,141 @@ namespace dlib
tensor& beta_grad tensor& beta_grad
) )
{ {
// TODO const long num = src.k()*src.nr()*src.nc();
DLIB_CASSERT(false,""); DLIB_CASSERT(num == means.size(),"");
DLIB_CASSERT(num == invstds.size(),"");
DLIB_CASSERT(num == gamma.size(),"");
DLIB_CASSERT(num == gamma_grad.size(),"");
DLIB_CASSERT(num == beta_grad.size(),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
dvars.copy_size(invstds);
dmeans.copy_size(means);
_cuda_batch_normalize_gradient<<<512,512>>>(
gradient_input.device(),
means.device(),
invstds.device(),
src.device(),
gamma.device(),
src_grad.device(),
gamma_grad.device(),
beta_grad.device(),
dmeans.device(),
dvars.device(),
num,
src.num_samples());
} }
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
// This function is from the article:
// http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
__inline__ __device__ float warp_reduce_sum(float val)
{
for (int offset = warpSize/2; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
}
__inline__ __device__ bool is_first_thread_in_warp()
{
return (threadIdx.x & (warpSize - 1)) == 0;
}
__inline__ __device__ void warp_reduce_atomic_add(
float& out,
float val
)
/*!
ensures
- Atomically adds all the val variables in the current warp to out.
See this page for an extended discussion:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
!*/
{
val = warp_reduce_sum(val);
if (is_first_thread_in_warp())
atomicAdd(&out, val);
}
__global__ void _cuda_batch_normalize_conv1(
float* dest,
float* means,
float* invstds,
const float* src,
const float* gamma,
const float* beta,
long num_k,
long num_samples,
long num_pixels
)
{
for (long k = 0; k < num_k; ++k)
{
float mval = 0;
float ival = 0;
// Now do two parallel reductions to compute the first two moments of the
// data.
for(auto j : grid_stride_range(0, num_samples*num_pixels))
{
long i = j%num_pixels;
long n = j/num_pixels;
float val = src[n*num_k*num_pixels + k*num_pixels +i];
mval += val;
ival += val*val;
}
warp_reduce_atomic_add(means[k], mval);
warp_reduce_atomic_add(invstds[k], ival);
}
}
__global__ void _cuda_batch_normalize_conv2(
float* means,
float* invstds,
long num_k,
long num_samples,
long num_pixels
)
{
const float scale = 1.0f/(num_samples*num_pixels);
const float eps = 0.00001;
for (auto k : grid_stride_range(0, num_k))
{
means[k] *= scale;
auto actual_var = scale*invstds[k] - means[k]*means[k];
invstds[k] = 1.0f/::sqrt(actual_var + eps);
}
}
__global__ void _cuda_batch_normalize_conv3(
float* dest,
float* means,
float* invstds,
const float* src,
const float* gamma,
const float* beta,
long num_k,
long num_samples,
long num_pixels
)
{
for (long k = 0; k < num_k; ++k)
{
for(auto j : grid_stride_range(0, num_samples*num_pixels))
{
long i = j%num_pixels;
long n = j/num_pixels;
i = n*num_k*num_pixels + k*num_pixels +i;
dest[i] = (src[i] - means[k])*invstds[k];
dest[i] = dest[i]*gamma[k] + beta[k];
}
}
}
void batch_normalize_conv ( void batch_normalize_conv (
resizable_tensor& dest, resizable_tensor& dest,
resizable_tensor& means, resizable_tensor& means,
...@@ -204,8 +455,172 @@ namespace dlib ...@@ -204,8 +455,172 @@ namespace dlib
const tensor& beta const tensor& beta
) )
{ {
// TODO DLIB_CASSERT(
DLIB_CASSERT(false,""); src.num_samples() > 1 &&
gamma.num_samples() == 1 &&
beta.num_samples() == 1 &&
gamma.nr() == 1 &&
beta.nr() == 1 &&
gamma.nc() == 1 &&
beta.nc() == 1 &&
gamma.k() == beta.k() && beta.k() == src.k(),
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
"\ngamma.nc(): " << gamma.nc() <<
"\nbeta.num_samples(): " << beta.num_samples() <<
"\nbeta.k(): " << beta.k() <<
"\nbeta.nr(): " << beta.nr() <<
"\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc()
);
dest.copy_size(src);
means.set_size(1, src.k());
invstds.set_size(1, src.k());
means = 0;
invstds = 0;
_cuda_batch_normalize_conv1<<<512,512>>>(dest.device(),
means.device(),
invstds.device(),
src.device(),
gamma.device(),
beta.device(),
src.k(),
src.num_samples(),
src.nr()*src.nc());
_cuda_batch_normalize_conv2<<<512,512>>>(
means.device(),
invstds.device(),
src.k(),
src.num_samples(),
src.nr()*src.nc());
_cuda_batch_normalize_conv3<<<512,512>>>(dest.device(),
means.device(),
invstds.device(),
src.device(),
gamma.device(),
beta.device(),
src.k(),
src.num_samples(),
src.nr()*src.nc());
}
__global__ void _cuda_batch_normalize_conv_gradient1(
const float* grad,
const float* means,
const float* invstds,
const float* src,
const float* gamma,
float* src_grad,
float* gamma_grad,
float* beta_grad,
float* dmeans,
float* dvars,
long num_k,
long num_samples,
long num_pixels
)
{
for (long k = 0; k < num_k; ++k)
{
float bval = 0;
float gval = 0;
float dval = 0;
const float invstd_pow = -0.5f*::pow(invstds[k], 3.0f);
// Now do three parallel reductions
for(auto j : grid_stride_range(0, num_samples*num_pixels))
{
long i = j%num_pixels;
long n = j/num_pixels;
long idx = n*num_k*num_pixels + k*num_pixels +i;
const float x_hat = (src[idx] - means[k])*invstds[k];
bval += grad[idx];
gval += grad[idx]*x_hat;
const float dx = grad[idx] * gamma[k];
dval += dx*(src[idx] - means[k])*invstd_pow;
}
warp_reduce_atomic_add(beta_grad[k], bval);
warp_reduce_atomic_add(gamma_grad[k], gval);
warp_reduce_atomic_add(dvars[k], dval);
}
}
__global__ void _cuda_batch_normalize_conv_gradient2(
const float* grad,
const float* means,
const float* invstds,
const float* src,
const float* gamma,
float* src_grad,
float* gamma_grad,
float* beta_grad,
float* dmeans,
float* dvars,
long num_k,
long num_samples,
long num_pixels
)
{
const float invnum = 1.0f/(num_samples*num_pixels);
for (long k = 0; k < num_k; ++k)
{
float mval = 0;
// Now do a parallel reduction
for(auto j : grid_stride_range(0, num_samples*num_pixels))
{
long i = j%num_pixels;
long n = j/num_pixels;
long idx = n*num_k*num_pixels + k*num_pixels +i;
const float dx = grad[idx] * gamma[k];
mval += -dx*invstds[k] + dvars[k] * -2*(src[idx] - means[k])*invnum;
}
warp_reduce_atomic_add(dmeans[k], mval);
}
}
__global__ void _cuda_batch_normalize_conv_gradient3(
const float* grad,
const float* means,
const float* invstds,
const float* src,
const float* gamma,
float* src_grad,
float* gamma_grad,
float* beta_grad,
float* dmeans,
float* dvars,
long num_k,
long num_samples,
long num_pixels
)
{
const float invnum = 1.0f/(num_samples*num_pixels);
for (long k = 0; k < num_k; ++k)
{
for(auto j : grid_stride_range(0, num_samples*num_pixels))
{
long i = j%num_pixels;
long n = j/num_pixels;
long idx = n*num_k*num_pixels + k*num_pixels +i;
const float dx = grad[idx] * gamma[k];
src_grad[idx] += dx*invstds[k] +
dvars[k]*2*(src[idx] - means[k])*invnum +
dmeans[k]*invnum;
}
}
} }
void batch_normalize_conv_gradient::operator() ( void batch_normalize_conv_gradient::operator() (
...@@ -219,8 +634,64 @@ namespace dlib ...@@ -219,8 +634,64 @@ namespace dlib
tensor& beta_grad tensor& beta_grad
) )
{ {
// TODO DLIB_CASSERT(src.k() == means.size(),"");
DLIB_CASSERT(false,""); DLIB_CASSERT(src.k() == invstds.size(),"");
DLIB_CASSERT(src.k() == gamma.size(),"");
DLIB_CASSERT(src.k() == gamma_grad.size(),"");
DLIB_CASSERT(src.k() == beta_grad.size(),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
dvars.copy_size(invstds);
dmeans.copy_size(means);
dvars = 0;
dmeans = 0;
_cuda_batch_normalize_conv_gradient1<<<512,512>>>(
gradient_input.device(),
means.device(),
invstds.device(),
src.device(),
gamma.device(),
src_grad.device(),
gamma_grad.device(),
beta_grad.device(),
dmeans.device(),
dvars.device(),
src.k(),
src.num_samples(),
src.nr()*src.nc());
_cuda_batch_normalize_conv_gradient2<<<512,512>>>(
gradient_input.device(),
means.device(),
invstds.device(),
src.device(),
gamma.device(),
src_grad.device(),
gamma_grad.device(),
beta_grad.device(),
dmeans.device(),
dvars.device(),
src.k(),
src.num_samples(),
src.nr()*src.nc());
_cuda_batch_normalize_conv_gradient3<<<512,512>>>(
gradient_input.device(),
means.device(),
invstds.device(),
src.device(),
gamma.device(),
src_grad.device(),
gamma_grad.device(),
beta_grad.device(),
dmeans.device(),
dvars.device(),
src.k(),
src.num_samples(),
src.nr()*src.nc());
} }
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
......
...@@ -460,6 +460,107 @@ namespace ...@@ -460,6 +460,107 @@ namespace
} }
#endif #endif
// ----------------------------------------------------------------------------------------
void compare_bn_gpu_and_cpu()
{
print_spinner();
resizable_tensor dest, dest2;
resizable_tensor means, means2;
resizable_tensor invstds, invstds2;
resizable_tensor src(64,20,100,100);
resizable_tensor gamma(1,20,100,100);
resizable_tensor beta(1,20,100,100);
gamma = 2;
beta = 3;
tt::tensor_rand rnd;
rnd.fill_uniform(src);
cpu::batch_normalize(dest,means,invstds, src, gamma, beta);
cuda::batch_normalize(dest2,means2,invstds2, src, gamma, beta);
dlog << LINFO << "dest error: "<< max(abs(mat(dest) -mat(dest2)));
dlog << LINFO << "means error: "<< max(abs(mat(means) -mat(means2)));
dlog << LINFO << "invstds error: "<< max(abs(mat(invstds) -mat(invstds2)));
DLIB_TEST(max(abs(mat(dest) -mat(dest2))) < 1e-5);
DLIB_TEST(max(abs(mat(means) -mat(means2))) < 1e-5);
DLIB_TEST(max(abs(mat(invstds) -mat(invstds2))) < 1e-5);
// now check that the gradients match as well
resizable_tensor gradient_input;
resizable_tensor src_grad, gamma_grad, beta_grad;
resizable_tensor src_grad2, gamma_grad2, beta_grad2;
gradient_input.copy_size(dest);
src_grad.copy_size(src); src_grad = 0; src_grad2 = src_grad;
gamma_grad.copy_size(gamma); gamma_grad = 0; gamma_grad2 = gamma_grad;
beta_grad.copy_size(beta); beta_grad = 0; beta_grad2 = beta_grad;
rnd.fill_uniform(gradient_input);
cpu::batch_normalize_gradient cpu_bng;
cpu_bng(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cuda::batch_normalize_gradient cuda_bng;
cuda_bng(gradient_input, means, invstds, src, gamma, src_grad2, gamma_grad2, beta_grad2);
dlog << LINFO << "src_grad error: " << max(abs(mat(src_grad)-mat(src_grad2)));
dlog << LINFO << "gamma_grad error: " << max(abs(mat(gamma_grad)-mat(gamma_grad2)));
dlog << LINFO << "beta_grad error: " << max(abs(mat(beta_grad)-mat(beta_grad2)));
DLIB_TEST(max(abs(mat(src_grad)-mat(src_grad2))) < 1e-5);
DLIB_TEST(max(abs(mat(gamma_grad)-mat(gamma_grad2))) < 1e-5);
DLIB_TEST(max(abs(mat(beta_grad)-mat(beta_grad2))) < 1e-5);
}
void compare_bn_conv_gpu_and_cpu()
{
print_spinner();
resizable_tensor dest, dest2;
resizable_tensor means, means2;
resizable_tensor invstds, invstds2;
resizable_tensor src(2,8,10,9);
resizable_tensor gamma(1,8);
resizable_tensor beta(1,8);
gamma = 2;
beta = 3;
tt::tensor_rand rnd;
rnd.fill_uniform(src);
cpu::batch_normalize_conv(dest,means,invstds, src, gamma, beta);
cuda::batch_normalize_conv(dest2,means2,invstds2, src, gamma, beta);
dlog << LINFO << "dest error: "<< max(abs(mat(dest) -mat(dest2)));
dlog << LINFO << "means error: "<< max(abs(mat(means) -mat(means2)));
dlog << LINFO << "invstds error: "<< max(abs(mat(invstds) -mat(invstds2)));
DLIB_TEST(max(abs(mat(dest) -mat(dest2))) < 1e-4);
DLIB_TEST(max(abs(mat(means) -mat(means2))) < 1e-4);
DLIB_TEST(max(abs(mat(invstds) -mat(invstds2))) < 1e-4);
resizable_tensor gradient_input;
resizable_tensor src_grad, gamma_grad, beta_grad;
resizable_tensor src_grad2, gamma_grad2, beta_grad2;
gradient_input.copy_size(dest);
src_grad.copy_size(src); src_grad = 0; src_grad2 = src_grad;
gamma_grad.copy_size(gamma); gamma_grad = 0; gamma_grad2 = gamma_grad;
beta_grad.copy_size(beta); beta_grad = 0; beta_grad2 = beta_grad;
rnd.fill_uniform(gradient_input);
cpu::batch_normalize_conv_gradient cpu_bng;
cpu_bng(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
cuda::batch_normalize_conv_gradient cuda_bng;
cuda_bng(gradient_input, means, invstds, src, gamma, src_grad2, gamma_grad2, beta_grad2);
dlog << LINFO << "src_grad error: " << max(abs(mat(src_grad)-mat(src_grad2)));
dlog << LINFO << "gamma_grad error: " << max(abs(mat(gamma_grad)-mat(gamma_grad2)));
dlog << LINFO << "beta_grad error: " << max(abs(mat(beta_grad)-mat(beta_grad2)));
DLIB_TEST(max(abs(mat(src_grad)-mat(src_grad2))) < 1e-4);
DLIB_TEST(max(abs(mat(gamma_grad)-mat(gamma_grad2))) < 1e-4);
DLIB_TEST(max(abs(mat(beta_grad)-mat(beta_grad2))) < 1e-4);
}
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
class dnn_tester : public tester class dnn_tester : public tester
...@@ -488,6 +589,8 @@ namespace ...@@ -488,6 +589,8 @@ namespace
test_batch_normalize(); test_batch_normalize();
test_batch_normalize_conv(); test_batch_normalize_conv();
test_basic_tensor_ops(); test_basic_tensor_ops();
compare_bn_gpu_and_cpu();
compare_bn_conv_gpu_and_cpu();
} }
} a; } a;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment