Commit 8421f213 authored by Davis King's avatar Davis King

Fixed the in-place layers so that they don't interfere with the operation of

skip layers and add_prev style layers.  In particular, now in-place layers only
overwrite the gradient information in their child layer if they are operating
in in-place mode.  Otherwise, they add their gradients to their child layers.

It should also be noted that it's safe for in-place layers to overwrite
gradients when in in-place mode since their child layers are inaccessible when
in-place layers operate in in-place mode.  This prevents any other layers from
trying to add to the child layer, thereby avoiding the potability of layer
interference.  So the bug this change fixes is that, when not in in-place mode
the child layers are still accessible but in-place layers were *still*
overwriting child gradients.
parent 81eb18a4
...@@ -2849,8 +2849,11 @@ namespace dlib ...@@ -2849,8 +2849,11 @@ namespace dlib
resizable_tensor params_grad1, params_grad2, data_grad1, data_grad2; resizable_tensor params_grad1, params_grad2, data_grad1, data_grad2;
params_grad1 = params_grad; params_grad1 = params_grad;
params_grad2 = params_grad; params_grad2 = params_grad;
// Now call backward() and make sure it works as well. // Now call backward() and make sure it works as well. Recall that when an
subnetwork2.get_gradient_input() = 9999; // in-place layer works in-place it assigns to it's outputs but when it's
// not running in-place it adds. So we initialize to a non-zero value to
// check that this is the behavior that really executes.
subnetwork2.get_gradient_input() = 9;
impl::call_layer_backward(ll, ip_out, input_grad, subnetwork2, params_grad1); impl::call_layer_backward(ll, ip_out, input_grad, subnetwork2, params_grad1);
data_grad1 = subnetwork2.get_gradient_input(); data_grad1 = subnetwork2.get_gradient_input();
...@@ -2868,7 +2871,7 @@ namespace dlib ...@@ -2868,7 +2871,7 @@ namespace dlib
return layer_test_results(sout.str()); return layer_test_results(sout.str());
} }
} }
const auto backward_data_error = max(abs(mat(data_grad1) - mat(data_grad2))); const auto backward_data_error = max(abs(mat(data_grad1)-9 - mat(data_grad2)));
if (backward_data_error > 0.00001) if (backward_data_error > 0.00001)
{ {
using namespace std; using namespace std;
...@@ -2934,7 +2937,6 @@ namespace dlib ...@@ -2934,7 +2937,6 @@ namespace dlib
// compare it to the one output by the layer and make sure they match. // compare it to the one output by the layer and make sure they match.
double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps); double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
double output_derivative = subnetwork.get_gradient_input_element(i); double output_derivative = subnetwork.get_gradient_input_element(i);
if (!impl::is_inplace_layer(l,subnetwork))
output_derivative -= initial_gradient_input[i]; output_derivative -= initial_gradient_input[i];
double relative_error; double relative_error;
if (reference_derivative != 0) if (reference_derivative != 0)
......
...@@ -16,6 +16,7 @@ namespace dlib ...@@ -16,6 +16,7 @@ namespace dlib
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void multiply ( void multiply (
bool add_to,
tensor& dest, tensor& dest,
const tensor& src1, const tensor& src1,
const tensor& src2 const tensor& src2
...@@ -37,25 +38,45 @@ namespace dlib ...@@ -37,25 +38,45 @@ namespace dlib
const auto s1 = src1.host(); const auto s1 = src1.host();
const auto s2 = src2.host(); const auto s2 = src2.host();
if (dest.size() == src1.size() && src1.size() == src2.size()) if (dest.size() == src1.size() && src1.size() == src2.size())
{
if (add_to)
{
for (size_t i = 0; i < src1.size(); ++i)
d[i] += s1[i]*s2[i];
}
else
{ {
for (size_t i = 0; i < src1.size(); ++i) for (size_t i = 0; i < src1.size(); ++i)
d[i] = s1[i]*s2[i]; d[i] = s1[i]*s2[i];
} }
}
else if (dest.num_samples() == 1) else if (dest.num_samples() == 1)
{
if (!add_to)
{ {
for (size_t i = 0; i < dest.size(); ++i) for (size_t i = 0; i < dest.size(); ++i)
d[i] = 0; d[i] = 0;
}
for (size_t i = 0; i < max_size; ++i) for (size_t i = 0; i < max_size; ++i)
d[i%dest.size()] += s1[i%src1.size()]*s2[i%src2.size()]; d[i%dest.size()] += s1[i%src1.size()]*s2[i%src2.size()];
} }
else else
{
if (add_to)
{
for (size_t i = 0; i < max_size; ++i)
d[i] += s1[i%src1.size()]*s2[i%src2.size()];
}
else
{ {
for (size_t i = 0; i < max_size; ++i) for (size_t i = 0; i < max_size; ++i)
d[i] = s1[i%src1.size()]*s2[i%src2.size()]; d[i] = s1[i%src1.size()]*s2[i%src2.size()];
} }
} }
}
void multiply_conv ( void multiply_conv (
bool add_to,
tensor& dest, tensor& dest,
const tensor& src1, const tensor& src1,
const tensor& src2 const tensor& src2
...@@ -68,6 +89,24 @@ namespace dlib ...@@ -68,6 +89,24 @@ namespace dlib
{ {
DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k(),""); DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k(),"");
if (add_to)
{
for (long n = 0; n < dest.num_samples(); ++n)
{
for (long k = 0; k < dest.k(); ++k)
{
for (long r = 0; r < dest.nr(); ++r)
{
for (long c = 0; c < dest.nc(); ++c)
{
*d++ += (*s1++)*s2[k];
}
}
}
}
}
else
{
for (long n = 0; n < dest.num_samples(); ++n) for (long n = 0; n < dest.num_samples(); ++n)
{ {
for (long k = 0; k < dest.k(); ++k) for (long k = 0; k < dest.k(); ++k)
...@@ -82,13 +121,17 @@ namespace dlib ...@@ -82,13 +121,17 @@ namespace dlib
} }
} }
} }
}
else else
{ {
DLIB_CASSERT(have_same_dimensions(src1,src2),""); DLIB_CASSERT(have_same_dimensions(src1,src2),"");
DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k(),""); DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k(),"");
if (!add_to)
{
for (long k = 0; k < src1.k(); ++k) for (long k = 0; k < src1.k(); ++k)
d[k] = 0; d[k] = 0;
}
for (long n = 0; n < src1.num_samples(); ++n) for (long n = 0; n < src1.num_samples(); ++n)
{ {
...@@ -1105,9 +1148,17 @@ namespace dlib ...@@ -1105,9 +1148,17 @@ namespace dlib
float temp = 0; float temp = 0;
for (long k = 0; k < grad.k(); ++k) for (long k = 0; k < grad.k(); ++k)
temp += -d3[k*num]*in3[k*num]; temp += -d3[k*num]*in3[k*num];
if (is_same_object(gradient_input, grad))
{
for (long k = 0; k < grad.k(); ++k) for (long k = 0; k < grad.k(); ++k)
g3[k*num] = d3[k*num]*(temp+in3[k*num]); g3[k*num] = d3[k*num]*(temp+in3[k*num]);
} }
else
{
for (long k = 0; k < grad.k(); ++k)
g3[k*num] += d3[k*num]*(temp+in3[k*num]);
}
}
} }
} }
} }
...@@ -1134,9 +1185,17 @@ namespace dlib ...@@ -1134,9 +1185,17 @@ namespace dlib
const auto g = grad.host(); const auto g = grad.host();
const auto d = dest.host(); const auto d = dest.host();
const auto in = gradient_input.host(); const auto in = gradient_input.host();
if (is_same_object(gradient_input, grad))
{
for (size_t i = 0; i < dest.size(); ++i) for (size_t i = 0; i < dest.size(); ++i)
g[i] = in[i]*d[i]*(1-d[i]); g[i] = in[i]*d[i]*(1-d[i]);
} }
else
{
for (size_t i = 0; i < dest.size(); ++i)
g[i] += in[i]*d[i]*(1-d[i]);
}
}
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
...@@ -1157,6 +1216,8 @@ namespace dlib ...@@ -1157,6 +1216,8 @@ namespace dlib
const float* gi = gradient_input.host(); const float* gi = gradient_input.host();
const float* in = dest.host(); const float* in = dest.host();
float* out = grad.host(); float* out = grad.host();
if (is_same_object(grad, gradient_input))
{
for (size_t i = 0; i < dest.size(); ++i) for (size_t i = 0; i < dest.size(); ++i)
{ {
if (in[i] > 0) if (in[i] > 0)
...@@ -1165,6 +1226,15 @@ namespace dlib ...@@ -1165,6 +1226,15 @@ namespace dlib
out[i] = 0; out[i] = 0;
} }
} }
else
{
for (size_t i = 0; i < dest.size(); ++i)
{
if (in[i] > 0)
out[i] += gi[i];
}
}
}
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
...@@ -1194,6 +1264,7 @@ namespace dlib ...@@ -1194,6 +1264,7 @@ namespace dlib
tensor& params_grad tensor& params_grad
) )
{ {
DLIB_CASSERT(is_same_object(grad, gradient_input) == false,"");
const float p = param.host()[0]; const float p = param.host()[0];
const float* gi = gradient_input.host(); const float* gi = gradient_input.host();
const float* s = src.host(); const float* s = src.host();
...@@ -1236,9 +1307,17 @@ namespace dlib ...@@ -1236,9 +1307,17 @@ namespace dlib
const auto g = grad.host(); const auto g = grad.host();
const auto d = dest.host(); const auto d = dest.host();
const auto in = gradient_input.host(); const auto in = gradient_input.host();
if (is_same_object(grad, gradient_input))
{
for (size_t i = 0; i < dest.size(); ++i) for (size_t i = 0; i < dest.size(); ++i)
g[i] = in[i]*(1-d[i]*d[i]); g[i] = in[i]*(1-d[i]*d[i]);
} }
else
{
for (size_t i = 0; i < dest.size(); ++i)
g[i] += in[i]*(1-d[i]*d[i]);
}
}
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
......
...@@ -16,12 +16,14 @@ namespace dlib ...@@ -16,12 +16,14 @@ namespace dlib
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void multiply ( void multiply (
bool add_to,
tensor& dest, tensor& dest,
const tensor& src1, const tensor& src1,
const tensor& src2 const tensor& src2
); );
void multiply_conv ( void multiply_conv (
bool add_to,
tensor& dest, tensor& dest,
const tensor& src1, const tensor& src1,
const tensor& src2 const tensor& src2
......
...@@ -122,7 +122,34 @@ namespace dlib ...@@ -122,7 +122,34 @@ namespace dlib
} }
} }
__global__ void _cuda_multiply1_add_to(float* d, const float* s1, const float* s2, size_t n)
{
for (auto i : grid_stride_range(0, n))
{
d[i] += s1[i]*s2[i];
}
}
__global__ void _cuda_multiply2_add_to(float* d, const float* s1, const float* s2,
size_t n, size_t s1_n, size_t s2_n, size_t max_size)
{
for (auto i : grid_stride_range(0, n))
{
for (size_t j = i; j < max_size; j += n)
d[i] += s1[j%s1_n]*s2[j%s2_n];
}
}
__global__ void _cuda_multiply3_add_to(float* d, const float* s1, const float* s2,
size_t n, size_t s1_n, size_t s2_n)
{
for (auto i : grid_stride_range(0, n))
{
d[i] += s1[i%s1_n]*s2[i%s2_n];
}
}
void multiply ( void multiply (
bool add_to,
tensor& dest, tensor& dest,
const tensor& src1, const tensor& src1,
const tensor& src2 const tensor& src2
...@@ -146,15 +173,26 @@ namespace dlib ...@@ -146,15 +173,26 @@ namespace dlib
const auto s2 = src2.host(); const auto s2 = src2.host();
if (dest.size() == src1.size() && src1.size() == src2.size()) if (dest.size() == src1.size() && src1.size() == src2.size())
{ {
if (add_to)
launch_kernel(_cuda_multiply1_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size());
else
launch_kernel(_cuda_multiply1,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size()); launch_kernel(_cuda_multiply1,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size());
} }
else if (dest.num_samples() == 1) else if (dest.num_samples() == 1)
{ {
if (add_to)
launch_kernel(_cuda_multiply2_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
dest.size(), src1.size(), src2.size(), max_size);
else
launch_kernel(_cuda_multiply2,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), launch_kernel(_cuda_multiply2,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
dest.size(), src1.size(), src2.size(), max_size); dest.size(), src1.size(), src2.size(), max_size);
} }
else else
{ {
if (add_to)
launch_kernel(_cuda_multiply3_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
dest.size(), src1.size(), src2.size());
else
launch_kernel(_cuda_multiply3,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), launch_kernel(_cuda_multiply3,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
dest.size(), src1.size(), src2.size()); dest.size(), src1.size(), src2.size());
} }
...@@ -191,8 +229,33 @@ namespace dlib ...@@ -191,8 +229,33 @@ namespace dlib
} }
} }
__global__ void _cuda_multiply_conv_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
{
for (auto i : grid_stride_range(0, n))
{
auto k = (i/bs)%ks;
d[i] += s1[i]*s2[k];
}
}
__global__ void _cuda_multiply_conv2_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
{
// loop over all the image planes
for (auto i : grid_stride_range_y(0, n))
{
// sum all the elements in the i-th image plane
float temp = 0;
for (auto j : grid_stride_range(i*bs, (i+1)*bs))
temp += s1[j]*s2[j];
auto k = i%ks;
// and store the sum into d[k]
warp_reduce_atomic_add(d[k], temp);
}
}
void multiply_conv ( void multiply_conv (
bool add_to,
tensor& dest, tensor& dest,
const tensor& src1, const tensor& src1,
const tensor& src2 const tensor& src2
...@@ -204,6 +267,10 @@ namespace dlib ...@@ -204,6 +267,10 @@ namespace dlib
if (dest.size() == 0) if (dest.size() == 0)
return; return;
if (add_to)
launch_kernel(_cuda_multiply_conv_add_to,max_jobs(dest.size()),
dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k());
else
launch_kernel(_cuda_multiply_conv,max_jobs(dest.size()), launch_kernel(_cuda_multiply_conv,max_jobs(dest.size()),
dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k()); dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k());
} }
...@@ -216,6 +283,10 @@ namespace dlib ...@@ -216,6 +283,10 @@ namespace dlib
dim3 blocks(10,1); dim3 blocks(10,1);
dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel. dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
if (add_to)
_cuda_multiply_conv2_add_to<<<blocks,threads>>>(
dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k());
else
_cuda_multiply_conv2<<<blocks,threads>>>( _cuda_multiply_conv2<<<blocks,threads>>>(
dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k()); dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k());
} }
......
...@@ -102,12 +102,14 @@ namespace dlib ...@@ -102,12 +102,14 @@ namespace dlib
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
void multiply ( void multiply (
bool add_to,
tensor& dest, tensor& dest,
const tensor& src1, const tensor& src1,
const tensor& src2 const tensor& src2
); );
void multiply_conv ( void multiply_conv (
bool add_to,
tensor& dest, tensor& dest,
const tensor& src1, const tensor& src1,
const tensor& src2 const tensor& src2
......
...@@ -1265,7 +1265,7 @@ namespace dlib ...@@ -1265,7 +1265,7 @@ namespace dlib
return; return;
const float alpha = 1; const float alpha = 1;
const float beta = 0; const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
CHECK_CUDNN(cudnnSoftmaxBackward(context(), CHECK_CUDNN(cudnnSoftmaxBackward(context(),
CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_ACCURATE,
CUDNN_SOFTMAX_MODE_CHANNEL, CUDNN_SOFTMAX_MODE_CHANNEL,
...@@ -1316,7 +1316,7 @@ namespace dlib ...@@ -1316,7 +1316,7 @@ namespace dlib
return; return;
const float alpha = 1; const float alpha = 1;
const float beta = 0; const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
CHECK_CUDNN(cudnnActivationBackward(context(), CHECK_CUDNN(cudnnActivationBackward(context(),
sigmoid_activation_descriptor(), sigmoid_activation_descriptor(),
&alpha, &alpha,
...@@ -1367,7 +1367,7 @@ namespace dlib ...@@ -1367,7 +1367,7 @@ namespace dlib
return; return;
const float alpha = 1; const float alpha = 1;
const float beta = 0; const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
CHECK_CUDNN(cudnnActivationBackward(context(), CHECK_CUDNN(cudnnActivationBackward(context(),
relu_activation_descriptor(), relu_activation_descriptor(),
&alpha, &alpha,
...@@ -1418,7 +1418,7 @@ namespace dlib ...@@ -1418,7 +1418,7 @@ namespace dlib
return; return;
const float alpha = 1; const float alpha = 1;
const float beta = 0; const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
CHECK_CUDNN(cudnnActivationBackward(context(), CHECK_CUDNN(cudnnActivationBackward(context(),
tanh_activation_descriptor(), tanh_activation_descriptor(),
&alpha, &alpha,
......
...@@ -966,7 +966,7 @@ namespace dlib ...@@ -966,7 +966,7 @@ namespace dlib
mask.copy_size(input); mask.copy_size(input);
rnd.fill_uniform(mask); rnd.fill_uniform(mask);
tt::threshold(mask, drop_rate); tt::threshold(mask, drop_rate);
tt::multiply(output, input, mask); tt::multiply(false, output, input, mask);
} }
void backward_inplace( void backward_inplace(
...@@ -975,7 +975,10 @@ namespace dlib ...@@ -975,7 +975,10 @@ namespace dlib
tensor& /*params_grad*/ tensor& /*params_grad*/
) )
{ {
tt::multiply(data_grad, mask, gradient_input); if (is_same_object(gradient_input, data_grad))
tt::multiply(false, data_grad, mask, gradient_input);
else
tt::multiply(true, data_grad, mask, gradient_input);
} }
const tensor& get_layer_params() const { return params; } const tensor& get_layer_params() const { return params; }
...@@ -1044,7 +1047,7 @@ namespace dlib ...@@ -1044,7 +1047,7 @@ namespace dlib
void forward_inplace(const tensor& input, tensor& output) void forward_inplace(const tensor& input, tensor& output)
{ {
tt::affine_transform(output, input, val, 0); tt::affine_transform(output, input, val);
} }
void backward_inplace( void backward_inplace(
...@@ -1053,7 +1056,10 @@ namespace dlib ...@@ -1053,7 +1056,10 @@ namespace dlib
tensor& /*params_grad*/ tensor& /*params_grad*/
) )
{ {
tt::affine_transform(data_grad, gradient_input, val, 0); if (is_same_object(gradient_input, data_grad))
tt::affine_transform(data_grad, gradient_input, val);
else
tt::affine_transform(data_grad, data_grad, gradient_input, 1, val);
} }
const tensor& get_layer_params() const { return params; } const tensor& get_layer_params() const { return params; }
...@@ -1187,11 +1193,17 @@ namespace dlib ...@@ -1187,11 +1193,17 @@ namespace dlib
// We are computing the gradient of dot(gradient_input, computed_output*g + b) // We are computing the gradient of dot(gradient_input, computed_output*g + b)
if (mode == FC_MODE) if (mode == FC_MODE)
{ {
tt::multiply(data_grad, gradient_input, g); if (is_same_object(gradient_input, data_grad))
tt::multiply(false, data_grad, gradient_input, g);
else
tt::multiply(true, data_grad, gradient_input, g);
} }
else else
{ {
tt::multiply_conv(data_grad, gradient_input, g); if (is_same_object(gradient_input, data_grad))
tt::multiply_conv(false, data_grad, gradient_input, g);
else
tt::multiply_conv(true, data_grad, gradient_input, g);
} }
} }
......
...@@ -287,7 +287,10 @@ namespace dlib ...@@ -287,7 +287,10 @@ namespace dlib
to data_input. to data_input.
Finally, backward_inplace() outputs these gradients by performing: Finally, backward_inplace() outputs these gradients by performing:
- params_grad = PARAMETER_GRADIENT - params_grad = PARAMETER_GRADIENT
- if (is_same_object(gradient_input, data_grad)) then
- data_grad = DATA_GRADIENT - data_grad = DATA_GRADIENT
- else
- data_grad += DATA_GRADIENT
!*/ !*/
const tensor& get_layer_params( const tensor& get_layer_params(
......
...@@ -127,6 +127,7 @@ namespace dlib { namespace tt ...@@ -127,6 +127,7 @@ namespace dlib { namespace tt
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void multiply ( void multiply (
bool add_to,
tensor& dest, tensor& dest,
const tensor& src1, const tensor& src1,
const tensor& src2 const tensor& src2
...@@ -140,23 +141,24 @@ namespace dlib { namespace tt ...@@ -140,23 +141,24 @@ namespace dlib { namespace tt
(src1.num_samples()==1 || src1.num_samples()==MD) && (src1.num_samples()==1 || src1.num_samples()==MD) &&
(src2.num_samples()==1 || src2.num_samples()==MD) ,""); (src2.num_samples()==1 || src2.num_samples()==MD) ,"");
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cuda::multiply(dest, src1, src2); cuda::multiply(add_to, dest, src1, src2);
#else #else
cpu::multiply(dest, src1, src2); cpu::multiply(add_to, dest, src1, src2);
#endif #endif
} }
void multiply_conv ( void multiply_conv (
bool add_to,
tensor& dest, tensor& dest,
const tensor& src1, const tensor& src1,
const tensor& src2 const tensor& src2
) )
{ {
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cuda::multiply_conv(dest, src1, src2); cuda::multiply_conv(add_to, dest, src1, src2);
#else #else
cpu::multiply_conv(dest, src1, src2); cpu::multiply_conv(add_to, dest, src1, src2);
#endif #endif
} }
......
...@@ -101,6 +101,7 @@ namespace dlib { namespace tt ...@@ -101,6 +101,7 @@ namespace dlib { namespace tt
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void multiply ( void multiply (
bool add_to,
tensor& dest, tensor& dest,
const tensor& src1, const tensor& src1,
const tensor& src2 const tensor& src2
...@@ -124,9 +125,12 @@ namespace dlib { namespace tt ...@@ -124,9 +125,12 @@ namespace dlib { namespace tt
Second, if dest.num_samples()==1, then after the pointwise multiplication of Second, if dest.num_samples()==1, then after the pointwise multiplication of
src1 with src2, the result has its samples summed to produce an output tensor src1 with src2, the result has its samples summed to produce an output tensor
with num_samples()==1 which is then assigned to #dest. with num_samples()==1 which is then assigned to #dest.
- if (add_to) then
- Instead of assigning the result to dest, this function adds the result to dest.
!*/ !*/
void multiply_conv ( void multiply_conv (
bool add_to,
tensor& dest, tensor& dest,
const tensor& src1, const tensor& src1,
const tensor& src2 const tensor& src2
...@@ -152,6 +156,8 @@ namespace dlib { namespace tt ...@@ -152,6 +156,8 @@ namespace dlib { namespace tt
#dest(n,k,r,c) == src1(n,k,r,c)*src2(k) #dest(n,k,r,c) == src1(n,k,r,c)*src2(k)
- else - else
#dest(k) == sum over {n,r,c} of src1(n,k,r,c)*src2(n,k,r,c) #dest(k) == sum over {n,r,c} of src1(n,k,r,c)*src2(n,k,r,c)
- if (add_to) then
- Instead of assigning the result to dest, this function adds the result to dest.
!*/ !*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
...@@ -865,11 +871,13 @@ namespace dlib { namespace tt ...@@ -865,11 +871,13 @@ namespace dlib { namespace tt
requires requires
- have_same_dimensions(dest,gradient_input) == true - have_same_dimensions(dest,gradient_input) == true
- have_same_dimensions(dest,grad) == true - have_same_dimensions(dest,grad) == true
- is_same_object(grad, dest)==false
ensures ensures
- We interpret dest as the output of softmax(dest,SRC) for some SRC tensor. - We interpret dest as the output of softmax(dest,SRC) for some SRC tensor.
Then let f(SRC) == dot(gradient_input,dest) Then this function computes the Then let f(SRC) == dot(gradient_input,dest). Then this function computes the
gradient of f() with respect to SRC and adds it to grad. gradient of f() with respect to SRC and stores it to grad. Moreover, if
is_same_object(grad,gradient_input)==true then the output is assigned to
grad, replacing its previous contents. Otherwise the output is added to
grad.
- This function supports in-place operation, i.e. having - This function supports in-place operation, i.e. having
is_same_object(grad, gradient_input)==true is_same_object(grad, gradient_input)==true
!*/ !*/
...@@ -899,12 +907,13 @@ namespace dlib { namespace tt ...@@ -899,12 +907,13 @@ namespace dlib { namespace tt
requires requires
- have_same_dimensions(dest,gradient_input) == true - have_same_dimensions(dest,gradient_input) == true
- have_same_dimensions(dest,grad) == true - have_same_dimensions(dest,grad) == true
- is_same_object(grad,dest) == false
ensures ensures
- Recalling that dest is the output of sigmoid(dest,SRC) for some SRC tensor, - Recalling that dest is the output of sigmoid(dest,SRC) for some SRC tensor,
let f(SRC) == dot(gradient_input,dest) let f(SRC) == dot(gradient_input,dest). Then this function computes the
- Then this function computes the gradient of f() with respect to SRC and gradient of f() with respect to SRC and stores it to grad. Moreover, if
assigns it to grad. is_same_object(grad,gradient_input)==true then the output is assigned to
grad, replacing its previous contents. Otherwise the output is added to
grad.
- This function supports in-place operation, i.e. having - This function supports in-place operation, i.e. having
is_same_object(grad, gradient_input)==true is_same_object(grad, gradient_input)==true
!*/ !*/
...@@ -934,12 +943,13 @@ namespace dlib { namespace tt ...@@ -934,12 +943,13 @@ namespace dlib { namespace tt
requires requires
- have_same_dimensions(dest,gradient_input) == true - have_same_dimensions(dest,gradient_input) == true
- have_same_dimensions(dest,grad) == true - have_same_dimensions(dest,grad) == true
- is_same_object(grad,dest) == false
ensures ensures
- Recalling that dest is the output of relu(dest,SRC) for some SRC tensor, - Recalling that dest is the output of relu(dest,SRC) for some SRC tensor,
let f(SRC) == dot(gradient_input,dest) let f(SRC) == dot(gradient_input,dest). Then this function computes the
- Then this function computes the gradient of f() with respect to SRC and gradient of f() with respect to SRC and stores it to grad. Moreover, if
assigns it to grad. is_same_object(grad,gradient_input)==true then the output is assigned to
grad, replacing its previous contents. Otherwise the output is added to
grad.
- This function supports in-place operation, i.e. having - This function supports in-place operation, i.e. having
is_same_object(grad, gradient_input)==true is_same_object(grad, gradient_input)==true
!*/ !*/
...@@ -978,6 +988,7 @@ namespace dlib { namespace tt ...@@ -978,6 +988,7 @@ namespace dlib { namespace tt
- have_same_dimensions(grad,gradient_input) == true - have_same_dimensions(grad,gradient_input) == true
- param.size() == 1 - param.size() == 1
- params_grad.size() == 1 - params_grad.size() == 1
- is_same_object(grad, gradient_input) == false
ensures ensures
- Recalling that dest is the output of prelu(dest,src,param) let - Recalling that dest is the output of prelu(dest,src,param) let
f(src,param) == dot(gradient_input,dest) f(src,param) == dot(gradient_input,dest)
...@@ -1011,12 +1022,13 @@ namespace dlib { namespace tt ...@@ -1011,12 +1022,13 @@ namespace dlib { namespace tt
requires requires
- have_same_dimensions(dest,gradient_input) == true - have_same_dimensions(dest,gradient_input) == true
- have_same_dimensions(dest,grad) == true - have_same_dimensions(dest,grad) == true
- is_same_object(grad,dest) == false
ensures ensures
- Recalling that dest is the output of tanh(dest,SRC) for some SRC tensor, - Recalling that dest is the output of tanh(dest,SRC) for some SRC tensor,
let f(SRC) == dot(gradient_input,dest) let f(SRC) == dot(gradient_input,dest). Then this function computes the
- Then this function computes the gradient of f() with respect to SRC and gradient of f() with respect to SRC and stores it to grad. Moreover, if
assigns it to grad. is_same_object(grad,gradient_input)==true then the output is assigned to
grad, replacing its previous contents. Otherwise the output is added to
grad.
- This function supports in-place operation, i.e. having - This function supports in-place operation, i.e. having
is_same_object(grad, gradient_input)==true is_same_object(grad, gradient_input)==true
!*/ !*/
......
...@@ -362,7 +362,10 @@ namespace ...@@ -362,7 +362,10 @@ namespace
DLIB_TEST(max(abs(truth3-mat(dest))) < 1e-5); DLIB_TEST(max(abs(truth3-mat(dest))) < 1e-5);
matrix<float> truth4 = pointwise_multiply(mat(A), mat(B)); matrix<float> truth4 = pointwise_multiply(mat(A), mat(B));
tt::multiply(A, A, B); tt::multiply(false, A, A, B);
DLIB_TEST(max(abs(truth4-mat(A))) < 1e-5);
truth4 = pointwise_multiply(mat(A), mat(B)) + mat(A);
tt::multiply(true, A, A, B);
DLIB_TEST(max(abs(truth4-mat(A))) < 1e-5); DLIB_TEST(max(abs(truth4-mat(A))) < 1e-5);
matrix<float> truth5 = mat(B) > 0.1; matrix<float> truth5 = mat(B) > 0.1;
...@@ -418,25 +421,34 @@ namespace ...@@ -418,25 +421,34 @@ namespace
dest.set_size(1,4); dest.set_size(1,4);
tt::multiply(dest, A, B); tt::multiply(false, dest, A, B);
DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B))))) < 1e-6); DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B))))) < 1e-6);
A.set_size(1,4); A.set_size(1,4);
rnd.fill_uniform(A); rnd.fill_uniform(A);
matrix<float> AA = join_cols(mat(A),mat(A)); AA = join_cols(mat(A),AA); matrix<float> AA = join_cols(mat(A),mat(A)); AA = join_cols(mat(A),AA);
tt::multiply(dest, A, B); tt::multiply(false, dest, A, B);
DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6); DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6);
tt::multiply(dest, B, A); tt::multiply(false, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6); DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6);
matrix<float> prevdest = mat(dest);
tt::multiply(true, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-prevdest-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6);
dest.set_size(3,4); dest.set_size(3,4);
tt::multiply(dest, B, A); tt::multiply(false, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6); DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6);
prevdest = mat(dest);
tt::multiply(true, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-prevdest-pointwise_multiply(AA,mat(B)))) < 1e-6);
tt::multiply(dest, A, B); tt::multiply(false, dest, A, B);
DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6); DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6);
prevdest = mat(dest);
tt::multiply(true, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-prevdest-pointwise_multiply(AA,mat(B)))) < 1e-6);
} }
{ {
...@@ -731,8 +743,11 @@ namespace ...@@ -731,8 +743,11 @@ namespace
rnd.fill_uniform(dest); rnd.fill_uniform(dest);
rnd.fill_uniform(src); rnd.fill_uniform(src);
dest2 = dest; src2 = src; dest2 = dest; src2 = src;
cuda::multiply(dest, dest, src); cuda::multiply(false, dest, dest, src);
cpu::multiply(dest2, dest2, src2); cpu::multiply(false, dest2, dest2, src2);
DLIB_TEST(equal(mat(dest),mat(dest2)));
cuda::multiply(true, dest, dest, src);
cpu::multiply(true, dest2, dest2, src2);
DLIB_TEST(equal(mat(dest),mat(dest2))); DLIB_TEST(equal(mat(dest),mat(dest2)));
...@@ -801,24 +816,30 @@ namespace ...@@ -801,24 +816,30 @@ namespace
dest.set_size(1,4); dest.set_size(1,4);
cuda::multiply(dest, A, B); cuda::multiply(false, dest, A, B);
DLIB_TEST_MSG(max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B))))) < 1e-6, max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B)))))); DLIB_TEST_MSG(max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B))))) < 1e-6, max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B))))));
A.set_size(1,4); A.set_size(1,4);
rnd.fill_uniform(A); rnd.fill_uniform(A);
matrix<float> AA = join_cols(mat(A),mat(A)); AA = join_cols(mat(A),AA); matrix<float> AA = join_cols(mat(A),mat(A)); AA = join_cols(mat(A),AA);
cuda::multiply(dest, A, B); cuda::multiply(false, dest, A, B);
DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6); DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6);
cuda::multiply(dest, B, A); cuda::multiply(false, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6); DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6);
matrix<float> prevdest = mat(dest);
cuda::multiply(true, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-prevdest-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6);
dest.set_size(3,4); dest.set_size(3,4);
cuda::multiply(dest, B, A); cuda::multiply(false, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6); DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6);
prevdest = mat(dest);
cuda::multiply(true, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-prevdest-pointwise_multiply(AA,mat(B)))) < 1e-6);
cuda::multiply(dest, A, B); cuda::multiply(false, dest, A, B);
DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6); DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6);
} }
} }
...@@ -955,8 +976,11 @@ namespace ...@@ -955,8 +976,11 @@ namespace
trand.fill_uniform(src1); trand.fill_uniform(src1);
trand.fill_uniform(src2); trand.fill_uniform(src2);
cpu::multiply_conv(dest1, src1, src2); cpu::multiply_conv(false, dest1, src1, src2);
cuda::multiply_conv(dest2, src1, src2); cuda::multiply_conv(false, dest2, src1, src2);
DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);
cpu::multiply_conv(true, dest1, src1, src2);
cuda::multiply_conv(true, dest2, src1, src2);
DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5); DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);
...@@ -968,12 +992,19 @@ namespace ...@@ -968,12 +992,19 @@ namespace
trand.fill_uniform(dest2); trand.fill_uniform(dest2);
trand.fill_uniform(src1); trand.fill_uniform(src1);
trand.fill_uniform(src2); trand.fill_uniform(src2);
cpu::multiply_conv(dest1, src1, src2); cpu::multiply_conv(false, dest1, src1, src2);
cuda::multiply_conv(dest2, src1, src2); cuda::multiply_conv(false, dest2, src1, src2);
const float scale = max(abs(mat(dest1))); float scale = max(abs(mat(dest1)));
const float scalem = mean(abs(mat(dest1))); float scalem = mean(abs(mat(dest1)));
DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2)))/scale < 1e-4 , max(abs(mat(dest1)-mat(dest2)))/scale); DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2)))/scale < 1e-4 , max(abs(mat(dest1)-mat(dest2)))/scale);
DLIB_TEST_MSG(mean(abs(mat(dest1)-mat(dest2)))/scalem < 1e-5 , mean(abs(mat(dest1)-mat(dest2)))/scalem); DLIB_TEST_MSG(mean(abs(mat(dest1)-mat(dest2)))/scalem < 1e-5 , mean(abs(mat(dest1)-mat(dest2)))/scalem);
matrix<float> prevd2 = mat(dest2);
cpu::multiply_conv(false, dest1, src1, src2);
cuda::multiply_conv(true, dest2, src1, src2);
scale = max(abs(mat(dest1)));
scalem = mean(abs(mat(dest1)));
DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2)+prevd2))/scale < 1e-4 , max(abs(mat(dest1)-mat(dest2)+prevd2))/scale);
DLIB_TEST_MSG(mean(abs(mat(dest1)-mat(dest2)+prevd2))/scalem < 1e-5 , mean(abs(mat(dest1)-mat(dest2)+prevd2))/scalem);
} }
for (int iter = 0; iter < 100; ++iter) for (int iter = 0; iter < 100; ++iter)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment