Commit 8421f213 authored by Davis King's avatar Davis King

Fixed the in-place layers so that they don't interfere with the operation of

skip layers and add_prev style layers.  In particular, now in-place layers only
overwrite the gradient information in their child layer if they are operating
in in-place mode.  Otherwise, they add their gradients to their child layers.

It should also be noted that it's safe for in-place layers to overwrite
gradients when in in-place mode since their child layers are inaccessible when
in-place layers operate in in-place mode.  This prevents any other layers from
trying to add to the child layer, thereby avoiding the potability of layer
interference.  So the bug this change fixes is that, when not in in-place mode
the child layers are still accessible but in-place layers were *still*
overwriting child gradients.
parent 81eb18a4
......@@ -2849,8 +2849,11 @@ namespace dlib
resizable_tensor params_grad1, params_grad2, data_grad1, data_grad2;
params_grad1 = params_grad;
params_grad2 = params_grad;
// Now call backward() and make sure it works as well.
subnetwork2.get_gradient_input() = 9999;
// Now call backward() and make sure it works as well. Recall that when an
// in-place layer works in-place it assigns to it's outputs but when it's
// not running in-place it adds. So we initialize to a non-zero value to
// check that this is the behavior that really executes.
subnetwork2.get_gradient_input() = 9;
impl::call_layer_backward(ll, ip_out, input_grad, subnetwork2, params_grad1);
data_grad1 = subnetwork2.get_gradient_input();
......@@ -2868,7 +2871,7 @@ namespace dlib
return layer_test_results(sout.str());
}
}
const auto backward_data_error = max(abs(mat(data_grad1) - mat(data_grad2)));
const auto backward_data_error = max(abs(mat(data_grad1)-9 - mat(data_grad2)));
if (backward_data_error > 0.00001)
{
using namespace std;
......@@ -2934,8 +2937,7 @@ namespace dlib
// compare it to the one output by the layer and make sure they match.
double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
double output_derivative = subnetwork.get_gradient_input_element(i);
if (!impl::is_inplace_layer(l,subnetwork))
output_derivative -= initial_gradient_input[i];
output_derivative -= initial_gradient_input[i];
double relative_error;
if (reference_derivative != 0)
relative_error = (reference_derivative - output_derivative)/(reference_derivative);
......
......@@ -16,6 +16,7 @@ namespace dlib
// -----------------------------------------------------------------------------------
void multiply (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
......@@ -38,24 +39,44 @@ namespace dlib
const auto s2 = src2.host();
if (dest.size() == src1.size() && src1.size() == src2.size())
{
for (size_t i = 0; i < src1.size(); ++i)
d[i] = s1[i]*s2[i];
if (add_to)
{
for (size_t i = 0; i < src1.size(); ++i)
d[i] += s1[i]*s2[i];
}
else
{
for (size_t i = 0; i < src1.size(); ++i)
d[i] = s1[i]*s2[i];
}
}
else if (dest.num_samples() == 1)
{
for (size_t i = 0; i < dest.size(); ++i)
d[i] = 0;
if (!add_to)
{
for (size_t i = 0; i < dest.size(); ++i)
d[i] = 0;
}
for (size_t i = 0; i < max_size; ++i)
d[i%dest.size()] += s1[i%src1.size()]*s2[i%src2.size()];
}
else
{
for (size_t i = 0; i < max_size; ++i)
d[i] = s1[i%src1.size()]*s2[i%src2.size()];
if (add_to)
{
for (size_t i = 0; i < max_size; ++i)
d[i] += s1[i%src1.size()]*s2[i%src2.size()];
}
else
{
for (size_t i = 0; i < max_size; ++i)
d[i] = s1[i%src1.size()]*s2[i%src2.size()];
}
}
}
void multiply_conv (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
......@@ -68,15 +89,34 @@ namespace dlib
{
DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k(),"");
for (long n = 0; n < dest.num_samples(); ++n)
if (add_to)
{
for (long k = 0; k < dest.k(); ++k)
for (long n = 0; n < dest.num_samples(); ++n)
{
for (long r = 0; r < dest.nr(); ++r)
for (long k = 0; k < dest.k(); ++k)
{
for (long c = 0; c < dest.nc(); ++c)
for (long r = 0; r < dest.nr(); ++r)
{
for (long c = 0; c < dest.nc(); ++c)
{
*d++ += (*s1++)*s2[k];
}
}
}
}
}
else
{
for (long n = 0; n < dest.num_samples(); ++n)
{
for (long k = 0; k < dest.k(); ++k)
{
for (long r = 0; r < dest.nr(); ++r)
{
*d++ = (*s1++)*s2[k];
for (long c = 0; c < dest.nc(); ++c)
{
*d++ = (*s1++)*s2[k];
}
}
}
}
......@@ -87,8 +127,11 @@ namespace dlib
DLIB_CASSERT(have_same_dimensions(src1,src2),"");
DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k(),"");
for (long k = 0; k < src1.k(); ++k)
d[k] = 0;
if (!add_to)
{
for (long k = 0; k < src1.k(); ++k)
d[k] = 0;
}
for (long n = 0; n < src1.num_samples(); ++n)
{
......@@ -1105,8 +1148,16 @@ namespace dlib
float temp = 0;
for (long k = 0; k < grad.k(); ++k)
temp += -d3[k*num]*in3[k*num];
for (long k = 0; k < grad.k(); ++k)
g3[k*num] = d3[k*num]*(temp+in3[k*num]);
if (is_same_object(gradient_input, grad))
{
for (long k = 0; k < grad.k(); ++k)
g3[k*num] = d3[k*num]*(temp+in3[k*num]);
}
else
{
for (long k = 0; k < grad.k(); ++k)
g3[k*num] += d3[k*num]*(temp+in3[k*num]);
}
}
}
}
......@@ -1134,8 +1185,16 @@ namespace dlib
const auto g = grad.host();
const auto d = dest.host();
const auto in = gradient_input.host();
for (size_t i = 0; i < dest.size(); ++i)
g[i] = in[i]*d[i]*(1-d[i]);
if (is_same_object(gradient_input, grad))
{
for (size_t i = 0; i < dest.size(); ++i)
g[i] = in[i]*d[i]*(1-d[i]);
}
else
{
for (size_t i = 0; i < dest.size(); ++i)
g[i] += in[i]*d[i]*(1-d[i]);
}
}
// ------------------------------------------------------------------------------------
......@@ -1157,12 +1216,23 @@ namespace dlib
const float* gi = gradient_input.host();
const float* in = dest.host();
float* out = grad.host();
for (size_t i = 0; i < dest.size(); ++i)
if (is_same_object(grad, gradient_input))
{
if (in[i] > 0)
out[i] = gi[i];
else
out[i] = 0;
for (size_t i = 0; i < dest.size(); ++i)
{
if (in[i] > 0)
out[i] = gi[i];
else
out[i] = 0;
}
}
else
{
for (size_t i = 0; i < dest.size(); ++i)
{
if (in[i] > 0)
out[i] += gi[i];
}
}
}
......@@ -1194,6 +1264,7 @@ namespace dlib
tensor& params_grad
)
{
DLIB_CASSERT(is_same_object(grad, gradient_input) == false,"");
const float p = param.host()[0];
const float* gi = gradient_input.host();
const float* s = src.host();
......@@ -1236,8 +1307,16 @@ namespace dlib
const auto g = grad.host();
const auto d = dest.host();
const auto in = gradient_input.host();
for (size_t i = 0; i < dest.size(); ++i)
g[i] = in[i]*(1-d[i]*d[i]);
if (is_same_object(grad, gradient_input))
{
for (size_t i = 0; i < dest.size(); ++i)
g[i] = in[i]*(1-d[i]*d[i]);
}
else
{
for (size_t i = 0; i < dest.size(); ++i)
g[i] += in[i]*(1-d[i]*d[i]);
}
}
// ------------------------------------------------------------------------------------
......
......@@ -16,12 +16,14 @@ namespace dlib
// -----------------------------------------------------------------------------------
void multiply (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
);
void multiply_conv (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
......
......@@ -122,7 +122,34 @@ namespace dlib
}
}
__global__ void _cuda_multiply1_add_to(float* d, const float* s1, const float* s2, size_t n)
{
for (auto i : grid_stride_range(0, n))
{
d[i] += s1[i]*s2[i];
}
}
__global__ void _cuda_multiply2_add_to(float* d, const float* s1, const float* s2,
size_t n, size_t s1_n, size_t s2_n, size_t max_size)
{
for (auto i : grid_stride_range(0, n))
{
for (size_t j = i; j < max_size; j += n)
d[i] += s1[j%s1_n]*s2[j%s2_n];
}
}
__global__ void _cuda_multiply3_add_to(float* d, const float* s1, const float* s2,
size_t n, size_t s1_n, size_t s2_n)
{
for (auto i : grid_stride_range(0, n))
{
d[i] += s1[i%s1_n]*s2[i%s2_n];
}
}
void multiply (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
......@@ -146,17 +173,28 @@ namespace dlib
const auto s2 = src2.host();
if (dest.size() == src1.size() && src1.size() == src2.size())
{
launch_kernel(_cuda_multiply1,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size());
if (add_to)
launch_kernel(_cuda_multiply1_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size());
else
launch_kernel(_cuda_multiply1,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), src1.size());
}
else if (dest.num_samples() == 1)
{
launch_kernel(_cuda_multiply2,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
dest.size(), src1.size(), src2.size(), max_size);
if (add_to)
launch_kernel(_cuda_multiply2_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
dest.size(), src1.size(), src2.size(), max_size);
else
launch_kernel(_cuda_multiply2,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
dest.size(), src1.size(), src2.size(), max_size);
}
else
{
launch_kernel(_cuda_multiply3,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
dest.size(), src1.size(), src2.size());
if (add_to)
launch_kernel(_cuda_multiply3_add_to,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
dest.size(), src1.size(), src2.size());
else
launch_kernel(_cuda_multiply3,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(),
dest.size(), src1.size(), src2.size());
}
}
......@@ -191,8 +229,33 @@ namespace dlib
}
}
__global__ void _cuda_multiply_conv_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
{
for (auto i : grid_stride_range(0, n))
{
auto k = (i/bs)%ks;
d[i] += s1[i]*s2[k];
}
}
__global__ void _cuda_multiply_conv2_add_to(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
{
// loop over all the image planes
for (auto i : grid_stride_range_y(0, n))
{
// sum all the elements in the i-th image plane
float temp = 0;
for (auto j : grid_stride_range(i*bs, (i+1)*bs))
temp += s1[j]*s2[j];
auto k = i%ks;
// and store the sum into d[k]
warp_reduce_atomic_add(d[k], temp);
}
}
void multiply_conv (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
......@@ -204,8 +267,12 @@ namespace dlib
if (dest.size() == 0)
return;
launch_kernel(_cuda_multiply_conv,max_jobs(dest.size()),
dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k());
if (add_to)
launch_kernel(_cuda_multiply_conv_add_to,max_jobs(dest.size()),
dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k());
else
launch_kernel(_cuda_multiply_conv,max_jobs(dest.size()),
dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k());
}
else
{
......@@ -216,8 +283,12 @@ namespace dlib
dim3 blocks(10,1);
dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
_cuda_multiply_conv2<<<blocks,threads>>>(
dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k());
if (add_to)
_cuda_multiply_conv2_add_to<<<blocks,threads>>>(
dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k());
else
_cuda_multiply_conv2<<<blocks,threads>>>(
dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k());
}
}
......
......@@ -102,12 +102,14 @@ namespace dlib
// -----------------------------------------------------------------------------------
void multiply (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
);
void multiply_conv (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
......
......@@ -1265,7 +1265,7 @@ namespace dlib
return;
const float alpha = 1;
const float beta = 0;
const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
CHECK_CUDNN(cudnnSoftmaxBackward(context(),
CUDNN_SOFTMAX_ACCURATE,
CUDNN_SOFTMAX_MODE_CHANNEL,
......@@ -1316,7 +1316,7 @@ namespace dlib
return;
const float alpha = 1;
const float beta = 0;
const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
CHECK_CUDNN(cudnnActivationBackward(context(),
sigmoid_activation_descriptor(),
&alpha,
......@@ -1367,7 +1367,7 @@ namespace dlib
return;
const float alpha = 1;
const float beta = 0;
const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
CHECK_CUDNN(cudnnActivationBackward(context(),
relu_activation_descriptor(),
&alpha,
......@@ -1418,7 +1418,7 @@ namespace dlib
return;
const float alpha = 1;
const float beta = 0;
const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
CHECK_CUDNN(cudnnActivationBackward(context(),
tanh_activation_descriptor(),
&alpha,
......
......@@ -966,7 +966,7 @@ namespace dlib
mask.copy_size(input);
rnd.fill_uniform(mask);
tt::threshold(mask, drop_rate);
tt::multiply(output, input, mask);
tt::multiply(false, output, input, mask);
}
void backward_inplace(
......@@ -975,7 +975,10 @@ namespace dlib
tensor& /*params_grad*/
)
{
tt::multiply(data_grad, mask, gradient_input);
if (is_same_object(gradient_input, data_grad))
tt::multiply(false, data_grad, mask, gradient_input);
else
tt::multiply(true, data_grad, mask, gradient_input);
}
const tensor& get_layer_params() const { return params; }
......@@ -1044,7 +1047,7 @@ namespace dlib
void forward_inplace(const tensor& input, tensor& output)
{
tt::affine_transform(output, input, val, 0);
tt::affine_transform(output, input, val);
}
void backward_inplace(
......@@ -1053,7 +1056,10 @@ namespace dlib
tensor& /*params_grad*/
)
{
tt::affine_transform(data_grad, gradient_input, val, 0);
if (is_same_object(gradient_input, data_grad))
tt::affine_transform(data_grad, gradient_input, val);
else
tt::affine_transform(data_grad, data_grad, gradient_input, 1, val);
}
const tensor& get_layer_params() const { return params; }
......@@ -1187,11 +1193,17 @@ namespace dlib
// We are computing the gradient of dot(gradient_input, computed_output*g + b)
if (mode == FC_MODE)
{
tt::multiply(data_grad, gradient_input, g);
if (is_same_object(gradient_input, data_grad))
tt::multiply(false, data_grad, gradient_input, g);
else
tt::multiply(true, data_grad, gradient_input, g);
}
else
{
tt::multiply_conv(data_grad, gradient_input, g);
if (is_same_object(gradient_input, data_grad))
tt::multiply_conv(false, data_grad, gradient_input, g);
else
tt::multiply_conv(true, data_grad, gradient_input, g);
}
}
......
......@@ -287,7 +287,10 @@ namespace dlib
to data_input.
Finally, backward_inplace() outputs these gradients by performing:
- params_grad = PARAMETER_GRADIENT
- data_grad = DATA_GRADIENT
- if (is_same_object(gradient_input, data_grad)) then
- data_grad = DATA_GRADIENT
- else
- data_grad += DATA_GRADIENT
!*/
const tensor& get_layer_params(
......
......@@ -127,6 +127,7 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
void multiply (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
......@@ -140,23 +141,24 @@ namespace dlib { namespace tt
(src1.num_samples()==1 || src1.num_samples()==MD) &&
(src2.num_samples()==1 || src2.num_samples()==MD) ,"");
#ifdef DLIB_USE_CUDA
cuda::multiply(dest, src1, src2);
cuda::multiply(add_to, dest, src1, src2);
#else
cpu::multiply(dest, src1, src2);
cpu::multiply(add_to, dest, src1, src2);
#endif
}
void multiply_conv (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
)
{
#ifdef DLIB_USE_CUDA
cuda::multiply_conv(dest, src1, src2);
cuda::multiply_conv(add_to, dest, src1, src2);
#else
cpu::multiply_conv(dest, src1, src2);
cpu::multiply_conv(add_to, dest, src1, src2);
#endif
}
......
......@@ -101,6 +101,7 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
void multiply (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
......@@ -124,9 +125,12 @@ namespace dlib { namespace tt
Second, if dest.num_samples()==1, then after the pointwise multiplication of
src1 with src2, the result has its samples summed to produce an output tensor
with num_samples()==1 which is then assigned to #dest.
- if (add_to) then
- Instead of assigning the result to dest, this function adds the result to dest.
!*/
void multiply_conv (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
......@@ -152,6 +156,8 @@ namespace dlib { namespace tt
#dest(n,k,r,c) == src1(n,k,r,c)*src2(k)
- else
#dest(k) == sum over {n,r,c} of src1(n,k,r,c)*src2(n,k,r,c)
- if (add_to) then
- Instead of assigning the result to dest, this function adds the result to dest.
!*/
// ----------------------------------------------------------------------------------------
......@@ -865,11 +871,13 @@ namespace dlib { namespace tt
requires
- have_same_dimensions(dest,gradient_input) == true
- have_same_dimensions(dest,grad) == true
- is_same_object(grad, dest)==false
ensures
- We interpret dest as the output of softmax(dest,SRC) for some SRC tensor.
Then let f(SRC) == dot(gradient_input,dest) Then this function computes the
gradient of f() with respect to SRC and adds it to grad.
Then let f(SRC) == dot(gradient_input,dest). Then this function computes the
gradient of f() with respect to SRC and stores it to grad. Moreover, if
is_same_object(grad,gradient_input)==true then the output is assigned to
grad, replacing its previous contents. Otherwise the output is added to
grad.
- This function supports in-place operation, i.e. having
is_same_object(grad, gradient_input)==true
!*/
......@@ -899,12 +907,13 @@ namespace dlib { namespace tt
requires
- have_same_dimensions(dest,gradient_input) == true
- have_same_dimensions(dest,grad) == true
- is_same_object(grad,dest) == false
ensures
- Recalling that dest is the output of sigmoid(dest,SRC) for some SRC tensor,
let f(SRC) == dot(gradient_input,dest)
- Then this function computes the gradient of f() with respect to SRC and
assigns it to grad.
let f(SRC) == dot(gradient_input,dest). Then this function computes the
gradient of f() with respect to SRC and stores it to grad. Moreover, if
is_same_object(grad,gradient_input)==true then the output is assigned to
grad, replacing its previous contents. Otherwise the output is added to
grad.
- This function supports in-place operation, i.e. having
is_same_object(grad, gradient_input)==true
!*/
......@@ -934,12 +943,13 @@ namespace dlib { namespace tt
requires
- have_same_dimensions(dest,gradient_input) == true
- have_same_dimensions(dest,grad) == true
- is_same_object(grad,dest) == false
ensures
- Recalling that dest is the output of relu(dest,SRC) for some SRC tensor,
let f(SRC) == dot(gradient_input,dest)
- Then this function computes the gradient of f() with respect to SRC and
assigns it to grad.
let f(SRC) == dot(gradient_input,dest). Then this function computes the
gradient of f() with respect to SRC and stores it to grad. Moreover, if
is_same_object(grad,gradient_input)==true then the output is assigned to
grad, replacing its previous contents. Otherwise the output is added to
grad.
- This function supports in-place operation, i.e. having
is_same_object(grad, gradient_input)==true
!*/
......@@ -978,6 +988,7 @@ namespace dlib { namespace tt
- have_same_dimensions(grad,gradient_input) == true
- param.size() == 1
- params_grad.size() == 1
- is_same_object(grad, gradient_input) == false
ensures
- Recalling that dest is the output of prelu(dest,src,param) let
f(src,param) == dot(gradient_input,dest)
......@@ -1011,12 +1022,13 @@ namespace dlib { namespace tt
requires
- have_same_dimensions(dest,gradient_input) == true
- have_same_dimensions(dest,grad) == true
- is_same_object(grad,dest) == false
ensures
- Recalling that dest is the output of tanh(dest,SRC) for some SRC tensor,
let f(SRC) == dot(gradient_input,dest)
- Then this function computes the gradient of f() with respect to SRC and
assigns it to grad.
let f(SRC) == dot(gradient_input,dest). Then this function computes the
gradient of f() with respect to SRC and stores it to grad. Moreover, if
is_same_object(grad,gradient_input)==true then the output is assigned to
grad, replacing its previous contents. Otherwise the output is added to
grad.
- This function supports in-place operation, i.e. having
is_same_object(grad, gradient_input)==true
!*/
......
......@@ -362,7 +362,10 @@ namespace
DLIB_TEST(max(abs(truth3-mat(dest))) < 1e-5);
matrix<float> truth4 = pointwise_multiply(mat(A), mat(B));
tt::multiply(A, A, B);
tt::multiply(false, A, A, B);
DLIB_TEST(max(abs(truth4-mat(A))) < 1e-5);
truth4 = pointwise_multiply(mat(A), mat(B)) + mat(A);
tt::multiply(true, A, A, B);
DLIB_TEST(max(abs(truth4-mat(A))) < 1e-5);
matrix<float> truth5 = mat(B) > 0.1;
......@@ -418,25 +421,34 @@ namespace
dest.set_size(1,4);
tt::multiply(dest, A, B);
tt::multiply(false, dest, A, B);
DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B))))) < 1e-6);
A.set_size(1,4);
rnd.fill_uniform(A);
matrix<float> AA = join_cols(mat(A),mat(A)); AA = join_cols(mat(A),AA);
tt::multiply(dest, A, B);
tt::multiply(false, dest, A, B);
DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6);
tt::multiply(dest, B, A);
tt::multiply(false, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6);
matrix<float> prevdest = mat(dest);
tt::multiply(true, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-prevdest-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6);
dest.set_size(3,4);
tt::multiply(dest, B, A);
tt::multiply(false, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6);
prevdest = mat(dest);
tt::multiply(true, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-prevdest-pointwise_multiply(AA,mat(B)))) < 1e-6);
tt::multiply(dest, A, B);
tt::multiply(false, dest, A, B);
DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6);
prevdest = mat(dest);
tt::multiply(true, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-prevdest-pointwise_multiply(AA,mat(B)))) < 1e-6);
}
{
......@@ -731,8 +743,11 @@ namespace
rnd.fill_uniform(dest);
rnd.fill_uniform(src);
dest2 = dest; src2 = src;
cuda::multiply(dest, dest, src);
cpu::multiply(dest2, dest2, src2);
cuda::multiply(false, dest, dest, src);
cpu::multiply(false, dest2, dest2, src2);
DLIB_TEST(equal(mat(dest),mat(dest2)));
cuda::multiply(true, dest, dest, src);
cpu::multiply(true, dest2, dest2, src2);
DLIB_TEST(equal(mat(dest),mat(dest2)));
......@@ -801,24 +816,30 @@ namespace
dest.set_size(1,4);
cuda::multiply(dest, A, B);
cuda::multiply(false, dest, A, B);
DLIB_TEST_MSG(max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B))))) < 1e-6, max(abs(mat(dest)-sum_rows(pointwise_multiply(mat(A),mat(B))))));
A.set_size(1,4);
rnd.fill_uniform(A);
matrix<float> AA = join_cols(mat(A),mat(A)); AA = join_cols(mat(A),AA);
cuda::multiply(dest, A, B);
cuda::multiply(false, dest, A, B);
DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6);
cuda::multiply(dest, B, A);
cuda::multiply(false, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6);
matrix<float> prevdest = mat(dest);
cuda::multiply(true, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-prevdest-sum_rows(pointwise_multiply(AA,mat(B))))) < 1e-6);
dest.set_size(3,4);
cuda::multiply(dest, B, A);
cuda::multiply(false, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6);
prevdest = mat(dest);
cuda::multiply(true, dest, B, A);
DLIB_TEST(max(abs(mat(dest)-prevdest-pointwise_multiply(AA,mat(B)))) < 1e-6);
cuda::multiply(dest, A, B);
cuda::multiply(false, dest, A, B);
DLIB_TEST(max(abs(mat(dest)-pointwise_multiply(AA,mat(B)))) < 1e-6);
}
}
......@@ -955,8 +976,11 @@ namespace
trand.fill_uniform(src1);
trand.fill_uniform(src2);
cpu::multiply_conv(dest1, src1, src2);
cuda::multiply_conv(dest2, src1, src2);
cpu::multiply_conv(false, dest1, src1, src2);
cuda::multiply_conv(false, dest2, src1, src2);
DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);
cpu::multiply_conv(true, dest1, src1, src2);
cuda::multiply_conv(true, dest2, src1, src2);
DLIB_TEST(max(abs(mat(dest1)-mat(dest2))) < 1e-5);
......@@ -968,12 +992,19 @@ namespace
trand.fill_uniform(dest2);
trand.fill_uniform(src1);
trand.fill_uniform(src2);
cpu::multiply_conv(dest1, src1, src2);
cuda::multiply_conv(dest2, src1, src2);
const float scale = max(abs(mat(dest1)));
const float scalem = mean(abs(mat(dest1)));
cpu::multiply_conv(false, dest1, src1, src2);
cuda::multiply_conv(false, dest2, src1, src2);
float scale = max(abs(mat(dest1)));
float scalem = mean(abs(mat(dest1)));
DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2)))/scale < 1e-4 , max(abs(mat(dest1)-mat(dest2)))/scale);
DLIB_TEST_MSG(mean(abs(mat(dest1)-mat(dest2)))/scalem < 1e-5 , mean(abs(mat(dest1)-mat(dest2)))/scalem);
matrix<float> prevd2 = mat(dest2);
cpu::multiply_conv(false, dest1, src1, src2);
cuda::multiply_conv(true, dest2, src1, src2);
scale = max(abs(mat(dest1)));
scalem = mean(abs(mat(dest1)));
DLIB_TEST_MSG(max(abs(mat(dest1)-mat(dest2)+prevd2))/scale < 1e-4 , max(abs(mat(dest1)-mat(dest2)+prevd2))/scale);
DLIB_TEST_MSG(mean(abs(mat(dest1)-mat(dest2)+prevd2))/scalem < 1e-5 , mean(abs(mat(dest1)-mat(dest2)+prevd2))/scalem);
}
for (int iter = 0; iter < 100; ++iter)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment