Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
D
dlib
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
钟尚武
dlib
Commits
f7310f4b
Commit
f7310f4b
authored
Aug 11, 2017
by
Davis King
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added multiply_zero_padded()
parent
46a02d94
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
305 additions
and
0 deletions
+305
-0
cpu_dlib.cpp
dlib/dnn/cpu_dlib.cpp
+73
-0
cpu_dlib.h
dlib/dnn/cpu_dlib.h
+7
-0
cuda_dlib.cu
dlib/dnn/cuda_dlib.cu
+128
-0
cuda_dlib.h
dlib/dnn/cuda_dlib.h
+7
-0
tensor_tools.cpp
dlib/dnn/tensor_tools.cpp
+14
-0
tensor_tools.h
dlib/dnn/tensor_tools.h
+17
-0
dnn.cpp
dlib/test/dnn.cpp
+59
-0
No files found.
dlib/dnn/cpu_dlib.cpp
View file @
f7310f4b
...
...
@@ -265,6 +265,79 @@ namespace dlib
}
}
// ----------------------------------------------------------------------------------------
void
multiply_zero_padded
(
bool
add_to
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
)
{
auto
d
=
dest
.
host
();
auto
s1
=
src1
.
host
();
auto
s2
=
src2
.
host
();
// Do the simple and fast version if everything has the same dimensions
if
(
have_same_dimensions
(
dest
,
src1
)
&&
have_same_dimensions
(
dest
,
src2
))
{
if
(
add_to
)
{
for
(
size_t
i
=
0
;
i
<
dest
.
size
();
++
i
)
d
[
i
]
+=
s1
[
i
]
*
s2
[
i
];
}
else
{
for
(
size_t
i
=
0
;
i
<
dest
.
size
();
++
i
)
d
[
i
]
=
s1
[
i
]
*
s2
[
i
];
}
return
;
}
// Otherwise, do the more complex version with bounds checking.
for
(
long
n
=
0
;
n
<
dest
.
num_samples
();
++
n
)
{
for
(
long
k
=
0
;
k
<
dest
.
k
();
++
k
)
{
for
(
long
r
=
0
;
r
<
dest
.
nr
();
++
r
)
{
for
(
long
c
=
0
;
c
<
dest
.
nc
();
++
c
)
{
float
v1
=
0
;
float
v2
=
0
;
// if this index is inside src1
if
(
n
<
src1
.
num_samples
()
&&
k
<
src1
.
k
()
&&
r
<
src1
.
nr
()
&&
c
<
src1
.
nc
()
)
{
const
auto
s_idx
=
((
n
*
src1
.
k
()
+
k
)
*
src1
.
nr
()
+
r
)
*
src1
.
nc
()
+
c
;
v1
=
s1
[
s_idx
];
}
// if this index is inside src2
if
(
n
<
src2
.
num_samples
()
&&
k
<
src2
.
k
()
&&
r
<
src2
.
nr
()
&&
c
<
src2
.
nc
()
)
{
const
auto
s_idx
=
((
n
*
src2
.
k
()
+
k
)
*
src2
.
nr
()
+
r
)
*
src2
.
nc
()
+
c
;
v2
=
s2
[
s_idx
];
}
if
(
add_to
)
*
d
+=
v1
*
v2
;
else
*
d
=
v1
*
v2
;
++
d
;
}
}
}
}
}
// ----------------------------------------------------------------------------------------
void
assign_bias_gradient
(
...
...
dlib/dnn/cpu_dlib.h
View file @
f7310f4b
...
...
@@ -30,6 +30,13 @@ namespace dlib
const
tensor
&
src2
);
void
multiply_zero_padded
(
bool
add_to
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
);
void
add
(
float
beta
,
tensor
&
dest
,
...
...
dlib/dnn/cuda_dlib.cu
View file @
f7310f4b
...
...
@@ -513,6 +513,134 @@ namespace dlib
}
// ------------------------------------------------------------------------------------
__global__ void _cuda_mult1(float* d, const float* s1, const float* s2, size_t n)
{
for (auto i : grid_stride_range(0, n))
{
d[i] = s1[i]*s2[i];
}
}
__global__ void _cuda_mult1_add_to(float* d, const float* s1, const float* s2, size_t n)
{
for (auto i : grid_stride_range(0, n))
{
d[i] += s1[i]*s2[i];
}
}
__global__ void _cuda_mult2(float* d, const float* s1, const float* s2,
size_t dn, size_t dk, size_t dr, size_t dc,
size_t s1n, size_t s1k, size_t s1r, size_t s1c,
size_t s2n, size_t s2k, size_t s2r, size_t s2c)
{
for (auto i : grid_stride_range(0, dn*dk*dr*dc))
{
size_t n,k,r,c;
unpack_idx(i, dk,dr,dc, n,k,r,c);
float v1 = 0;
float v2 = 0;
if (n < s1n &&
k < s1k &&
r < s1r &&
c < s1c )
{
v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)];
}
if (n < s2n &&
k < s2k &&
r < s2r &&
c < s2c )
{
v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)];
}
d[i] = v1*v2;
}
}
__global__ void _cuda_mult2_add_to(float* d, const float* s1, const float* s2,
size_t dn, size_t dk, size_t dr, size_t dc,
size_t s1n, size_t s1k, size_t s1r, size_t s1c,
size_t s2n, size_t s2k, size_t s2r, size_t s2c)
{
for (auto i : grid_stride_range(0, dn*dk*dr*dc))
{
size_t n,k,r,c;
unpack_idx(i, dk,dr,dc, n,k,r,c);
float v1 = 0;
float v2 = 0;
if (n < s1n &&
k < s1k &&
r < s1r &&
c < s1c )
{
v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)];
}
if (n < s2n &&
k < s2k &&
r < s2r &&
c < s2c )
{
v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)];
}
d[i] += v1*v2;
}
}
void multiply_zero_padded (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
)
{
if (dest.size() == 0)
return;
// Do the simple and fast version if everything has the same dimensions
if (have_same_dimensions(dest, src1) &&
have_same_dimensions(dest, src2))
{
if (add_to)
launch_kernel(_cuda_mult1_add_to,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size());
else
launch_kernel(_cuda_mult1,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size());
}
else
{
if (add_to)
{
// Otherwise, do the more complex version with bounds checking.
launch_kernel(_cuda_mult2_add_to,max_jobs(dest.size()),
dest.device(), src1.device(), src2.device(),
dest.num_samples(), dest.k(), dest.nr(), dest.nc(),
src1.num_samples(), src1.k(), src1.nr(), src1.nc(),
src2.num_samples(), src2.k(), src2.nr(), src2.nc()
);
}
else
{
// Otherwise, do the more complex version with bounds checking.
launch_kernel(_cuda_mult2,max_jobs(dest.size()),
dest.device(), src1.device(), src2.device(),
dest.num_samples(), dest.k(), dest.nr(), dest.nc(),
src1.num_samples(), src1.k(), src1.nr(), src1.nc(),
src2.num_samples(), src2.k(), src2.nr(), src2.nc()
);
}
}
}
// ------------------------------------------------------------------------------------
__global__ void _cuda_add1(float* d, const float* s1, const float* s2, size_t n)
...
...
dlib/dnn/cuda_dlib.h
View file @
f7310f4b
...
...
@@ -185,6 +185,13 @@ namespace dlib
const
tensor
&
src2
);
void
multiply_zero_padded
(
bool
add_to
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
);
void
add
(
tensor
&
dest
,
const
tensor
&
src1
,
...
...
dlib/dnn/tensor_tools.cpp
View file @
f7310f4b
...
...
@@ -304,6 +304,20 @@ namespace dlib { namespace tt
#endif
}
void
multiply_zero_padded
(
bool
add_to
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
)
{
#ifdef DLIB_USE_CUDA
cuda
::
multiply_zero_padded
(
add_to
,
dest
,
src1
,
src2
);
#else
cpu
::
multiply_zero_padded
(
add_to
,
dest
,
src1
,
src2
);
#endif
}
// ----------------------------------------------------------------------------------------
void
affine_transform
(
...
...
dlib/dnn/tensor_tools.h
View file @
f7310f4b
...
...
@@ -306,6 +306,23 @@ namespace dlib { namespace tt
- Instead of assigning the result to dest, this function adds the result to dest.
!*/
void
multiply_zero_padded
(
bool
add_to
,
tensor
&
dest
,
const
tensor
&
src1
,
const
tensor
&
src2
);
/*!
ensures
- if (add_to) then
- performs: dest += src1 * src2
- else
- performs: dest = src1 * src2
- In either case, the multiplication happens pointwise according to 4D tensor
arithmetic. If the dimensions don't match then missing elements are presumed
to be equal to 0.
!*/
// ----------------------------------------------------------------------------------------
void
affine_transform
(
...
...
dlib/test/dnn.cpp
View file @
f7310f4b
...
...
@@ -904,6 +904,64 @@ namespace
DLIB_TEST_MSG
(
max
(
abs
(
mat
(
v
)
-
mat
(
vv
)))
<
1e-6
,
max
(
abs
(
mat
(
v
)
-
mat
(
vv
))));
}
void
test_multiply_zero_padded
()
{
print_spinner
();
dlib
::
rand
rnd
;
tt
::
tensor_rand
trnd
;
for
(
int
iter
=
0
;
iter
<
300
;
++
iter
)
{
resizable_tensor
dest1
(
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
);
resizable_tensor
dest2
;
dest2
.
copy_size
(
dest1
);
resizable_tensor
src1
(
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
);
resizable_tensor
src2
(
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
,
rnd
.
get_random_32bit_number
()
%
4
+
1
);
trnd
.
fill_uniform
(
dest1
);
trnd
.
fill_uniform
(
dest2
);
trnd
.
fill_uniform
(
src1
);
trnd
.
fill_uniform
(
src2
);
cpu
::
multiply_zero_padded
(
false
,
dest1
,
src1
,
src2
);
cuda
::
multiply_zero_padded
(
false
,
dest2
,
src1
,
src2
);
DLIB_TEST
(
max
(
abs
(
mat
(
dest1
)
-
mat
(
dest2
)))
<
1e-5
);
cpu
::
multiply_zero_padded
(
true
,
dest1
,
src1
,
src2
);
cuda
::
multiply_zero_padded
(
true
,
dest2
,
src1
,
src2
);
DLIB_TEST
(
max
(
abs
(
mat
(
dest1
)
-
mat
(
dest2
)))
<
1e-5
);
}
// make sure we have a test for the case where all tensors have the same
// dimensions.
resizable_tensor
dest1
(
3
,
4
,
5
,
6
);
resizable_tensor
dest2
;
resizable_tensor
src1
;
resizable_tensor
src2
;
dest2
.
copy_size
(
dest1
);
src1
.
copy_size
(
dest1
);
src2
.
copy_size
(
dest1
);
trnd
.
fill_uniform
(
dest1
);
trnd
.
fill_uniform
(
dest2
);
trnd
.
fill_uniform
(
src1
);
trnd
.
fill_uniform
(
src2
);
cpu
::
multiply_zero_padded
(
false
,
dest1
,
src1
,
src2
);
cuda
::
multiply_zero_padded
(
false
,
dest2
,
src1
,
src2
);
DLIB_TEST
(
max
(
abs
(
mat
(
dest1
)
-
mat
(
dest2
)))
<
1e-5
);
cpu
::
multiply_zero_padded
(
true
,
dest1
,
src1
,
src2
);
cuda
::
multiply_zero_padded
(
true
,
dest2
,
src1
,
src2
);
DLIB_TEST
(
max
(
abs
(
mat
(
dest1
)
-
mat
(
dest2
)))
<
1e-5
);
}
void
test_add
()
{
print_spinner
();
...
...
@@ -2606,6 +2664,7 @@ namespace
compare_bn_gpu_and_cpu
();
compare_bn_conv_gpu_and_cpu
();
test_add
();
test_multiply_zero_padded
();
compare_adam
();
test_copy_tensor_gpu
();
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment