Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
D
dlib
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
钟尚武
dlib
Commits
2ba29f65
Commit
2ba29f65
authored
Dec 09, 2015
by
Davis King
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Updated multiply()'s CUDA implementation to reflect it's new features. Also added
CUDA version of add_bias_gradient().
parent
2f34414e
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
130 additions
and
39 deletions
+130
-39
cuda_dlib.cu
dlib/dnn/cuda_dlib.cu
+77
-35
cuda_dlib.h
dlib/dnn/cuda_dlib.h
+7
-0
cuda_utils.h
dlib/dnn/cuda_utils.h
+35
-0
layers_abstract.h
dlib/dnn/layers_abstract.h
+4
-0
tensor_tools.cpp
dlib/dnn/tensor_tools.cpp
+1
-3
dnn.cpp
dlib/test/dnn.cpp
+6
-1
No files found.
dlib/dnn/cuda_dlib.cu
View file @
2ba29f65
...
...
@@ -48,13 +48,32 @@ namespace dlib
// -----------------------------------------------------------------------------------
__global__ void _cuda_multiply(float* d, const float* s1, const float* s2, size_t n)
__global__ void _cuda_multiply
1
(float* d, const float* s1, const float* s2, size_t n)
{
for (auto i : grid_stride_range(0, n))
{
d[i] = s1[i]*s2[i];
}
}
__global__ void _cuda_multiply2(float* d, const float* s1, const float* s2,
size_t n, size_t s1_n, size_t s2_n, size_t max_size)
{
for (auto i : grid_stride_range(0, n))
{
d[i] = 0;
for (size_t j = i; j < max_size; j += n)
d[i] += s1[j%s1_n]*s2[j%s2_n];
}
}
__global__ void _cuda_multiply3(float* d, const float* s1, const float* s2,
size_t n, size_t s1_n, size_t s2_n)
{
for (auto i : grid_stride_range(0, n))
{
d[i] = s1[i%s1_n]*s2[i%s2_n];
}
}
void multiply (
tensor& dest,
...
...
@@ -62,9 +81,36 @@ namespace dlib
const tensor& src2
)
{
DLIB_CASSERT(dest.size()==src1.size(),"");
DLIB_CASSERT(dest.size()==src2.size(),"");
_cuda_multiply<<<512,512>>>(dest.device(), src1.device(), src2.device(), src1.size());
DLIB_CASSERT(dest.k() == src1.k() && src1.k() == src2.k() &&
dest.nr() == src1.nr() && src1.nr() == src2.nr() &&
dest.nc() == src1.nc() && src1.nc() == src2.nc() ,"");
const long MD = std::max(std::max(dest.num_samples(),src1.num_samples()),src2.num_samples());
DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) &&
(src1.num_samples()==1 || src1.num_samples()==MD) &&
(src2.num_samples()==1 || src2.num_samples()==MD) ,"");
if (dest.size() == 0)
return;
const size_t max_size = std::max(std::max(dest.size(),src1.size()),src2.size());
const auto d = dest.host();
const auto s1 = src1.host();
const auto s2 = src2.host();
if (dest.size() == src1.size() && src1.size() == src2.size())
{
_cuda_multiply1<<<512,512>>>(dest.device(), src1.device(), src2.device(), src1.size());
}
else if (dest.num_samples() == 1)
{
_cuda_multiply2<<<512,512>>>(dest.device(), src1.device(), src2.device(),
dest.size(), src1.size(), src2.size(), max_size);
}
else
{
_cuda_multiply3<<<512,512>>>(dest.device(), src1.device(), src2.device(),
dest.size(), src1.size(), src2.size());
}
}
// -----------------------------------------------------------------------------------
...
...
@@ -184,6 +230,33 @@ namespace dlib
}
}
// -----------------------------------------------------------------------------------
__global__ void _add_bias_gradient(float* out, const float* in, size_t n, size_t total_n)
{
for (auto i : grid_stride_range(0, n))
{
out[i] = in[i];
for (size_t j = i+n; j < total_n; j+=n)
out[i] += in[j];
}
}
void add_bias_gradient (
tensor& grad,
const tensor& gradient_input
)
{
DLIB_CASSERT(
grad.num_samples() == 1 &&
gradient_input.k() == grad.k() &&
gradient_input.nr() == grad.nr() &&
gradient_input.nc() == grad.nc() &&
gradient_input.size() > 0,"");
_add_bias_gradient<<<512,512>>>(grad.device(), gradient_input.device(), grad.size(), gradient_input.size());
}
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
...
...
@@ -363,37 +436,6 @@ namespace dlib
// ----------------------------------------------------------------------------------------
// This function is from the article:
// http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
__inline__ __device__ float warp_reduce_sum(float val)
{
for (int offset = warpSize/2; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
}
__inline__ __device__ bool is_first_thread_in_warp()
{
return (threadIdx.x & (warpSize - 1)) == 0;
}
__inline__ __device__ void warp_reduce_atomic_add(
float& out,
float val
)
/*!
ensures
- Atomically adds all the val variables in the current warp to out.
See this page for an extended discussion:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
!*/
{
val = warp_reduce_sum(val);
if (is_first_thread_in_warp())
atomicAdd(&out, val);
}
__global__ void _cuda_batch_normalize_conv1(
float* dest,
float* means,
...
...
dlib/dnn/cuda_dlib.h
View file @
2ba29f65
...
...
@@ -74,6 +74,13 @@ namespace dlib
const
tensor
&
B
);
// -----------------------------------------------------------------------------------
void
add_bias_gradient
(
tensor
&
grad
,
const
tensor
&
gradient_input
);
// -----------------------------------------------------------------------------------
void
batch_normalize
(
...
...
dlib/dnn/cuda_utils.h
View file @
2ba29f65
...
...
@@ -34,6 +34,41 @@ namespace dlib
{
namespace
cuda
{
// ------------------------------------------------------------------------------------
// This function is from the article:
// http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
__inline__
__device__
float
warp_reduce_sum
(
float
val
)
{
for
(
int
offset
=
warpSize
/
2
;
offset
>
0
;
offset
/=
2
)
val
+=
__shfl_down
(
val
,
offset
);
return
val
;
}
__inline__
__device__
bool
is_first_thread_in_warp
()
{
return
(
threadIdx
.
x
&
(
warpSize
-
1
))
==
0
;
}
__inline__
__device__
void
warp_reduce_atomic_add
(
float
&
out
,
float
val
)
/*!
ensures
- Atomically adds all the val variables in the current warp to out.
See this page for an extended discussion:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
!*/
{
val
=
warp_reduce_sum
(
val
);
if
(
is_first_thread_in_warp
())
atomicAdd
(
&
out
,
val
);
}
// ------------------------------------------------------------------------------------
class
grid_stride_range
{
/*!
...
...
dlib/dnn/layers_abstract.h
View file @
2ba29f65
...
...
@@ -520,6 +520,10 @@ namespace dlib
template
<
typename
SUBNET
>
using
dropout
=
add_layer
<
dropout_
,
SUBNET
>
;
// ----------------------------------------------------------------------------------------
// TODO, add spec for bn_ and affine_ layers.
// ----------------------------------------------------------------------------------------
class
relu_
...
...
dlib/dnn/tensor_tools.cpp
View file @
2ba29f65
...
...
@@ -286,9 +286,7 @@ namespace dlib { namespace tt
)
{
#ifdef DLIB_USE_CUDA
// TODO
DLIB_CASSERT
(
false
,
""
);
//cuda::add_bias_gradient(grad,gradient_input);
cuda
::
add_bias_gradient
(
grad
,
gradient_input
);
#else
cpu
::
add_bias_gradient
(
grad
,
gradient_input
);
#endif
...
...
dlib/test/dnn.cpp
View file @
2ba29f65
...
...
@@ -505,7 +505,7 @@ namespace
dest
.
set_size
(
1
,
4
);
cuda
::
multiply
(
dest
,
A
,
B
);
DLIB_TEST
(
max
(
abs
(
mat
(
dest
)
-
sum_rows
(
pointwise_multiply
(
mat
(
A
),
mat
(
B
)))))
<
1e-6
);
DLIB_TEST
_MSG
(
max
(
abs
(
mat
(
dest
)
-
sum_rows
(
pointwise_multiply
(
mat
(
A
),
mat
(
B
)))))
<
1e-6
,
max
(
abs
(
mat
(
dest
)
-
sum_rows
(
pointwise_multiply
(
mat
(
A
),
mat
(
B
)))))
);
A
.
set_size
(
1
,
4
);
rnd
.
fill_uniform
(
A
);
...
...
@@ -633,6 +633,11 @@ namespace
void
test_layers
()
{
{
print_spinner
();
affine_
l
;
DLIB_TEST_MSG
(
test_layer
(
l
),
test_layer
(
l
));
}
{
print_spinner
();
bn_
l
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment