Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
D
dlib
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
钟尚武
dlib
Commits
b92b226c
Commit
b92b226c
authored
May 22, 2016
by
Davis King
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added learning rate and weight decay multipliers to the con_, fc_, and bn_
layers. Updated the solvers to support this.
parent
40f04beb
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
562 additions
and
32 deletions
+562
-32
cpu_dlib.cpp
dlib/dnn/cpu_dlib.cpp
+4
-1
cpu_dlib.h
dlib/dnn/cpu_dlib.h
+2
-0
cuda_dlib.cu
dlib/dnn/cuda_dlib.cu
+8
-4
cuda_dlib.h
dlib/dnn/cuda_dlib.h
+2
-0
layers.h
dlib/dnn/layers.h
+149
-19
layers_abstract.h
dlib/dnn/layers_abstract.h
+214
-2
solvers.h
dlib/dnn/solvers.h
+155
-4
solvers_abstract.h
dlib/dnn/solvers_abstract.h
+18
-0
tensor_tools.cpp
dlib/dnn/tensor_tools.cpp
+4
-2
tensor_tools.h
dlib/dnn/tensor_tools.h
+6
-0
No files found.
dlib/dnn/cpu_dlib.cpp
View file @
b92b226c
...
...
@@ -488,6 +488,8 @@ namespace dlib
// -----------------------------------------------------------------------------------
void
compute_adam_update
(
size_t
begin
,
size_t
end
,
tensor
&
s
,
tensor
&
m
,
tensor
&
v
,
...
...
@@ -504,6 +506,7 @@ namespace dlib
s
.
size
()
==
v
.
size
()
&&
s
.
size
()
==
params
.
size
()
&&
s
.
size
()
==
params_grad
.
size
(),
""
);
DLIB_CASSERT
(
begin
<=
end
&&
end
<=
params
.
size
(),
""
);
const
float
eps
=
1e-8
;
const
float
alpha
=
learning_rate
*
std
::
sqrt
(
1
-
std
::
pow
(
momentum2
,
t
))
/
(
1
-
std
::
pow
(
momentum1
,
t
));
...
...
@@ -516,7 +519,7 @@ namespace dlib
auto
ps
=
s
.
host_write_only
();
auto
pparams
=
params
.
host
();
auto
ppgrad
=
params_grad
.
host
();
for
(
size_t
i
=
0
;
i
<
params
.
size
()
;
++
i
)
for
(
size_t
i
=
begin
;
i
<
end
;
++
i
)
{
float
g
=
weight_decay
*
pparams
[
i
]
+
ppgrad
[
i
];
pm
[
i
]
=
momentum1
*
pm
[
i
]
+
(
1
-
momentum1
)
*
g
;
...
...
dlib/dnn/cpu_dlib.h
View file @
b92b226c
...
...
@@ -114,6 +114,8 @@ namespace dlib
// -----------------------------------------------------------------------------------
void
compute_adam_update
(
size_t
begin
,
size_t
end
,
tensor
&
s
,
tensor
&
m
,
tensor
&
v
,
...
...
dlib/dnn/cuda_dlib.cu
View file @
b92b226c
...
...
@@ -583,7 +583,8 @@ namespace dlib
// ----------------------------------------------------------------------------------------
__global__ void _cuda_compute_adam_update(
size_t n,
size_t begin,
size_t end,
float* s,
float* m,
float* v,
...
...
@@ -600,7 +601,7 @@ namespace dlib
// m = momentum1*m + (1-momentum1) * (weight_decay*params + params_grad);
// v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad);
// s = -alpha*m/(sqrt(v) + eps);
for (auto i : grid_stride_range(
0, n
))
for (auto i : grid_stride_range(
begin, end
))
{
float g = (weight_decay*params[i] + params_grad[i]);
m[i] = momentum1*m[i] + (1-momentum1)*g;
...
...
@@ -610,6 +611,8 @@ namespace dlib
}
void compute_adam_update (
size_t begin,
size_t end,
tensor& s,
tensor& m,
tensor& v,
...
...
@@ -626,10 +629,11 @@ namespace dlib
s.size() == v.size() &&
s.size() == params.size() &&
s.size() == params_grad.size(),"");
DLIB_CASSERT(begin <= end && end <= params.size(),"");
const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));
launch_kernel(_cuda_compute_adam_update,max_jobs(
s.size()
),
s.size()
, s.device(), m.device(), v.device(), alpha, weight_decay,
launch_kernel(_cuda_compute_adam_update,max_jobs(
end-begin
),
begin, end
, s.device(), m.device(), v.device(), alpha, weight_decay,
momentum1, momentum2, params.device(), params_grad.device());
}
...
...
dlib/dnn/cuda_dlib.h
View file @
b92b226c
...
...
@@ -205,6 +205,8 @@ namespace dlib
// ----------------------------------------------------------------------------------------
void
compute_adam_update
(
size_t
begin
,
size_t
end
,
tensor
&
s
,
tensor
&
m
,
tensor
&
v
,
...
...
dlib/dnn/layers.h
View file @
b92b226c
...
...
@@ -42,6 +42,10 @@ namespace dlib
con_
(
)
:
learning_rate_multiplier
(
1
),
weight_decay_multiplier
(
1
),
bias_learning_rate_multiplier
(
1
),
bias_weight_decay_multiplier
(
0
),
padding_y_
(
_padding_y
),
padding_x_
(
_padding_x
)
{}
...
...
@@ -54,12 +58,27 @@ namespace dlib
long
padding_y
()
const
{
return
padding_y_
;
}
long
padding_x
()
const
{
return
padding_x_
;
}
double
get_learning_rate_multiplier
()
const
{
return
learning_rate_multiplier
;
}
double
get_weight_decay_multiplier
()
const
{
return
weight_decay_multiplier
;
}
void
set_learning_rate_multiplier
(
double
val
)
{
learning_rate_multiplier
=
val
;
}
void
set_weight_decay_multiplier
(
double
val
)
{
weight_decay_multiplier
=
val
;
}
double
get_bias_learning_rate_multiplier
()
const
{
return
bias_learning_rate_multiplier
;
}
double
get_bias_weight_decay_multiplier
()
const
{
return
bias_weight_decay_multiplier
;
}
void
set_bias_learning_rate_multiplier
(
double
val
)
{
bias_learning_rate_multiplier
=
val
;
}
void
set_bias_weight_decay_multiplier
(
double
val
)
{
bias_weight_decay_multiplier
=
val
;
}
con_
(
const
con_
&
item
)
:
params
(
item
.
params
),
filters
(
item
.
filters
),
biases
(
item
.
biases
),
learning_rate_multiplier
(
item
.
learning_rate_multiplier
),
weight_decay_multiplier
(
item
.
weight_decay_multiplier
),
bias_learning_rate_multiplier
(
item
.
bias_learning_rate_multiplier
),
bias_weight_decay_multiplier
(
item
.
bias_weight_decay_multiplier
),
padding_y_
(
item
.
padding_y_
),
padding_x_
(
item
.
padding_x_
)
{
...
...
@@ -81,6 +100,10 @@ namespace dlib
biases
=
item
.
biases
;
padding_y_
=
item
.
padding_y_
;
padding_x_
=
item
.
padding_x_
;
learning_rate_multiplier
=
item
.
learning_rate_multiplier
;
weight_decay_multiplier
=
item
.
weight_decay_multiplier
;
bias_learning_rate_multiplier
=
item
.
bias_learning_rate_multiplier
;
bias_weight_decay_multiplier
=
item
.
bias_weight_decay_multiplier
;
return
*
this
;
}
...
...
@@ -121,18 +144,22 @@ namespace dlib
void
backward
(
const
tensor
&
gradient_input
,
SUBNET
&
sub
,
tensor
&
params_grad
)
{
conv
.
get_gradient_for_data
(
gradient_input
,
filters
(
params
,
0
),
sub
.
get_gradient_input
());
// no point computing the parameter gradients if they won't be used.
if
(
learning_rate_multiplier
!=
0
)
{
auto
filt
=
filters
(
params_grad
,
0
);
conv
.
get_gradient_for_filters
(
gradient_input
,
sub
.
get_output
(),
filt
);
auto
b
=
biases
(
params_grad
,
filters
.
size
());
tt
::
assign_conv_bias_gradient
(
b
,
gradient_input
);
}
}
const
tensor
&
get_layer_params
()
const
{
return
params
;
}
tensor
&
get_layer_params
()
{
return
params
;
}
friend
void
serialize
(
const
con_
&
item
,
std
::
ostream
&
out
)
{
serialize
(
"con_
2
"
,
out
);
serialize
(
"con_
3
"
,
out
);
serialize
(
item
.
params
,
out
);
serialize
(
_num_filters
,
out
);
serialize
(
_nr
,
out
);
...
...
@@ -143,6 +170,10 @@ namespace dlib
serialize
(
item
.
padding_x_
,
out
);
serialize
(
item
.
filters
,
out
);
serialize
(
item
.
biases
,
out
);
serialize
(
item
.
learning_rate_multiplier
,
out
);
serialize
(
item
.
weight_decay_multiplier
,
out
);
serialize
(
item
.
bias_learning_rate_multiplier
,
out
);
serialize
(
item
.
bias_weight_decay_multiplier
,
out
);
}
friend
void
deserialize
(
con_
&
item
,
std
::
istream
&
in
)
...
...
@@ -167,7 +198,7 @@ namespace dlib
item
.
padding_y_
=
nr
/
2
;
item
.
padding_x_
=
nc
/
2
;
}
else
if
(
version
==
"con_2"
)
else
if
(
version
==
"con_2"
||
version
==
"con_3"
)
{
deserialize
(
item
.
params
,
in
);
deserialize
(
num_filters
,
in
);
...
...
@@ -180,6 +211,23 @@ namespace dlib
deserialize
(
item
.
filters
,
in
);
deserialize
(
item
.
biases
,
in
);
if
(
version
==
"con_3"
)
{
deserialize
(
item
.
learning_rate_multiplier
,
in
);
deserialize
(
item
.
weight_decay_multiplier
,
in
);
deserialize
(
item
.
bias_learning_rate_multiplier
,
in
);
deserialize
(
item
.
bias_weight_decay_multiplier
,
in
);
}
else
{
// Previous versions didn't have these parameters, so they were
// implicitly 1.
item
.
learning_rate_multiplier
=
1
;
item
.
weight_decay_multiplier
=
1
;
item
.
bias_learning_rate_multiplier
=
1
;
item
.
bias_weight_decay_multiplier
=
1
;
}
if
(
item
.
padding_y_
!=
_padding_y
)
throw
serialization_error
(
"Wrong padding_y found while deserializing dlib::con_"
);
if
(
item
.
padding_x_
!=
_padding_x
)
throw
serialization_error
(
"Wrong padding_x found while deserializing dlib::con_"
);
}
...
...
@@ -207,6 +255,10 @@ namespace dlib
<<
", padding_y="
<<
item
.
padding_y_
<<
", padding_x="
<<
item
.
padding_x_
<<
")"
;
out
<<
" learning_rate_mult="
<<
item
.
learning_rate_multiplier
;
out
<<
" weight_decay_mult="
<<
item
.
weight_decay_multiplier
;
out
<<
" bias_learning_rate_mult="
<<
item
.
bias_learning_rate_multiplier
;
out
<<
" bias_weight_decay_mult="
<<
item
.
bias_weight_decay_multiplier
;
return
out
;
}
...
...
@@ -217,6 +269,10 @@ namespace dlib
alias_tensor
filters
,
biases
;
tt
::
tensor_conv
conv
;
double
learning_rate_multiplier
;
double
weight_decay_multiplier
;
double
bias_learning_rate_multiplier
;
double
bias_weight_decay_multiplier
;
// These are here only because older versions of con (which you might encounter
// serialized to disk) used different padding settings.
...
...
@@ -600,15 +656,24 @@ namespace dlib
class
bn_
{
public
:
bn_
()
:
num_updates
(
0
),
running_stats_window_size
(
1000
)
explicit
bn_
(
unsigned
long
window_size
)
:
num_updates
(
0
),
running_stats_window_size
(
window_size
),
learning_rate_multiplier
(
1
),
weight_decay_multiplier
(
0
)
{}
explicit
bn_
(
unsigned
long
window_size
)
:
num_updates
(
0
),
running_stats_window_size
(
window_size
)
{}
bn_
()
:
bn_
(
1000
)
{}
layer_mode
get_mode
()
const
{
return
mode
;
}
unsigned
long
get_running_stats_window_size
()
const
{
return
running_stats_window_size
;
}
double
get_learning_rate_multiplier
()
const
{
return
learning_rate_multiplier
;
}
double
get_weight_decay_multiplier
()
const
{
return
weight_decay_multiplier
;
}
void
set_learning_rate_multiplier
(
double
val
)
{
learning_rate_multiplier
=
val
;
}
void
set_weight_decay_multiplier
(
double
val
)
{
weight_decay_multiplier
=
val
;
}
template
<
typename
SUBNET
>
void
setup
(
const
SUBNET
&
sub
)
{
...
...
@@ -679,9 +744,9 @@ namespace dlib
friend
void
serialize
(
const
bn_
&
item
,
std
::
ostream
&
out
)
{
if
(
mode
==
CONV_MODE
)
serialize
(
"bn_con"
,
out
);
serialize
(
"bn_con
2
"
,
out
);
else
// if FC_MODE
serialize
(
"bn_fc"
,
out
);
serialize
(
"bn_fc
2
"
,
out
);
serialize
(
item
.
params
,
out
);
serialize
(
item
.
gamma
,
out
);
serialize
(
item
.
beta
,
out
);
...
...
@@ -691,6 +756,8 @@ namespace dlib
serialize
(
item
.
running_variances
,
out
);
serialize
(
item
.
num_updates
,
out
);
serialize
(
item
.
running_stats_window_size
,
out
);
serialize
(
item
.
learning_rate_multiplier
,
out
);
serialize
(
item
.
weight_decay_multiplier
,
out
);
}
friend
void
deserialize
(
bn_
&
item
,
std
::
istream
&
in
)
...
...
@@ -701,12 +768,12 @@ namespace dlib
{
if
(
mode
==
CONV_MODE
)
{
if
(
version
!=
"bn_con"
)
if
(
version
!=
"bn_con"
&&
version
!=
"bn_con2"
)
throw
serialization_error
(
"Unexpected version '"
+
version
+
"' found while deserializing dlib::bn_."
);
}
else
// must be in FC_MODE
{
if
(
version
!=
"bn_fc"
)
if
(
version
!=
"bn_fc"
&&
version
!=
"bn_fc2"
)
throw
serialization_error
(
"Unexpected version '"
+
version
+
"' found while deserializing dlib::bn_."
);
}
}
...
...
@@ -733,14 +800,28 @@ namespace dlib
// format saved the inverse standard deviations instead of variances.
item
.
running_variances
=
1
.
0
f
/
squared
(
mat
(
item
.
running_variances
))
-
tt
::
BATCH_NORM_EPS
;
}
else
if
(
version
==
"bn_con2"
||
version
==
"bn_fc2"
)
{
deserialize
(
item
.
learning_rate_multiplier
,
in
);
deserialize
(
item
.
weight_decay_multiplier
,
in
);
}
else
{
// Previous versions didn't have these parameters, so they were
// implicitly 1.
item
.
learning_rate_multiplier
=
1
;
item
.
weight_decay_multiplier
=
1
;
}
}
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
const
bn_
&
item
)
{
if
(
mode
==
CONV_MODE
)
out
<<
"bn_con"
;
out
<<
"bn_con
"
;
else
out
<<
"bn_fc"
;
out
<<
"bn_fc "
;
out
<<
" learning_rate_mult="
<<
item
.
learning_rate_multiplier
;
out
<<
" weight_decay_mult="
<<
item
.
weight_decay_multiplier
;
return
out
;
}
...
...
@@ -754,6 +835,8 @@ namespace dlib
resizable_tensor
invstds
,
running_variances
;
unsigned
long
num_updates
;
unsigned
long
running_stats_window_size
;
double
learning_rate_multiplier
;
double
weight_decay_multiplier
;
};
template
<
typename
SUBNET
>
...
...
@@ -784,11 +867,24 @@ namespace dlib
static_assert
(
num_outputs_
>
0
,
"The number of outputs from a fc_ layer must be > 0"
);
public
:
fc_
()
:
num_outputs
(
num_outputs_
),
num_inputs
(
0
)
{
}
fc_
(
num_fc_outputs
o
)
:
num_outputs
(
o
.
num_outputs
),
num_inputs
(
0
),
learning_rate_multiplier
(
1
),
weight_decay_multiplier
(
1
),
bias_learning_rate_multiplier
(
1
),
bias_weight_decay_multiplier
(
0
)
{}
fc_
()
:
fc_
(
num_fc_outputs
(
num_outputs_
))
{}
double
get_learning_rate_multiplier
()
const
{
return
learning_rate_multiplier
;
}
double
get_weight_decay_multiplier
()
const
{
return
weight_decay_multiplier
;
}
void
set_learning_rate_multiplier
(
double
val
)
{
learning_rate_multiplier
=
val
;
}
void
set_weight_decay_multiplier
(
double
val
)
{
weight_decay_multiplier
=
val
;
}
fc_
(
num_fc_outputs
o
)
:
num_outputs
(
o
.
num_outputs
),
num_inputs
(
0
)
{}
double
get_bias_learning_rate_multiplier
()
const
{
return
bias_learning_rate_multiplier
;
}
double
get_bias_weight_decay_multiplier
()
const
{
return
bias_weight_decay_multiplier
;
}
void
set_bias_learning_rate_multiplier
(
double
val
)
{
bias_learning_rate_multiplier
=
val
;
}
void
set_bias_weight_decay_multiplier
(
double
val
)
{
bias_weight_decay_multiplier
=
val
;
}
unsigned
long
get_num_outputs
(
)
const
{
return
num_outputs
;
}
...
...
@@ -834,6 +930,9 @@ namespace dlib
template
<
typename
SUBNET
>
void
backward
(
const
tensor
&
gradient_input
,
SUBNET
&
sub
,
tensor
&
params_grad
)
{
// no point computing the parameter gradients if they won't be used.
if
(
learning_rate_multiplier
!=
0
)
{
// compute the gradient of the weight parameters.
auto
pw
=
weights
(
params_grad
,
0
);
...
...
@@ -845,6 +944,7 @@ namespace dlib
auto
pb
=
biases
(
params_grad
,
weights
.
size
());
tt
::
assign_bias_gradient
(
pb
,
gradient_input
);
}
}
// compute the gradient for the data
auto
w
=
weights
(
params
,
0
);
...
...
@@ -856,20 +956,24 @@ namespace dlib
friend
void
serialize
(
const
fc_
&
item
,
std
::
ostream
&
out
)
{
serialize
(
"fc_"
,
out
);
serialize
(
"fc_
2
"
,
out
);
serialize
(
item
.
num_outputs
,
out
);
serialize
(
item
.
num_inputs
,
out
);
serialize
(
item
.
params
,
out
);
serialize
(
item
.
weights
,
out
);
serialize
(
item
.
biases
,
out
);
serialize
((
int
)
bias_mode
,
out
);
serialize
(
item
.
learning_rate_multiplier
,
out
);
serialize
(
item
.
weight_decay_multiplier
,
out
);
serialize
(
item
.
bias_learning_rate_multiplier
,
out
);
serialize
(
item
.
bias_weight_decay_multiplier
,
out
);
}
friend
void
deserialize
(
fc_
&
item
,
std
::
istream
&
in
)
{
std
::
string
version
;
deserialize
(
version
,
in
);
if
(
version
!=
"fc_"
)
if
(
version
!=
"fc_"
&&
version
!=
"fc_2"
)
throw
serialization_error
(
"Unexpected version '"
+
version
+
"' found while deserializing dlib::fc_."
);
deserialize
(
item
.
num_outputs
,
in
);
...
...
@@ -880,6 +984,22 @@ namespace dlib
int
bmode
=
0
;
deserialize
(
bmode
,
in
);
if
(
bias_mode
!=
(
fc_bias_mode
)
bmode
)
throw
serialization_error
(
"Wrong fc_bias_mode found while deserializing dlib::fc_"
);
if
(
version
==
"fc_2"
)
{
deserialize
(
item
.
learning_rate_multiplier
,
in
);
deserialize
(
item
.
weight_decay_multiplier
,
in
);
deserialize
(
item
.
bias_learning_rate_multiplier
,
in
);
deserialize
(
item
.
bias_weight_decay_multiplier
,
in
);
}
else
{
// Previous versions didn't have these parameters, so they were
// implicitly 1.
item
.
learning_rate_multiplier
=
1
;
item
.
weight_decay_multiplier
=
1
;
item
.
bias_learning_rate_multiplier
=
1
;
item
.
bias_weight_decay_multiplier
=
1
;
}
}
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
const
fc_
&
item
)
...
...
@@ -889,12 +1009,18 @@ namespace dlib
out
<<
"fc
\t
("
<<
"num_outputs="
<<
item
.
num_outputs
<<
")"
;
out
<<
" learning_rate_mult="
<<
item
.
learning_rate_multiplier
;
out
<<
" weight_decay_mult="
<<
item
.
weight_decay_multiplier
;
out
<<
" bias_learning_rate_mult="
<<
item
.
bias_learning_rate_multiplier
;
out
<<
" bias_weight_decay_mult="
<<
item
.
bias_weight_decay_multiplier
;
}
else
{
out
<<
"fc_no_bias ("
<<
"num_outputs="
<<
item
.
num_outputs
<<
")"
;
out
<<
" learning_rate_mult="
<<
item
.
learning_rate_multiplier
;
out
<<
" weight_decay_mult="
<<
item
.
weight_decay_multiplier
;
}
return
out
;
}
...
...
@@ -905,6 +1031,10 @@ namespace dlib
unsigned
long
num_inputs
;
resizable_tensor
params
;
alias_tensor
weights
,
biases
;
double
learning_rate_multiplier
;
double
weight_decay_multiplier
;
double
bias_learning_rate_multiplier
;
double
bias_weight_decay_multiplier
;
};
template
<
...
...
@@ -1223,7 +1353,7 @@ namespace dlib
{
std
::
string
version
;
deserialize
(
version
,
in
);
if
(
version
==
"bn_con"
)
if
(
version
==
"bn_con"
||
version
==
"bn_con2"
)
{
// Since we can build an affine_ from a bn_ we check if that's what is in
// the stream and if so then just convert it right here.
...
...
@@ -1233,7 +1363,7 @@ namespace dlib
item
=
temp
;
return
;
}
else
if
(
version
==
"bn_fc"
)
else
if
(
version
==
"bn_fc"
||
version
==
"bn_fc2"
)
{
// Since we can build an affine_ from a bn_ we check if that's what is in
// the stream and if so then just convert it right here.
...
...
dlib/dnn/layers_abstract.h
View file @
b92b226c
...
...
@@ -123,6 +123,16 @@ namespace dlib
allow dlib to make some layers execute in-place and therefore run a
little faster and use less memory. Do not implement forward() and
backward().
It should also be noted that layers may define additional layer specific
fields and the solvers can use these fields as they see fit. For example,
some layers define get_learning_rate_multiplier() and
get_weight_decay_multiplier() methods. The solvers that come with dlib
look at these methods, if they exist, and adjust the learning rate or
weight decay for that layer according to the multiplier. Therefore, you
can add these methods to your layer types if you want, or even define new
fields and new solvers that use those fields in some way.
!*/
public
:
...
...
@@ -367,6 +377,10 @@ namespace dlib
ensures
- #get_num_outputs() == num_outputs
- #get_bias_mode() == bias_mode
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 1
- #get_bias_learning_rate_multiplier() == 1
- #get_bias_weight_decay_multiplier() == 0
!*/
unsigned
long
get_num_outputs
(
...
...
@@ -389,6 +403,82 @@ namespace dlib
is added to each of the outputs of this layer.
!*/
double
get_learning_rate_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its parameters be
multiplied by get_learning_rate_multiplier().
!*/
double
get_weight_decay_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its parameters be
multiplied by get_weight_decay_multiplier().
!*/
void
set_learning_rate_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_learning_rate_multiplier() == val
!*/
void
set_weight_decay_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_weight_decay_multiplier() == val
!*/
double
get_bias_learning_rate_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its bias parameters be
multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
!*/
double
get_bias_weight_decay_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its bias parameters be
multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
!*/
void
set_bias_learning_rate_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_bias_learning_rate_multiplier() == val
!*/
void
set_bias_weight_decay_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_bias_weight_decay_multiplier() == val
!*/
template
<
typename
SUBNET
>
void
setup
(
const
SUBNET
&
sub
);
template
<
typename
SUBNET
>
void
forward
(
const
SUBNET
&
sub
,
resizable_tensor
&
output
);
template
<
typename
SUBNET
>
void
backward
(
const
tensor
&
gradient_input
,
SUBNET
&
sub
,
tensor
&
params_grad
);
...
...
@@ -458,6 +548,10 @@ namespace dlib
- #stride_x() == _stride_x
- #padding_y() == _padding_y
- #padding_x() == _padding_x
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 1
- #get_bias_learning_rate_multiplier() == 1
- #get_bias_weight_decay_multiplier() == 0
!*/
long
num_filters
(
...
...
@@ -517,6 +611,82 @@ namespace dlib
sides of the image.
!*/
double
get_learning_rate_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its parameters be
multiplied by get_learning_rate_multiplier().
!*/
double
get_weight_decay_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its parameters be
multiplied by get_weight_decay_multiplier().
!*/
void
set_learning_rate_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_learning_rate_multiplier() == val
!*/
void
set_weight_decay_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_weight_decay_multiplier() == val
!*/
double
get_bias_learning_rate_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its bias parameters be
multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
!*/
double
get_bias_weight_decay_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its bias parameters be
multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
!*/
void
set_bias_learning_rate_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_bias_learning_rate_multiplier() == val
!*/
void
set_bias_weight_decay_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_bias_weight_decay_multiplier() == val
!*/
template
<
typename
SUBNET
>
void
setup
(
const
SUBNET
&
sub
);
template
<
typename
SUBNET
>
void
forward
(
const
SUBNET
&
sub
,
resizable_tensor
&
output
);
template
<
typename
SUBNET
>
void
backward
(
const
tensor
&
gradient_input
,
SUBNET
&
sub
,
tensor
&
params_grad
);
...
...
@@ -684,7 +854,9 @@ namespace dlib
/*!
ensures
- #get_mode() == mode
- get_running_stats_window_size() == 1000
- #get_running_stats_window_size() == 1000
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 0
!*/
explicit
bn_
(
...
...
@@ -693,7 +865,9 @@ namespace dlib
/*!
ensures
- #get_mode() == mode
- get_running_stats_window_size() == window_size
- #get_running_stats_window_size() == window_size
- #get_learning_rate_multiplier() == 1
- #get_weight_decay_multiplier() == 0
!*/
layer_mode
get_mode
(
...
...
@@ -725,6 +899,44 @@ namespace dlib
the running average.
!*/
double
get_learning_rate_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the learning rate used to optimize its parameters be
multiplied by get_learning_rate_multiplier().
!*/
double
get_weight_decay_multiplier
(
)
const
;
/*!
ensures
- returns a multiplier number. The interpretation is that this object is
requesting that the weight decay used to optimize its parameters be
multiplied by get_weight_decay_multiplier().
!*/
void
set_learning_rate_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_learning_rate_multiplier() == val
!*/
void
set_weight_decay_multiplier
(
double
val
);
/*!
requires
- val >= 0
ensures
- #get_weight_decay_multiplier() == val
!*/
template
<
typename
SUBNET
>
void
setup
(
const
SUBNET
&
sub
);
template
<
typename
SUBNET
>
void
forward
(
const
SUBNET
&
sub
,
resizable_tensor
&
output
);
template
<
typename
SUBNET
>
void
backward
(
const
tensor
&
gradient_input
,
SUBNET
&
sub
,
tensor
&
params_grad
);
...
...
dlib/dnn/solvers.h
View file @
b92b226c
...
...
@@ -6,6 +6,7 @@
#include "solvers_abstract.h"
#include "tensor.h"
#include <iostream>
#include "layers.h"
namespace
dlib
{
...
...
@@ -49,10 +50,42 @@ namespace dlib
v
=
0
;
}
//perform: v = momentum*mat(v) - weight_decay*learning_rate*mat(params) - learning_rate*mat(params_grad);
tt
::
affine_transform
(
v
,
v
,
params
,
params_grad
,
momentum
,
-
weight_decay
*
learning_rate
,
-
learning_rate
,
0
);
const
double
lr
=
learning_rate
*
get_learning_rate_multiplier
(
l
);
const
double
wd
=
weight_decay
*
get_weight_decay_multiplier
(
l
);
//perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
tt
::
affine_transform
(
v
,
v
,
params
,
params_grad
,
momentum
,
-
wd
*
lr
,
-
lr
);
return
v
;
}
template
<
unsigned
long
N
>
const
tensor
&
operator
()
(
const
float
learning_rate
,
const
fc_
<
N
,
FC_HAS_BIAS
>&
l
,
const
tensor
&
params_grad
)
{
update_considering_bias
(
learning_rate
,
l
,
params_grad
,
l
.
get_num_outputs
());
return
v
;
}
template
<
long
_num_filters
,
long
_nr
,
long
_nc
,
int
_stride_y
,
int
_stride_x
,
int
_padding_y
,
int
_padding_x
>
const
tensor
&
operator
()
(
const
float
learning_rate
,
const
con_
<
_num_filters
,
_nr
,
_nc
,
_stride_y
,
_stride_x
,
_padding_y
,
_padding_x
>&
l
,
const
tensor
&
params_grad
)
{
update_considering_bias
(
learning_rate
,
l
,
params_grad
,
l
.
num_filters
());
return
v
;
}
...
...
@@ -76,9 +109,49 @@ namespace dlib
}
private
:
template
<
typename
layer_type
>
void
update_considering_bias
(
const
float
learning_rate
,
const
layer_type
&
l
,
const
tensor
&
params_grad
,
unsigned
long
bias_offset
)
{
const
tensor
&
params
=
l
.
get_layer_params
();
DLIB_CASSERT
(
params
.
size
()
!=
0
,
""
);
if
(
v
.
size
()
==
0
)
{
v
.
copy_size
(
params_grad
);
v
=
0
;
}
double
lr
=
learning_rate
*
get_learning_rate_multiplier
(
l
);
double
wd
=
weight_decay
*
get_weight_decay_multiplier
(
l
);
//perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
if
(
l
.
get_bias_learning_rate_multiplier
()
==
1
&&
l
.
get_bias_weight_decay_multiplier
()
==
1
)
{
tt
::
affine_transform
(
v
,
v
,
params
,
params_grad
,
momentum
,
-
wd
*
lr
,
-
lr
);
}
else
{
tt
::
affine_transform_range
(
0
,
bias_offset
,
v
,
v
,
params
,
params_grad
,
momentum
,
-
wd
*
lr
,
-
lr
);
// now update the biases but apply their multipliers
lr
*=
l
.
get_bias_learning_rate_multiplier
();
wd
*=
l
.
get_bias_weight_decay_multiplier
();
tt
::
affine_transform_range
(
bias_offset
,
v
.
size
(),
v
,
v
,
params
,
params_grad
,
momentum
,
-
wd
*
lr
,
-
lr
);
}
}
resizable_tensor
v
;
float
weight_decay
;
float
momentum
;
};
// ----------------------------------------------------------------------------------------
...
...
@@ -132,11 +205,46 @@ namespace dlib
++
t
;
tt
::
compute_adam_update
(
s
,
m
,
v
,
t
,
learning_rate
,
weight_decay
,
momentum1
,
momentum2
,
params
,
params_grad
);
tt
::
compute_adam_update
(
0
,
params
.
size
(),
s
,
m
,
v
,
t
,
learning_rate
*
get_learning_rate_multiplier
(
l
),
weight_decay
*
get_weight_decay_multiplier
(
l
),
momentum1
,
momentum2
,
params
,
params_grad
);
return
s
;
}
template
<
unsigned
long
N
>
const
tensor
&
operator
()
(
const
float
learning_rate
,
const
fc_
<
N
,
FC_HAS_BIAS
>&
l
,
const
tensor
&
params_grad
)
{
update_considering_bias
(
learning_rate
,
l
,
params_grad
,
l
.
get_num_outputs
());
return
s
;
}
template
<
long
_num_filters
,
long
_nr
,
long
_nc
,
int
_stride_y
,
int
_stride_x
,
int
_padding_y
,
int
_padding_x
>
const
tensor
&
operator
()
(
const
float
learning_rate
,
const
con_
<
_num_filters
,
_nr
,
_nc
,
_stride_y
,
_stride_x
,
_padding_y
,
_padding_x
>&
l
,
const
tensor
&
params_grad
)
{
update_considering_bias
(
learning_rate
,
l
,
params_grad
,
l
.
num_filters
());
return
s
;
}
friend
void
serialize
(
const
adam
&
item
,
std
::
ostream
&
out
)
{
serialize
(
"adam2"
,
out
);
...
...
@@ -165,6 +273,49 @@ namespace dlib
}
private
:
template
<
typename
layer_type
>
void
update_considering_bias
(
const
float
learning_rate
,
const
layer_type
&
l
,
const
tensor
&
params_grad
,
unsigned
long
bias_offset
)
{
const
tensor
&
params
=
l
.
get_layer_params
();
DLIB_CASSERT
(
params
.
size
()
!=
0
,
""
);
if
(
v
.
size
()
==
0
)
{
m
.
copy_size
(
params_grad
);
m
=
0
;
v
.
copy_size
(
params_grad
);
v
=
0
;
s
.
copy_size
(
params_grad
);
}
++
t
;
if
(
l
.
get_bias_learning_rate_multiplier
()
==
1
&&
l
.
get_bias_weight_decay_multiplier
()
==
1
)
{
tt
::
compute_adam_update
(
0
,
params
.
size
(),
s
,
m
,
v
,
t
,
learning_rate
*
get_learning_rate_multiplier
(
l
),
weight_decay
*
get_weight_decay_multiplier
(
l
),
momentum1
,
momentum2
,
params
,
params_grad
);
}
else
{
tt
::
compute_adam_update
(
0
,
bias_offset
,
s
,
m
,
v
,
t
,
learning_rate
*
get_learning_rate_multiplier
(
l
),
weight_decay
*
get_weight_decay_multiplier
(
l
),
momentum1
,
momentum2
,
params
,
params_grad
);
tt
::
compute_adam_update
(
bias_offset
,
params
.
size
(),
s
,
m
,
v
,
t
,
learning_rate
*
get_learning_rate_multiplier
(
l
)
*
l
.
get_bias_learning_rate_multiplier
(),
weight_decay
*
get_weight_decay_multiplier
(
l
)
*
l
.
get_bias_weight_decay_multiplier
(),
momentum1
,
momentum2
,
params
,
params_grad
);
}
}
resizable_tensor
m
;
resizable_tensor
v
;
resizable_tensor
s
;
...
...
dlib/dnn/solvers_abstract.h
View file @
b92b226c
...
...
@@ -78,6 +78,15 @@ namespace dlib
V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
Here V is a momentum term that is remembered by the solver from one
invocation of operator() to the next.
Note that the actual learning rate and weight decay used by the solver are
multiplied by the per layer multipliers. That is, the solver will call
get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
multiply these values with the nominal learning rate and weight decay,
respectively, to determine the values it will use during each step. It is
also overloaded to allow additional learning rate multipliers to be applied
to fc_ and con_ bias parameters.
!*/
public
:
...
...
@@ -123,6 +132,15 @@ namespace dlib
paper:
Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
optimization." International Conference on Learning Representation. 2015.
Note that the actual learning rate and weight decay used by the solver are
multiplied by the per layer multipliers. That is, the solver will call
get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
multiply these values with the nominal learning rate and weight decay,
respectively, to determine the values it will use during each step. It is
also overloaded to allow additional learning rate multipliers to be applied
to fc_ and con_ bias parameters.
!*/
public
:
...
...
dlib/dnn/tensor_tools.cpp
View file @
b92b226c
...
...
@@ -311,6 +311,8 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
void
compute_adam_update
(
size_t
begin
,
size_t
end
,
tensor
&
s
,
tensor
&
m
,
tensor
&
v
,
...
...
@@ -324,10 +326,10 @@ namespace dlib { namespace tt
)
{
#ifdef DLIB_USE_CUDA
cuda
::
compute_adam_update
(
s
,
m
,
v
,
t
,
learning_rate
,
weight_decay
,
momentum1
,
cuda
::
compute_adam_update
(
begin
,
end
,
s
,
m
,
v
,
t
,
learning_rate
,
weight_decay
,
momentum1
,
momentum2
,
params
,
params_grad
);
#else
cpu
::
compute_adam_update
(
s
,
m
,
v
,
t
,
learning_rate
,
weight_decay
,
momentum1
,
cpu
::
compute_adam_update
(
begin
,
end
,
s
,
m
,
v
,
t
,
learning_rate
,
weight_decay
,
momentum1
,
momentum2
,
params
,
params_grad
);
#endif
}
...
...
dlib/dnn/tensor_tools.h
View file @
b92b226c
...
...
@@ -335,6 +335,8 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
void
compute_adam_update
(
size_t
begin
,
size_t
end
,
tensor
&
s
,
tensor
&
m
,
tensor
&
v
,
...
...
@@ -354,12 +356,16 @@ namespace dlib { namespace tt
- weight_decay >= 0
- 0 <= momentum1 < 1
- 0 <= momentum2 < 1
- begin <= end <= params.size()
ensures
- This function implements the ADAM parameter update method described in the paper:
Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
optimization." International Conference on Learning Representation. 2015.
Specifically, it implements the method shown as Algorithm 1.
- #s is the update vector that should be added to the parameters.
- The function only operates in the half open range [begin,end) of the memory
blocks of each tensor. E.g. to make this function run on the entire tensor
set begin to 0 and end to params.size().
!*/
// ----------------------------------------------------------------------------------------
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment