Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
D
dlib
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
钟尚武
dlib
Commits
93bbe5ff
Commit
93bbe5ff
authored
May 15, 2016
by
Davis King
Browse files
Options
Browse Files
Download
Plain Diff
merged
parents
9763c471
66166c67
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
214 additions
and
184 deletions
+214
-184
core.h
dlib/dnn/core.h
+19
-18
core_abstract.h
dlib/dnn/core_abstract.h
+10
-8
solvers.h
dlib/dnn/solvers.h
+27
-27
solvers_abstract.h
dlib/dnn/solvers_abstract.h
+39
-26
trainer.h
dlib/dnn/trainer.h
+0
-0
trainer_abstract.h
dlib/dnn/trainer_abstract.h
+45
-42
dnn_mnist_advanced_ex.cpp
examples/dnn_mnist_advanced_ex.cpp
+66
-57
dnn_mnist_ex.cpp
examples/dnn_mnist_ex.cpp
+8
-6
No files found.
dlib/dnn/core.h
View file @
93bbe5ff
...
@@ -825,16 +825,16 @@ namespace dlib
...
@@ -825,16 +825,16 @@ namespace dlib
}
}
template
<
typename
solver_type
>
template
<
typename
solver_type
>
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
step_siz
e
)
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
learning_rat
e
)
{
{
DLIB_CASSERT
(
solvers
.
size
()
>=
num_computational_layers
,
""
);
DLIB_CASSERT
(
solvers
.
size
()
>=
num_computational_layers
,
""
);
// Don't try to adjust the parameters if this layer doesn't have any.
// Don't try to adjust the parameters if this layer doesn't have any.
if
(
params_grad
.
size
()
!=
0
)
if
(
params_grad
.
size
()
!=
0
)
{
{
const
tensor
&
step
=
solvers
.
top
()(
details
.
get_layer_params
()
,
static_cast
<
const
tensor
&>
(
params_grad
));
const
tensor
&
step
=
solvers
.
top
()(
learning_rate
,
details
,
static_cast
<
const
tensor
&>
(
params_grad
));
tt
::
add
(
1
,
details
.
get_layer_params
(),
step_size
,
step
);
tt
::
add
(
details
.
get_layer_params
(),
details
.
get_layer_params
()
,
step
);
}
}
subnetwork
->
update_parameters
(
solvers
.
pop
(),
step_siz
e
);
subnetwork
->
update_parameters
(
solvers
.
pop
(),
learning_rat
e
);
}
}
const
tensor
&
get_parameter_gradient
(
const
tensor
&
get_parameter_gradient
(
...
@@ -1175,13 +1175,14 @@ namespace dlib
...
@@ -1175,13 +1175,14 @@ namespace dlib
}
}
template
<
typename
solver_type
>
template
<
typename
solver_type
>
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
step_siz
e
)
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
learning_rat
e
)
{
{
DLIB_CASSERT
(
solvers
.
size
()
>=
num_computational_layers
,
""
);
DLIB_CASSERT
(
solvers
.
size
()
>=
num_computational_layers
,
""
);
// Don't try to adjust the parameters if this layer doesn't have any.
// Don't try to adjust the parameters if this layer doesn't have any.
if
(
params_grad
.
size
()
!=
0
)
{
if
(
params_grad
.
size
()
!=
0
)
const
tensor
&
step
=
solvers
.
top
()(
details
.
get_layer_params
(),
static_cast
<
const
tensor
&>
(
params_grad
));
{
tt
::
add
(
1
,
details
.
get_layer_params
(),
step_size
,
step
);
const
tensor
&
step
=
solvers
.
top
()(
learning_rate
,
details
,
static_cast
<
const
tensor
&>
(
params_grad
));
tt
::
add
(
details
.
get_layer_params
(),
details
.
get_layer_params
(),
step
);
}
}
}
}
...
@@ -1401,9 +1402,9 @@ namespace dlib
...
@@ -1401,9 +1402,9 @@ namespace dlib
}
}
template
<
typename
solver_type
>
template
<
typename
solver_type
>
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
step_siz
e
)
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
learning_rat
e
)
{
{
subnetwork
.
update_parameters
(
solvers
,
step_siz
e
);
subnetwork
.
update_parameters
(
solvers
,
learning_rat
e
);
}
}
const
tensor
&
get_parameter_gradient
(
const
tensor
&
get_parameter_gradient
(
...
@@ -1687,11 +1688,11 @@ namespace dlib
...
@@ -1687,11 +1688,11 @@ namespace dlib
}
}
template
<
typename
solver_type
>
template
<
typename
solver_type
>
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
step_siz
e
)
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
learning_rat
e
)
{
{
for
(
size_t
i
=
0
;
i
<
details
.
size
();
++
i
)
for
(
size_t
i
=
0
;
i
<
details
.
size
();
++
i
)
details
[
i
].
update_parameters
(
solvers
.
pop
(
comp_layers_in_each_group
*
i
),
step_siz
e
);
details
[
i
].
update_parameters
(
solvers
.
pop
(
comp_layers_in_each_group
*
i
),
learning_rat
e
);
subnetwork
.
update_parameters
(
solvers
.
pop
(
comp_layers_in_each_group
*
details
.
size
()),
step_siz
e
);
subnetwork
.
update_parameters
(
solvers
.
pop
(
comp_layers_in_each_group
*
details
.
size
()),
learning_rat
e
);
}
}
const
subnet_type
&
subnet
()
const
{
return
subnetwork
;
}
const
subnet_type
&
subnet
()
const
{
return
subnetwork
;
}
...
@@ -1905,7 +1906,7 @@ namespace dlib
...
@@ -1905,7 +1906,7 @@ namespace dlib
}
}
template
<
typename
solver_type
>
template
<
typename
solver_type
>
void
update_parameters
(
sstack
<
solver_type
>
/*solvers*/
,
double
/*
step_siz
e*/
)
void
update_parameters
(
sstack
<
solver_type
>
/*solvers*/
,
double
/*
learning_rat
e*/
)
{
{
// nothing to do
// nothing to do
}
}
...
@@ -2248,10 +2249,10 @@ namespace dlib
...
@@ -2248,10 +2249,10 @@ namespace dlib
template
<
typename
solver_type
>
template
<
typename
solver_type
>
void
update_parameters
(
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
sstack
<
solver_type
>
solvers
,
double
step_siz
e
double
learning_rat
e
)
)
{
{
subnetwork
.
update_parameters
(
solvers
,
step_siz
e
);
subnetwork
.
update_parameters
(
solvers
,
learning_rat
e
);
}
}
const
subnet_type
&
subnet
()
const
{
return
subnetwork
;
}
const
subnet_type
&
subnet
()
const
{
return
subnetwork
;
}
...
@@ -2542,9 +2543,9 @@ namespace dlib
...
@@ -2542,9 +2543,9 @@ namespace dlib
}
}
template
<
typename
solver_type
>
template
<
typename
solver_type
>
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
step_siz
e
)
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
learning_rat
e
)
{
{
subnetwork
.
update_parameters
(
solvers
,
step_siz
e
);
subnetwork
.
update_parameters
(
solvers
,
learning_rat
e
);
}
}
const
tensor
&
get_parameter_gradient
(
const
tensor
&
get_parameter_gradient
(
...
...
dlib/dnn/core_abstract.h
View file @
93bbe5ff
...
@@ -506,7 +506,7 @@ namespace dlib
...
@@ -506,7 +506,7 @@ namespace dlib
template
<
typename
solver_type
>
template
<
typename
solver_type
>
void
update_parameters
(
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
sstack
<
solver_type
>
solvers
,
double
step_siz
e
double
learning_rat
e
);
);
/*!
/*!
requires
requires
...
@@ -517,13 +517,14 @@ namespace dlib
...
@@ -517,13 +517,14 @@ namespace dlib
if you want to call update_parameters() on some other neural network
if you want to call update_parameters() on some other neural network
object then you must NOT reuse the same solvers object.
object then you must NOT reuse the same solvers object.
- solvers.size() >= num_computational_layers
- solvers.size() >= num_computational_layers
- 0 <
step_siz
e <= 1
- 0 <
learning_rat
e <= 1
ensures
ensures
- Updates all the parameters in the network. In particular, we pass each
- Updates all the parameters in the network. In particular, we pass each
layer's parameter gradient (i.e. the tensor returned by the layer's
layer's parameter gradient (i.e. the tensor returned by the layer's
get_parameter_gradient() member) through that layer's corresponding
get_parameter_gradient() member) through that layer's corresponding
solver object. This produces a parameter delta vector and we add
solver object. This produces a parameter delta vector which we add to
step_size times that vector to the layer's parameters.
the layer's parameters.
- The solvers use the given learning rate.
!*/
!*/
void
clean
(
void
clean
(
...
@@ -944,7 +945,7 @@ namespace dlib
...
@@ -944,7 +945,7 @@ namespace dlib
template
<
typename
solver_type
>
template
<
typename
solver_type
>
void
update_parameters
(
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
sstack
<
solver_type
>
solvers
,
double
step_siz
e
double
learning_rat
e
);
);
/*!
/*!
requires
requires
...
@@ -955,13 +956,14 @@ namespace dlib
...
@@ -955,13 +956,14 @@ namespace dlib
is, if you want to call update_parameters() on some other neural network
is, if you want to call update_parameters() on some other neural network
object then you must NOT reuse the same solvers object.
object then you must NOT reuse the same solvers object.
- solvers.size() >= num_computational_layers
- solvers.size() >= num_computational_layers
- 0 <
step_siz
e <= 1
- 0 <
learning_rat
e <= 1
ensures
ensures
- Updates all the parameters in the network. In particular, we pass each
- Updates all the parameters in the network. In particular, we pass each
layer's parameter gradient (i.e. the tensor returned by the layer's
layer's parameter gradient (i.e. the tensor returned by the layer's
get_parameter_gradient() member) through that layer's corresponding
get_parameter_gradient() member) through that layer's corresponding
solver object. This produces a parameter delta vector and we add
solver object. This produces a parameter delta vector which we add to
step_size times that vector to the layer's parameters.
the layer's parameters.
- The solvers use the given learning rate.
!*/
!*/
// -------------
// -------------
...
...
dlib/dnn/solvers.h
View file @
93bbe5ff
...
@@ -14,30 +14,34 @@ namespace dlib
...
@@ -14,30 +14,34 @@ namespace dlib
public
:
public
:
sgd
(
sgd
(
float
learning_rate_
=
0
.
01
,
float
weight_decay_
,
float
weight_decay_
=
0
.
0005
,
float
momentum_
float
momentum_
=
0
.
9
)
)
{
{
weight_decay
=
weight_decay_
;
weight_decay
=
weight_decay_
;
learning_rate
=
learning_rate_
;
momentum
=
momentum_
;
momentum
=
momentum_
;
}
}
sgd
(
)
:
sgd
(
0
.
0005
,
0
.
9
)
{
}
float
get_momentum
(
float
get_momentum
(
)
const
{
return
momentum
;
}
)
const
{
return
momentum
;
}
float
get_weight_decay
(
float
get_weight_decay
(
)
const
{
return
weight_decay
;
}
)
const
{
return
weight_decay
;
}
float
get_learning_rate
(
template
<
typename
layer_type
>
)
const
{
return
learning_rate
;
}
const
tensor
&
operator
()
(
const
tensor
&
operator
()
(
const
tensor
&
params
,
const
float
learning_rate
,
const
layer_type
&
l
,
const
tensor
&
params_grad
const
tensor
&
params_grad
)
)
{
{
const
tensor
&
params
=
l
.
get_layer_params
();
DLIB_CASSERT
(
params
.
size
()
!=
0
,
""
);
DLIB_CASSERT
(
params
.
size
()
!=
0
,
""
);
if
(
v
.
size
()
==
0
)
if
(
v
.
size
()
==
0
)
{
{
...
@@ -54,10 +58,9 @@ namespace dlib
...
@@ -54,10 +58,9 @@ namespace dlib
friend
void
serialize
(
const
sgd
&
item
,
std
::
ostream
&
out
)
friend
void
serialize
(
const
sgd
&
item
,
std
::
ostream
&
out
)
{
{
serialize
(
"sgd"
,
out
);
serialize
(
"sgd
2
"
,
out
);
serialize
(
item
.
v
,
out
);
serialize
(
item
.
v
,
out
);
serialize
(
item
.
weight_decay
,
out
);
serialize
(
item
.
weight_decay
,
out
);
serialize
(
item
.
learning_rate
,
out
);
serialize
(
item
.
momentum
,
out
);
serialize
(
item
.
momentum
,
out
);
}
}
...
@@ -65,18 +68,16 @@ namespace dlib
...
@@ -65,18 +68,16 @@ namespace dlib
{
{
std
::
string
version
;
std
::
string
version
;
deserialize
(
version
,
in
);
deserialize
(
version
,
in
);
if
(
version
!=
"sgd"
)
if
(
version
!=
"sgd
2
"
)
throw
serialization_error
(
"Unexpected version found while deserializing dlib::sgd."
);
throw
serialization_error
(
"Unexpected version found while deserializing dlib::sgd."
);
deserialize
(
item
.
v
,
in
);
deserialize
(
item
.
v
,
in
);
deserialize
(
item
.
weight_decay
,
in
);
deserialize
(
item
.
weight_decay
,
in
);
deserialize
(
item
.
learning_rate
,
in
);
deserialize
(
item
.
momentum
,
in
);
deserialize
(
item
.
momentum
,
in
);
}
}
private
:
private
:
resizable_tensor
v
;
resizable_tensor
v
;
float
weight_decay
;
float
weight_decay
;
float
learning_rate
;
float
momentum
;
float
momentum
;
};
};
...
@@ -87,19 +88,21 @@ namespace dlib
...
@@ -87,19 +88,21 @@ namespace dlib
public
:
public
:
adam
(
adam
(
float
learning_rate_
=
0
.
001
,
float
weight_decay_
,
float
weight_decay_
=
0
.
0005
,
float
momentum1_
,
float
momentum1_
=
0
.
9
,
float
momentum2_
float
momentum2_
=
0
.
999
)
)
{
{
weight_decay
=
weight_decay_
;
weight_decay
=
weight_decay_
;
learning_rate
=
learning_rate_
;
momentum1
=
momentum1_
;
momentum1
=
momentum1_
;
momentum2
=
momentum2_
;
momentum2
=
momentum2_
;
t
=
0
;
t
=
0
;
}
}
adam
(
)
:
adam
(
0
.
0005
,
0
.
9
,
0
.
999
)
{}
float
get_momentum1
(
float
get_momentum1
(
)
const
{
return
momentum1
;
}
)
const
{
return
momentum1
;
}
...
@@ -109,14 +112,14 @@ namespace dlib
...
@@ -109,14 +112,14 @@ namespace dlib
float
get_weight_decay
(
float
get_weight_decay
(
)
const
{
return
weight_decay
;
}
)
const
{
return
weight_decay
;
}
float
get_learning_rate
(
template
<
typename
layer_type
>
)
const
{
return
learning_rate
;
}
const
tensor
&
operator
()
(
const
tensor
&
operator
()
(
const
tensor
&
params
,
const
float
learning_rate
,
const
layer_type
&
l
,
const
tensor
&
params_grad
const
tensor
&
params_grad
)
)
{
{
const
tensor
&
params
=
l
.
get_layer_params
();
DLIB_CASSERT
(
params
.
size
()
!=
0
,
""
);
DLIB_CASSERT
(
params
.
size
()
!=
0
,
""
);
if
(
v
.
size
()
==
0
)
if
(
v
.
size
()
==
0
)
{
{
...
@@ -136,12 +139,11 @@ namespace dlib
...
@@ -136,12 +139,11 @@ namespace dlib
friend
void
serialize
(
const
adam
&
item
,
std
::
ostream
&
out
)
friend
void
serialize
(
const
adam
&
item
,
std
::
ostream
&
out
)
{
{
serialize
(
"adam"
,
out
);
serialize
(
"adam
2
"
,
out
);
serialize
(
item
.
m
,
out
);
serialize
(
item
.
m
,
out
);
serialize
(
item
.
v
,
out
);
serialize
(
item
.
v
,
out
);
serialize
(
item
.
s
,
out
);
serialize
(
item
.
s
,
out
);
serialize
(
item
.
weight_decay
,
out
);
serialize
(
item
.
weight_decay
,
out
);
serialize
(
item
.
learning_rate
,
out
);
serialize
(
item
.
momentum1
,
out
);
serialize
(
item
.
momentum1
,
out
);
serialize
(
item
.
momentum2
,
out
);
serialize
(
item
.
momentum2
,
out
);
serialize
(
item
.
t
,
out
);
serialize
(
item
.
t
,
out
);
...
@@ -151,13 +153,12 @@ namespace dlib
...
@@ -151,13 +153,12 @@ namespace dlib
{
{
std
::
string
version
;
std
::
string
version
;
deserialize
(
version
,
in
);
deserialize
(
version
,
in
);
if
(
version
!=
"adam"
)
if
(
version
!=
"adam
2
"
)
throw
serialization_error
(
"Unexpected version found while deserializing dlib::adam."
);
throw
serialization_error
(
"Unexpected version found while deserializing dlib::adam."
);
deserialize
(
item
.
m
,
in
);
deserialize
(
item
.
m
,
in
);
deserialize
(
item
.
v
,
in
);
deserialize
(
item
.
v
,
in
);
deserialize
(
item
.
s
,
in
);
deserialize
(
item
.
s
,
in
);
deserialize
(
item
.
weight_decay
,
in
);
deserialize
(
item
.
weight_decay
,
in
);
deserialize
(
item
.
learning_rate
,
in
);
deserialize
(
item
.
momentum1
,
in
);
deserialize
(
item
.
momentum1
,
in
);
deserialize
(
item
.
momentum2
,
in
);
deserialize
(
item
.
momentum2
,
in
);
deserialize
(
item
.
t
,
in
);
deserialize
(
item
.
t
,
in
);
...
@@ -168,7 +169,6 @@ namespace dlib
...
@@ -168,7 +169,6 @@ namespace dlib
resizable_tensor
v
;
resizable_tensor
v
;
resizable_tensor
s
;
resizable_tensor
s
;
float
weight_decay
;
float
weight_decay
;
float
learning_rate
;
float
momentum1
;
float
momentum1
;
float
momentum2
;
float
momentum2
;
float
t
;
float
t
;
...
...
dlib/dnn/solvers_abstract.h
View file @
93bbe5ff
...
@@ -33,22 +33,28 @@ namespace dlib
...
@@ -33,22 +33,28 @@ namespace dlib
EXAMPLE_SOLVER
(
EXAMPLE_SOLVER
(
);
);
template
<
typename
layer_type
>
const
tensor
&
operator
()
(
const
tensor
&
operator
()
(
const
tensor
&
params
,
const
float
learning_rate
,
const
layer_type
&
l
,
const
tensor
&
params_grad
const
tensor
&
params_grad
)
)
/*!
/*!
requires
requires
-
params
.size() != 0
-
l.get_layer_params()
.size() != 0
- have_same_dimensions(
params
, params_grad) == true.
- have_same_dimensions(
l.get_layer_params()
, params_grad) == true.
- When this function is invoked on a particular solver instance, it is
- When this function is invoked on a particular solver instance, it is
always supplied with
parameters from the same layer instance. That is,
always supplied with
the same layer instance, l. That is, the solver is
the solver is allowed to remember things from one invocation to another
allowed to remember things from one invocation to another and to assume
and to assume that it is being serially applied to optimize the same
that it is being serially applied to optimize the same layer's
parameters.
parameters.
ensures
ensures
- Returns a step vector V that is intended to be used to update the
- Returns a step vector V that is intended to be used to update the
parameters by adding V to params.
parameters by adding V to l.get_layer_params().
- This function will use the given "learning rate" to compute V. How the
learning rate is used is solver dependent. But in general the learning
rate should be used to select the step size, i.e. to somehow determine
the magnitude of V.
!*/
!*/
};
};
...
@@ -68,32 +74,34 @@ namespace dlib
...
@@ -68,32 +74,34 @@ namespace dlib
WHAT THIS OBJECT REPRESENTS
WHAT THIS OBJECT REPRESENTS
This object implements the EXAMPLE_SOLVER interface defined above. It is a
This object implements the EXAMPLE_SOLVER interface defined above. It is a
basic stochastic gradient descent solver which uses momentum and weight
basic stochastic gradient descent solver which uses momentum and weight
decay. In particular, it performs the following update each time the
decay. In particular, it computes the update vector V according to:
solver is invoked:
V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
v = momentum*v - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
Here V is a momentum term that is remembered by the solver from one
l.get_layer_params() += v;
invocation of operator() to the next.
Here v is a momentum term that is remembered by the solver from one
invocation of operator() to the next.
!*/
!*/
public
:
public
:
sgd
(
sgd
(
float
learning_rate
=
0
.
01
,
);
float
weight_decay
=
0
.
0005
,
/*!
float
momentum
=
0
.
9
ensures
- #get_weight_decay() == 0.0005
- #get_momentum() == 0.9
!*/
sgd
(
float
weight_decay
,
float
momentum
);
);
/*!
/*!
requires
requires
- learning_rate > 0
- weight_decay >= 0
- weight_decay >= 0
- momentum >= 0
- momentum >= 0
ensures
ensures
- #get_learning_rate() == learning_rate
- #get_weight_decay() == weight_decay
- #get_weight_decay() == weight_decay
- #get_momentum() == momentum
- #get_momentum() == momentum
!*/
!*/
float
get_learning_rate
()
const
;
float
get_weight_decay
()
const
;
float
get_weight_decay
()
const
;
float
get_momentum
()
const
;
float
get_momentum
()
const
;
};
};
...
@@ -120,25 +128,30 @@ namespace dlib
...
@@ -120,25 +128,30 @@ namespace dlib
public
:
public
:
adam
(
adam
(
float
learning_rate
=
0
.
001
,
);
float
weight_decay
=
0
.
0005
,
/*!
float
momentum1
=
0
.
9
,
ensures
float
momentum2
=
0
.
999
- #get_weight_decay() == 0.0005
- #get_momentum1() == 0.9
- #get_momentum2() == 0.999
!*/
adam
(
float
weight_decay
,
float
momentum1
,
float
momentum2
);
);
/*!
/*!
requires
requires
- learning_rate > 0
- weight_decay >= 0
- weight_decay >= 0
- 0 <= momentum1 < 1
- 0 <= momentum1 < 1
- 0 <= momentum2 < 1
- 0 <= momentum2 < 1
ensures
ensures
- #get_learning_rate() == learning_rate
- #get_weight_decay() == weight_decay
- #get_weight_decay() == weight_decay
- #get_momentum1() == momentum1
- #get_momentum1() == momentum1
- #get_momentum2() == momentum2
- #get_momentum2() == momentum2
!*/
!*/
float
get_learning_rate
()
const
;
float
get_weight_decay
()
const
;
float
get_weight_decay
()
const
;
float
get_momentum1
()
const
;
float
get_momentum1
()
const
;
float
get_momentum2
()
const
;
float
get_momentum2
()
const
;
...
...
dlib/dnn/trainer.h
View file @
93bbe5ff
This diff is collapsed.
Click to expand it.
dlib/dnn/trainer_abstract.h
View file @
93bbe5ff
...
@@ -68,10 +68,10 @@ namespace dlib
...
@@ -68,10 +68,10 @@ namespace dlib
provided solver instance.
provided solver instance.
- #get_max_num_epochs() == 10000
- #get_max_num_epochs() == 10000
- #get_mini_batch_size() == 128
- #get_mini_batch_size() == 128
- #get_
step_size() == 1
- #get_
learning_rate() == 1e-2
- #get_min_
step_size() == 1e-3
- #get_min_
learning_rate() == 1e-5
- #get_iterations_without_progress_threshold() == 2000
- #get_iterations_without_progress_threshold() == 2000
- #get_
step_siz
e_shrink() == 0.1
- #get_
learning_rat
e_shrink() == 0.1
- if (cuda_extra_devices.size() > 0) then
- if (cuda_extra_devices.size() > 0) then
- This object will use multiple graphics cards to run the learning
- This object will use multiple graphics cards to run the learning
algorithms. In particular, it will always use whatever device is
algorithms. In particular, it will always use whatever device is
...
@@ -102,6 +102,8 @@ namespace dlib
...
@@ -102,6 +102,8 @@ namespace dlib
get_net(). In particular, the first layer's solver is
get_net(). In particular, the first layer's solver is
get_solvers()[0], the second layer's solver is
get_solvers()[0], the second layer's solver is
get_solvers()[1], and so on.
get_solvers()[1], and so on.
- This function blocks until all threads inside the dnn_trainer have
stopped touching the net.
!*/
!*/
unsigned
long
get_mini_batch_size
(
unsigned
long
get_mini_batch_size
(
...
@@ -142,54 +144,51 @@ namespace dlib
...
@@ -142,54 +144,51 @@ namespace dlib
- #get_max_num_epochs() == num
- #get_max_num_epochs() == num
!*/
!*/
void
set_
step_siz
e
(
void
set_
learning_rat
e
(
double
ss
double
lr
);
);
/*!
/*!
requires
requires
-
ss
> 0
-
lr
> 0
ensures
ensures
- #get_step_size() == ss
- #get_learning_rate() == lr
- This function blocks until all threads inside the dnn_trainer have
stopped touching the net.
!*/
!*/
double
get_
step_siz
e
(
double
get_
learning_rat
e
(
)
const
;
)
const
;
/*!
/*!
ensures
ensures
- During each training step, a solver tells us how to modify the parameters
- During each training step, a solver tells us how to modify the parameters
of each layer in the network. It does this by outputting a step vector,
of each layer in the network. It does this by outputting a step vector
that when added to the parameters, will hopefully result in improved
that, when added to the parameters, will hopefully result in improved
network performance. In our case, at each step, we multiply the step
network performance. The learning rate is one of the inputs to the
vector from the solver by get_step_size() before adding it to the
solver and influences the size of this step vector.
parameters. Therefore, get_step_size() controls the "learning rate" used
during training.
It should be emphasized that this learning rate applied by dnn_trainer is
independent from any learning rate scheduling a solver might itself apply
to the step vector it outputs. That is, the dnn_trainer doesn't know
what the solver is doing. It just takes the output from a solver and
multiplies it by get_step_size() before applying the step vector.
!*/
!*/
void
set_min_
step_siz
e
(
void
set_min_
learning_rat
e
(
double
ss
double
lr
);
);
/*!
/*!
requires
requires
-
ss
> 0
-
lr
> 0
ensures
ensures
- #get_min_
step_size() == ss
- #get_min_
learning_rate() == lr
!*/
!*/
double
get_min_
step_siz
e
(
double
get_min_
learning_rat
e
(
)
const
;
)
const
;
/*!
/*!
ensures
ensures
- During training, this object will test if progress is still being made
- During training via this->train(), this object will test if progress is
and if it isn't then it will reduce get_step_size() by setting it to
still being made and if it isn't then it will reduce get_learning_rate()
get_step_size()*get_step_size_shrink(). However, it will not reduce it
by setting it to get_learning_rate()*get_learning_rate_shrink().
below get_min_step_size(). Once this minimum step size is crossed the
However, it will not reduce it below get_min_learning_rate(). Once this
training will terminate.
minimum learning rate is crossed the training will terminate.
- get_min_learning_rate() doesn't apply if you are using train_one_step().
You can keep calling train_one_step() as many times as you want and the
learning rate will drop infinitely close to 0 if you run long enough.
!*/
!*/
void
set_iterations_without_progress_threshold
(
void
set_iterations_without_progress_threshold
(
...
@@ -209,33 +208,33 @@ namespace dlib
...
@@ -209,33 +208,33 @@ namespace dlib
get_iterations_without_progress_threshold() mini-batch results and
get_iterations_without_progress_threshold() mini-batch results and
applying the statistical test defined by the running_gradient object to
applying the statistical test defined by the running_gradient object to
see if the training error is getting smaller. If it isn't being reduced
see if the training error is getting smaller. If it isn't being reduced
then get_
step_size() is made smaller by a factor of get_step_siz
e_shrink().
then get_
learning_rate() is made smaller by a factor of get_learning_rat
e_shrink().
Therefore, get_iterations_without_progress_threshold() should always be
Therefore, get_iterations_without_progress_threshold() should always be
set to something sensibly large so that this test can be done with
set to something sensibly large so that this test can be done with
reasonably high confidence. Think of this test as saying "if the loss
reasonably high confidence. Think of this test as saying "if the loss
hasn't decreased for the previous get_iterations_without_progress_threshold()
hasn't decreased for the previous get_iterations_without_progress_threshold()
then shrink the
step siz
e".
then shrink the
learning rat
e".
!*/
!*/
void
set_
step_siz
e_shrink_amount
(
void
set_
learning_rat
e_shrink_amount
(
double
shrink
double
shrink
);
);
/*!
/*!
requires
requires
- 0 < shrink && shrink <= 1
- 0 < shrink && shrink <= 1
ensures
ensures
- #get_
step_siz
e_shrink() == shrink
- #get_
learning_rat
e_shrink() == shrink
!*/
!*/
double
get_
step_siz
e_shrink
(
double
get_
learning_rat
e_shrink
(
)
const
;
)
const
;
/*!
/*!
ensures
ensures
- Whenever the training routine thinks it isn't making progress anymore it
- Whenever the training routine thinks it isn't making progress anymore it
will reduce get_
step_size() by multiplying it by get_step_siz
e_shrink().
will reduce get_
learning_rate() by multiplying it by get_learning_rat
e_shrink().
- You can disable the automatic
step siz
e reduction by setting
- You can disable the automatic
learning rat
e reduction by setting
get_
step_siz
e_shrink() to 1.
get_
learning_rat
e_shrink() to 1.
!*/
!*/
void
be_verbose
(
void
be_verbose
(
...
@@ -283,8 +282,8 @@ namespace dlib
...
@@ -283,8 +282,8 @@ namespace dlib
- Trains a supervised neural network based on the given training data.
- Trains a supervised neural network based on the given training data.
The goal of training is to find the network parameters that minimize
The goal of training is to find the network parameters that minimize
get_net().compute_loss(data.begin(), data.end(), labels.begin()).
get_net().compute_loss(data.begin(), data.end(), labels.begin()).
- The optimizer will run until get_
step_size() < get_min_step_size() or
- The optimizer will run until get_
learning_rate() < get_min_learning_rate()
get_max_num_epochs() training epochs have been executed.
or
get_max_num_epochs() training epochs have been executed.
- Each layer in the network will be optimized by its corresponding solver
- Each layer in the network will be optimized by its corresponding solver
in get_solvers().
in get_solvers().
- Each call to train DOES NOT reinitialize the state of get_net() or
- Each call to train DOES NOT reinitialize the state of get_net() or
...
@@ -309,8 +308,8 @@ namespace dlib
...
@@ -309,8 +308,8 @@ namespace dlib
- Trains an unsupervised neural network based on the given training data.
- Trains an unsupervised neural network based on the given training data.
The goal of training is to find the network parameters that minimize
The goal of training is to find the network parameters that minimize
get_net().compute_loss(data.begin(), data.end()).
get_net().compute_loss(data.begin(), data.end()).
- The optimizer will run until get_
step_size() < get_min_step_size() or
- The optimizer will run until get_
learning_rate() < get_min_learning_rate()
get_max_num_epochs() training epochs have been executed.
or
get_max_num_epochs() training epochs have been executed.
- Each layer in the network will be optimized by its corresponding solver
- Each layer in the network will be optimized by its corresponding solver
in get_solvers().
in get_solvers().
- Each call to train DOES NOT reinitialize the state of get_net() or
- Each call to train DOES NOT reinitialize the state of get_net() or
...
@@ -381,6 +380,8 @@ namespace dlib
...
@@ -381,6 +380,8 @@ namespace dlib
- Note that, if be_verbose() has been called, then this object will
- Note that, if be_verbose() has been called, then this object will
automatically call clear_average_loss() periodically when it logs the
automatically call clear_average_loss() periodically when it logs the
loss to the console.
loss to the console.
- This function blocks until all threads inside the dnn_trainer have
stopped touching the net.
!*/
!*/
void
clear_average_loss
(
void
clear_average_loss
(
...
@@ -393,6 +394,8 @@ namespace dlib
...
@@ -393,6 +394,8 @@ namespace dlib
applied during training. Calling clear_average_loss() resets the
applied during training. Calling clear_average_loss() resets the
running_stats object so it forgets about all previous loss values
running_stats object so it forgets about all previous loss values
observed.
observed.
- This function blocks until all threads inside the dnn_trainer have
stopped touching the net.
!*/
!*/
};
};
...
...
examples/dnn_mnist_advanced_ex.cpp
View file @
93bbe5ff
...
@@ -33,23 +33,27 @@ using namespace dlib;
...
@@ -33,23 +33,27 @@ using namespace dlib;
// It exists solely so other layers can refer to it. In this case, the
// It exists solely so other layers can refer to it. In this case, the
// add_prev1 layer looks for the tag1 layer and will take the tag1 output and
// add_prev1 layer looks for the tag1 layer and will take the tag1 output and
// add it to the input of the add_prev1 layer. This combination allows us to
// add it to the input of the add_prev1 layer. This combination allows us to
// implement skip and residual style networks.
// implement skip and residual style networks. We have also made base_res
template
<
int
stride
,
typename
SUBNET
>
// parameterized by BN, which will let us insert different batch normalization
using
base_res
=
relu
<
add_prev1
<
bn_con
<
con
<
8
,
3
,
3
,
1
,
1
,
relu
<
bn_con
<
con
<
8
,
3
,
3
,
stride
,
stride
,
tag1
<
SUBNET
>>>>>>>>
;
// layers.
template
<
template
<
typename
>
class
BN
,
typename
SUBNET
>
// Let's also define the same block but with all the batch normalization layers
using
base_res
=
relu
<
add_prev1
<
BN
<
con
<
8
,
3
,
3
,
1
,
1
,
relu
<
BN
<
con
<
8
,
3
,
3
,
1
,
1
,
tag1
<
SUBNET
>>>>>>>>
;
// replaced with affine transform layers. We will use this type of construction
// when testing our networks.
// We also want a residual block that begins by doing downsampling. We can
template
<
int
stride
,
typename
SUBNET
>
// reuse base_res to define it like this:
using
base_ares
=
relu
<
add_prev1
<
affine
<
con
<
8
,
3
,
3
,
1
,
1
,
relu
<
affine
<
con
<
8
,
3
,
3
,
stride
,
stride
,
tag1
<
SUBNET
>>>>>>>>
;
template
<
template
<
typename
>
class
BN
,
typename
SUBNET
>
using
base_res_down
=
base_res
<
BN
,
avg_pool
<
1
,
1
,
2
,
2
,
SUBNET
>>
;
// And of course we can define more alias templates based on previously defined
// alias templates. The _down versions downsample the inputs by a factor of 2
// Now we can define 4 different residual blocks we will use in this example.
// while the res and ares layer types don't.
// The first two are non-downsampling residual blocks while the last two
template
<
typename
SUBNET
>
using
res
=
base_res
<
1
,
SUBNET
>
;
// downsample. Also, res and res_down use batch normalization while ares and
template
<
typename
SUBNET
>
using
res_down
=
base_res
<
2
,
SUBNET
>
;
// ares_down have had the batch normalization replaced with simple affine
template
<
typename
SUBNET
>
using
ares
=
base_ares
<
1
,
SUBNET
>
;
// layers. We will use the affine version of the layers when testing our
template
<
typename
SUBNET
>
using
ares_down
=
base_ares
<
2
,
SUBNET
>
;
// networks.
template
<
typename
SUBNET
>
using
res
=
base_res
<
bn_con
,
SUBNET
>
;
template
<
typename
SUBNET
>
using
ares
=
base_res
<
affine
,
SUBNET
>
;
template
<
typename
SUBNET
>
using
res_down
=
base_res_down
<
bn_con
,
SUBNET
>
;
template
<
typename
SUBNET
>
using
ares_down
=
base_res_down
<
affine
,
SUBNET
>
;
...
@@ -141,37 +145,39 @@ int main(int argc, char** argv) try
...
@@ -141,37 +145,39 @@ int main(int argc, char** argv) try
// These print statements will output this (I've truncated it since it's
// These print statements will output this (I've truncated it since it's
// long, but you get the idea):
// long, but you get the idea):
/*
/*
The pnet has 12
5
layers in it.
The pnet has 12
7
layers in it.
layer<0>
loss_multiclass_log
layer<0> loss_multiclass_log
layer<1>
fc (num_outputs=10)
layer<1> fc (num_outputs=10)
layer<2>
avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0)
layer<2> avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0)
layer<3>
prelu (initial_param_value=0.2)
layer<3> prelu (initial_param_value=0.2)
layer<4>
add_prev
layer<4> add_prev
layer<5>
bn_con
layer<5> bn_con
layer<6>
con (num_filters=8, nr=3, nc=3, stride_y=1, stride
_x=1)
layer<6>
con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding
_x=1)
layer<7>
prelu (initial_param_value=0.25)
layer<7> prelu (initial_param_value=0.25)
layer<8>
bn_con
layer<8> bn_con
layer<9>
con (num_filters=8, nr=3, nc=3, stride_y=1, stride
_x=1)
layer<9>
con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding
_x=1)
layer<10>
tag1
layer<10> tag1
...
...
layer<33> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2)
layer<33> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<34> tag1
layer<34> tag1
layer<35> tag4
layer<35> avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
layer<36> prelu (initial_param_value=0.3)
layer<36> tag4
layer<37> add_prev
layer<37> prelu (initial_param_value=0.3)
layer<38> bn_con
layer<38> add_prev
layer<39> bn_con
...
...
layer<114> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2)
layer<115> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<115> tag1
layer<116> tag1
layer<116> relu
layer<117> avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
layer<117> add_prev
layer<118> relu
layer<118> bn_con
layer<119> add_prev
layer<119> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1)
layer<120> bn_con
layer<120> relu
layer<121> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<121> bn_con
layer<122> relu
layer<122> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1)
layer<123> bn_con
layer<123> tag1
layer<124> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<124> input<matrix>
layer<125> tag1
layer<126> input<matrix>
*/
*/
// Now that we know the index numbers for each layer, we can access them
// Now that we know the index numbers for each layer, we can access them
...
@@ -189,7 +195,7 @@ int main(int argc, char** argv) try
...
@@ -189,7 +195,7 @@ int main(int argc, char** argv) try
// parts of your network and access them by layer<tag>(). You can also
// parts of your network and access them by layer<tag>(). You can also
// index relative to a tag. So for example, to access the layer immediately
// index relative to a tag. So for example, to access the layer immediately
// after tag4 you can say:
// after tag4 you can say:
layer
<
tag4
,
1
>
(
pnet
);
// Equivalent to layer<3
5
+1>(pnet).
layer
<
tag4
,
1
>
(
pnet
);
// Equivalent to layer<3
6
+1>(pnet).
// Or to access the layer 2 layers after tag4:
// Or to access the layer 2 layers after tag4:
layer
<
tag4
,
2
>
(
pnet
);
layer
<
tag4
,
2
>
(
pnet
);
...
@@ -203,23 +209,26 @@ int main(int argc, char** argv) try
...
@@ -203,23 +209,26 @@ int main(int argc, char** argv) try
// talk about training networks!
// talk about training networks!
// The dnn_trainer will use SGD by default, but you can tell it to use
// The dnn_trainer will use SGD by default, but you can tell it to use
// different solvers like adam.
// different solvers like adam with a weight decay of 0.0005 and the given
dnn_trainer
<
net_type
,
adam
>
trainer
(
net
,
adam
(
0.001
));
// momentum parameters.
dnn_trainer
<
net_type
,
adam
>
trainer
(
net
,
adam
(
0.0005
,
0.9
,
0.999
));
// Also, if you have multiple graphics cards you can tell the trainer to use
// Also, if you have multiple graphics cards you can tell the trainer to use
// them together to make the training faster. For example, replacing the
// them together to make the training faster. For example, replacing the
// above constructor call with this one would cause it to use GPU cards 0
// above constructor call with this one would cause it to use GPU cards 0
// and 1.
// and 1.
//dnn_trainer<net_type,adam> trainer(net,adam(0.00
1
), {0,1});
//dnn_trainer<net_type,adam> trainer(net,adam(0.00
05, 0.9, 0.999
), {0,1});
trainer
.
be_verbose
();
trainer
.
be_verbose
();
trainer
.
set_synchronization_file
(
"mnist_resnet_sync"
,
std
::
chrono
::
seconds
(
100
));
trainer
.
set_synchronization_file
(
"mnist_resnet_sync"
,
std
::
chrono
::
seconds
(
100
));
// While the trainer is running it keeps an eye on the training error. If
// While the trainer is running it keeps an eye on the training error. If
// it looks like the error hasn't decreased for the last 2000 iterations it
// it looks like the error hasn't decreased for the last 2000 iterations it
// will automatically reduce the
step siz
e by 0.1. You can change these
// will automatically reduce the
learning rat
e by 0.1. You can change these
// default parameters to some other values by calling these functions. Or
// default parameters to some other values by calling these functions. Or
// disable the
m
entirely by setting the shrink amount to 1.
// disable the
automatic shrinking
entirely by setting the shrink amount to 1.
trainer
.
set_iterations_without_progress_threshold
(
2000
);
trainer
.
set_iterations_without_progress_threshold
(
2000
);
trainer
.
set_step_size_shrink_amount
(
0.1
);
trainer
.
set_learning_rate_shrink_amount
(
0.1
);
// The learning rate will start at 1e-3.
trainer
.
set_learning_rate
(
1e-3
);
// Now, what if your training dataset is so big it doesn't fit in RAM? You
// Now, what if your training dataset is so big it doesn't fit in RAM? You
...
@@ -230,10 +239,10 @@ int main(int argc, char** argv) try
...
@@ -230,10 +239,10 @@ int main(int argc, char** argv) try
std
::
vector
<
matrix
<
unsigned
char
>>
mini_batch_samples
;
std
::
vector
<
matrix
<
unsigned
char
>>
mini_batch_samples
;
std
::
vector
<
unsigned
long
>
mini_batch_labels
;
std
::
vector
<
unsigned
long
>
mini_batch_labels
;
dlib
::
rand
rnd
(
time
(
0
));
dlib
::
rand
rnd
(
time
(
0
));
// Loop until the trainer's automatic shrinking has shrunk the
step size by
// Loop until the trainer's automatic shrinking has shrunk the
learning rate to 1e-6.
//
1e-3. For the default shrinks amount of 0.1 this means stop after it
//
Given our settings, this means it will stop training after it has shrunk the
//
shrinks it
3 times.
//
learning rate
3 times.
while
(
trainer
.
get_
step_size
()
>=
1e-3
)
while
(
trainer
.
get_
learning_rate
()
>=
1e-6
)
{
{
mini_batch_samples
.
clear
();
mini_batch_samples
.
clear
();
mini_batch_labels
.
clear
();
mini_batch_labels
.
clear
();
...
...
examples/dnn_mnist_ex.cpp
View file @
93bbe5ff
...
@@ -89,7 +89,9 @@ int main(int argc, char** argv) try
...
@@ -89,7 +89,9 @@ int main(int argc, char** argv) try
net_type
net
;
net_type
net
;
// And then train it using the MNIST data. The code below uses mini-batch stochastic
// And then train it using the MNIST data. The code below uses mini-batch stochastic
// gradient descent with an initial learning rate of 0.01 to accomplish this.
// gradient descent with an initial learning rate of 0.01 to accomplish this.
dnn_trainer
<
net_type
>
trainer
(
net
,
sgd
(
0.01
));
dnn_trainer
<
net_type
>
trainer
(
net
);
trainer
.
set_learning_rate
(
0.01
);
trainer
.
set_min_learning_rate
(
0.00001
);
trainer
.
set_mini_batch_size
(
128
);
trainer
.
set_mini_batch_size
(
128
);
trainer
.
be_verbose
();
trainer
.
be_verbose
();
// Since DNN training can take a long time, we can ask the trainer to save its state to
// Since DNN training can take a long time, we can ask the trainer to save its state to
...
@@ -97,11 +99,11 @@ int main(int argc, char** argv) try
...
@@ -97,11 +99,11 @@ int main(int argc, char** argv) try
// start it again it will begin where it left off rather than restarting the training
// start it again it will begin where it left off rather than restarting the training
// from scratch.
// from scratch.
trainer
.
set_synchronization_file
(
"mnist_sync"
,
std
::
chrono
::
seconds
(
20
));
trainer
.
set_synchronization_file
(
"mnist_sync"
,
std
::
chrono
::
seconds
(
20
));
// Finally, this line begins training. By default, it runs SGD with our specified
step
// Finally, this line begins training. By default, it runs SGD with our specified
//
size until the loss stops decreasing. Then it reduces the step size by a factor of
//
learning rate until the loss stops decreasing. Then it reduces the learning rate by
//
10 and continues running until the loss stops decreasing again. It will reduce the
//
a factor of 10 and continues running until the loss stops decreasing again. It will
//
step size 3 times and then terminate. For a longer discussion, see the documentation
//
keep doing this until the learning rate has dropped below the min learning rate
//
of the dnn_trainer object.
//
defined above or the maximum number of epochs as been executed (defaulted to 10000).
trainer
.
train
(
training_images
,
training_labels
);
trainer
.
train
(
training_images
,
training_labels
);
// At this point our net object should have learned how to classify MNIST images. But
// At this point our net object should have learned how to classify MNIST images. But
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment