Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
D
dlib
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
钟尚武
dlib
Commits
93bbe5ff
Commit
93bbe5ff
authored
May 15, 2016
by
Davis King
Browse files
Options
Browse Files
Download
Plain Diff
merged
parents
9763c471
66166c67
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
214 additions
and
184 deletions
+214
-184
core.h
dlib/dnn/core.h
+19
-18
core_abstract.h
dlib/dnn/core_abstract.h
+10
-8
solvers.h
dlib/dnn/solvers.h
+27
-27
solvers_abstract.h
dlib/dnn/solvers_abstract.h
+39
-26
trainer.h
dlib/dnn/trainer.h
+0
-0
trainer_abstract.h
dlib/dnn/trainer_abstract.h
+45
-42
dnn_mnist_advanced_ex.cpp
examples/dnn_mnist_advanced_ex.cpp
+66
-57
dnn_mnist_ex.cpp
examples/dnn_mnist_ex.cpp
+8
-6
No files found.
dlib/dnn/core.h
View file @
93bbe5ff
...
...
@@ -825,16 +825,16 @@ namespace dlib
}
template
<
typename
solver_type
>
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
step_siz
e
)
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
learning_rat
e
)
{
DLIB_CASSERT
(
solvers
.
size
()
>=
num_computational_layers
,
""
);
// Don't try to adjust the parameters if this layer doesn't have any.
if
(
params_grad
.
size
()
!=
0
)
{
const
tensor
&
step
=
solvers
.
top
()(
details
.
get_layer_params
()
,
static_cast
<
const
tensor
&>
(
params_grad
));
tt
::
add
(
1
,
details
.
get_layer_params
(),
step_size
,
step
);
const
tensor
&
step
=
solvers
.
top
()(
learning_rate
,
details
,
static_cast
<
const
tensor
&>
(
params_grad
));
tt
::
add
(
details
.
get_layer_params
(),
details
.
get_layer_params
()
,
step
);
}
subnetwork
->
update_parameters
(
solvers
.
pop
(),
step_siz
e
);
subnetwork
->
update_parameters
(
solvers
.
pop
(),
learning_rat
e
);
}
const
tensor
&
get_parameter_gradient
(
...
...
@@ -1175,13 +1175,14 @@ namespace dlib
}
template
<
typename
solver_type
>
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
step_siz
e
)
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
learning_rat
e
)
{
DLIB_CASSERT
(
solvers
.
size
()
>=
num_computational_layers
,
""
);
// Don't try to adjust the parameters if this layer doesn't have any.
if
(
params_grad
.
size
()
!=
0
)
{
const
tensor
&
step
=
solvers
.
top
()(
details
.
get_layer_params
(),
static_cast
<
const
tensor
&>
(
params_grad
));
tt
::
add
(
1
,
details
.
get_layer_params
(),
step_size
,
step
);
if
(
params_grad
.
size
()
!=
0
)
{
const
tensor
&
step
=
solvers
.
top
()(
learning_rate
,
details
,
static_cast
<
const
tensor
&>
(
params_grad
));
tt
::
add
(
details
.
get_layer_params
(),
details
.
get_layer_params
(),
step
);
}
}
...
...
@@ -1401,9 +1402,9 @@ namespace dlib
}
template
<
typename
solver_type
>
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
step_siz
e
)
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
learning_rat
e
)
{
subnetwork
.
update_parameters
(
solvers
,
step_siz
e
);
subnetwork
.
update_parameters
(
solvers
,
learning_rat
e
);
}
const
tensor
&
get_parameter_gradient
(
...
...
@@ -1687,11 +1688,11 @@ namespace dlib
}
template
<
typename
solver_type
>
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
step_siz
e
)
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
learning_rat
e
)
{
for
(
size_t
i
=
0
;
i
<
details
.
size
();
++
i
)
details
[
i
].
update_parameters
(
solvers
.
pop
(
comp_layers_in_each_group
*
i
),
step_siz
e
);
subnetwork
.
update_parameters
(
solvers
.
pop
(
comp_layers_in_each_group
*
details
.
size
()),
step_siz
e
);
details
[
i
].
update_parameters
(
solvers
.
pop
(
comp_layers_in_each_group
*
i
),
learning_rat
e
);
subnetwork
.
update_parameters
(
solvers
.
pop
(
comp_layers_in_each_group
*
details
.
size
()),
learning_rat
e
);
}
const
subnet_type
&
subnet
()
const
{
return
subnetwork
;
}
...
...
@@ -1905,7 +1906,7 @@ namespace dlib
}
template
<
typename
solver_type
>
void
update_parameters
(
sstack
<
solver_type
>
/*solvers*/
,
double
/*
step_siz
e*/
)
void
update_parameters
(
sstack
<
solver_type
>
/*solvers*/
,
double
/*
learning_rat
e*/
)
{
// nothing to do
}
...
...
@@ -2248,10 +2249,10 @@ namespace dlib
template
<
typename
solver_type
>
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
step_siz
e
double
learning_rat
e
)
{
subnetwork
.
update_parameters
(
solvers
,
step_siz
e
);
subnetwork
.
update_parameters
(
solvers
,
learning_rat
e
);
}
const
subnet_type
&
subnet
()
const
{
return
subnetwork
;
}
...
...
@@ -2542,9 +2543,9 @@ namespace dlib
}
template
<
typename
solver_type
>
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
step_siz
e
)
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
learning_rat
e
)
{
subnetwork
.
update_parameters
(
solvers
,
step_siz
e
);
subnetwork
.
update_parameters
(
solvers
,
learning_rat
e
);
}
const
tensor
&
get_parameter_gradient
(
...
...
dlib/dnn/core_abstract.h
View file @
93bbe5ff
...
...
@@ -506,7 +506,7 @@ namespace dlib
template
<
typename
solver_type
>
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
step_siz
e
double
learning_rat
e
);
/*!
requires
...
...
@@ -517,13 +517,14 @@ namespace dlib
if you want to call update_parameters() on some other neural network
object then you must NOT reuse the same solvers object.
- solvers.size() >= num_computational_layers
- 0 <
step_siz
e <= 1
- 0 <
learning_rat
e <= 1
ensures
- Updates all the parameters in the network. In particular, we pass each
layer's parameter gradient (i.e. the tensor returned by the layer's
get_parameter_gradient() member) through that layer's corresponding
solver object. This produces a parameter delta vector and we add
step_size times that vector to the layer's parameters.
solver object. This produces a parameter delta vector which we add to
the layer's parameters.
- The solvers use the given learning rate.
!*/
void
clean
(
...
...
@@ -944,7 +945,7 @@ namespace dlib
template
<
typename
solver_type
>
void
update_parameters
(
sstack
<
solver_type
>
solvers
,
double
step_siz
e
double
learning_rat
e
);
/*!
requires
...
...
@@ -955,13 +956,14 @@ namespace dlib
is, if you want to call update_parameters() on some other neural network
object then you must NOT reuse the same solvers object.
- solvers.size() >= num_computational_layers
- 0 <
step_siz
e <= 1
- 0 <
learning_rat
e <= 1
ensures
- Updates all the parameters in the network. In particular, we pass each
layer's parameter gradient (i.e. the tensor returned by the layer's
get_parameter_gradient() member) through that layer's corresponding
solver object. This produces a parameter delta vector and we add
step_size times that vector to the layer's parameters.
solver object. This produces a parameter delta vector which we add to
the layer's parameters.
- The solvers use the given learning rate.
!*/
// -------------
...
...
dlib/dnn/solvers.h
View file @
93bbe5ff
...
...
@@ -14,30 +14,34 @@ namespace dlib
public
:
sgd
(
float
learning_rate_
=
0
.
01
,
float
weight_decay_
=
0
.
0005
,
float
momentum_
=
0
.
9
float
weight_decay_
,
float
momentum_
)
{
weight_decay
=
weight_decay_
;
learning_rate
=
learning_rate_
;
momentum
=
momentum_
;
}
sgd
(
)
:
sgd
(
0
.
0005
,
0
.
9
)
{
}
float
get_momentum
(
)
const
{
return
momentum
;
}
float
get_weight_decay
(
)
const
{
return
weight_decay
;
}
float
get_learning_rate
(
)
const
{
return
learning_rate
;
}
template
<
typename
layer_type
>
const
tensor
&
operator
()
(
const
tensor
&
params
,
const
float
learning_rate
,
const
layer_type
&
l
,
const
tensor
&
params_grad
)
{
const
tensor
&
params
=
l
.
get_layer_params
();
DLIB_CASSERT
(
params
.
size
()
!=
0
,
""
);
if
(
v
.
size
()
==
0
)
{
...
...
@@ -54,10 +58,9 @@ namespace dlib
friend
void
serialize
(
const
sgd
&
item
,
std
::
ostream
&
out
)
{
serialize
(
"sgd"
,
out
);
serialize
(
"sgd
2
"
,
out
);
serialize
(
item
.
v
,
out
);
serialize
(
item
.
weight_decay
,
out
);
serialize
(
item
.
learning_rate
,
out
);
serialize
(
item
.
momentum
,
out
);
}
...
...
@@ -65,18 +68,16 @@ namespace dlib
{
std
::
string
version
;
deserialize
(
version
,
in
);
if
(
version
!=
"sgd"
)
if
(
version
!=
"sgd
2
"
)
throw
serialization_error
(
"Unexpected version found while deserializing dlib::sgd."
);
deserialize
(
item
.
v
,
in
);
deserialize
(
item
.
weight_decay
,
in
);
deserialize
(
item
.
learning_rate
,
in
);
deserialize
(
item
.
momentum
,
in
);
}
private
:
resizable_tensor
v
;
float
weight_decay
;
float
learning_rate
;
float
momentum
;
};
...
...
@@ -87,19 +88,21 @@ namespace dlib
public
:
adam
(
float
learning_rate_
=
0
.
001
,
float
weight_decay_
=
0
.
0005
,
float
momentum1_
=
0
.
9
,
float
momentum2_
=
0
.
999
float
weight_decay_
,
float
momentum1_
,
float
momentum2_
)
{
weight_decay
=
weight_decay_
;
learning_rate
=
learning_rate_
;
momentum1
=
momentum1_
;
momentum2
=
momentum2_
;
t
=
0
;
}
adam
(
)
:
adam
(
0
.
0005
,
0
.
9
,
0
.
999
)
{}
float
get_momentum1
(
)
const
{
return
momentum1
;
}
...
...
@@ -109,14 +112,14 @@ namespace dlib
float
get_weight_decay
(
)
const
{
return
weight_decay
;
}
float
get_learning_rate
(
)
const
{
return
learning_rate
;
}
template
<
typename
layer_type
>
const
tensor
&
operator
()
(
const
tensor
&
params
,
const
float
learning_rate
,
const
layer_type
&
l
,
const
tensor
&
params_grad
)
{
const
tensor
&
params
=
l
.
get_layer_params
();
DLIB_CASSERT
(
params
.
size
()
!=
0
,
""
);
if
(
v
.
size
()
==
0
)
{
...
...
@@ -136,12 +139,11 @@ namespace dlib
friend
void
serialize
(
const
adam
&
item
,
std
::
ostream
&
out
)
{
serialize
(
"adam"
,
out
);
serialize
(
"adam
2
"
,
out
);
serialize
(
item
.
m
,
out
);
serialize
(
item
.
v
,
out
);
serialize
(
item
.
s
,
out
);
serialize
(
item
.
weight_decay
,
out
);
serialize
(
item
.
learning_rate
,
out
);
serialize
(
item
.
momentum1
,
out
);
serialize
(
item
.
momentum2
,
out
);
serialize
(
item
.
t
,
out
);
...
...
@@ -151,13 +153,12 @@ namespace dlib
{
std
::
string
version
;
deserialize
(
version
,
in
);
if
(
version
!=
"adam"
)
if
(
version
!=
"adam
2
"
)
throw
serialization_error
(
"Unexpected version found while deserializing dlib::adam."
);
deserialize
(
item
.
m
,
in
);
deserialize
(
item
.
v
,
in
);
deserialize
(
item
.
s
,
in
);
deserialize
(
item
.
weight_decay
,
in
);
deserialize
(
item
.
learning_rate
,
in
);
deserialize
(
item
.
momentum1
,
in
);
deserialize
(
item
.
momentum2
,
in
);
deserialize
(
item
.
t
,
in
);
...
...
@@ -168,7 +169,6 @@ namespace dlib
resizable_tensor
v
;
resizable_tensor
s
;
float
weight_decay
;
float
learning_rate
;
float
momentum1
;
float
momentum2
;
float
t
;
...
...
dlib/dnn/solvers_abstract.h
View file @
93bbe5ff
...
...
@@ -33,22 +33,28 @@ namespace dlib
EXAMPLE_SOLVER
(
);
template
<
typename
layer_type
>
const
tensor
&
operator
()
(
const
tensor
&
params
,
const
float
learning_rate
,
const
layer_type
&
l
,
const
tensor
&
params_grad
)
/*!
requires
-
params
.size() != 0
- have_same_dimensions(
params
, params_grad) == true.
-
l.get_layer_params()
.size() != 0
- have_same_dimensions(
l.get_layer_params()
, params_grad) == true.
- When this function is invoked on a particular solver instance, it is
always supplied with
parameters from the same layer instance. That is,
the solver is allowed to remember things from one invocation to another
and to assume that it is being serially applied to optimize the same
always supplied with
the same layer instance, l. That is, the solver is
allowed to remember things from one invocation to another and to assume
that it is being serially applied to optimize the same layer's
parameters.
ensures
- Returns a step vector V that is intended to be used to update the
parameters by adding V to params.
parameters by adding V to l.get_layer_params().
- This function will use the given "learning rate" to compute V. How the
learning rate is used is solver dependent. But in general the learning
rate should be used to select the step size, i.e. to somehow determine
the magnitude of V.
!*/
};
...
...
@@ -68,32 +74,34 @@ namespace dlib
WHAT THIS OBJECT REPRESENTS
This object implements the EXAMPLE_SOLVER interface defined above. It is a
basic stochastic gradient descent solver which uses momentum and weight
decay. In particular, it performs the following update each time the
solver is invoked:
v = momentum*v - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
l.get_layer_params() += v;
Here v is a momentum term that is remembered by the solver from one
invocation of operator() to the next.
decay. In particular, it computes the update vector V according to:
V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
Here V is a momentum term that is remembered by the solver from one
invocation of operator() to the next.
!*/
public
:
sgd
(
float
learning_rate
=
0
.
01
,
float
weight_decay
=
0
.
0005
,
float
momentum
=
0
.
9
);
/*!
ensures
- #get_weight_decay() == 0.0005
- #get_momentum() == 0.9
!*/
sgd
(
float
weight_decay
,
float
momentum
);
/*!
requires
- learning_rate > 0
- weight_decay >= 0
- momentum >= 0
ensures
- #get_learning_rate() == learning_rate
- #get_weight_decay() == weight_decay
- #get_momentum() == momentum
!*/
float
get_learning_rate
()
const
;
float
get_weight_decay
()
const
;
float
get_momentum
()
const
;
};
...
...
@@ -120,25 +128,30 @@ namespace dlib
public
:
adam
(
float
learning_rate
=
0
.
001
,
float
weight_decay
=
0
.
0005
,
float
momentum1
=
0
.
9
,
float
momentum2
=
0
.
999
);
/*!
ensures
- #get_weight_decay() == 0.0005
- #get_momentum1() == 0.9
- #get_momentum2() == 0.999
!*/
adam
(
float
weight_decay
,
float
momentum1
,
float
momentum2
);
/*!
requires
- learning_rate > 0
- weight_decay >= 0
- 0 <= momentum1 < 1
- 0 <= momentum2 < 1
ensures
- #get_learning_rate() == learning_rate
- #get_weight_decay() == weight_decay
- #get_momentum1() == momentum1
- #get_momentum2() == momentum2
!*/
float
get_learning_rate
()
const
;
float
get_weight_decay
()
const
;
float
get_momentum1
()
const
;
float
get_momentum2
()
const
;
...
...
dlib/dnn/trainer.h
View file @
93bbe5ff
This diff is collapsed.
Click to expand it.
dlib/dnn/trainer_abstract.h
View file @
93bbe5ff
...
...
@@ -68,10 +68,10 @@ namespace dlib
provided solver instance.
- #get_max_num_epochs() == 10000
- #get_mini_batch_size() == 128
- #get_
step_size() == 1
- #get_min_
step_size() == 1e-3
- #get_
learning_rate() == 1e-2
- #get_min_
learning_rate() == 1e-5
- #get_iterations_without_progress_threshold() == 2000
- #get_
step_siz
e_shrink() == 0.1
- #get_
learning_rat
e_shrink() == 0.1
- if (cuda_extra_devices.size() > 0) then
- This object will use multiple graphics cards to run the learning
algorithms. In particular, it will always use whatever device is
...
...
@@ -102,6 +102,8 @@ namespace dlib
get_net(). In particular, the first layer's solver is
get_solvers()[0], the second layer's solver is
get_solvers()[1], and so on.
- This function blocks until all threads inside the dnn_trainer have
stopped touching the net.
!*/
unsigned
long
get_mini_batch_size
(
...
...
@@ -142,54 +144,51 @@ namespace dlib
- #get_max_num_epochs() == num
!*/
void
set_
step_siz
e
(
double
ss
void
set_
learning_rat
e
(
double
lr
);
/*!
requires
-
ss
> 0
-
lr
> 0
ensures
- #get_step_size() == ss
- #get_learning_rate() == lr
- This function blocks until all threads inside the dnn_trainer have
stopped touching the net.
!*/
double
get_
step_siz
e
(
double
get_
learning_rat
e
(
)
const
;
/*!
ensures
- During each training step, a solver tells us how to modify the parameters
of each layer in the network. It does this by outputting a step vector,
that when added to the parameters, will hopefully result in improved
network performance. In our case, at each step, we multiply the step
vector from the solver by get_step_size() before adding it to the
parameters. Therefore, get_step_size() controls the "learning rate" used
during training.
It should be emphasized that this learning rate applied by dnn_trainer is
independent from any learning rate scheduling a solver might itself apply
to the step vector it outputs. That is, the dnn_trainer doesn't know
what the solver is doing. It just takes the output from a solver and
multiplies it by get_step_size() before applying the step vector.
of each layer in the network. It does this by outputting a step vector
that, when added to the parameters, will hopefully result in improved
network performance. The learning rate is one of the inputs to the
solver and influences the size of this step vector.
!*/
void
set_min_
step_siz
e
(
double
ss
void
set_min_
learning_rat
e
(
double
lr
);
/*!
requires
-
ss
> 0
-
lr
> 0
ensures
- #get_min_
step_size() == ss
- #get_min_
learning_rate() == lr
!*/
double
get_min_
step_siz
e
(
double
get_min_
learning_rat
e
(
)
const
;
/*!
ensures
- During training, this object will test if progress is still being made
and if it isn't then it will reduce get_step_size() by setting it to
get_step_size()*get_step_size_shrink(). However, it will not reduce it
below get_min_step_size(). Once this minimum step size is crossed the
training will terminate.
- During training via this->train(), this object will test if progress is
still being made and if it isn't then it will reduce get_learning_rate()
by setting it to get_learning_rate()*get_learning_rate_shrink().
However, it will not reduce it below get_min_learning_rate(). Once this
minimum learning rate is crossed the training will terminate.
- get_min_learning_rate() doesn't apply if you are using train_one_step().
You can keep calling train_one_step() as many times as you want and the
learning rate will drop infinitely close to 0 if you run long enough.
!*/
void
set_iterations_without_progress_threshold
(
...
...
@@ -209,33 +208,33 @@ namespace dlib
get_iterations_without_progress_threshold() mini-batch results and
applying the statistical test defined by the running_gradient object to
see if the training error is getting smaller. If it isn't being reduced
then get_
step_size() is made smaller by a factor of get_step_siz
e_shrink().
then get_
learning_rate() is made smaller by a factor of get_learning_rat
e_shrink().
Therefore, get_iterations_without_progress_threshold() should always be
set to something sensibly large so that this test can be done with
reasonably high confidence. Think of this test as saying "if the loss
hasn't decreased for the previous get_iterations_without_progress_threshold()
then shrink the
step siz
e".
then shrink the
learning rat
e".
!*/
void
set_
step_siz
e_shrink_amount
(
void
set_
learning_rat
e_shrink_amount
(
double
shrink
);
/*!
requires
- 0 < shrink && shrink <= 1
ensures
- #get_
step_siz
e_shrink() == shrink
- #get_
learning_rat
e_shrink() == shrink
!*/
double
get_
step_siz
e_shrink
(
double
get_
learning_rat
e_shrink
(
)
const
;
/*!
ensures
- Whenever the training routine thinks it isn't making progress anymore it
will reduce get_
step_size() by multiplying it by get_step_siz
e_shrink().
- You can disable the automatic
step siz
e reduction by setting
get_
step_siz
e_shrink() to 1.
will reduce get_
learning_rate() by multiplying it by get_learning_rat
e_shrink().
- You can disable the automatic
learning rat
e reduction by setting
get_
learning_rat
e_shrink() to 1.
!*/
void
be_verbose
(
...
...
@@ -283,8 +282,8 @@ namespace dlib
- Trains a supervised neural network based on the given training data.
The goal of training is to find the network parameters that minimize
get_net().compute_loss(data.begin(), data.end(), labels.begin()).
- The optimizer will run until get_
step_size() < get_min_step_size() or
get_max_num_epochs() training epochs have been executed.
- The optimizer will run until get_
learning_rate() < get_min_learning_rate()
or
get_max_num_epochs() training epochs have been executed.
- Each layer in the network will be optimized by its corresponding solver
in get_solvers().
- Each call to train DOES NOT reinitialize the state of get_net() or
...
...
@@ -309,8 +308,8 @@ namespace dlib
- Trains an unsupervised neural network based on the given training data.
The goal of training is to find the network parameters that minimize
get_net().compute_loss(data.begin(), data.end()).
- The optimizer will run until get_
step_size() < get_min_step_size() or
get_max_num_epochs() training epochs have been executed.
- The optimizer will run until get_
learning_rate() < get_min_learning_rate()
or
get_max_num_epochs() training epochs have been executed.
- Each layer in the network will be optimized by its corresponding solver
in get_solvers().
- Each call to train DOES NOT reinitialize the state of get_net() or
...
...
@@ -381,6 +380,8 @@ namespace dlib
- Note that, if be_verbose() has been called, then this object will
automatically call clear_average_loss() periodically when it logs the
loss to the console.
- This function blocks until all threads inside the dnn_trainer have
stopped touching the net.
!*/
void
clear_average_loss
(
...
...
@@ -393,6 +394,8 @@ namespace dlib
applied during training. Calling clear_average_loss() resets the
running_stats object so it forgets about all previous loss values
observed.
- This function blocks until all threads inside the dnn_trainer have
stopped touching the net.
!*/
};
...
...
examples/dnn_mnist_advanced_ex.cpp
View file @
93bbe5ff
...
...
@@ -33,23 +33,27 @@ using namespace dlib;
// It exists solely so other layers can refer to it. In this case, the
// add_prev1 layer looks for the tag1 layer and will take the tag1 output and
// add it to the input of the add_prev1 layer. This combination allows us to
// implement skip and residual style networks.
template
<
int
stride
,
typename
SUBNET
>
using
base_res
=
relu
<
add_prev1
<
bn_con
<
con
<
8
,
3
,
3
,
1
,
1
,
relu
<
bn_con
<
con
<
8
,
3
,
3
,
stride
,
stride
,
tag1
<
SUBNET
>>>>>>>>
;
// Let's also define the same block but with all the batch normalization layers
// replaced with affine transform layers. We will use this type of construction
// when testing our networks.
template
<
int
stride
,
typename
SUBNET
>
using
base_ares
=
relu
<
add_prev1
<
affine
<
con
<
8
,
3
,
3
,
1
,
1
,
relu
<
affine
<
con
<
8
,
3
,
3
,
stride
,
stride
,
tag1
<
SUBNET
>>>>>>>>
;
// And of course we can define more alias templates based on previously defined
// alias templates. The _down versions downsample the inputs by a factor of 2
// while the res and ares layer types don't.
template
<
typename
SUBNET
>
using
res
=
base_res
<
1
,
SUBNET
>
;
template
<
typename
SUBNET
>
using
res_down
=
base_res
<
2
,
SUBNET
>
;
template
<
typename
SUBNET
>
using
ares
=
base_ares
<
1
,
SUBNET
>
;
template
<
typename
SUBNET
>
using
ares_down
=
base_ares
<
2
,
SUBNET
>
;
// implement skip and residual style networks. We have also made base_res
// parameterized by BN, which will let us insert different batch normalization
// layers.
template
<
template
<
typename
>
class
BN
,
typename
SUBNET
>
using
base_res
=
relu
<
add_prev1
<
BN
<
con
<
8
,
3
,
3
,
1
,
1
,
relu
<
BN
<
con
<
8
,
3
,
3
,
1
,
1
,
tag1
<
SUBNET
>>>>>>>>
;
// We also want a residual block that begins by doing downsampling. We can
// reuse base_res to define it like this:
template
<
template
<
typename
>
class
BN
,
typename
SUBNET
>
using
base_res_down
=
base_res
<
BN
,
avg_pool
<
1
,
1
,
2
,
2
,
SUBNET
>>
;
// Now we can define 4 different residual blocks we will use in this example.
// The first two are non-downsampling residual blocks while the last two
// downsample. Also, res and res_down use batch normalization while ares and
// ares_down have had the batch normalization replaced with simple affine
// layers. We will use the affine version of the layers when testing our
// networks.
template
<
typename
SUBNET
>
using
res
=
base_res
<
bn_con
,
SUBNET
>
;
template
<
typename
SUBNET
>
using
ares
=
base_res
<
affine
,
SUBNET
>
;
template
<
typename
SUBNET
>
using
res_down
=
base_res_down
<
bn_con
,
SUBNET
>
;
template
<
typename
SUBNET
>
using
ares_down
=
base_res_down
<
affine
,
SUBNET
>
;
...
...
@@ -141,37 +145,39 @@ int main(int argc, char** argv) try
// These print statements will output this (I've truncated it since it's
// long, but you get the idea):
/*
The pnet has 12
5
layers in it.
layer<0>
loss_multiclass_log
layer<1>
fc (num_outputs=10)
layer<2>
avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0)
layer<3>
prelu (initial_param_value=0.2)
layer<4>
add_prev
layer<5>
bn_con
layer<6>
con (num_filters=8, nr=3, nc=3, stride_y=1, stride
_x=1)
layer<7>
prelu (initial_param_value=0.25)
layer<8>
bn_con
layer<9>
con (num_filters=8, nr=3, nc=3, stride_y=1, stride
_x=1)
layer<10>
tag1
The pnet has 12
7
layers in it.
layer<0> loss_multiclass_log
layer<1> fc (num_outputs=10)
layer<2> avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0)
layer<3> prelu (initial_param_value=0.2)
layer<4> add_prev
layer<5> bn_con
layer<6>
con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding
_x=1)
layer<7> prelu (initial_param_value=0.25)
layer<8> bn_con
layer<9>
con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding
_x=1)
layer<10> tag1
...
layer<33> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2)
layer<34> tag1
layer<35> tag4
layer<36> prelu (initial_param_value=0.3)
layer<37> add_prev
layer<38> bn_con
layer<33> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<34> tag1
layer<35> avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
layer<36> tag4
layer<37> prelu (initial_param_value=0.3)
layer<38> add_prev
layer<39> bn_con
...
layer<114> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2)
layer<115> tag1
layer<116> relu
layer<117> add_prev
layer<118> bn_con
layer<119> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1)
layer<120> relu
layer<121> bn_con
layer<122> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1)
layer<123> tag1
layer<124> input<matrix>
layer<115> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<116> tag1
layer<117> avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
layer<118> relu
layer<119> add_prev
layer<120> bn_con
layer<121> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<122> relu
layer<123> bn_con
layer<124> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<125> tag1
layer<126> input<matrix>
*/
// Now that we know the index numbers for each layer, we can access them
...
...
@@ -189,7 +195,7 @@ int main(int argc, char** argv) try
// parts of your network and access them by layer<tag>(). You can also
// index relative to a tag. So for example, to access the layer immediately
// after tag4 you can say:
layer
<
tag4
,
1
>
(
pnet
);
// Equivalent to layer<3
5
+1>(pnet).
layer
<
tag4
,
1
>
(
pnet
);
// Equivalent to layer<3
6
+1>(pnet).
// Or to access the layer 2 layers after tag4:
layer
<
tag4
,
2
>
(
pnet
);
...
...
@@ -203,23 +209,26 @@ int main(int argc, char** argv) try
// talk about training networks!
// The dnn_trainer will use SGD by default, but you can tell it to use
// different solvers like adam.
dnn_trainer
<
net_type
,
adam
>
trainer
(
net
,
adam
(
0.001
));
// different solvers like adam with a weight decay of 0.0005 and the given
// momentum parameters.
dnn_trainer
<
net_type
,
adam
>
trainer
(
net
,
adam
(
0.0005
,
0.9
,
0.999
));
// Also, if you have multiple graphics cards you can tell the trainer to use
// them together to make the training faster. For example, replacing the
// above constructor call with this one would cause it to use GPU cards 0
// and 1.
//dnn_trainer<net_type,adam> trainer(net,adam(0.00
1
), {0,1});
//dnn_trainer<net_type,adam> trainer(net,adam(0.00
05, 0.9, 0.999
), {0,1});
trainer
.
be_verbose
();
trainer
.
set_synchronization_file
(
"mnist_resnet_sync"
,
std
::
chrono
::
seconds
(
100
));
// While the trainer is running it keeps an eye on the training error. If
// it looks like the error hasn't decreased for the last 2000 iterations it
// will automatically reduce the
step siz
e by 0.1. You can change these
// will automatically reduce the
learning rat
e by 0.1. You can change these
// default parameters to some other values by calling these functions. Or
// disable the
m
entirely by setting the shrink amount to 1.
// disable the
automatic shrinking
entirely by setting the shrink amount to 1.
trainer
.
set_iterations_without_progress_threshold
(
2000
);
trainer
.
set_step_size_shrink_amount
(
0.1
);
trainer
.
set_learning_rate_shrink_amount
(
0.1
);
// The learning rate will start at 1e-3.
trainer
.
set_learning_rate
(
1e-3
);
// Now, what if your training dataset is so big it doesn't fit in RAM? You
...
...
@@ -230,10 +239,10 @@ int main(int argc, char** argv) try
std
::
vector
<
matrix
<
unsigned
char
>>
mini_batch_samples
;
std
::
vector
<
unsigned
long
>
mini_batch_labels
;
dlib
::
rand
rnd
(
time
(
0
));
// Loop until the trainer's automatic shrinking has shrunk the
step size by
//
1e-3. For the default shrinks amount of 0.1 this means stop after it
//
shrinks it
3 times.
while
(
trainer
.
get_
step_size
()
>=
1e-3
)
// Loop until the trainer's automatic shrinking has shrunk the
learning rate to 1e-6.
//
Given our settings, this means it will stop training after it has shrunk the
//
learning rate
3 times.
while
(
trainer
.
get_
learning_rate
()
>=
1e-6
)
{
mini_batch_samples
.
clear
();
mini_batch_labels
.
clear
();
...
...
examples/dnn_mnist_ex.cpp
View file @
93bbe5ff
...
...
@@ -89,7 +89,9 @@ int main(int argc, char** argv) try
net_type
net
;
// And then train it using the MNIST data. The code below uses mini-batch stochastic
// gradient descent with an initial learning rate of 0.01 to accomplish this.
dnn_trainer
<
net_type
>
trainer
(
net
,
sgd
(
0.01
));
dnn_trainer
<
net_type
>
trainer
(
net
);
trainer
.
set_learning_rate
(
0.01
);
trainer
.
set_min_learning_rate
(
0.00001
);
trainer
.
set_mini_batch_size
(
128
);
trainer
.
be_verbose
();
// Since DNN training can take a long time, we can ask the trainer to save its state to
...
...
@@ -97,11 +99,11 @@ int main(int argc, char** argv) try
// start it again it will begin where it left off rather than restarting the training
// from scratch.
trainer
.
set_synchronization_file
(
"mnist_sync"
,
std
::
chrono
::
seconds
(
20
));
// Finally, this line begins training. By default, it runs SGD with our specified
step
//
size until the loss stops decreasing. Then it reduces the step size by a factor of
//
10 and continues running until the loss stops decreasing again. It will reduce the
//
step size 3 times and then terminate. For a longer discussion, see the documentation
//
of the dnn_trainer object.
// Finally, this line begins training. By default, it runs SGD with our specified
//
learning rate until the loss stops decreasing. Then it reduces the learning rate by
//
a factor of 10 and continues running until the loss stops decreasing again. It will
//
keep doing this until the learning rate has dropped below the min learning rate
//
defined above or the maximum number of epochs as been executed (defaulted to 10000).
trainer
.
train
(
training_images
,
training_labels
);
// At this point our net object should have learned how to classify MNIST images. But
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment