Commit 95cb5697 authored by Davis King's avatar Davis King

Cleaned up trainer API and made the verbose output include information about

how much the current step size has converged.
parent c5f83cbe
...@@ -142,7 +142,8 @@ namespace dlib ...@@ -142,7 +142,8 @@ namespace dlib
last_time = now_time; last_time = now_time;
std::cout << "step#: " << rpad(cast_to_string(train_one_step_calls),epoch_string_pad) << " " std::cout << "step#: " << rpad(cast_to_string(train_one_step_calls),epoch_string_pad) << " "
<< "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " " << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "
<< "steps without apparent progress: " << steps_without_progress
<< std::endl; << std::endl;
clear_average_loss(); clear_average_loss();
} }
...@@ -167,7 +168,8 @@ namespace dlib ...@@ -167,7 +168,8 @@ namespace dlib
last_time = now_time; last_time = now_time;
std::cout << "step#: " << rpad(cast_to_string(train_one_step_calls),epoch_string_pad) << " " std::cout << "step#: " << rpad(cast_to_string(train_one_step_calls),epoch_string_pad) << " "
<< "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " " << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "
<< "steps without apparent progress: " << steps_without_progress
<< std::endl; << std::endl;
clear_average_loss(); clear_average_loss();
} }
...@@ -207,7 +209,8 @@ namespace dlib ...@@ -207,7 +209,8 @@ namespace dlib
auto iter = epoch_iteration + epoch_pos/(double)data.size(); auto iter = epoch_iteration + epoch_pos/(double)data.size();
std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " " std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " "
<< "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " " << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "
<< "steps without apparent progress: " << steps_without_progress
<< std::endl; << std::endl;
} }
} }
...@@ -229,7 +232,8 @@ namespace dlib ...@@ -229,7 +232,8 @@ namespace dlib
// are for full epoch status statements. // are for full epoch status statements.
std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " " std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " "
<< "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " " << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "
<< "steps without apparent progress: " << steps_without_progress
<< std::endl; << std::endl;
} }
} }
...@@ -270,7 +274,8 @@ namespace dlib ...@@ -270,7 +274,8 @@ namespace dlib
auto iter = epoch_iteration + epoch_pos/(double)data.size(); auto iter = epoch_iteration + epoch_pos/(double)data.size();
std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " " std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " "
<< "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " " << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "
<< "steps without apparent progress: " << steps_without_progress
<< std::endl; << std::endl;
} }
} }
...@@ -290,7 +295,8 @@ namespace dlib ...@@ -290,7 +295,8 @@ namespace dlib
// are for full epoch status statements. // are for full epoch status statements.
std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " " std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " "
<< "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " " << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "
<< "steps without apparent progress: " << steps_without_progress
<< std::endl; << std::endl;
} }
} }
...@@ -359,17 +365,17 @@ namespace dlib ...@@ -359,17 +365,17 @@ namespace dlib
return min_step_size; return min_step_size;
} }
void set_iterations_between_step_size_adjust ( void set_iterations_without_progress_threshold (
unsigned long min_iter unsigned long thresh
) )
{ {
iter_between_step_size_adjust = min_iter; iter_without_progress_thresh = thresh;
} }
unsigned long get_iterations_between_step_size_adjust ( unsigned long get_iterations_without_progress_threshold (
) const ) const
{ {
return iter_between_step_size_adjust; return iter_without_progress_thresh;
} }
void set_step_size_shrink_amount ( void set_step_size_shrink_amount (
...@@ -396,15 +402,16 @@ namespace dlib ...@@ -396,15 +402,16 @@ namespace dlib
void record_loss(double loss) void record_loss(double loss)
{ {
// Say that we will check if the gradient is bad 200 times during each // Say that we will check if the gradient is bad 200 times during each
// iter_between_step_size_adjust interval of network updates. This kind of // iter_without_progress_thresh interval of network updates. This kind of
// budgeting causes our gradient checking to use a fixed amount of // budgeting causes our gradient checking to use a fixed amount of
// computational resources, regardless of the size of // computational resources, regardless of the size of
// iter_between_step_size_adjust. // iter_without_progress_thresh.
gradient_check_budget += 200; gradient_check_budget += 200;
rs.add(loss); rs.add(loss);
previous_loss_values.push_back(loss); previous_loss_values.push_back(loss);
if (previous_loss_values.size() > iter_between_step_size_adjust) // discard really old loss values.
while (previous_loss_values.size() > iter_without_progress_thresh)
previous_loss_values.pop_front(); previous_loss_values.pop_front();
} }
...@@ -417,7 +424,7 @@ namespace dlib ...@@ -417,7 +424,7 @@ namespace dlib
void run_update(job_t& next_job, const no_label_type&) void run_update(job_t& next_job, const no_label_type&)
{ {
no_label_type pick_wich_run_update; no_label_type pick_which_run_update;
double loss = net.update(next_job.t, make_sstack(solvers), step_size); double loss = net.update(next_job.t, make_sstack(solvers), step_size);
record_loss(loss); record_loss(loss);
} }
...@@ -427,26 +434,28 @@ namespace dlib ...@@ -427,26 +434,28 @@ namespace dlib
// Make sure this thread uses the same cuda device as the thread that created // Make sure this thread uses the same cuda device as the thread that created
// the dnn_trainer object. // the dnn_trainer object.
dlib::cuda::set_device(cuda_device_id); dlib::cuda::set_device(cuda_device_id);
label_type pick_wich_run_update; label_type pick_which_run_update;
job_t next_job; job_t next_job;
while(job_pipe.dequeue(next_job)) while(job_pipe.dequeue(next_job))
{ {
// call net.update() but pick the right version for unsupervised or // call net.update() but pick the right version for unsupervised or
// supervised training based on the type of label_type. // supervised training based on the type of label_type.
run_update(next_job, pick_wich_run_update); run_update(next_job, pick_which_run_update);
// If we have been running for a while then check if the loss is still // If we have been running for a while then check if the loss is still
// dropping. If it isn't then we will reduce the step size. Note that we // dropping. If it isn't then we will reduce the step size. Note that we
// have a "budget" that prevents us from calling // have a "budget" that prevents us from calling
// probability_gradient_greater_than() every iteration. We do this because // count_steps_without_decrease() every iteration. We do this because
// it can be expensive to compute when previous_loss_values is large. // it can be expensive to compute when previous_loss_values is large.
if (previous_loss_values.size() >= iter_between_step_size_adjust && if (gradient_check_budget > iter_without_progress_thresh)
gradient_check_budget > previous_loss_values.size())
{ {
gradient_check_budget = 0; gradient_check_budget = 0;
if (probability_gradient_greater_than(previous_loss_values, 0) > 0.49) steps_without_progress = count_steps_without_decrease(previous_loss_values);
if (steps_without_progress >= iter_without_progress_thresh)
{ {
// optimization has flattened out, so drop the learning rate.
step_size = step_size_shrink*step_size; step_size = step_size_shrink*step_size;
steps_without_progress = 0;
previous_loss_values.clear(); previous_loss_values.clear();
} }
} }
...@@ -475,7 +484,8 @@ namespace dlib ...@@ -475,7 +484,8 @@ namespace dlib
cuda_device_id = dlib::cuda::get_device(); cuda_device_id = dlib::cuda::get_device();
step_size = 1; step_size = 1;
min_step_size = 1e-3; min_step_size = 1e-3;
iter_between_step_size_adjust = 2000; iter_without_progress_thresh = 2000;
steps_without_progress = 0;
step_size_shrink = 0.1; step_size_shrink = 0.1;
epoch_iteration = 0; epoch_iteration = 0;
epoch_pos = 0; epoch_pos = 0;
...@@ -491,7 +501,7 @@ namespace dlib ...@@ -491,7 +501,7 @@ namespace dlib
friend void serialize(const dnn_trainer& item, std::ostream& out) friend void serialize(const dnn_trainer& item, std::ostream& out)
{ {
item.wait_for_thread_to_pause(); item.wait_for_thread_to_pause();
int version = 4; int version = 5;
serialize(version, out); serialize(version, out);
size_t nl = dnn_trainer::num_layers; size_t nl = dnn_trainer::num_layers;
...@@ -505,7 +515,8 @@ namespace dlib ...@@ -505,7 +515,8 @@ namespace dlib
serialize(item.solvers, out); serialize(item.solvers, out);
serialize(item.step_size.load(), out); serialize(item.step_size.load(), out);
serialize(item.min_step_size, out); serialize(item.min_step_size, out);
serialize(item.iter_between_step_size_adjust.load(), out); serialize(item.iter_without_progress_thresh.load(), out);
serialize(item.steps_without_progress.load(), out);
serialize(item.step_size_shrink.load(), out); serialize(item.step_size_shrink.load(), out);
serialize(item.epoch_iteration, out); serialize(item.epoch_iteration, out);
serialize(item.epoch_pos, out); serialize(item.epoch_pos, out);
...@@ -516,7 +527,7 @@ namespace dlib ...@@ -516,7 +527,7 @@ namespace dlib
item.wait_for_thread_to_pause(); item.wait_for_thread_to_pause();
int version = 0; int version = 0;
deserialize(version, in); deserialize(version, in);
if (version != 4) if (version != 5)
throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer."); throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer.");
size_t num_layers = 0; size_t num_layers = 0;
...@@ -540,7 +551,8 @@ namespace dlib ...@@ -540,7 +551,8 @@ namespace dlib
deserialize(item.solvers, in); deserialize(item.solvers, in);
deserialize(dtemp, in); item.step_size = dtemp; deserialize(dtemp, in); item.step_size = dtemp;
deserialize(item.min_step_size, in); deserialize(item.min_step_size, in);
deserialize(ltemp, in); item.iter_between_step_size_adjust = ltemp; deserialize(ltemp, in); item.iter_without_progress_thresh = ltemp;
deserialize(ltemp, in); item.steps_without_progress = ltemp;
deserialize(dtemp, in); item.step_size_shrink = dtemp; deserialize(dtemp, in); item.step_size_shrink = dtemp;
deserialize(item.epoch_iteration, in); deserialize(item.epoch_iteration, in);
deserialize(item.epoch_pos, in); deserialize(item.epoch_pos, in);
...@@ -592,7 +604,8 @@ namespace dlib ...@@ -592,7 +604,8 @@ namespace dlib
std::vector<solver_type> solvers; std::vector<solver_type> solvers;
std::atomic<double> step_size; std::atomic<double> step_size;
double min_step_size; double min_step_size;
std::atomic<unsigned long> iter_between_step_size_adjust; std::atomic<unsigned long> iter_without_progress_thresh;
std::atomic<unsigned long> steps_without_progress;
std::atomic<double> step_size_shrink; std::atomic<double> step_size_shrink;
std::chrono::time_point<std::chrono::system_clock> last_sync_time; std::chrono::time_point<std::chrono::system_clock> last_sync_time;
std::string sync_filename; std::string sync_filename;
......
...@@ -61,7 +61,7 @@ namespace dlib ...@@ -61,7 +61,7 @@ namespace dlib
- #get_mini_batch_size() == 128 - #get_mini_batch_size() == 128
- #get_step_size() == 1 - #get_step_size() == 1
- #get_min_step_size() == 1e-3 - #get_min_step_size() == 1e-3
- #get_iterations_between_step_size_adjust() == 2000 - #get_iterations_without_progress_threshold() == 2000
- #get_step_size_shrink() == 0.1 - #get_step_size_shrink() == 0.1
!*/ !*/
...@@ -193,27 +193,30 @@ namespace dlib ...@@ -193,27 +193,30 @@ namespace dlib
training will terminate. training will terminate.
!*/ !*/
void set_iterations_between_step_size_adjust ( void set_iterations_without_progress_threshold (
unsigned long min_iter unsigned long thresh
); );
/*! /*!
ensures ensures
- #get_iterations_between_step_size_adjust() == min_iter - #get_iterations_without_progress_threshold() == thresh
!*/ !*/
unsigned long get_iterations_between_step_size_adjust ( unsigned long get_iterations_without_progress_threshold (
) const; ) const;
/*! /*!
ensures ensures
- This object monitors the progress of training and estimates if the - This object monitors the progress of training and estimates if the
training error is being reduced. It does this by looking at training error is being reduced. It does this by looking at the previous
get_iterations_between_step_size_adjust() mini-batch results and applying get_iterations_without_progress_threshold() mini-batch results and
the statistical test defined by the running_gradient object to see if the applying the statistical test defined by the running_gradient object to
training error is getting smaller. see if the training error is getting smaller. If it isn't being reduced
then get_step_size() is made smaller by a factor of get_step_size_shrink().
Therefore, get_iterations_between_step_size_adjust() should always be set
to something sensibly large so that this test can be done with reasonably Therefore, get_iterations_without_progress_threshold() should always be
high confidence. set to something sensibly large so that this test can be done with
reasonably high confidence. Think of this test as saying "if the loss
hasn't been reduced for the previous get_iterations_without_progress_threshold()
then shrink the step size".
!*/ !*/
void set_step_size_shrink_amount ( void set_step_size_shrink_amount (
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment