Commit 8eb9e295 authored by Davis King's avatar Davis King

Made dnn_trainer sync its state to two separate sync files that it alternates

between.  This should make syncing more robust to sudden hardware failure that
happens right when saving to disk.
parent a06b5292
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <future> #include <future>
#include <exception> #include <exception>
#include <mutex> #include <mutex>
#include "../dir_nav.h"
namespace dlib namespace dlib
{ {
...@@ -409,7 +410,7 @@ namespace dlib ...@@ -409,7 +410,7 @@ namespace dlib
time_between_syncs = time_between_syncs_; time_between_syncs = time_between_syncs_;
// check if the sync file already exists, if it does we should load it. // check if the sync file already exists, if it does we should load it.
std::ifstream fin(sync_filename, std::ios::binary); std::ifstream fin(newest_syncfile(), std::ios::binary);
if (fin) if (fin)
deserialize(*this, fin); deserialize(*this, fin);
} }
...@@ -980,26 +981,20 @@ namespace dlib ...@@ -980,26 +981,20 @@ namespace dlib
// previously saved state in the hopes that the problem won't reoccur. // previously saved state in the hopes that the problem won't reoccur.
if (loss_increased_since_last_disk_sync()) if (loss_increased_since_last_disk_sync())
{ {
std::ifstream fin(sync_filename, std::ios::binary); std::ifstream fin(newest_syncfile(), std::ios::binary);
deserialize(*this, fin); deserialize(*this, fin);
sync_file_reloaded = true; sync_file_reloaded = true;
if (verbose) if (verbose)
std::cout << "Loss has been increasing, reloading saved state from " << sync_filename << std::endl; std::cout << "Loss has been increasing, reloading saved state from " << newest_syncfile() << std::endl;
} }
else else
{ {
// save our state to a temp file const std::string filename = oldest_syncfile();
const std::string tempfile = sync_filename + ".tmp"; serialize(filename) << *this;
serialize(tempfile) << *this;
// Now that we know the state is safely saved to disk, delete the old sync
// file and move the .tmp file to it.
std::remove(sync_filename.c_str());
std::rename(tempfile.c_str(), sync_filename.c_str());
if (verbose) if (verbose)
std::cout << "Saved state to " << sync_filename << std::endl; std::cout << "Saved state to " << filename << std::endl;
} }
last_sync_time = std::chrono::system_clock::now(); last_sync_time = std::chrono::system_clock::now();
...@@ -1008,12 +1003,24 @@ namespace dlib ...@@ -1008,12 +1003,24 @@ namespace dlib
} }
} }
std::string newest_syncfile (
)
{
return select_newest_file(sync_filename, sync_filename + "_");
}
std::string oldest_syncfile (
)
{
return select_oldest_file(sync_filename, sync_filename + "_");
}
bool loss_increased_since_last_disk_sync() bool loss_increased_since_last_disk_sync()
{ {
size_t gradient_updates_since_last_sync = main_iteration_counter - main_iteration_counter_at_last_disk_sync; size_t gradient_updates_since_last_sync = main_iteration_counter - main_iteration_counter_at_last_disk_sync;
// if we haven't synced anything to disk yet then return false. // if we haven't synced anything to disk yet then return false.
if (!std::ifstream(sync_filename, std::ios::binary)) if (!std::ifstream(newest_syncfile(), std::ios::binary))
return false; return false;
for (auto x : previous_loss_values) for (auto x : previous_loss_values)
......
...@@ -336,10 +336,17 @@ namespace dlib ...@@ -336,10 +336,17 @@ namespace dlib
train_one_step(), this object will save its entire state, including the train_one_step(), this object will save its entire state, including the
state of get_net(), to disk in the file named filename every state of get_net(), to disk in the file named filename every
time_between_syncs seconds. time_between_syncs seconds.
- if the filename file already exists then the state of this trainer will - If the filename file already exists then the state of this trainer will
be loaded from that file by this call to set_synchronization_file(). be loaded from that file by this call to set_synchronization_file().
This allows you to resume a training session which was previously This allows you to resume a training session which was previously
interrupted. interrupted.
- It should be noted that when saving, the trainer will alternate between
saving to a file called filename and another file called filename+"_".
We do this because it's possible that your computer might crash (not
because of dlib, just in general) before the data is safely saved to
disk. This way, you will always have a backup file if the write to disk
gets corrupted or is incomplete. Moreover, when loading, we will always
load from the newest of the two possible files.
!*/ !*/
void train ( void train (
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment