Potential fix for training stuck caused by data loader failure. (#638)

Summary: Pull Request resolved: https://github.com/facebookresearch/Detectron/pull/638 Potential fix for training stuck caused by data loader failure. Reviewed By: rbgirshick Differential Revision: D9513621 fbshipit-source-id: 123974eac83f40ef2f582a90fedea790fdc442d1

Potential fix for training stuck caused by data loader failure. (#638)
Summary: Pull Request resolved: https://github.com/facebookresearch/Detectron/pull/638 Potential fix for training stuck caused by data loader failure. Reviewed By: rbgirshick Differential Revision: D9513621 fbshipit-source-id: 123974eac83f40ef2f582a90fedea790fdc442d1
1ecd603b · Peizhao Zhang · Facebook Github Bot · c9ed587c · 1ecd603b · 1ecd603b
Commit 1ecd603b authored Aug 28, 2018 by Peizhao Zhang Committed by Facebook Github Bot Aug 28, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 4 deletions

loader.py detectron/roi_data/loader.py +3 -0

train.py detectron/utils/train.py +10 -4

No files found.
--- a/detectron/roi_data/loader.py
+++ b/detectron/roi_data/loader.py
@@ -241,6 +241,9 @@ class RoIDataLoader(object):
                    self.shutdown()
                    break

+    def should_stop(self):
+        return self.coordinator.should_stop()
+
    def shutdown(self):
        self.coordinator.request_stop()
        self.coordinator.wait_for_stop()

--- a/detectron/utils/train.py
+++ b/detectron/utils/train.py
@@ -50,7 +50,6 @@ import detectron.utils.net as nu

 def train_model():
    """Model training loop."""
-    logger = logging.getLogger(__name__)
    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
@@ -61,6 +60,8 @@ def train_model():
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
+        if model.roi_data_loader.should_stop():
+            handle_critical_error(model, 'roi_data_loader failed')
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
@@ -82,9 +83,7 @@ def train_model():
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
-            logger.critical('Loss is NaN, exiting...')
-            model.roi_data_loader.shutdown()
-            envu.exit_on_error()
+            handle_critical_error(model, 'Loss is NaN')

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
@@ -94,6 +93,13 @@ def train_model():
    return checkpoints


+def handle_critical_error(model, msg):
+    logger = logging.getLogger(__name__)
+    logger.critical(msg)
+    model.roi_data_loader.shutdown()
+    raise Exception(msg)
+
+
 def create_model():
    """Build the model and look for saved model checkpoints in case we can
    resume from one.