use dist.barrier to synchronize (#393)

c5ca36fc · 夜阑听风 · Francisco Massa · 5f2a8263 · c5ca36fc · c5ca36fc
Commit c5ca36fc authored Jan 30, 2019 by 夜阑听风 Committed by Francisco Massa Jan 30, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 14 deletions

comm.py maskrcnn_benchmark/utils/comm.py +1 -14

test_net.py tools/test_net.py +1 -0

train_net.py tools/train_net.py +1 -0

No files found.
--- a/maskrcnn_benchmark/utils/comm.py
+++ b/maskrcnn_benchmark/utils/comm.py
@@ -40,22 +40,9 @@ def synchronize():
    if not dist.is_initialized():
        return
    world_size = dist.get_world_size()
-    rank = dist.get_rank()
    if world_size == 1:
        return
+    dist.barrier()
-    def _send_and_wait(r):
-        if rank == r:
-            tensor = torch.tensor(0, device="cuda")
-        else:
-            tensor = torch.tensor(1, device="cuda")
-        dist.broadcast(tensor, r)
-        while tensor.item() == 1:
-            time.sleep(1)
-    _send_and_wait(0)
-    # now sync on the main process
-    _send_and_wait(1)
 def all_gather(data):

--- a/tools/test_net.py
+++ b/tools/test_net.py
@@ -44,6 +44,7 @@ def main():
        torch.distributed.init_process_group(
            backend="nccl", init_method="env://"
        )
+        synchronize()
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)

--- a/tools/train_net.py
+++ b/tools/train_net.py
@@ -139,6 +139,7 @@ def main():
        torch.distributed.init_process_group(
            backend="nccl", init_method="env://"
        )
+        synchronize()
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)