pytorch · fco-dv · Oct 31, 2021 · Oct 31, 2021 · Nov 2, 2021 · Nov 2, 2021
diff --git a/ignite/distributed/comp_models/base.py b/ignite/distributed/comp_models/base.py
@@ -1,6 +1,8 @@
+import warnings
 from abc import ABCMeta, abstractmethod
+from inspect import signature
 from numbers import Number
-from typing import Any, Callable, List, Optional, Union, cast
+from typing import Any, Callable, Dict, List, Optional, Union, cast
 
 import torch
 
@@ -275,8 +277,24 @@ def _do_all_gather(self, tensor: torch.Tensor) -> torch.Tensor:
     def _do_broadcast(self, tensor: torch.Tensor, src: int) -> torch.Tensor:
         pass
 
+    def _check_barrier_fn_kwargs(self, barrier_fn: Callable, kwargs_dict: Dict[str, Any]) -> Dict[str, Any]:
+        fn_params_name = set(
+            map(
+                lambda param: param.name,
+                filter(
+                    lambda param: param.kind == param.POSITIONAL_OR_KEYWORD, signature(barrier_fn).parameters.values()
+                ),
+            )
+        )
+        extra_keys = kwargs_dict.keys() - fn_params_name
+        if extra_keys:
+            warnings.warn(f"Extra keys : {extra_keys} will not be used by {self._backend}.")
+            for k in extra_keys:
+                del kwargs_dict[k]
+        return kwargs_dict
+
     @abstractmethod
-    def barrier(self) -> None:
+    def barrier(self, **kwargs: Any) -> None:
         pass
 
 
@@ -358,5 +376,5 @@ def _do_all_gather(self, tensor: torch.Tensor) -> torch.Tensor:
     def _do_broadcast(self, tensor: torch.Tensor, src: int) -> torch.Tensor:
         return tensor
 
-    def barrier(self) -> None:
+    def barrier(self, **kwargs: Any) -> None:
         pass
diff --git a/ignite/distributed/comp_models/horovod.py b/ignite/distributed/comp_models/horovod.py
@@ -194,7 +194,12 @@ def _do_all_gather(self, tensor: torch.Tensor) -> torch.Tensor:
         def _do_broadcast(self, tensor: torch.Tensor, src: int) -> torch.Tensor:
             return hvd.broadcast(tensor, root_rank=src)
 
-        def barrier(self) -> None:
+        def barrier(self, **kwargs: Any) -> None:
+            kwargs = self._check_barrier_fn_kwargs(barrier_fn=hvd.allreduce, kwargs_dict=kwargs)
+            if "tensor" in kwargs:
+                del kwargs["tensor"]
+            if "name" in kwargs:
+                del kwargs["name"]
             # https://github.com/horovod/horovod/issues/159#issuecomment-424834603
             # hvd.allreduce(torch.tensor(0, device=self.device()), name="barrier")
-            hvd.allreduce(torch.tensor(0, device="cpu"), name="barrier")
+            hvd.allreduce(tensor=torch.tensor(0, device="cpu"), name="barrier", **kwargs)
diff --git a/ignite/distributed/comp_models/native.py b/ignite/distributed/comp_models/native.py
@@ -432,8 +432,9 @@ def _do_broadcast(self, tensor: torch.Tensor, src: int) -> torch.Tensor:
             dist.broadcast(tensor, src=src)
             return tensor
 
-        def barrier(self) -> None:
-            dist.barrier()
+        def barrier(self, **kwargs: Any) -> None:
+            kwargs = self._check_barrier_fn_kwargs(barrier_fn=dist.barrier, kwargs_dict=kwargs)
+            dist.barrier(**kwargs)
 
     def _expand_hostlist(nodelist: str) -> List[str]:
         """Expand a compressed hostlist string and returns all hosts listed.

diff --git a/ignite/distributed/comp_models/xla.py b/ignite/distributed/comp_models/xla.py
@@ -160,5 +160,8 @@ def _do_broadcast(self, tensor: torch.Tensor, src: int) -> torch.Tensor:
             xm.all_reduce("sum", [tensor,])
             return tensor
 
-        def barrier(self) -> None:
-            xm.rendezvous("barrier")
+        def barrier(self, **kwargs: Any) -> None:
+            kwargs = self._check_barrier_fn_kwargs(barrier_fn=xm.rendezvous, kwargs_dict=kwargs)
+            if "tag" in kwargs:
+                del kwargs["tag"]
+            xm.rendezvous(tag="barrier", **kwargs)
diff --git a/ignite/distributed/utils.py b/ignite/distributed/utils.py
@@ -421,13 +421,30 @@ def broadcast(
     return _model.broadcast(tensor, src=src, safe_mode=safe_mode)
 
 
-def barrier() -> None:
+def barrier(**kwargs: Any) -> None:
     """Helper method to synchronize all processes.
+
+    Args:
+        kwargs: acceptable kwargs according to provided backend:
+
+            - | "nccl" or "gloo" : ``group`` (default, GroupMember.WORLD), ``async_op`` (default, False),
+              | ``device_ids`` (default, None).
+
+            - | "horovod" : ``average`` (default, None), ``compression`` (default, Compression.none),
+              | ``op`` (default, None), ``prescale_factor`` (default, 1.0), ``postscale_factor`` (default, 1.0),
+              | ``process_set`` (default, global_process_set).
+              | Arguments ``tensor=torch.tensor(0, device="cpu")`` and ``name="barrier"`` are redefined.
+
+            - | "xla-tpu" : ``payload`` (default, b""), ``replicas`` (default, []).
+              | Argument ``tag="barrier"`` is redefined.
+
+    .. versionchanged:: 0.5.1
+        Method now accepts ``kwargs`` for all supported backends.
     """
     if _need_to_sync and isinstance(_model, _SerialModel):
         sync(temporary=True)
 
-    _model.barrier()
+    _model.barrier(**kwargs)
 
 
 def set_local_rank(index: int) -> None:

diff --git a/tests/ignite/distributed/utils/__init__.py b/tests/ignite/distributed/utils/__init__.py
@@ -215,14 +215,15 @@ def _test(data_src, data_others, safe_mode):
             idist.broadcast(None, src=0)
 
 
-def _test_distrib_barrier(device):
+def _test_distrib_barrier(device, kwargs_dict=None):
 
     t = torch.tensor([idist.get_rank()], device=device, dtype=torch.float)
     true_res = sum([i for i in range(idist.get_world_size())])
 
     if idist.get_rank() == 0:
         t += 10.0
-    idist.barrier()
+
+    idist.barrier(**kwargs_dict) if kwargs_dict else idist.barrier()
 
     tt = idist.all_reduce(t)
     assert tt.item() == true_res + 10.0

diff --git a/tests/ignite/distributed/utils/test_horovod.py b/tests/ignite/distributed/utils/test_horovod.py
@@ -184,6 +184,29 @@ def test_idist_barrier_hvd(gloo_hvd_executor):
     gloo_hvd_executor(_test_distrib_barrier, (device,), np=np, do_init=True)
 
 
+@pytest.mark.distributed
+@pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support")
+@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
+def test_idist_barrier_kwargs_hvd(gloo_hvd_executor):
+    from horovod.torch.compression import Compression
+    from horovod.torch.mpi_ops import global_process_set
+
+    device = "cpu" if not torch.cuda.is_available() else "cuda"
+    np = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
+
+    kwargs_dict = dict(
+        tensor=torch.tensor(0, device="cpu"),
+        average=None,
+        name=None,
+        compression=Compression.none,
+        op=None,
+        prescale_factor=1.0,
+        postscale_factor=1.0,
+        process_set=global_process_set,
+    )
+    gloo_hvd_executor(_test_distrib_barrier, (device, kwargs_dict,), np=np, do_init=True)
+
+
 def _test_idist_methods_overhead(ok_factor, sync_model):
     import time
 

diff --git a/tests/ignite/distributed/utils/test_native.py b/tests/ignite/distributed/utils/test_native.py
@@ -272,6 +272,41 @@ def test_idist_barrier_gloo(distributed_context_single_node_gloo):
     _test_distrib_barrier(device)
 
 
+@pytest.mark.distributed
+@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
+def test_idist_barrier_kwargs_nccl(distributed_context_single_node_nccl):
+
+    device = idist.device()
+    from torch.distributed import GroupMember
+
+    kwargs_dict = {"group": GroupMember.WORLD, "async_op": False, "device_ids": None}
+    _test_distrib_barrier(device, kwargs_dict)
+
+    kwargs_dict.update({"tag": "barrier", "payload": b"", "replicas": []})
+    with pytest.warns(
+        UserWarning, match=r"Extra keys : \{((, )?('payload'|'replicas'|'tag')(, )?)+\} will not be used by nccl."
+    ):
+        _test_distrib_barrier(device, kwargs_dict)
+
+
+@pytest.mark.distributed
+@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
+def test_idist_barrier_kwargs_gloo(distributed_context_single_node_gloo):
+
+    device = idist.device()
+    from torch.distributed import GroupMember
+
+    kwargs_dict = {"group": GroupMember.WORLD, "async_op": False, "device_ids": None}
+    _test_distrib_barrier(device, kwargs_dict)
+
+    kwargs_dict.update({"tag": "barrier", "payload": b"", "replicas": []})
+    with pytest.warns(
+        UserWarning, match=r"Extra keys : \{((, )?('payload'|'replicas'|'tag')(, )?)+\} will not be used by gloo."
+    ):
+        _test_distrib_barrier(device, kwargs_dict)
+
+
 def _test_idist_methods_overhead(ok_factor):
     import time
 

diff --git a/tests/ignite/distributed/utils/test_xla.py b/tests/ignite/distributed/utils/test_xla.py
@@ -184,9 +184,28 @@ def test_idist_barrier_xla():
     _test_distrib_barrier(device)
 
 
-def _test_idist_barrier_xla_in_child_proc(index):
+def _test_idist_barrier_xla_in_child_proc(index, kwargs_dict=None):
     device = idist.device()
-    _test_distrib_barrier(device)
+    _test_distrib_barrier(device, kwargs_dict)
+
+
+@pytest.mark.tpu
+@pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
+@pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package")
+def test_idist_barrier_kwargs_xla():
+
+    device = idist.device()
+    kwargs_dict = {"tag": "barrier", "payload": b"", "replicas": []}
+    _test_distrib_barrier(device, kwargs_dict)
+
+    from torch.distributed import GroupMember
+
+    kwargs_dict.update({"group": GroupMember.WORLD, "async_op": False, "device_ids": None})
+    with pytest.warns(
+        UserWarning,
+        match=r"Extra keys : \{((, )?('async_op'|'group'|'device_ids')(, )?)+\} will not be used by xla-tpu.",
+    ):
+        _test_distrib_barrier(device, kwargs_dict)
 
 
 @pytest.mark.tpu
@@ -197,6 +216,15 @@ def test_idist_barrier_xla_in_child_proc(xmp_executor):
     xmp_executor(_test_idist_barrier_xla_in_child_proc, args=(), nprocs=n)
 
 
+@pytest.mark.tpu
+@pytest.mark.skipif("NUM_TPU_WORKERS" not in os.environ, reason="Skip if no NUM_TPU_WORKERS in env vars")
+@pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package")
+def test_idist_barrier_kwargs_xla_in_child_proc(xmp_executor):
+    n = int(os.environ["NUM_TPU_WORKERS"])
+    kwargs_dict = {"tag": "barrier", "payload": b"", "replicas": []}
+    xmp_executor(_test_idist_barrier_xla_in_child_proc, args=(kwargs_dict,), nprocs=n)
+
+
 @pytest.mark.tpu
 @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
 @pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package")