From 052d66a2d23c06820307cfa5a7ca0122084e2ebf Mon Sep 17 00:00:00 2001 From: vfdev Date: Thu, 3 Feb 2022 18:16:11 +0100 Subject: [PATCH] [skip ci] dist.torch.launch -> torchrun (#2450) * [skip ci] dist.torch.launch -> torchrun * Update config.yml --- .circleci/config.yml | 4 ++-- examples/contrib/cifar10/README.md | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index c361b34b789..8a7ed6188cf 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -294,12 +294,12 @@ jobs: docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}" - run: - name: "Run with NCCL backend using torch dist launch" + name: "Run with NCCL backend using torchrun" command: | export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="CI=1 python -u -m torch.distributed.launch --nproc_per_node=2 --use_env ${example_path}/main.py run --backend=nccl --checkpoint_every=200" + export test_cmd="CI=1 torchrun --nproc_per_node=2 ${example_path}/main.py run --backend=nccl --checkpoint_every=200" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt" diff --git a/examples/contrib/cifar10/README.md b/examples/contrib/cifar10/README.md index 8f4747b7621..dcc8a0af233 100644 --- a/examples/contrib/cifar10/README.md +++ b/examples/contrib/cifar10/README.md @@ -52,8 +52,8 @@ If user would like to provide already downloaded dataset, the path can be setup Let's start training on a single node with 2 gpus: ```bash -# using torch.distributed.launch -python -u -m torch.distributed.launch --nproc_per_node=2 --use_env main.py run --backend="nccl" +# using torchrun +torchrun --nproc_per_node=2 main.py run --backend="nccl" ``` or @@ -92,18 +92,18 @@ Let's start training on two nodes with 2 gpus each. We assuming that master node 1. Execute on master node ```bash -python -u -m torch.distributed.launch \ +torchrun \ --nnodes=2 \ --nproc_per_node=2 \ --node_rank=0 \ - --master_addr=master --master_port=2222 --use_env \ + --master_addr=master --master_port=2222 \ main.py run --backend="nccl" ``` 2. Execute on worker node ```bash -python -u -m torch.distributed.launch \ +torchrun \ --nnodes=2 \ --nproc_per_node=2 \ --node_rank=1 \ @@ -134,14 +134,14 @@ python main.py run --resume-from=/tmp/output-cifar10/resnet18_backend-None-1_sto Initial training on a single node with 2 gpus with a stop on 1000 iteration (~11 epochs): ```bash -# using torch.distributed.launch -python -u -m torch.distributed.launch --nproc_per_node=2 --use_env main.py run --backend="nccl" --stop_iteration=1000 +# using torchrun +torchrun --nproc_per_node=2 main.py run --backend="nccl" --stop_iteration=1000 ``` Resume from the latest checkpoint ```bash -python -u -m torch.distributed.launch --nproc_per_node=2 --use_env main.py run --backend="nccl" \ +torchrun --nproc_per_node=2 main.py run --backend="nccl" \ --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-1000/training_checkpoint_1000.pt ```