NCCL error
ZiqianXie opened this issue · comments
I pulled the docker image and downloaded the checkpoint. When running generate_interactive.sh, I encountered the following error:
Traceback (most recent call last):
File "megatron_lm/tools/generate_samples_gpt2.py", line 104, in <module>
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "megatron_lm/tools/generate_samples_gpt2.py", line 104, in <module>
main()
File "megatron_lm/tools/generate_samples_gpt2.py", line 89, in main
File "megatron_lm/tools/generate_samples_gpt2.py", line 104, in <module>
File "megatron_lm/tools/generate_samples_gpt2.py", line 104, in <module>
_ = load_checkpoint(model, None, None)
Traceback (most recent call last):
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 183, in load_checkpoint
File "megatron_lm/tools/generate_samples_gpt2.py", line 104, in <module>
main()
main()
load_checkpoint_new(model, optimizer, lr_scheduler)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 373, in load_checkpoint_new
Traceback (most recent call last):
File "megatron_lm/tools/generate_samples_gpt2.py", line 89, in main
File "megatron_lm/tools/generate_samples_gpt2.py", line 89, in main
File "megatron_lm/tools/generate_samples_gpt2.py", line 104, in <module>
Traceback (most recent call last):
File "megatron_lm/tools/generate_samples_gpt2.py", line 104, in <module>
_ = load_checkpoint(model, None, None)
_ = load_checkpoint(model, None, None)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 183, in load_checkpoint
torch.distributed.barrier()
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 183, in load_checkpoint
main()
main()
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 2709, in barrier
File "megatron_lm/tools/generate_samples_gpt2.py", line 89, in main
load_checkpoint_new(model, optimizer, lr_scheduler)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 373, in load_checkpoint_new
load_checkpoint_new(model, optimizer, lr_scheduler)
File "megatron_lm/tools/generate_samples_gpt2.py", line 89, in main
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 373, in load_checkpoint_new
_ = load_checkpoint(model, None, None)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 183, in load_checkpoint
torch.distributed.barrier()
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 2709, in barrier
_ = load_checkpoint(model, None, None)
main()
torch.distributed.barrier()
load_checkpoint_new(model, optimizer, lr_scheduler)
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 2709, in barrier
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 183, in load_checkpoint
main()
File "megatron_lm/tools/generate_samples_gpt2.py", line 89, in main
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 373, in load_checkpoint_new
File "megatron_lm/tools/generate_samples_gpt2.py", line 89, in main
_ = load_checkpoint(model, None, None)
_ = load_checkpoint(model, None, None)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 183, in load_checkpoint
load_checkpoint_new(model, optimizer, lr_scheduler)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 183, in load_checkpoint
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 373, in load_checkpoint_new
torch.distributed.barrier()
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 2709, in barrier
load_checkpoint_new(model, optimizer, lr_scheduler)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 373, in load_checkpoint_new
load_checkpoint_new(model, optimizer, lr_scheduler)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 373, in load_checkpoint_new
torch.distributed.barrier()
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 2709, in barrier
work = default_pg.barrier(opts=opts)
torch.distributed.barrier()
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc). torch.distributed.barrier()
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 2709, in barrier
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 2709, in barrier
work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc). work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
work = default_pg.barrier(opts=opts)
work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
Traceback (most recent call last):
File "megatron_lm/tools/generate_samples_gpt2.py", line 104, in <module>
main()
File "megatron_lm/tools/generate_samples_gpt2.py", line 89, in main
_ = load_checkpoint(model, None, None)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 183, in load_checkpoint
load_checkpoint_new(model, optimizer, lr_scheduler)
File "/workspace/YaLM-100B/megatron_lm/megatron/checkpointing.py", line 373, in load_checkpoint_new
torch.distributed.barrier()
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 2709, in barrier
work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 554) of binary: /opt/conda/bin/python3
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in <module>
sys.exit(load_entry_point('torch==1.8.0a0+17f8c32', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/run.py", line 719, in main
run(args)
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/run.py", line 713, in run
)(*cmd_args)
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.6/site-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
megatron_lm/tools/generate_samples_gpt2.py FAILED
I forgot to change the MP_SIZE, closing this issue.