Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DeepSpeedExamples/training/HelloDeepSpeed] Fail to run if onnxruntime-training is installed #1386

Open
wschin opened this issue Apr 15, 2024 · 0 comments
Labels
bug Something isn't working

Comments

@wschin
Copy link

wschin commented Apr 15, 2024

repro: execute bash run.sh from DeepSpeedExamples/training/HelloDeepSpeed.

error

root@9824d79a444b:/home/DeepSpeedExamples/training/HelloDeepSpeed# sh run_ds.sh
[2024-04-15 21:59:04,124] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Traceback (most recent call last):
File "/usr/local/bin/deepspeed", line 3, in
from deepspeed.launcher.runner import main
File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 25, in
from . import ops
File "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/init.py", line 6, in
from . import adam
File "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/init.py", line 6, in
from .cpu_adam import DeepSpeedCPUAdam
File "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/cpu_adam.py", line 8, in
from deepspeed.utils import logger
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/init.py", line 10, in
from .groups import *
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/groups.py", line 28, in
from deepspeed import comm as dist
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/init.py", line 7, in
from .comm import *
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 31, in
from deepspeed.comm.ccl import CCLBackend
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/ccl.py", line 12, in
from .torch import TorchBackend
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 80, in
class TorchBackend(Backend):
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 105, in TorchBackend
def get_all_gather_function(self):
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/compiler.py", line 21, in disable
return torch.compiler.disable(func)
File "/home/pytorch/torch/compiler/init.py", line 96, in disable
import torch._dynamo
File "/home/pytorch/torch/_dynamo/init.py", line 2, in
from . import convert_frame, eval_frame, resume_execution
File "/home/pytorch/torch/_dynamo/convert_frame.py", line 41, in
from . import config, exc, trace_rules
File "/home/pytorch/torch/_dynamo/trace_rules.py", line 51, in
from .variables import (
File "/home/pytorch/torch/_dynamo/variables/init.py", line 38, in
from .higher_order_ops import (
File "/home/pytorch/torch/_dynamo/variables/higher_order_ops.py", line 13, in
import torch.onnx.operators
File "/home/pytorch/torch/onnx/init.py", line 61, in
from ._internal.onnxruntime import (
File "/home/pytorch/torch/onnx/_internal/onnxruntime.py", line 37, in
import onnxruntime # type: ignore[import]
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/init.py", line 54, in
from onnxruntime.capi import onnxruntime_validation
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/capi/onnxruntime_validation.py", line 145, in
has_ortmodule, package_name, version, cuda_version = validate_build_package_info()
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/capi/onnxruntime_validation.py", line 140, in validate_build_package_info
raise import_ortmodule_exception
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/capi/onnxruntime_validation.py", line 70, in validate_build_package_info
from onnxruntime.training.ortmodule import ORTModule # noqa: F401
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/init.py", line 26, in
from .ortmodule import ORTModule # noqa: F401
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/init.py", line 132, in
from .ortmodule import ORTModule # noqa: E402, F401
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/ortmodule.py", line 8, in
from ._torch_module_factory import TorchModuleFactory
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/_torch_module_factory.py", line 8, in
from ._torch_module_ort import TorchModuleORT
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/_torch_module_ort.py", line 13, in
from ._graph_execution_manager_factory import GraphExecutionManagerFactory
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/_graph_execution_manager_factory.py", line 10, in
from ._inference_manager import InferenceManager
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/_inference_manager.py", line 17, in
from ._graph_execution_manager import GraphExecutionManager, _RunStateInfo
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/ortmodule/_graph_execution_manager.py", line 23, in
from onnxruntime.training.utils.hooks import configure_ort_compatible_zero_stage3
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/utils/hooks/init.py", line 19, in
from ._zero_offload_subscriber import ZeROOffloadSubscriber, configure_ort_compatible_zero_stage3
File "/usr/local/lib/python3.10/dist-packages/onnxruntime/training/utils/hooks/_zero_offload_subscriber.py", line 141, in
from deepspeed.runtime.zero.parameter_offload import * # noqa: F403
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/init.py", line 6, in
from .partition_parameters import ZeroParamType
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/partition_parameters.py", line 22, in
from .linear import zero3_linear_wrap
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/linear.py", line 25, in
from deepspeed.runtime.utils import noop_decorator
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/utils.py", line 12, in
from deepspeed.moe.utils import is_moe_param
File "/usr/local/lib/python3.10/dist-packages/deepspeed/moe/utils.py", line 12, in
from .layer import MoE
File "/usr/local/lib/python3.10/dist-packages/deepspeed/moe/layer.py", line 14, in
from .sharded_moe import MOELayer, TopKGate
File "/usr/local/lib/python3.10/dist-packages/deepspeed/moe/sharded_moe.py", line 96, in
class _AllToAll(torch.autograd.Function):
File "/usr/local/lib/python3.10/dist-packages/deepspeed/moe/sharded_moe.py", line 99, in _AllToAll
def forward(ctx: Any, group: dist.ProcessGroup, input: Tensor) -> Tensor: # type: ignore
AttributeError: partially initialized module 'deepspeed.comm' has no attribute 'ProcessGroup' (most likely due to a circular import)

after uninstalling onnxruntime-training, the example can run but generates another error:

[rank7]: Traceback (most recent call last):
[rank7]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in
[rank7]: train_pipe(args)
[rank7]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe
[rank7]: engine, _, _, _ = deepspeed.initialize(
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize
[rank7]: engine = PipelineEngine(args=args,
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init
[rank7]: super().init(*super_args, **super_kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init
[rank7]: self._configure_distributed_model(model)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
[rank7]: self._broadcast_model()
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
[rank7]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank7]: return func(*args, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank7]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank7]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn
[rank7]: return fn(*args, **kwargs)
[rank7]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast
[rank7]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank7]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper
[rank7]: return func(*args, **kwargs)
[rank7]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast
[rank7]: work = group.broadcast([tensor], opts)
[rank7]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5
[rank7]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
[rank7]: Last error:
[rank7]: Error while creating shared memory segment /dev/shm/nccl-JFI57y (size 5767520)
[rank5]: Traceback (most recent call last):
[rank5]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in
[rank5]: train_pipe(args)
[rank5]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe
[rank5]: engine, _, _, _ = deepspeed.initialize(
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize
[rank5]: engine = PipelineEngine(args=args,
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init
[rank5]: super().init(*super_args, **super_kwargs)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init
[rank5]: self._configure_distributed_model(model)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
[rank5]: self._broadcast_model()
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
[rank5]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank5]: return func(*args, **kwargs)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank5]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank5]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn
[rank5]: return fn(*args, **kwargs)
[rank5]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast
[rank5]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank5]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper
[rank5]: return func(*args, **kwargs)
[rank5]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast
[rank5]: work = group.broadcast([tensor], opts)
[rank5]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5
[rank5]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
[rank5]: Last error:
[rank5]: Error while creating shared memory segment /dev/shm/nccl-NloqKs (size 5767520)
[rank6]: Traceback (most recent call last):
[rank6]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in
[rank6]: train_pipe(args)
[rank6]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe
[rank6]: engine, _, _, _ = deepspeed.initialize(
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize
[rank6]: engine = PipelineEngine(args=args,
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init
[rank6]: super().init(*super_args, **super_kwargs)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init
[rank6]: self._configure_distributed_model(model)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
[rank6]: self._broadcast_model()
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
[rank6]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank6]: return func(*args, **kwargs)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank6]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank6]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn
[rank6]: return fn(*args, **kwargs)
[rank6]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast
[rank6]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank6]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper
[rank6]: return func(*args, **kwargs)
[rank6]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast
[rank6]: work = group.broadcast([tensor], opts)
[rank6]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5
[rank6]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
[rank6]: Last error:
[rank6]: Error while creating shared memory segment /dev/shm/nccl-V76VEU (size 5767520)
[rank4]: Traceback (most recent call last):
[rank4]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in
[rank4]: train_pipe(args)
[rank4]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe
[rank4]: engine, _, _, _ = deepspeed.initialize(
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize
[rank4]: engine = PipelineEngine(args=args,
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init
[rank4]: super().init(*super_args, **super_kwargs)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init
[rank4]: self._configure_distributed_model(model)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
[rank4]: self._broadcast_model()
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
[rank4]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank4]: return func(*args, **kwargs)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank4]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank4]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn
[rank4]: return fn(*args, **kwargs)
[rank4]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast
[rank4]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank4]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper
[rank4]: return func(*args, **kwargs)
[rank4]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast
[rank4]: work = group.broadcast([tensor], opts)
[rank4]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5
[rank4]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
[rank4]: Last error:
[rank4]: Error while creating shared memory segment /dev/shm/nccl-RJem58 (size 5767520)
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in
[rank0]: train_pipe(args)
[rank0]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe
[rank0]: engine, _, _, _ = deepspeed.initialize(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize
[rank0]: engine = PipelineEngine(args=args,
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init
[rank0]: super().init(*super_args, **super_kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init
[rank0]: self._configure_distributed_model(model)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
[rank0]: self._broadcast_model()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
[rank0]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank0]: return func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank0]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank0]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn
[rank0]: return fn(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast
[rank0]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank0]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper
[rank0]: return func(*args, **kwargs)
[rank0]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast
[rank0]: work = group.broadcast([tensor], opts)
[rank0]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5
[rank0]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
[rank0]: Last error:
[rank0]: Error while creating shared memory segment /dev/shm/nccl-8AJ7Mw (size 5767520)
[rank1]: Traceback (most recent call last):
[rank1]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in
[rank1]: train_pipe(args)
[rank1]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe
[rank1]: engine, _, _, _ = deepspeed.initialize(
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize
[rank1]: engine = PipelineEngine(args=args,
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init
[rank1]: super().init(*super_args, **super_kwargs)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init
[rank1]: self._configure_distributed_model(model)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
[rank1]: self._broadcast_model()
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
[rank1]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank1]: return func(*args, **kwargs)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank1]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank1]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn
[rank1]: return fn(*args, **kwargs)
[rank1]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast
[rank1]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank1]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper
[rank1]: return func(*args, **kwargs)
[rank1]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast
[rank1]: work = group.broadcast([tensor], opts)
[rank1]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5
[rank1]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
[rank1]: Last error:
[rank1]: Error while creating shared memory segment /dev/shm/nccl-xaFDUZ (size 5767520)
[rank2]: Traceback (most recent call last):
[rank2]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in
[rank2]: train_pipe(args)
[rank2]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe
[rank2]: engine, _, _, _ = deepspeed.initialize(
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize
[rank2]: engine = PipelineEngine(args=args,
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init
[rank2]: super().init(*super_args, **super_kwargs)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init
[rank2]: self._configure_distributed_model(model)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
[rank2]: self._broadcast_model()
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
[rank2]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank2]: return func(*args, **kwargs)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank2]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank2]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn
[rank2]: return fn(*args, **kwargs)
[rank2]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast
[rank2]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank2]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper
[rank2]: return func(*args, **kwargs)
[rank2]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast
[rank2]: work = group.broadcast([tensor], opts)
[rank2]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5
[rank2]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
[rank2]: Last error:
[rank2]: Error while creating shared memory segment /dev/shm/nccl-c8G2vd (size 5767520)
[rank3]: Traceback (most recent call last):
[rank3]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 159, in
[rank3]: train_pipe(args)
[rank3]: File "/home/DeepSpeedExamples/training/pipeline_parallelism/train.py", line 139, in train_pipe
[rank3]: engine, _, _, _ = deepspeed.initialize(
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/init.py", line 196, in initialize
[rank3]: engine = PipelineEngine(args=args,
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/pipe/engine.py", line 69, in init
[rank3]: super().init(*super_args, **super_kwargs)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 262, in init
[rank3]: self._configure_distributed_model(model)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
[rank3]: self._broadcast_model()
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
[rank3]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank3]: return func(*args, **kwargs)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank3]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank3]: File "/home/pytorch/torch/_dynamo/eval_frame.py", line 410, in _fn
[rank3]: return fn(*args, **kwargs)
[rank3]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 185, in broadcast
[rank3]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank3]: File "/home/pytorch/torch/distributed/c10d_logger.py", line 78, in wrapper
[rank3]: return func(*args, **kwargs)
[rank3]: File "/home/pytorch/torch/distributed/distributed_c10d.py", line 2144, in broadcast
[rank3]: work = group.broadcast([tensor], opts)
[rank3]: torch.distributed.DistBackendError: NCCL error in: /home/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2028, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.20.5
[rank3]: ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
[rank3]: Last error:
[rank3]: Error while creating shared memory segment /dev/shm/nccl-ByCsYh (size 5767520)

@wschin wschin added the bug Something isn't working label Apr 15, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

1 participant