Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUG]: ValueError: mutable default <class 'colossalai.legacy.tensor.distspec._DistSpec'> for field dist_attr is not allowed: use default_factory #5564

Open
fangbrodie opened this issue Apr 7, 2024 · 1 comment
Labels
bug Something isn't working

Comments

@fangbrodie
Copy link

馃悰 Describe the bug


ValueError Traceback (most recent call last)
Cell In[2], line 1
----> 1 from colossalai.booster import Booster

File ~/.local/lib/python3.11/site-packages/colossalai/booster/init.py:2
1 from .accelerator import Accelerator
----> 2 from .booster import Booster
3 from .plugin import Plugin

File ~/.local/lib/python3.11/site-packages/colossalai/booster/booster.py:17
15 from .accelerator import Accelerator
16 from .mixed_precision import MixedPrecision, mixed_precision_factory
---> 17 from .plugin import Plugin
18 from .plugin.pp_plugin_base import PipelinePluginBase
20 all = ["Booster"]

File ~/.local/lib/python3.11/site-packages/colossalai/booster/plugin/init.py:1
----> 1 from .gemini_plugin import GeminiPlugin
2 from .hybrid_parallel_plugin import HybridParallelPlugin
3 from .low_level_zero_plugin import LowLevelZeroPlugin

File ~/.local/lib/python3.11/site-packages/colossalai/booster/plugin/gemini_plugin.py:31
29 from colossalai.interface import ModelWrapper, OptimizerWrapper
30 from colossalai.shardformer import ShardConfig, ShardFormer
---> 31 from colossalai.zero import GeminiDDP, GeminiOptimizer
32 from colossalai.zero.gemini.memory_tracer import MemStats
34 from .dp_plugin_base import DPPluginBase

File ~/.local/lib/python3.11/site-packages/colossalai/zero/init.py:1
----> 1 from .gemini import GeminiAdamOptimizer, GeminiDDP, GeminiOptimizer, get_static_torch_model
2 from .low_level import LowLevelZeroOptimizer
3 from .wrapper import zero_model_wrapper, zero_optim_wrapper

File ~/.local/lib/python3.11/site-packages/colossalai/zero/gemini/init.py:2
1 from .chunk import ChunkManager, TensorInfo, TensorState, search_chunk_configuration
----> 2 from .gemini_ddp import GeminiDDP
3 from .gemini_mgr import GeminiManager
4 from .gemini_optimizer import GeminiAdamOptimizer, GeminiOptimizer

File ~/.local/lib/python3.11/site-packages/colossalai/zero/gemini/gemini_ddp.py:34
31 from colossalai.utils import _cast_float, free_storage, is_ddp_ignored
33 from .chunk import Chunk, ChunkManager, TensorState, init_chunk_manager
---> 34 from .gemini_hook import GeminiZeROHook
35 from .gemini_mgr import GeminiManager
36 from .memory_tracer import MemStats, OrderedParamGenerator

File ~/.local/lib/python3.11/site-packages/colossalai/zero/gemini/gemini_hook.py:11
9 from colossalai.utils import is_ddp_ignored
10 from colossalai.zero.gemini import TensorState
---> 11 from colossalai.zero.gemini.gemini_mgr import GeminiManager
14 class TrainingPhase(Enum):
15 FORWARD = 0

File ~/.local/lib/python3.11/site-packages/colossalai/zero/gemini/gemini_mgr.py:9
7 from .chunk import Chunk, ChunkManager
8 from .memory_tracer import ChunkMemStatsCollector, MemStats
----> 9 from .placement_policy import PlacementPolicyFactory
12 class GeminiManager:
13 """
14 Stateful Tensor Manager, inspired from PatrickStar
15
(...)
24 memstats (MemStats, optional): a mem stats collected by a runtime mem tracer. if None then GeminiManager will collect it during a warmup iteration.
25 """

File ~/.local/lib/python3.11/site-packages/colossalai/zero/gemini/placement_policy.py:10
7 import torch
9 from colossalai.accelerator import get_accelerator
---> 10 from colossalai.legacy.utils.memory import colo_device_memory_capacity
11 from colossalai.zero.gemini.chunk import Chunk
13 from .chunk import Chunk, ChunkManager

File ~/.local/lib/python3.11/site-packages/colossalai/legacy/init.py:1
----> 1 from .initialize import (
2 get_default_parser,
3 initialize,
4 launch,
5 launch_from_openmpi,
6 launch_from_slurm,
7 launch_from_torch,
8 )
10 all = [
11 "launch",
12 "launch_from_openmpi",
(...)
16 "get_default_parser",
17 ]

File ~/.local/lib/python3.11/site-packages/colossalai/legacy/initialize.py:21
19 from colossalai.context import Config, ConfigException
20 from colossalai.interface import OptimizerWrapper
---> 21 from colossalai.legacy.amp import AMP_TYPE, convert_to_amp
22 from colossalai.legacy.amp.naive_amp import NaiveAMPModel
23 from colossalai.legacy.builder.builder import build_gradient_handler

File ~/.local/lib/python3.11/site-packages/colossalai/legacy/amp/init.py:11
8 from colossalai.context import Config
10 from .amp_type import AMP_TYPE
---> 11 from .apex_amp import convert_to_apex_amp
12 from .naive_amp import convert_to_naive_amp
13 from .torch_amp import convert_to_torch_amp

File ~/.local/lib/python3.11/site-packages/colossalai/legacy/amp/apex_amp/init.py:4
1 import torch.nn as nn
2 from torch.optim import Optimizer
----> 4 from .apex_amp import ApexAMPOptimizer
7 def convert_to_apex_amp(model: nn.Module, optimizer: Optimizer, amp_config):
8 r"""A helper function to wrap training components with Apex AMP modules
9
10 Args:
(...)
34 More details about amp_config refer to amp_config <[https://nvidia.github.io/apex/amp.html?highlight=apex%20amp>_.](https://nvidia.github.io/apex/amp.html?highlight=apex%20amp%3E`_.%3C/span%3E)
35 """

File ~/.local/lib/python3.11/site-packages/colossalai/legacy/amp/apex_amp/apex_amp.py:14
11 from torch import Tensor
13 from colossalai.interface import OptimizerWrapper
---> 14 from colossalai.legacy.utils import clip_grad_norm_fp32
17 class ApexAMPOptimizer(OptimizerWrapper):
18 """A wrapper class for APEX optimizer and it implements apex-specific backward and clip_grad_norm
19 methods
20 """

File ~/.local/lib/python3.11/site-packages/colossalai/legacy/utils/init.py:1
----> 1 from .checkpointing import load_checkpoint, save_checkpoint
2 from .common import (
3 clip_grad_norm_fp32,
4 copy_tensor_parallel_attributes,
(...)
16 sync_model_param,
17 )
18 from .data_sampler import DataParallelSampler, get_dataloader

File ~/.local/lib/python3.11/site-packages/colossalai/legacy/utils/checkpointing.py:16
13 except ImportError:
14 _EXTRA_STATE_KEY_SUFFIX = "_extra_state"
---> 16 from .common import is_using_pp
18 all = ["save_checkpoint", "load_checkpoint"]
21 def broadcast_state_dict(state_dict, parallel_mode):

File ~/.local/lib/python3.11/site-packages/colossalai/legacy/utils/common.py:16
14 from colossalai.legacy.core import global_context as gpc
15 from colossalai.legacy.global_variables import tensor_parallel_env as env
---> 16 from colossalai.legacy.tensor import ProcessGroup
17 from colossalai.tensor import ColoParameter
18 from colossalai.utils.multi_tensor_apply import multi_tensor_applier

File ~/.local/lib/python3.11/site-packages/colossalai/legacy/tensor/init.py:6
4 from .distspec import ReplicaSpec, ShardSpec
5 from .process_group import ProcessGroup
----> 6 from .tensor_spec import ColoTensorSpec
8 all = [
9 "ComputePattern",
10 "ComputeSpec",
(...)
16 "ReplicaSpec",
17 ]

File ~/.local/lib/python3.11/site-packages/colossalai/legacy/tensor/tensor_spec.py:10
5 from colossalai.legacy.tensor.process_group import ProcessGroup
7 from .compute_spec import ComputeSpec
---> 10 @DataClass
11 class ColoTensorSpec:
12 """ColoTensorSpec
13
14 A data class for specifications of the ColoTensor.
15 It contains attributes of ProcessGroup, _DistSpec, ComputeSpec.
16 The latter two attributes are optional. If not set, they are default value is Replicate() and None.
17 """
19 pg: ProcessGroup

File /usr/local/lib/python3.11/dataclasses.py:1230, in dataclass(cls, init, repr, eq, order, unsafe_hash, frozen, match_args, kw_only, slots, weakref_slot)
1227 return wrap
1229 # We're called as @DataClass without parens.
-> 1230 return wrap(cls)

File /usr/local/lib/python3.11/dataclasses.py:1220, in dataclass..wrap(cls)
1219 def wrap(cls):
-> 1220 return _process_class(cls, init, repr, eq, order, unsafe_hash,
1221 frozen, match_args, kw_only, slots,
1222 weakref_slot)

File /usr/local/lib/python3.11/dataclasses.py:958, in _process_class(cls, init, repr, eq, order, unsafe_hash, frozen, match_args, kw_only, slots, weakref_slot)
955 kw_only = True
956 else:
957 # Otherwise it's a field of some type.
--> 958 cls_fields.append(_get_field(cls, name, type, kw_only))
960 for f in cls_fields:
961 fields[f.name] = f

File /usr/local/lib/python3.11/dataclasses.py:815, in _get_field(cls, a_name, a_type, default_kw_only)
811 # For real fields, disallow mutable defaults. Use unhashable as a proxy
812 # indicator for mutability. Read the hash attribute from the class,
813 # not the instance.
814 if f._field_type is _FIELD and f.default.class.hash is None:
--> 815 raise ValueError(f'mutable default {type(f.default)} for field '
816 f'{f.name} is not allowed: use default_factory')
818 return f

ValueError: mutable default <class 'colossalai.legacy.tensor.distspec._DistSpec'> for field dist_attr is not allowed: use default_factory

Environment

python3.11锛宑uda12.2锛宯vidia-cudnn-cu12==8.9.2.2锛宯vidia-nccl-cu12==2.18.1; colossalai 0.3.6, PyTorch 2.1.1

@fangbrodie fangbrodie added the bug Something isn't working label Apr 7, 2024
@flybird11111
Copy link
Contributor

Hi, this issue has been fixed. https://github.com/hpcaitech/ColossalAI/pull/5440/files

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

2 participants