Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

大佬,lora多卡训练报错,帮忙看下 #24

Open
fjchung opened this issue Aug 3, 2023 · 3 comments
Open

大佬,lora多卡训练报错,帮忙看下 #24

fjchung opened this issue Aug 3, 2023 · 3 comments
Assignees

Comments

@fjchung
Copy link

fjchung commented Aug 3, 2023

训练命令如下:
CUDA_VISIBLE_DEVICES=0,1 python train.py

报错信息如下:
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /chatglm2-dev/train.py:122 in │
│ │
│ 119 │ ) │
│ 120 │ │
│ 121 │ if train_datasets is not None: │
│ ❱ 122 │ │ trainer.fit(pl_model, train_dataloaders=train_datasets) │
│ 123 │
│ 124 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 520 in fit │
│ │
│ 517 │ │ """ │
│ 518 │ │ model = _maybe_unwrap_optimized(model) │
│ 519 │ │ self.strategy._lightning_module = model │
│ ❱ 520 │ │ call._call_and_handle_interrupt( │
│ 521 │ │ │ self, self._fit_impl, model, train_dataloaders, val_datal │
│ 522 │ │ ) │
│ 523 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:42 │
│ in _call_and_handle_interrupt │
│ │
│ 39 │ """ │
│ 40 │ try: │
│ 41 │ │ if trainer.strategy.launcher is not None: │
│ ❱ 42 │ │ │ return trainer.strategy.launcher.launch(trainer_fn, *args, │
│ 43 │ │ else: │
│ 44 │ │ │ return trainer_fn(*args, **kwargs) │
│ 45 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/launcher │
│ s/subprocess_script.py:92 in launch │
│ │
│ 89 │ │ """ │
│ 90 │ │ if not self.cluster_environment.creates_processes_externally: │
│ 91 │ │ │ self._call_children_scripts() │
│ ❱ 92 │ │ return function(*args, **kwargs) │
│ 93 │ │
│ 94 │ def kill(self, signum: _SIGNUM) -> None: │
│ 95 │ │ for proc in self.procs: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 559 in _fit_impl │
│ │
│ 556 │ │ │ model_provided=True, │
│ 557 │ │ │ model_connected=self.lightning_module is not None, │
│ 558 │ │ ) │
│ ❱ 559 │ │ self._run(model, ckpt_path=ckpt_path) │
│ 560 │ │ │
│ 561 │ │ assert self.state.stopped │
│ 562 │ │ self.training = False │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 935 in _run │
│ │
│ 932 │ │ # ---------------------------- │
│ 933 │ │ # RUN THE TRAINER │
│ 934 │ │ # ---------------------------- │
│ ❱ 935 │ │ results = self._run_stage() │
│ 936 │ │ │
│ 937 │ │ # ---------------------------- │
│ 938 │ │ # POST-Training CLEAN UP │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 978 in _run_stage │
│ │
│ 975 │ │ │ with isolate_rng(): │
│ 976 │ │ │ │ self._run_sanity_check() │
│ 977 │ │ │ with torch.autograd.set_detect_anomaly(self._detect_anoma │
│ ❱ 978 │ │ │ │ self.fit_loop.run() │
│ 979 │ │ │ return None │
│ 980 │ │ raise RuntimeError(f"Unexpected state {self.state}") │
│ 981 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/fit_loop.py:2 │
│ 01 in run │
│ │
│ 198 │ │ while not self.done: │
│ 199 │ │ │ try: │
│ 200 │ │ │ │ self.on_advance_start() │
│ ❱ 201 │ │ │ │ self.advance() │
│ 202 │ │ │ │ self.on_advance_end() │
│ 203 │ │ │ │ self._restarting = False │
│ 204 │ │ │ except StopIteration: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/fit_loop.py:3 │
│ 54 in advance │
│ │
│ 351 │ │ assert self._data_fetcher is not None │
│ 352 │ │ self._data_fetcher.setup(combined_loader) │
│ 353 │ │ with self.trainer.profiler.profile("run_training_epoch"): │
│ ❱ 354 │ │ │ self.epoch_loop.run(self._data_fetcher) │
│ 355 │ │
│ 356 │ def on_advance_end(self) -> None: │
│ 357 │ │ trainer = self.trainer │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/training_epoc │
│ h_loop.py:133 in run │
│ │
│ 130 │ │ self.on_run_start(data_fetcher) │
│ 131 │ │ while not self.done: │
│ 132 │ │ │ try: │
│ ❱ 133 │ │ │ │ self.advance(data_fetcher) │
│ 134 │ │ │ │ self.on_advance_end() │
│ 135 │ │ │ │ self._restarting = False │
│ 136 │ │ │ except StopIteration: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/training_epoc │
│ h_loop.py:218 in advance │
│ │
│ 215 │ │ │ with trainer.profiler.profile("run_training_batch"): │
│ 216 │ │ │ │ if trainer.lightning_module.automatic_optimization: │
│ 217 │ │ │ │ │ # in automatic optimization, there can only be one │
│ ❱ 218 │ │ │ │ │ batch_output = self.automatic_optimization.run(tra │
│ 219 │ │ │ │ else: │
│ 220 │ │ │ │ │ batch_output = self.manual_optimization.run(kwargs │
│ 221 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:185 in run │
│ │
│ 182 │ │ # ------------------------------ │
│ 183 │ │ # gradient update with accumulated gradients │
│ 184 │ │ else: │
│ ❱ 185 │ │ │ self._optimizer_step(kwargs.get("batch_idx", 0), closure) │
│ 186 │ │ │
│ 187 │ │ result = closure.consume_result() │
│ 188 │ │ if result.loss is None: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:261 in _optimizer_step │
│ │
│ 258 │ │ │ self.optim_progress.optimizer.step.increment_ready() │
│ 259 │ │ │
│ 260 │ │ # model hook │
│ ❱ 261 │ │ call._call_lightning_module_hook( │
│ 262 │ │ │ trainer, │
│ 263 │ │ │ "optimizer_step", │
│ 264 │ │ │ trainer.current_epoch, │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:142 │
│ in _call_lightning_module_hook │
│ │
│ 139 │ pl_module._current_fx_name = hook_name │
│ 140 │ │
│ 141 │ with trainer.profiler.profile(f"[LightningModule]{pl_module.__clas │
│ ❱ 142 │ │ output = fn(*args, **kwargs) │
│ 143 │ │
│ 144 │ # restore current_fx when nested context │
│ 145 │ pl_module._current_fx_name = prev_fx_name │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/core/module.py:1265 │
│ in optimizer_step │
│ │
│ 1262 │ │ │ │ │ for pg in optimizer.param_groups: │
│ 1263 │ │ │ │ │ │ pg["lr"] = lr_scale * self.learning_rate │
│ 1264 │ │ """ │
│ ❱ 1265 │ │ optimizer.step(closure=optimizer_closure) │
│ 1266 │ │
│ 1267 │ def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimiz │
│ 1268 │ │ """Override this method to change the default behaviour of │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/core/optimizer.py:1 │ │ 58 in step │ │ │ │ 155 │ │ │ raise MisconfigurationException("When `optimizer.step(clos │ │ 156 │ │ │ │ 157 │ │ assert self._strategy is not None │ │ ❱ 158 │ │ step_output = self._strategy.optimizer_step(self._optimizer, c │ │ 159 │ │ │ │ 160 │ │ self._on_after_step() │ │ 161 │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/ddp.py:2 │ │ 59 in optimizer_step │ │ │ │ 256 │ │ │ model: reference to the model, optionally defining optimiz │ │ 257 │ │ │ **kwargs: Any extra arguments tooptimizer.step`` │
│ 258 │ │ """ │
│ ❱ 259 │ │ optimizer_output = super().optimizer_step(optimizer, closure, │
│ 260 │ │ │
│ 261 │ │ if self._model_averager is None: │
│ 262 │ │ │ return optimizer_output │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/strategy │
│ .py:224 in optimizer_step │
│ │
│ 221 │ │ model = model or self.lightning_module │
│ 222 │ │ # TODO(fabric): remove assertion once strategy's optimizer_ste │
│ 223 │ │ assert isinstance(model, pl.LightningModule) │
│ ❱ 224 │ │ return self.precision_plugin.optimizer_step(optimizer, model=m │
│ 225 │ │
│ 226 │ def _setup_model_and_optimizers(self, model: Module, optimizers: L │
│ 227 │ │ """Setup a model and multiple optimizers together. │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/plugins/precision/d │
│ eepspeed.py:92 in optimizer_step │
│ │
│ 89 │ ) -> Any: │
│ 90 │ │ if isinstance(optimizer, LBFGS): │
│ 91 │ │ │ raise MisconfigurationException("DeepSpeed and the LBFGS o │
│ ❱ 92 │ │ closure_result = closure() │
│ 93 │ │ self._after_closure(model, optimizer) │
│ 94 │ │ skipped_backward = closure_result is None │
│ 95 │ │ # in manual optimization, the closure does not return a value │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:140 in call
│ │
│ 137 │ │ return step_output │
│ 138 │ │
│ 139 │ def call(self, *args: Any, **kwargs: Any) -> Optional[Tensor]: │
│ ❱ 140 │ │ self._result = self.closure(*args, **kwargs) │
│ 141 │ │ return self._result.loss │
│ 142 │
│ 143 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:126 in closure │
│ │
│ 123 │ │ self._zero_grad_fn = zero_grad_fn │
│ 124 │ │
│ 125 │ def closure(self, *args: Any, **kwargs: Any) -> ClosureResult: │
│ ❱ 126 │ │ step_output = self._step_fn() │
│ 127 │ │ │
│ 128 │ │ if step_output.closure_loss is None: │
│ 129 │ │ │ self.warning_cache.warn("training_step returned `None`. │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:308 in _training_step │
│ │
│ 305 │ │ trainer = self.trainer │
│ 306 │ │ │
│ 307 │ │ # manually capture logged metrics │
│ ❱ 308 │ │ training_step_output = call._call_strategy_hook(trainer, "trai │
│ 309 │ │ self.trainer.strategy.post_training_step() │
│ 310 │ │ │
│ 311 │ │ result = self.output_result_cls.from_training_step_output(trai │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:288 │
│ in _call_strategy_hook │
│ │
│ 285 │ │ return │
│ 286 │ │
│ 287 │ with trainer.profiler.profile(f"[Strategy]{trainer.strategy.__clas │
│ ❱ 288 │ │ output = fn(*args, **kwargs) │
│ 289 │ │
│ 290 │ # restore current_fx when nested context │
│ 291 │ pl_module._current_fx_name = prev_fx_name │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/ddp.py:3 │
│ 31 in training_step │
│ │
│ 328 │ def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: │
│ 329 │ │ assert self.model is not None │
│ 330 │ │ with self.precision_plugin.train_step_context(): │
│ ❱ 331 │ │ │ return self.model(*args, **kwargs) │
│ 332 │ │
│ 333 │ def validation_step(self, *args: Any, **kwargs: Any) -> Optional[S │
│ 334 │ │ with self.precision_plugin.val_step_context(): │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self.
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/deepspeed/utils/nvtx.py:15 in │
│ wrapped_fn │
│ │
│ 12 │ │
│ 13 │ def wrapped_fn(*args, **kwargs): │
│ 14 │ │ get_accelerator().range_push(func.qualname) │
│ ❱ 15 │ │ ret_val = func(*args, **kwargs) │
│ 16 │ │ get_accelerator().range_pop() │
│ 17 │ │ return ret_val │
│ 18 │
│ │
│ /usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py:1769 in │
│ forward │
│ │
│ 1766 │ │ if self.fp16_auto_cast(): │
│ 1767 │ │ │ inputs = self._cast_inputs_half(inputs) │
│ 1768 │ │ │
│ ❱ 1769 │ │ loss = self.module(*inputs, **kwargs) │
│ 1770 │ │ │
│ 1771 │ │ if self.zero_optimization_partition_weights(): │
│ 1772 │ │ │ # Disable automated discovery of external parameters │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self.
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self.backward_hooks or global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/overrides/base.py:9 │
│ 0 in forward │
│ │
│ 87 │ │ │
│ 88 │ │ if trainer is not None: │
│ 89 │ │ │ if trainer.training: │
│ ❱ 90 │ │ │ │ output = self.forward_module.training_step(*inputs, * │
│ 91 │ │ │ │ # In manual_optimization, we need to prevent DDP reduc │
│ 92 │ │ │ │ # it is done manually in `LightningModule.manual_backw │
│ 93 │ │ │ │ # `require_backward_grad_sync` will be reset in the │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/transformer

│ base.py:552 in training_step │
│ │
│ 549 │ │
│ 550 │ def training_step(self, batch): │
│ 551 │ │ if isinstance(batch, dict): │
│ ❱ 552 │ │ │ outputs = self.compute_loss(**batch) │
│ 553 │ │ else: │
│ 554 │ │ │ outputs = self.compute_loss(**dict(batch)) │
│ 555 │ │ loss = outputs[0] │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/transformer

│ base.py:371 in compute_loss │
│ │
│ 368 │ def compute_loss(self, *args, **kwargs): │
│ 369 │ │ if len(args): │
│ 370 │ │ │ kwargs.update(dict(args)) │
│ ❱ 371 │ │ return self.model.compute_loss(**kwargs) │
│ 372 │ │
│ 373 │ def forward(self, *args, **kwargs): │
│ 374 │ │ if len(args): │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/transformer

│ base.py:117 in compute_loss │
│ │
│ 114 │ │ return self.model(*args, **batch) │
│ 115 │ │
│ 116 │ def compute_loss(self, *args, **batch) -> tuple: │
│ ❱ 117 │ │ return self.model(*args, **batch) │
│ 118 │ │
│ 119 │ def post_init(self): │
│ 120 │ │ return self.model.post_init() │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self.
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │
│ new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:953 in forward │
│ │
│ 950 │ │ use_cache = use_cache if use_cache is not None else self.conf │
│ 951 │ │ return_dict = return_dict if return_dict is not None else sel │
│ 952 │ │ │
│ ❱ 953 │ │ transformer_outputs = self.transformer( │
│ 954 │ │ │ input_ids=input_ids, │
│ 955 │ │ │ position_ids=position_ids, │
│ 956 │ │ │ attention_mask=attention_mask, │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self.
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:849 in forward │
│ │
│ 846 │ │ rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() │
│ 847 │ │ │
│ 848 │ │ # Run encoder. │
│ ❱ 849 │ │ hidden_states, presents, all_hidden_states, all_self_attentio │
│ 850 │ │ │ inputs_embeds, full_attention_mask, rotary_pos_emb=rotary │
│ 851 │ │ │ kv_caches=past_key_values, use_cache=use_cache, output_hi │
│ 852 │ │ ) │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self.
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:639 in forward │
│ │
│ 636 │ │ │ │
│ 637 │ │ │ layer = self._get_layer(index) │
│ 638 │ │ │ if self.gradient_checkpointing and self.training: │
│ ❱ 639 │ │ │ │ layer_ret = torch.utils.checkpoint.checkpoint( │
│ 640 │ │ │ │ │ layer, │
│ 641 │ │ │ │ │ hidden_states, │
│ 642 │ │ │ │ │ attention_mask, │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/utils/checkpoint.py:235 in │
│ checkpoint │
│ │
│ 232 │ │ raise ValueError("Unexpected keyword arguments: " + ",".join(a │
│ 233 │ │
│ 234 │ if use_reentrant: │
│ ❱ 235 │ │ return CheckpointFunction.apply(function, preserve, *args) │
│ 236 │ else: │
│ 237 │ │ return _checkpoint_without_reentrant( │
│ 238 │ │ │ function, │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/utils/checkpoint.py:96 in │
│ forward │
│ │
│ 93 │ │ ctx.save_for_backward(*tensor_inputs) │
│ 94 │ │ │
│ 95 │ │ with torch.no_grad(): │
│ ❱ 96 │ │ │ outputs = run_function(*args) │
│ 97 │ │ return outputs │
│ 98 │ │
│ 99 │ @staticmethod
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self.
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │
│ new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:551 in forward │
│ │
│ 548 │ │ # hidden_states: [s, b, h] │
│ 549 │ │ │
│ 550 │ │ # Layer norm at the beginning of the transformer layer. │
│ ❱ 551 │ │ layernorm_output = self.input_layernorm(hidden_states) │
│ 552 │ │ # Self attention. │
│ 553 │ │ attention_output, kv_cache = self.self_attention( │
│ 554 │ │ │ layernorm_output, │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self.
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │
│ new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:201 in forward │
│ │
│ 198 │ │ variance = hidden_states.to(torch.float32).pow(2).mean(-1, ke │
│ 199 │ │ hidden_states = hidden_states * torch.rsqrt(variance + self.e │
│ 200 │ │ │
│ ❱ 201 │ │ return (self.weight * hidden_states).to(input_dtype) │
│ 202 │
│ 203 │
│ 204 class CoreAttention(torch.nn.Module): │
╰──────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Expected all tensors to be on the same device, but found at least
two devices, cuda:0 and cuda:1!

sft.config.py中的train_info_args如下:
train_info_args = {
'devices': 2,
'data_backend': 'parquet', #one of record lmdb arrow_stream ,arrow_file, parquet, 超大数据集可以使用 lmdb , 注 lmdb 存储空间比record大
# 预训练模型路径
**train_model_config,
'convert_onnx': False, # 转换onnx模型
'do_train': True,
'train_file': [ '/chatglm2-dev/data/finetune_train_examples.json'],
'max_epochs': 20,
'max_steps': -1,
'optimizer': 'lion', # one of [lamb,adma,adamw_hf,adamw,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit]

'scheduler_type': 'CAWR', #one of [linear,WarmupCosine,CAWR,CAL,Step,ReduceLROnPlateau, cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau]
'scheduler':{'T_mult': 1,
             'rewarm_epoch_num': 0.5,  # 如果 max_epochs is not None !
             # 'T_0': 50000,    # 如果 max_epochs is None , 设定步数
             'verbose': False},

# 'scheduler_type': 'linear',# one of [linear,WarmupCosine,CAWR,CAL,Step,ReduceLROnPlateau
# 'scheduler': None,

# 切换scheduler类型
# 'scheduler_type': 'WarmupCosine',
# 'scheduler': None,

# 'scheduler_type': 'ReduceLROnPlateau',
# 'scheduler': None,

# 'scheduler_type': 'Step',
# 'scheduler':{ 'decay_rate': 0.999,'decay_steps': 100,'verbose': True},

# 'scheduler_type': 'CAWR',
# 'scheduler':{'T_mult': 1, 'rewarm_epoch_num': 2, 'verbose': True},

# 'scheduler_type': 'CAL',
# 'scheduler': {'rewarm_epoch_num': 2,'verbose': True},


'optimizer_betas': (0.9, 0.999),
'train_batch_size': 1,
'eval_batch_size': 2,
'test_batch_size': 2,
'learning_rate': 2e-5,  #
'adam_epsilon': 1e-8,
'gradient_accumulation_steps': 1,
'max_grad_norm': 1.0,
'weight_decay': 0,
'warmup_steps': 0,
'output_dir': './output',
'max_seq_length': 16, # 如果资源充足,推荐长度2048 与官方保持一致
'max_target_length': 16,  # 预测最大长度, 保留字段
'use_fast_tokenizer': False,
'do_lower_case': False,

}

main.py的信息如下:

模块配置, 默认启用lora

enable_deepspeed = True
enable_ptv2 = False
enable_lora = True
load_in_bit = 0 # 4 load_in_4bit, 8 load_in_8bit other 0

if enable_lora:
from config.sft_config_lora import *
elif enable_ptv2:
from config.sft_config_ptv2 import *
else:
from config.sft_config import *

if enable_lora:
enable_ptv2 = False
global_args['load_in_4bit'] = load_in_bit == 4
global_args['load_in_8bit'] = load_in_bit == 8

if global_args['load_in_4bit']:
    global_args['quantization_config'] = None

#检查lora adalora是否开启
if 'lora' not in train_info_args and 'adalora' not in train_info_args:
    raise ValueError('please config lora or adalora')
if train_info_args.get('lora',{}).get('with_lora',False) and train_info_args.get('adalora',{}).get('with_lora',False):
    raise Exception('lora and adalora can set one at same time !')

train_info_args.pop('prompt', None)

elif enable_ptv2:
enable_lora = False
global_args['load_in_4bit'] = False
global_args['load_in_8bit'] = False
train_info_args.pop('lora', None)
train_info_args.pop('adalora', None)
else:
enable_ptv2 = False
enable_lora = False
# global_args['load_in_4bit'] = False
# global_args['load_in_8bit'] = False
train_info_args.pop('lora',None)
train_info_args.pop('adalora', None)
train_info_args.pop('prompt', None)

#预处理
if 'rwkv' in train_info_args['tokenizer_name'].lower():
train_info_args['use_fast_tokenizer'] = True

def get_deepspeed_config():
'''
lora prompt finetuning 使用 deepspeed_offload.json
普通finetuning 使用deepspeed.json
'''
# 是否开启deepspeed
if not enable_deepspeed:
return None

# 选择 deepspeed 配置文件
is_need_update_config = False
if enable_lora or enable_ptv2:
    is_need_update_config = True
    filename = os.path.join(os.path.dirname(__file__), 'deepspeed_offload.json')
else:
    filename = os.path.join(os.path.dirname(__file__), 'deepspeed.json')


with open(filename, mode='r', encoding='utf-8') as f:
    deepspeed_config = json.loads(f.read())

#lora offload 同步优化器配置
if is_need_update_config:
    optimizer = deepspeed_config.get('optimizer',None)
    if optimizer:
        optimizer['params']['betas'] = train_info_args.get('optimizer_betas', (0.9, 0.999))
        optimizer['params']['lr'] = train_info_args.get('learning_rate', 2e-5)
        optimizer['params']['eps'] = train_info_args.get('adam_epsilon', 1e-8)
return deepspeed_config
@ssbuild ssbuild self-assigned this Aug 3, 2023
@ssbuild
Copy link
Owner

ssbuild commented Aug 3, 2023

pip list | grep -E "ing|torch|deep"

@ssbuild
Copy link
Owner

ssbuild commented Aug 4, 2023

export CUDA_DEVICE_ORDER="PCI_BUS_ID"
然后用一下其中之一方案

  1. 设置 train_info_args['devices'] = [0,1,2,3].
    设置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7

  2. 设置 train_info_args['devices'] = 4.
    设置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7

@fjchung
Copy link
Author

fjchung commented Aug 4, 2023

export CUDA_DEVICE_ORDER="PCI_BUS_ID" 然后用一下其中之一方案

  1. 设置 train_info_args['devices'] = [0,1,2,3].
    设置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7
  2. 设置 train_info_args['devices'] = 4.
    设置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7

大佬,还是不行,还是之前的报错。全量参数微调是没问题,但是lora多卡就报这个错了

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants