大佬，lora多卡训练报错，帮忙看下 #24

fjchung · 2023-08-03T10:25:27Z

训练命令如下：
CUDA_VISIBLE_DEVICES=0,1 python train.py

报错信息如下：
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /chatglm2-dev/train.py:122 in │
│ │
│ 119 │ ) │
│ 120 │ │
│ 121 │ if train_datasets is not None: │
│ ❱ 122 │ │ trainer.fit(pl_model, train_dataloaders=train_datasets) │
│ 123 │
│ 124 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 520 in fit │
│ │
│ 517 │ │ """ │
│ 518 │ │ model = _maybe_unwrap_optimized(model) │
│ 519 │ │ self.strategy._lightning_module = model │
│ ❱ 520 │ │ call._call_and_handle_interrupt( │
│ 521 │ │ │ self, self._fit_impl, model, train_dataloaders, val_datal │
│ 522 │ │ ) │
│ 523 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:42 │
│ in _call_and_handle_interrupt │
│ │
│ 39 │ """ │
│ 40 │ try: │
│ 41 │ │ if trainer.strategy.launcher is not None: │
│ ❱ 42 │ │ │ return trainer.strategy.launcher.launch(trainer_fn, *args, │
│ 43 │ │ else: │
│ 44 │ │ │ return trainer_fn(*args, **kwargs) │
│ 45 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/launcher │
│ s/subprocess_script.py:92 in launch │
│ │
│ 89 │ │ """ │
│ 90 │ │ if not self.cluster_environment.creates_processes_externally: │
│ 91 │ │ │ self._call_children_scripts() │
│ ❱ 92 │ │ return function(*args, **kwargs) │
│ 93 │ │
│ 94 │ def kill(self, signum: _SIGNUM) -> None: │
│ 95 │ │ for proc in self.procs: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 559 in _fit_impl │
│ │
│ 556 │ │ │ model_provided=True, │
│ 557 │ │ │ model_connected=self.lightning_module is not None, │
│ 558 │ │ ) │
│ ❱ 559 │ │ self._run(model, ckpt_path=ckpt_path) │
│ 560 │ │ │
│ 561 │ │ assert self.state.stopped │
│ 562 │ │ self.training = False │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 935 in _run │
│ │
│ 932 │ │ # ---------------------------- │
│ 933 │ │ # RUN THE TRAINER │
│ 934 │ │ # ---------------------------- │
│ ❱ 935 │ │ results = self._run_stage() │
│ 936 │ │ │
│ 937 │ │ # ---------------------------- │
│ 938 │ │ # POST-Training CLEAN UP │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 978 in _run_stage │
│ │
│ 975 │ │ │ with isolate_rng(): │
│ 976 │ │ │ │ self._run_sanity_check() │
│ 977 │ │ │ with torch.autograd.set_detect_anomaly(self._detect_anoma │
│ ❱ 978 │ │ │ │ self.fit_loop.run() │
│ 979 │ │ │ return None │
│ 980 │ │ raise RuntimeError(f"Unexpected state {self.state}") │
│ 981 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/fit_loop.py:2 │
│ 01 in run │
│ │
│ 198 │ │ while not self.done: │
│ 199 │ │ │ try: │
│ 200 │ │ │ │ self.on_advance_start() │
│ ❱ 201 │ │ │ │ self.advance() │
│ 202 │ │ │ │ self.on_advance_end() │
│ 203 │ │ │ │ self._restarting = False │
│ 204 │ │ │ except StopIteration: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/fit_loop.py:3 │
│ 54 in advance │
│ │
│ 351 │ │ assert self._data_fetcher is not None │
│ 352 │ │ self._data_fetcher.setup(combined_loader) │
│ 353 │ │ with self.trainer.profiler.profile("run_training_epoch"): │
│ ❱ 354 │ │ │ self.epoch_loop.run(self._data_fetcher) │
│ 355 │ │
│ 356 │ def on_advance_end(self) -> None: │
│ 357 │ │ trainer = self.trainer │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/training_epoc │
│ h_loop.py:133 in run │
│ │
│ 130 │ │ self.on_run_start(data_fetcher) │
│ 131 │ │ while not self.done: │
│ 132 │ │ │ try: │
│ ❱ 133 │ │ │ │ self.advance(data_fetcher) │
│ 134 │ │ │ │ self.on_advance_end() │
│ 135 │ │ │ │ self._restarting = False │
│ 136 │ │ │ except StopIteration: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/training_epoc │
│ h_loop.py:218 in advance │
│ │
│ 215 │ │ │ with trainer.profiler.profile("run_training_batch"): │
│ 216 │ │ │ │ if trainer.lightning_module.automatic_optimization: │
│ 217 │ │ │ │ │ # in automatic optimization, there can only be one │
│ ❱ 218 │ │ │ │ │ batch_output = self.automatic_optimization.run(tra │
│ 219 │ │ │ │ else: │
│ 220 │ │ │ │ │ batch_output = self.manual_optimization.run(kwargs │
│ 221 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:185 in run │
│ │
│ 182 │ │ # ------------------------------ │
│ 183 │ │ # gradient update with accumulated gradients │
│ 184 │ │ else: │
│ ❱ 185 │ │ │ self._optimizer_step(kwargs.get("batch_idx", 0), closure) │
│ 186 │ │ │
│ 187 │ │ result = closure.consume_result() │
│ 188 │ │ if result.loss is None: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:261 in _optimizer_step │
│ │
│ 258 │ │ │ self.optim_progress.optimizer.step.increment_ready() │
│ 259 │ │ │
│ 260 │ │ # model hook │
│ ❱ 261 │ │ call._call_lightning_module_hook( │
│ 262 │ │ │ trainer, │
│ 263 │ │ │ "optimizer_step", │
│ 264 │ │ │ trainer.current_epoch, │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:142 │
│ in _call_lightning_module_hook │
│ │
│ 139 │ pl_module._current_fx_name = hook_name │
│ 140 │ │
│ 141 │ with trainer.profiler.profile(f"[LightningModule]{pl_module.__clas │
│ ❱ 142 │ │ output = fn(*args, **kwargs) │
│ 143 │ │
│ 144 │ # restore current_fx when nested context │
│ 145 │ pl_module._current_fx_name = prev_fx_name │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/core/module.py:1265 │
│ in optimizer_step │
│ │
│ 1262 │ │ │ │ │ for pg in optimizer.param_groups: │
│ 1263 │ │ │ │ │ │ pg["lr"] = lr_scale * self.learning_rate │
│ 1264 │ │ """ │
│ ❱ 1265 │ │ optimizer.step(closure=optimizer_closure) │
│ 1266 │ │
│ 1267 │ def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimiz │
│ 1268 │ │ """Override this method to change the default behaviour of │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/core/optimizer.py:1 │ │ 58 in step │ │ │ │ 155 │ │ │ raise MisconfigurationException("When `optimizer.step(clos │ │ 156 │ │ │ │ 157 │ │ assert self._strategy is not None │ │ ❱ 158 │ │ step_output = self._strategy.optimizer_step(self._optimizer, c │ │ 159 │ │ │ │ 160 │ │ self._on_after_step() │ │ 161 │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/ddp.py:2 │ │ 59 in optimizer_step │ │ │ │ 256 │ │ │ model: reference to the model, optionally defining optimiz │ │ 257 │ │ │ **kwargs: Any extra arguments tooptimizer.step`` │
│ 258 │ │ """ │
│ ❱ 259 │ │ optimizer_output = super().optimizer_step(optimizer, closure, │
│ 260 │ │ │
│ 261 │ │ if self._model_averager is None: │
│ 262 │ │ │ return optimizer_output │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/strategy │
│ .py:224 in optimizer_step │
│ │
│ 221 │ │ model = model or self.lightning_module │
│ 222 │ │ # TODO(fabric): remove assertion once strategy's optimizer_ste │
│ 223 │ │ assert isinstance(model, pl.LightningModule) │
│ ❱ 224 │ │ return self.precision_plugin.optimizer_step(optimizer, model=m │
│ 225 │ │
│ 226 │ def _setup_model_and_optimizers(self, model: Module, optimizers: L │
│ 227 │ │ """Setup a model and multiple optimizers together. │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/plugins/precision/d │
│ eepspeed.py:92 in optimizer_step │
│ │
│ 89 │ ) -> Any: │
│ 90 │ │ if isinstance(optimizer, LBFGS): │
│ 91 │ │ │ raise MisconfigurationException("DeepSpeed and the LBFGS o │
│ ❱ 92 │ │ closure_result = closure() │
│ 93 │ │ self._after_closure(model, optimizer) │
│ 94 │ │ skipped_backward = closure_result is None │
│ 95 │ │ # in manual optimization, the closure does not return a value │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:140 in call │
│ │
│ 137 │ │ return step_output │
│ 138 │ │
│ 139 │ def call(self, *args: Any, **kwargs: Any) -> Optional[Tensor]: │
│ ❱ 140 │ │ self._result = self.closure(*args, **kwargs) │
│ 141 │ │ return self._result.loss │
│ 142 │
│ 143 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:126 in closure │
│ │
│ 123 │ │ self._zero_grad_fn = zero_grad_fn │
│ 124 │ │
│ 125 │ def closure(self, *args: Any, **kwargs: Any) -> ClosureResult: │
│ ❱ 126 │ │ step_output = self._step_fn() │
│ 127 │ │ │
│ 128 │ │ if step_output.closure_loss is None: │
│ 129 │ │ │ self.warning_cache.warn("training_step returned `None`. │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:308 in _training_step │
│ │
│ 305 │ │ trainer = self.trainer │
│ 306 │ │ │
│ 307 │ │ # manually capture logged metrics │
│ ❱ 308 │ │ training_step_output = call._call_strategy_hook(trainer, "trai │
│ 309 │ │ self.trainer.strategy.post_training_step() │
│ 310 │ │ │
│ 311 │ │ result = self.output_result_cls.from_training_step_output(trai │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:288 │
│ in _call_strategy_hook │
│ │
│ 285 │ │ return │
│ 286 │ │
│ 287 │ with trainer.profiler.profile(f"[Strategy]{trainer.strategy.__clas │
│ ❱ 288 │ │ output = fn(*args, **kwargs) │
│ 289 │ │
│ 290 │ # restore current_fx when nested context │
│ 291 │ pl_module._current_fx_name = prev_fx_name │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/ddp.py:3 │
│ 31 in training_step │
│ │
│ 328 │ def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: │
│ 329 │ │ assert self.model is not None │
│ 330 │ │ with self.precision_plugin.train_step_context(): │
│ ❱ 331 │ │ │ return self.model(*args, **kwargs) │
│ 332 │ │
│ 333 │ def validation_step(self, *args: Any, **kwargs: Any) -> Optional[S │
│ 334 │ │ with self.precision_plugin.val_step_context(): │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/deepspeed/utils/nvtx.py:15 in │
│ wrapped_fn │
│ │
│ 12 │ │
│ 13 │ def wrapped_fn(*args, **kwargs): │
│ 14 │ │ get_accelerator().range_push(func.qualname) │
│ ❱ 15 │ │ ret_val = func(*args, **kwargs) │
│ 16 │ │ get_accelerator().range_pop() │
│ 17 │ │ return ret_val │
│ 18 │
│ │
│ /usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py:1769 in │
│ forward │
│ │
│ 1766 │ │ if self.fp16_auto_cast(): │
│ 1767 │ │ │ inputs = self._cast_inputs_half(inputs) │
│ 1768 │ │ │
│ ❱ 1769 │ │ loss = self.module(*inputs, **kwargs) │
│ 1770 │ │ │
│ 1771 │ │ if self.zero_optimization_partition_weights(): │
│ 1772 │ │ │ # Disable automated discovery of external parameters │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self.backward_hooks or global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/overrides/base.py:9 │
│ 0 in forward │
│ │
│ 87 │ │ │
│ 88 │ │ if trainer is not None: │
│ 89 │ │ │ if trainer.training: │
│ ❱ 90 │ │ │ │ output = self.forward_module.training_step(*inputs, * │
│ 91 │ │ │ │ # In manual_optimization, we need to prevent DDP reduc │
│ 92 │ │ │ │ # it is done manually in `LightningModule.manual_backw │
│ 93 │ │ │ │ # `require_backward_grad_sync` will be reset in the │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/transformer │
│ base.py:552 in training_step │
│ │
│ 549 │ │
│ 550 │ def training_step(self, batch): │
│ 551 │ │ if isinstance(batch, dict): │
│ ❱ 552 │ │ │ outputs = self.compute_loss(**batch) │
│ 553 │ │ else: │
│ 554 │ │ │ outputs = self.compute_loss(**dict(batch)) │
│ 555 │ │ loss = outputs[0] │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/transformer │
│ base.py:371 in compute_loss │
│ │
│ 368 │ def compute_loss(self, *args, **kwargs): │
│ 369 │ │ if len(args): │
│ 370 │ │ │ kwargs.update(dict(args)) │
│ ❱ 371 │ │ return self.model.compute_loss(**kwargs) │
│ 372 │ │
│ 373 │ def forward(self, *args, **kwargs): │
│ 374 │ │ if len(args): │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/transformer │
│ base.py:117 in compute_loss │
│ │
│ 114 │ │ return self.model(*args, **batch) │
│ 115 │ │
│ 116 │ def compute_loss(self, *args, **batch) -> tuple: │
│ ❱ 117 │ │ return self.model(*args, **batch) │
│ 118 │ │
│ 119 │ def post_init(self): │
│ 120 │ │ return self.model.post_init() │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │
│ new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:953 in forward │
│ │
│ 950 │ │ use_cache = use_cache if use_cache is not None else self.conf │
│ 951 │ │ return_dict = return_dict if return_dict is not None else sel │
│ 952 │ │ │
│ ❱ 953 │ │ transformer_outputs = self.transformer( │
│ 954 │ │ │ input_ids=input_ids, │
│ 955 │ │ │ position_ids=position_ids, │
│ 956 │ │ │ attention_mask=attention_mask, │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:849 in forward │
│ │
│ 846 │ │ rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() │
│ 847 │ │ │
│ 848 │ │ # Run encoder. │
│ ❱ 849 │ │ hidden_states, presents, all_hidden_states, all_self_attentio │
│ 850 │ │ │ inputs_embeds, full_attention_mask, rotary_pos_emb=rotary │
│ 851 │ │ │ kv_caches=past_key_values, use_cache=use_cache, output_hi │
│ 852 │ │ ) │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:639 in forward │
│ │
│ 636 │ │ │ │
│ 637 │ │ │ layer = self._get_layer(index) │
│ 638 │ │ │ if self.gradient_checkpointing and self.training: │
│ ❱ 639 │ │ │ │ layer_ret = torch.utils.checkpoint.checkpoint( │
│ 640 │ │ │ │ │ layer, │
│ 641 │ │ │ │ │ hidden_states, │
│ 642 │ │ │ │ │ attention_mask, │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/utils/checkpoint.py:235 in │
│ checkpoint │
│ │
│ 232 │ │ raise ValueError("Unexpected keyword arguments: " + ",".join(a │
│ 233 │ │
│ 234 │ if use_reentrant: │
│ ❱ 235 │ │ return CheckpointFunction.apply(function, preserve, *args) │
│ 236 │ else: │
│ 237 │ │ return _checkpoint_without_reentrant( │
│ 238 │ │ │ function, │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/utils/checkpoint.py:96 in │
│ forward │
│ │
│ 93 │ │ ctx.save_for_backward(*tensor_inputs) │
│ 94 │ │ │
│ 95 │ │ with torch.no_grad(): │
│ ❱ 96 │ │ │ outputs = run_function(*args) │
│ 97 │ │ return outputs │
│ 98 │ │
│ 99 │ @staticmethod │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │
│ new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:551 in forward │
│ │
│ 548 │ │ # hidden_states: [s, b, h] │
│ 549 │ │ │
│ 550 │ │ # Layer norm at the beginning of the transformer layer. │
│ ❱ 551 │ │ layernorm_output = self.input_layernorm(hidden_states) │
│ 552 │ │ # Self attention. │
│ 553 │ │ attention_output, kv_cache = self.self_attention( │
│ 554 │ │ │ layernorm_output, │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │
│ new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:201 in forward │
│ │
│ 198 │ │ variance = hidden_states.to(torch.float32).pow(2).mean(-1, ke │
│ 199 │ │ hidden_states = hidden_states * torch.rsqrt(variance + self.e │
│ 200 │ │ │
│ ❱ 201 │ │ return (self.weight * hidden_states).to(input_dtype) │
│ 202 │
│ 203 │
│ 204 class CoreAttention(torch.nn.Module): │
╰──────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Expected all tensors to be on the same device, but found at least
two devices, cuda:0 and cuda:1!

sft.config.py中的train_info_args如下：
train_info_args = {
'devices': 2,
'data_backend': 'parquet', #one of record lmdb arrow_stream ,arrow_file, parquet, 超大数据集可以使用 lmdb , 注 lmdb 存储空间比record大
# 预训练模型路径
**train_model_config,
'convert_onnx': False, # 转换onnx模型
'do_train': True,
'train_file': [ '/chatglm2-dev/data/finetune_train_examples.json'],
'max_epochs': 20,
'max_steps': -1,
'optimizer': 'lion', # one of [lamb,adma,adamw_hf,adamw,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit]

'scheduler_type': 'CAWR', #one of [linear,WarmupCosine,CAWR,CAL,Step,ReduceLROnPlateau, cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau]
'scheduler':{'T_mult': 1,
             'rewarm_epoch_num': 0.5,  # 如果 max_epochs is not None !
             # 'T_0': 50000,    # 如果 max_epochs is None , 设定步数
             'verbose': False},

# 'scheduler_type': 'linear',# one of [linear,WarmupCosine,CAWR,CAL,Step,ReduceLROnPlateau
# 'scheduler': None,

# 切换scheduler类型
# 'scheduler_type': 'WarmupCosine',
# 'scheduler': None,

# 'scheduler_type': 'ReduceLROnPlateau',
# 'scheduler': None,

# 'scheduler_type': 'Step',
# 'scheduler':{ 'decay_rate': 0.999,'decay_steps': 100,'verbose': True},

# 'scheduler_type': 'CAWR',
# 'scheduler':{'T_mult': 1, 'rewarm_epoch_num': 2, 'verbose': True},

# 'scheduler_type': 'CAL',
# 'scheduler': {'rewarm_epoch_num': 2,'verbose': True},


'optimizer_betas': (0.9, 0.999),
'train_batch_size': 1,
'eval_batch_size': 2,
'test_batch_size': 2,
'learning_rate': 2e-5,  #
'adam_epsilon': 1e-8,
'gradient_accumulation_steps': 1,
'max_grad_norm': 1.0,
'weight_decay': 0,
'warmup_steps': 0,
'output_dir': './output',
'max_seq_length': 16, # 如果资源充足，推荐长度2048 与官方保持一致
'max_target_length': 16,  # 预测最大长度, 保留字段
'use_fast_tokenizer': False,
'do_lower_case': False,

}

main.py的信息如下：

模块配置，默认启用lora

enable_deepspeed = True
enable_ptv2 = False
enable_lora = True
load_in_bit = 0 # 4 load_in_4bit, 8 load_in_8bit other 0

if enable_lora:
from config.sft_config_lora import *
elif enable_ptv2:
from config.sft_config_ptv2 import *
else:
from config.sft_config import *

if enable_lora:
enable_ptv2 = False
global_args['load_in_4bit'] = load_in_bit == 4
global_args['load_in_8bit'] = load_in_bit == 8

if global_args['load_in_4bit']:
    global_args['quantization_config'] = None

#检查lora adalora是否开启
if 'lora' not in train_info_args and 'adalora' not in train_info_args:
    raise ValueError('please config lora or adalora')
if train_info_args.get('lora',{}).get('with_lora',False) and train_info_args.get('adalora',{}).get('with_lora',False):
    raise Exception('lora and adalora can set one at same time !')

train_info_args.pop('prompt', None)

elif enable_ptv2:
enable_lora = False
global_args['load_in_4bit'] = False
global_args['load_in_8bit'] = False
train_info_args.pop('lora', None)
train_info_args.pop('adalora', None)
else:
enable_ptv2 = False
enable_lora = False
# global_args['load_in_4bit'] = False
# global_args['load_in_8bit'] = False
train_info_args.pop('lora',None)
train_info_args.pop('adalora', None)
train_info_args.pop('prompt', None)

#预处理
if 'rwkv' in train_info_args['tokenizer_name'].lower():
train_info_args['use_fast_tokenizer'] = True

def get_deepspeed_config():
'''
lora prompt finetuning 使用 deepspeed_offload.json
普通finetuning 使用deepspeed.json
'''
# 是否开启deepspeed
if not enable_deepspeed:
return None

# 选择 deepspeed 配置文件
is_need_update_config = False
if enable_lora or enable_ptv2:
    is_need_update_config = True
    filename = os.path.join(os.path.dirname(__file__), 'deepspeed_offload.json')
else:
    filename = os.path.join(os.path.dirname(__file__), 'deepspeed.json')


with open(filename, mode='r', encoding='utf-8') as f:
    deepspeed_config = json.loads(f.read())

#lora offload 同步优化器配置
if is_need_update_config:
    optimizer = deepspeed_config.get('optimizer',None)
    if optimizer:
        optimizer['params']['betas'] = train_info_args.get('optimizer_betas', (0.9, 0.999))
        optimizer['params']['lr'] = train_info_args.get('learning_rate', 2e-5)
        optimizer['params']['eps'] = train_info_args.get('adam_epsilon', 1e-8)
return deepspeed_config

The text was updated successfully, but these errors were encountered:

ssbuild · 2023-08-03T11:35:07Z

pip list | grep -E "ing|torch|deep"

ssbuild · 2023-08-04T00:49:30Z

export CUDA_DEVICE_ORDER="PCI_BUS_ID"
然后用一下其中之一方案

设置 train_info_args['devices'] = [0,1,2,3].
设置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7
设置 train_info_args['devices'] = 4.
设置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7

fjchung · 2023-08-04T02:51:29Z

export CUDA_DEVICE_ORDER="PCI_BUS_ID" 然后用一下其中之一方案

设置 train_info_args['devices'] = [0,1,2,3].
设置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7

设置 train_info_args['devices'] = 4.
设置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7

大佬，还是不行，还是之前的报错。全量参数微调是没问题，但是lora多卡就报这个错了

ssbuild self-assigned this Aug 3, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

大佬，lora多卡训练报错，帮忙看下 #24

大佬，lora多卡训练报错，帮忙看下 #24

fjchung commented Aug 3, 2023

ssbuild commented Aug 3, 2023

ssbuild commented Aug 4, 2023

fjchung commented Aug 4, 2023

大佬，lora多卡训练报错，帮忙看下 #24

大佬，lora多卡训练报错，帮忙看下 #24

Comments

fjchung commented Aug 3, 2023

模块配置， 默认启用lora

ssbuild commented Aug 3, 2023

ssbuild commented Aug 4, 2023

fjchung commented Aug 4, 2023

模块配置，默认启用lora