Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RuntimeError: torch.cat(): expected a non-empty list of Tensors #18

Open
evanweiguohua opened this issue Jul 13, 2023 · 1 comment
Open

Comments

@evanweiguohua
Copy link

╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /home/dev/project/chatglm2_finetuning/train.py:118 in │
│ │
│ 115 │ ) │
│ 116 │ │
│ 117 │ if train_datasets is not None: │
│ ❱ 118 │ │ trainer.fit(pl_model, train_dataloaders=train_datasets) │
│ 119 │
│ 120 │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /trainer/trainer.py:520 in fit │
│ │
│ 517 │ │ """ │
│ 518 │ │ model = _maybe_unwrap_optimized(model) │
│ 519 │ │ self.strategy._lightning_module = model │
│ ❱ 520 │ │ call._call_and_handle_interrupt( │
│ 521 │ │ │ self, self._fit_impl, model, train_dataloaders, val_datal │
│ 522 │ │ ) │
│ 523 │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /trainer/call.py:42 in _call_and_handle_interrupt │
│ │
│ 39 │ """ │
│ 40 │ try: │
│ 41 │ │ if trainer.strategy.launcher is not None: │
│ ❱ 42 │ │ │ return trainer.strategy.launcher.launch(trainer_fn, *args, │
│ 43 │ │ else: │
│ 44 │ │ │ return trainer_fn(*args, **kwargs) │
│ 45 │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /strategies/launchers/subprocess_script.py:92 in launch │
│ │
│ 89 │ │ """ │
│ 90 │ │ if not self.cluster_environment.creates_processes_externally: │
│ 91 │ │ │ self._call_children_scripts() │
│ ❱ 92 │ │ return function(*args, **kwargs) │
│ 93 │ │
│ 94 │ def kill(self, signum: _SIGNUM) -> None: │
│ 95 │ │ for proc in self.procs: │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /trainer/trainer.py:559 in _fit_impl │
│ │
│ 556 │ │ │ model_provided=True, │
│ 557 │ │ │ model_connected=self.lightning_module is not None, │
│ 558 │ │ ) │
│ ❱ 559 │ │ self._run(model, ckpt_path=ckpt_path) │
│ 560 │ │ │
│ 561 │ │ assert self.state.stopped │
│ 562 │ │ self.training = False │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /trainer/trainer.py:911 in _run │
│ │
│ 908 │ │ self._logger_connector.reset_metrics() │
│ 909 │ │ │
│ 910 │ │ # strategy will configure model and move it to the device │
│ ❱ 911 │ │ self.strategy.setup(self) │
│ 912 │ │ │
│ 913 │ │ # hook │
│ 914 │ │ if self.state.fn == TrainerFn.FITTING: │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /strategies/deepspeed.py:344 in setup │
│ │
│ 341 │ │ self.setup_optimizers(trainer) │
│ 342 │ │ self.setup_precision_plugin() │
│ 343 │ │ _optimizers_to_device(self.optimizers, self.root_device) │
│ ❱ 344 │ │ self.init_deepspeed() │
│ 345 │ │ self.barrier() │
│ 346 │ │
│ 347 │ def _init_deepspeed_distributed(self) -> None: │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /strategies/deepspeed.py:448 in init_deepspeed │
│ │
│ 445 │ │ model = _LightningModuleWrapperBase(forward_module=self.model) │
│ 446 │ │ │
│ 447 │ │ if self.lightning_module.trainer and self.lightning_module.tra │
│ ❱ 448 │ │ │ self._initialize_deepspeed_train(model) │
│ 449 │ │ else: │
│ 450 │ │ │ self._initialize_deepspeed_inference(model) │
│ 451 │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /strategies/deepspeed.py:484 in _initialize_deepspeed_train │
│ │
│ 481 │ │ │ if lr_scheduler is not None: │
│ 482 │ │ │ │ scheduler = lr_scheduler.scheduler │
│ 483 │ │ │
│ ❱ 484 │ │ model, deepspeed_optimizer = self._setup_model_and_optimizer(m │
│ 485 │ │ self._set_deepspeed_activation_checkpointing() │
│ 486 │ │ │
│ 487 │ │ # although we set these here, deepspeed manages the specific o │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /strategies/deepspeed.py:413 in _setup_model_and_optimizer │
│ │
│ 410 │ │ import deepspeed │
│ 411 │ │ │
│ 412 │ │ model_parameters = filter(lambda p: p.requires_grad, model.par │
│ ❱ 413 │ │ deepspeed_engine, deepspeed_optimizer, _, _ = deepspeed.initia │
│ 414 │ │ │ args=argparse.Namespace(device_rank=self.root_device.index │
│ 415 │ │ │ config=self.config, │
│ 416 │ │ │ model=model, │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/_init
│ _.py:165 in initialize │
│ │
│ 162 │ │ │ │ │ │ │ │ │ │ config=config, │
│ 163 │ │ │ │ │ │ │ │ │ │ config_class=config_class) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ engine = DeepSpeedEngine(args=args, │
│ 166 │ │ │ │ │ │ │ │ │ model=model, │
│ 167 │ │ │ │ │ │ │ │ │ optimizer=optimizer, │
│ 168 │ │ │ │ │ │ │ │ │ model_parameters=model_parameters │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/runtime │
│ /engine.py:309 in init
│ │
│ 306 │ │ │ model_parameters = list(model_parameters) │
│ 307 │ │ │
│ 308 │ │ if has_optimizer: │
│ ❱ 309 │ │ │ self._configure_optimizer(optimizer, model_parameters) │
│ 310 │ │ │ self._configure_lr_scheduler(lr_scheduler) │
│ 311 │ │ │ self._report_progress(0) │
│ 312 │ │ elif self.zero_optimization(): │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/runtime │
│ /engine.py:1184 in _configure_optimizer │
│ │
│ 1181 │ │ optimizer_wrapper = self._do_optimizer_sanity_check(basic_opt │
│ 1182 │ │ │
│ 1183 │ │ if optimizer_wrapper == ZERO_OPTIMIZATION: │
│ ❱ 1184 │ │ │ self.optimizer = self._configure_zero_optimizer(basic_opt │
│ 1185 │ │ elif optimizer_wrapper == AMP: │
│ 1186 │ │ │ amp_params = self.amp_params() │
│ 1187 │ │ │ log_dist(f"Initializing AMP with these params: {amp_param │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/runtime │
│ /engine.py:1419 in _configure_zero_optimizer │
│ │
│ 1416 │ │ │ │ if overlap_comm: │
│ 1417 │ │ │ │ │ logger.warning("Pipeline parallelism does not sup │
│ 1418 │ │ │ │ │ overlap_comm = False │
│ ❱ 1419 │ │ │ optimizer = DeepSpeedZeroOptimizer( │
│ 1420 │ │ │ │ optimizer, │
│ 1421 │ │ │ │ self.param_names, │
│ 1422 │ │ │ │ timers=timers, │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/runtime │
│ /zero/stage_1_and_2.py:312 in init
│ │
│ 309 │ │ │ │
│ 310 │ │ │ # create flat buffer in CPU and move to GPU │
│ 311 │ │ │ self.bit16_groups_flat.append( │
│ ❱ 312 │ │ │ │ self.flatten_dense_tensors_aligned( │
│ 313 │ │ │ │ │ self.round_robin_bit16_groups[i], │
│ 314 │ │ │ │ │ self.nccl_start_alignment_factor * dist.get_world │
│ 315 │ │ │ │ │ │ get_accelerator().current_device_name())) │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/runtime │
│ /zero/stage_1_and_2.py:829 in flatten_dense_tensors_aligned │
│ │
│ 826 │ │
│ 827 │ # create a flat tensor aligned at the alignment boundary │
│ 828 │ def flatten_dense_tensors_aligned(self, tensor_list, alignment): │
│ ❱ 829 │ │ return self.flatten(align_dense_tensors(tensor_list, alignmen │
│ 830 │ │
│ 831 │ ############### Independent Partition Gradient ################## │
│ 832 │ def reduce_independent_p_g_buckets_and_remove_grads(self, param, │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/torch/_utils.py:4 │
│ 59 in _flatten_dense_tensors │
│ │
│ 456 │ Returns: │
│ 457 │ │ A contiguous 1D buffer containing input tensors. │
│ 458 │ """ │
│ ❱ 459 │ return torch._C._nn.flatten_dense_tensors(tensors) │
│ 460 │
│ 461 │
│ 462 def _flatten_sparse_tensors(tensors): │
╰──────────────────────────────────────────────────────────────────────────────

@ssbuild
Copy link
Owner

ssbuild commented Jul 13, 2023

if enable_lora or enable_ptv2:

ptv2 已修改为 使用 deepspeed_offload.json

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants