You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Describe the bug
Training Fastai model fails on Distributed training setup. The training fails both on CLI setup (accelerate launch train.py) and also on notebook_launcher.
---------------------------------------------------------------------------
ProcessRaisedException Traceback (most recent call last)
File /root/miniconda3/envs/py3.10/lib/python3.10/site-packages/accelerate/launchers.py:200, in notebook_launcher(function, args, num_processes, mixed_precision, use_port, master_addr, node_rank, num_nodes)
199 try:
--> 200 start_processes(launcher, args=args, nprocs=num_processes, start_method="fork")
201 except ProcessRaisedException as e:
File /root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/multiprocessing/spawn.py:202, in start_processes(fn, args, nprocs, join, daemon, start_method)
201 # Loop on join until it returns True or raises an exception.
--> 202 while not context.join():
203 pass
File /root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/multiprocessing/spawn.py:163, in ProcessContext.join(self, timeout)
162 msg += original_trace
--> 163 raise ProcessRaisedException(msg, error_index, failed_process.pid)
ProcessRaisedException:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 74, in _wrap
fn(i, *args)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/accelerate/utils/launch.py", line 570, in __call__
self.launcher(*args)
File "/tmp/ipykernel_94/4042077722.py", line 13, in train
learn.fine_tune(1)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/callback/schedule.py", line 165, in fine_tune
self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/callback/schedule.py", line 119, in fit_one_cycle
self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 264, in fit
self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 199, in _with_events
try: self(f'before_{event_type}'); f()
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 253, in _do_fit
self._with_events(self._do_epoch, 'epoch', CancelEpochException)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 199, in _with_events
try: self(f'before_{event_type}'); f()
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 247, in _do_epoch
self._do_epoch_train()
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 239, in _do_epoch_train
self._with_events(self.all_batches, 'train', CancelTrainException)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 199, in _with_events
try: self(f'before_{event_type}'); f()
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 205, in all_batches
for o in enumerate(self.dl): self.one_batch(*o)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/data/load.py", line 131, in __iter__
yield self.after_batch(b)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/distributed.py", line 120, in after_batch
return self.dl.after_batch(b)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 208, in __call__
def __call__(self, o): return compose_tfms(o, tfms=self.fs, split_idx=self.split_idx)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 158, in compose_tfms
x = f(x, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 81, in __call__
def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 91, in _call
return self._do_call(getattr(self, fn), x, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 98, in _do_call
res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 98, in <genexpr>
res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 97, in _do_call
return retain_type(f(x, **kwargs), x, ret)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/dispatch.py", line 120, in __call__
return f(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/data/transforms.py", line 377, in encodes
def encodes(self, x:TensorImage): return (x-self.mean) / self.std
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/torch_core.py", line 382, in __torch_function__
res = super().__torch_function__(func, types, args, ifnone(kwargs, {}))
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/_tensor.py", line 1386, in __torch_function__
ret = func(*args, **kwargs)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!
The above exception was the direct cause of the following exception:
RuntimeError Traceback (most recent call last)
Cell In[1], line 16
13 learn.fine_tune(1)
15 from accelerate import notebook_launcher
---> 16 notebook_launcher(train, num_processes=2)
File /root/miniconda3/envs/py3.10/lib/python3.10/site-packages/accelerate/launchers.py:210, in notebook_launcher(function, args, num_processes, mixed_precision, use_port, master_addr, node_rank, num_nodes)
203 raise RuntimeError(
204 "CUDA has been initialized before the `notebook_launcher` could create a forked subprocess. "
205 "This likely stems from an outside import causing issues once the `notebook_launcher()` is called. "
206 "Please review your imports and test them when running the `notebook_launcher()` to identify "
207 "which one is problematic and causing CUDA to be initialized."
208 ) from e
209 else:
--> 210 raise RuntimeError(f"An issue was found when launching the training: {e}") from e
212 else:
213 # No need for a distributed launch otherwise as it's either CPU, GPU or MPS.
214 if is_mps_available():
RuntimeError: An issue was found when launching the training:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 74, in _wrap
fn(i, *args)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/accelerate/utils/launch.py", line 570, in __call__
self.launcher(*args)
File "/tmp/ipykernel_94/4042077722.py", line 13, in train
learn.fine_tune(1)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/callback/schedule.py", line 165, in fine_tune
self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/callback/schedule.py", line 119, in fit_one_cycle
self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 264, in fit
self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 199, in _with_events
try: self(f'before_{event_type}'); f()
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 253, in _do_fit
self._with_events(self._do_epoch, 'epoch', CancelEpochException)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 199, in _with_events
try: self(f'before_{event_type}'); f()
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 247, in _do_epoch
self._do_epoch_train()
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 239, in _do_epoch_train
self._with_events(self.all_batches, 'train', CancelTrainException)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 199, in _with_events
try: self(f'before_{event_type}'); f()
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 205, in all_batches
for o in enumerate(self.dl): self.one_batch(*o)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/data/load.py", line 131, in __iter__
yield self.after_batch(b)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/distributed.py", line 120, in after_batch
return self.dl.after_batch(b)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 208, in __call__
def __call__(self, o): return compose_tfms(o, tfms=self.fs, split_idx=self.split_idx)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 158, in compose_tfms
x = f(x, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 81, in __call__
def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 91, in _call
return self._do_call(getattr(self, fn), x, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 98, in _do_call
res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 98, in <genexpr>
res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 97, in _do_call
return retain_type(f(x, **kwargs), x, ret)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/dispatch.py", line 120, in __call__
return f(*args, **kwargs)
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/data/transforms.py", line 377, in encodes
def encodes(self, x:TensorImage): return (x-self.mean) / self.std
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/torch_core.py", line 382, in __torch_function__
res = super().__torch_function__(func, types, args, ifnone(kwargs, {}))
File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/_tensor.py", line 1386, in __torch_function__
ret = func(*args, **kwargs)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!
Describe the bug
Training Fastai model fails on Distributed training setup. The training fails both on CLI setup (accelerate launch train.py) and also on notebook_launcher.
To Reproduce
Example from https://docs.fast.ai/distributed.html
Environment: Python 3.10, 2 x RTX6000
Requirements:
accelerate==0.27.2
fastai==2.7.14
fastbook==0.0.29
fastcore==1.5.29
fastdownload==0.0.7
fastjsonschema==2.19.1
fastprogress==1.0.3
torch==2.1.2
torchaudio==2.1.2
torchvision==0.16.2
transformers==4.37.2
triton==2.1.0
Expected behavior
Error with full stack trace
@muellerzr
The text was updated successfully, but these errors were encountered: