I've got error below when runnning trainer.fit(model) with DDP
How can i fix this ?
VISIBLE GPUS: '0,1'
gpu available: True, used: True
/opt/conda/lib/python3.7/site-packages/pytorch_lightning/models/trainer.py:545: UserWarning:
You requested 2s GPUs but launched 0s slurm tasks.
We will launch 2s processes for you.
We recommend you let slurm manage the processes by setting:
--ntasks-per-node=2s
If you're not using SLURM, ignore this message!
warnings.warn(msg)
VISIBLE GPUS: '0,1'
gpu available: True, used: True
/opt/conda/lib/python3.7/site-packages/pytorch_lightning/models/trainer.py:545: UserWarning:
You requested 2s GPUs but launched 0s slurm tasks.
We will launch 2s processes for you.
We recommend you let slurm manage the processes by setting:
--ntasks-per-node=2s
If you're not using SLURM, ignore this message!
warnings.warn(msg)
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/opt/conda/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main
exitcode = _main(fd)
File "/opt/conda/lib/python3.7/multiprocessing/spawn.py", line 114, in _main
prepare(preparation_data)
File "/opt/conda/lib/python3.7/multiprocessing/spawn.py", line 225, in prepare
_fixup_main_from_path(data['init_main_from_path'])
File "/opt/conda/lib/python3.7/multiprocessing/spawn.py", line 277, in _fixup_main_from_path
run_name="__mp_main__")
File "/opt/conda/lib/python3.7/runpy.py", line 263, in run_path
pkg_name=pkg_name, script_name=fname)
File "/opt/conda/lib/python3.7/runpy.py", line 96, in _run_module_code
mod_name, mod_spec, pkg_name, script_name)
File "/opt/conda/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/jovyan/workspace/pl-practice.py", line 63, in <module>
trainer.fit(model)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/models/trainer.py", line 546, in fit
mp.spawn(self.ddp_train, nprocs=len(self.data_parallel_device_ids), args=(model, ))
File "/opt/conda/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 158, in spawn
process.start()
File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 112, in start
self._popen = self._Popen(self)
File "/opt/conda/lib/python3.7/multiprocessing/context.py", line 284, in _Popen
return Popen(process_obj)
File "/opt/conda/lib/python3.7/multiprocessing/popen_spawn_posix.py", line 32, in __init__
super().__init__(process_obj)
File "/opt/conda/lib/python3.7/multiprocessing/popen_fork.py", line 20, in __init__
self._launch(process_obj)
File "/opt/conda/lib/python3.7/multiprocessing/popen_spawn_posix.py", line 42, in _launch
prep_data = spawn.get_preparation_data(process_obj._name)
File "/opt/conda/lib/python3.7/multiprocessing/spawn.py", line 143, in get_preparation_data
_check_not_importing_main()
File "/opt/conda/lib/python3.7/multiprocessing/spawn.py", line 136, in _check_not_importing_main
is not going to be frozen to produce an executable.''')
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
...
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.
Traceback (most recent call last):
File "pl-practice.py", line 63, in <module>
trainer.fit(model)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/models/trainer.py", line 546, in fit
mp.spawn(self.ddp_train, nprocs=len(self.data_parallel_device_ids), args=(model, ))
File "/opt/conda/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 167, in spawn
while not spawn_context.join():
File "/opt/conda/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 108, in join
(error_index, exitcode)
Exception: process 0 terminated with exit code 1
Please paste a code snippet if your question requires it!
import os
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
from pytorch_lightning import Trainer
import pytorch_lightning as pl
class CoolSystem(pl.LightningModule):
def __init__(self):
super(CoolSystem, self).__init__()
# not the best model...
self.l1 = torch.nn.Linear(28 * 28, 10)
self.batchsize = 256
def forward(self, x):
return torch.relu(self.l1(x.view(x.size(0), -1)))
def training_step(self, batch, batch_nb):
# REQUIRED
x, y = batch
y_hat = self.forward(x)
return {'loss': F.cross_entropy(y_hat, y)}
def validation_step(self, batch, batch_nb):
# OPTIONAL
x, y = batch
y_hat = self.forward(x)
return {'val_loss': F.cross_entropy(y_hat, y)}
def validation_end(self, outputs):
# OPTIONAL
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
return {'avg_val_loss': avg_loss}
def configure_optimizers(self):
# REQUIRED
# can return multiple optimizers and learning_rate schedulers
return torch.optim.Adam(self.parameters(), lr=0.02)
@pl.data_loader
def tng_dataloader(self):
# REQUIRED
return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=self.batchsize)
@pl.data_loader
def val_dataloader(self):
# OPTIONAL
return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=self.batchsize)
@pl.data_loader
def test_dataloader(self):
# OPTIONAL
return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=self.batchsize)
model = CoolSystem()
trainer = Trainer(max_nb_epochs=1,
gpus=[0,1], distributed_backend='ddp')
trainer.fit(model)
i've tried running script above on terminal of jupyterlab on Docker.
When distributed_backend='dp', it works well.
@wdy06 distributed data parallel inits a process for each GPU. but there's no guarantee what happens with docker. you probably have to configure docker so it allows nb_gpus processes
@williamFalcon Thank you for your response! I will try various things with that.
@wdy06 I too am facing the same issue with DDP in a JupyterHub+Docker environment; But without using the PyTorch Lighning package. My DDP script seems to run smoothly in the docker container and only in the Notebook environment, it throws the error. Let me know if you have found any solution to this problem. Thanks!
Try this:
if __name__ == '__main__':
trainer.fit(model)
Most helpful comment
Try this: