When using NeptuneLogger with distributed_backend='ddp' and running it on a single node with two GPUs, I find an error like this.
Traceback (most recent call last):
File "pl.py", line 146, in <module>
main()
File "pl.py", line 103, in main
trainer.fit(model)
File "/home/hirune/anaconda3/envs/PANDA/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 751, in fit
mp.spawn(self.ddp_train, nprocs=self.num_processes, args=(model,))
File "/home/hirune/anaconda3/envs/PANDA/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 200, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/hirune/anaconda3/envs/PANDA/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 158, in start_processes
while not context.join():
File "/home/hirune/anaconda3/envs/PANDA/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 119, in join
raise Exception(msg)
Exception:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/home/hirune/anaconda3/envs/PANDA/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 20, in _wrap
fn(i, *args)
File "/home/hirune/anaconda3/envs/PANDA/lib/python3.7/site-packages/pytorch_lightning/trainer/distrib_data_parallel.py", line 370, in ddp_train
self.run_pretrain_routine(model)
File "/home/hirune/anaconda3/envs/PANDA/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 871, in run_pretrain_routine
self.configure_checkpoint_callback()
File "/home/hirune/anaconda3/envs/PANDA/lib/python3.7/site-packages/pytorch_lightning/trainer/callback_config.py", line 54, in configure_checkpoint_callback
self.logger.name,
File "/home/hirune/anaconda3/envs/PANDA/lib/python3.7/site-packages/pytorch_lightning/loggers/neptune.py", line 267, in name
return self.experiment.name
File "/home/hirune/anaconda3/envs/PANDA/lib/python3.7/site-packages/pytorch_lightning/loggers/neptune.py", line 230, in experiment
**self._kwargs)
File "/home/hirune/anaconda3/envs/PANDA/lib/python3.7/site-packages/neptune/__init__.py", line 222, in create_experiment
raise Uninitialized()
neptune.exceptions.Uninitialized: You must initialize neptune-client first. For more information, please visit: https://github.com/neptune-ai/neptune-client#initialize-neptune
And I found a similar error with CommetLogger
Steps to reproduce the behavior:
Run the following code on a machine with two GPUs.
This code is a slightly modified version of what was on this page.
https://docs.neptune.ai/integrations/pytorch_lightning.html
import os
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision import transforms
import pytorch_lightning as pl
MAX_EPOCHS=20
LR=0.01
BATCHSIZE=32
CHECKPOINTS_DIR = 'my_models/checkpoints/7'
class CoolSystem(pl.LightningModule):
def __init__(self):
super(CoolSystem, self).__init__()
# not the best model...
self.l1 = torch.nn.Linear(28 * 28, 10)
def forward(self, x):
return torch.relu(self.l1(x.view(x.size(0), -1)))
def training_step(self, batch, batch_idx):
# REQUIRED
x, y = batch
y_hat = self.forward(x)
loss = F.cross_entropy(y_hat, y)
tensorboard_logs = {'train_loss': loss}
return {'loss': loss, 'log': tensorboard_logs}
def validation_step(self, batch, batch_idx):
# OPTIONAL
x, y = batch
y_hat = self.forward(x)
return {'val_loss': F.cross_entropy(y_hat, y)}
def validation_end(self, outputs):
# OPTIONAL
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
tensorboard_logs = {'val_loss': avg_loss}
return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}
def test_step(self, batch, batch_idx):
# OPTIONAL
x, y = batch
y_hat = self.forward(x)
return {'test_loss': F.cross_entropy(y_hat, y)}
def test_end(self, outputs):
# OPTIONAL
avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
tensorboard_logs = {'test_loss': avg_loss}
return {'avg_test_loss': avg_loss, 'log': tensorboard_logs}
def configure_optimizers(self):
# REQUIRED
# can return multiple optimizers and learning_rate schedulers
# (LBFGS it is automatically supported, no need for closure function)
return torch.optim.Adam(self.parameters(), lr=LR)
@pl.data_loader
def train_dataloader(self):
# REQUIRED
return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=BATCHSIZE)
@pl.data_loader
def val_dataloader(self):
# OPTIONAL
return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=BATCHSIZE)
@pl.data_loader
def test_dataloader(self):
# OPTIONAL
return DataLoader(MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor()), batch_size=BATCHSIZE)
from pytorch_lightning.loggers.neptune import NeptuneLogger
def main():
neptune_logger = NeptuneLogger(
api_key="ANONYMOUS",
project_name="shared/pytorch-lightning-integration",
close_after_fit=False,
experiment_name="default", # Optional,
params={"max_epochs": MAX_EPOCHS,
"batch_size": BATCHSIZE,
"lr": LR}, # Optional,
tags=["pytorch-lightning", "mlp"] # Optional,
)
model_checkpoint = pl.callbacks.ModelCheckpoint(filepath=CHECKPOINTS_DIR)
from pytorch_lightning import Trainer
model = CoolSystem()
trainer = Trainer(max_epochs=MAX_EPOCHS,
logger=neptune_logger,
checkpoint_callback=model_checkpoint,
gpus=-1,
distributed_backend='ddp',
)
trainer.fit(model)
trainer.test(model)
# Get predictions on external test
import numpy as np
model.freeze()
test_loader = DataLoader(MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor()), batch_size=256)
y_true, y_pred = [],[]
for i, (x, y) in enumerate(test_loader):
y_hat = model.forward(x).argmax(axis=1).cpu().detach().numpy()
y = y.cpu().detach().numpy()
y_true.append(y)
y_pred.append(y_hat)
if i == len(test_loader):
break
y_true = np.hstack(y_true)
y_pred = np.hstack(y_pred)
# Log additional metrics
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_true, y_pred)
neptune_logger.experiment.log_metric('test_accuracy', accuracy)
# Log charts
from scikitplot.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(16, 12))
plot_confusion_matrix(y_true, y_pred, ax=ax)
neptune_logger.experiment.log_image('confusion_matrix', fig)
# Save checkpoints folder
neptune_logger.experiment.log_artifact(CHECKPOINTS_DIR)
# You can stop the experiment
neptune_logger.experiment.stop()
if __name__ == "__main__":
main()
Hi! thanks for your contribution!, great first issue!
@jakubczakon pls ^^
Hi @hirune924 and thanks for raising it!
I've already notified the dev team and @pitercl will get back to you once we have a solution.
@jakubczakon I assume that this is also Neptune issue only, not related to PL, right?
@hirune924 mentioned:
And I found a similar error with CommetLogger
So I think it may be a more general problem @Borda,
Hi @hirune924, @Borda, just a quick update on this: from my initial tests, our Neptune logger and multiprocessing don't mix well. Tomorrow I'll dig a bit deeper into that and see if/how it can be remedied.
@Borda, I don't know yet if this is more on Neptune side or PL side.
Another quick update: I have a solution to this and will create a PR with a fix today or tomorrow.
Most helpful comment
Another quick update: I have a solution to this and will create a PR with a fix today or tomorrow.