load_from_checkpoint fails for model with additional required parameters (besides hparams) in model constructor on TPU with more than 1 core.
Steps to reproduce the behavior:
hparams) in model constructor e.g. datasetTraceback (most recent call last):
File "train.py", line 83, in <module>
trainer.fit(model)
File "/usr/local/lib/python3.6/dist-packages/pytorch_lightning/trainer/trainer.py", line 721, in fit
self.load_spawn_weights(model)
File "/usr/local/lib/python3.6/dist-packages/pytorch_lightning/trainer/distrib_data_parallel.py", line 372, in load_spawn_weights
loaded_model = original_model.__class__.load_from_checkpoint(path)
File "/usr/local/lib/python3.6/dist-packages/pytorch_lightning/core/lightning.py", line 1512, in load_from_checkpoint
model = cls._load_model_state(checkpoint, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/pytorch_lightning/core/lightning.py", line 1543, in _load_model_state
model = cls(*model_args)
TypeError: __init__() missing 1 required positional argument: 'dataset'
from pytorch_lightning import Trainer
from argparse import Namespace
import os
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision import transforms
import pytorch_lightning as pl
class CoolSystem(pl.LightningModule):
def __init__(self, hparams, dataset):
super(CoolSystem, self).__init__()
# not the best model...
self.l1 = torch.nn.Linear(28 * 28, 10)
self.hparams = hparams
def forward(self, x):
# called with self(x)
return torch.relu(self.l1(x.view(x.size(0), -1)))
def training_step(self, batch, batch_idx):
# REQUIRED
x, y = batch
y_hat = self.forward(x)
loss = F.cross_entropy(y_hat, y)
tensorboard_logs = {'train_loss': loss}
return {'loss': loss, 'log': tensorboard_logs}
def validation_step(self, batch, batch_idx):
# OPTIONAL
x, y = batch
y_hat = self.forward(x)
return {'val_loss': F.cross_entropy(y_hat, y)}
def validation_end(self, outputs):
# OPTIONAL
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
tensorboard_logs = {'val_loss': avg_loss}
return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=0.0004)
def prepare_data(self):
self.mnist_train = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())
self.mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor())
def train_dataloader(self):
loader = DataLoader(self.mnist_train, batch_size=32, num_workers=2)
return loader
def val_dataloader(self):
loader = DataLoader(self.mnist_test, batch_size=32)
return loader
class Dataset():
pass
model = CoolSystem({ "test_param": 2 }, Dataset())
trainer = Trainer(num_tpu_cores=8, train_percent_check=0.02, val_percent_check=0.1, max_epochs=1)
trainer.fit(model)
Model parameters are saved and loaded correctly.
conda, pip, source): pip1.upgrade to master
.load_from_checkpoint(PATH, dataset=YourDataset)
@williamFalcon
load_from_checkpoint call comes from self.load_spawn_weights(model) automatically called in trainer at the end of training process here.oh i see. yeah, the dataset argument in your constructor is breaking the load.
For the trainer to autoload you have to only use hparams (put the dataset in the hparams object which can be a dict as well). Or second option is to submit a PR to enable loading other params as well
this has nothing to do with TPUs btw.
Yes, in my code I've moved dataset to hparams as you suggested, but I suppose there should some check against the original problem for the future users.
I mentioned the TPU, because when I checked the same code on GPU and CPU runtimes the error was not raised. Probably the if regarding the proc rank introduced recently worked.
I have a similar error. I passed in model argument besides hparams. My personal computer (Ubuntu, 2080 super) works fine but the computer at lab (CentOS, V100) reported the same error. Also I found it's related to distributed_backend='ddp', it would be good if setting distributed_backend='dp'.
Update: if I use single argument (combining them into a dict), I have the following error:
Traceback (most recent call last):
File "main.py", line 80, in <module>
main(args)
File "main.py", line 62, in main
trainer.fit(litmodel)
File "/home/dengcy/conda/myenv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 753, in fit
self.load_spawn_weights(model)
File "/home/dengcy/conda/myenv/lib/python3.8/site-packages/pytorch_lightning/trainer/distrib_data_parallel.py", line 398, in load_spawn_weights
loaded_model = original_model.__class__.load_from_checkpoint(path)
File "/home/dengcy/conda/myenv/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1522, in load_from_checkpoint
model = cls._load_model_state(checkpoint, *args, **kwargs)
File "/home/dengcy/conda/myenv/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1552, in _load_model_state
model = cls(*model_args, *args, **kwargs)
File "main.py", line 23, in __init__
self.hparams = hparams['args']
TypeError: 'Namespace' object is not subscriptable
this shall be fixed with #2047
@rzepinskip Hi, I come with a similar case, I use multi GPU and the model class (pl.LightningModule) also take multiple init parameters. When I load the checkpoint, it raised the exactly the same error, has this been fixed??