The metric val_loss was not found for ReduceLROnPlateau and progress bar display. But using print(val_loss on validation_step, and validation_epoch_end works ok (display Tensor(value).
class MyModel(pl.LightningModule):
def __init__(self, train_df, val_df, test_df, hparams = Namespace(lr = 0.02)):
# Initialization
super(MyModel, self).__init__()
self.train_df = train_df
self.val_df = val_df
self.test_df = test_df
self.hparams = hparams
# Model Structure
backbone = models.resnet18(pretrained=False)
self.features_extractor = torch.nn.Sequential(*list(backbone.children())[:-1])
self.fc = torch.nn.Sequential(*[
torch.nn.Linear(backbone.fc.in_features, 256, bias=True),
torch.nn.Linear(256, 32, bias=True),
torch.nn.Linear(32, 4, bias=True)
])
# Loss
self._loss = torch.nn.CrossEntropyLoss(weight=weight.float())
def forward(self, x):
x = self.features_extractor(x)
x = x.squeeze(-1).squeeze(-1)
x = self.fc(x)
return x
def loss(self, logits, y):
return self._loss(logits, y)
def training_step(self, batch, batch_idx):
# 1. Inference
x, y = batch
y_hat = self.forward(x)
# 2. Loss
loss = self.loss(y_hat, y)
# 3. Output
tensorboard_logs = {'train_loss': loss}
return {'loss': loss, 'log': tensorboard_logs}
def validation_step(self, batch, batch_idx):
x, y = batch
y_hat = self.forward(x)
loss = self.loss(y_hat, y)
return {'val_loss': loss}
def validation_epoch_end(self, outputs):
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
tensorboard_logs = {'val_loss': avg_loss}
return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
return [optimizer], [scheduler]
def prepare_data(self):
self.train_ds = ClassificationDataset(self.train_df, 'data/images')
self.val_ds = ClassificationDataset(self.val_df, 'data/images')
def train_dataloader(self):
return torch.utils.data.DataLoader(self.train_ds, batch_size=256, num_workers=4, sampler=train_sampler)
def val_dataloader(self):
return torch.utils.data.DataLoader(self.val_ds, batch_size=64, num_workers=4)
model = MyModel(train_df, val_df, test_df, hparams=Namespace(lr=0.001))
trainer = pl.Trainer(gpus=1, max_epochs=2, train_percent_check=0.01, weights_summary='top')
trainer.fit(model)
---------------------------------------------------------------------------
MisconfigurationException Traceback (most recent call last)
<ipython-input-412-55f3b29fc11e> in <module>
4 # Trainer
5 trainer = pl.Trainer(gpus=1, max_epochs=2, train_percent_check=0.01, weights_summary='top')
----> 6 trainer.fit(model)
/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloader, val_dataloaders, test_dataloaders)
702
703 elif self.single_gpu:
--> 704 self.single_gpu_train(model)
705
706 elif self.use_tpu: # pragma: no-cover
/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/distrib_parts.py in single_gpu_train(self, model)
475 self.optimizers = optimizers
476
--> 477 self.run_pretrain_routine(model)
478
479 def tpu_train(self, tpu_core_idx, model):
/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py in run_pretrain_routine(self, model)
862
863 # CORE TRAINING LOOP
--> 864 self.train()
865
866 def test(self, model: Optional[LightningModule] = None):
/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py in train(self)
364
365 # update LR schedulers
--> 366 self.update_learning_rates(interval='epoch')
367
368 if self.max_steps and self.max_steps == self.global_step:
/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py in update_learning_rates(self, interval)
779 avail_metrics = ','.join(list(self.callback_metrics.keys()))
780 raise MisconfigurationException(
--> 781 f'ReduceLROnPlateau conditioned on metric {monitor_key}'
782 f' which is not available. Available metrics are: {avail_metrics}.'
783 ' Condition can be set using `monitor` key in lr scheduler dict'
MisconfigurationException: ReduceLROnPlateau conditioned on metric val_loss which is not available. Available metrics are: . Condition can be set using `monitor` key in lr scheduler dict
Dataset
class ClassificationDataset(torch.utils.data.Dataset):
def __init__(self, df: pd.DataFrame, root_dir: pathlib.Path, test=False):
self.df = df
self.test = test
self.root_dir = root_dir
self.transforms = transforms.Compose([
transforms.Resize(size=(224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
self.number_of_categories = len(self.df.time_cat.cat.categories)
def __getitem__(self, index):
if torch.is_tensor(index):
index = index.tolist()
sample = datasets.folder.default_loader(pathlib.Path(self.root_dir) / pathlib.Path(self.df.iloc[index]['filename']))
sample = self.transforms(sample)
y = int(self.df.time_cat.cat.codes.iloc[index])
return (sample, y)
def __len__(self):
return self.df.shape[0]
Hi! thanks for your contribution!, great first issue!
Can you try rename avg_val_loss in validation_epoch_end to val_loss ?
@phihung Already tried, and tried again, but gives the same error. Printining the validation loss works normal, display the torch.Tensor associated to it. The mean of val_loss also works.
A quick read to the code, only training metrics can be used with ReduceLROnPlateau
As a test, adding the following code to your MyModel class should make the error disappear
def training_epoch_end(self, outputs):
return {"val_loss": 1}
Don't work either. One strange thing also is that if I remove the ReduceLr, the progress bar don't display any metric (train_loss, val_loss).
I could get your code to work on dummy data. If you look close at the error message you will see the info. Available metrics are: .. Since not even your loss is available as a metric, this means that your train_step have not be evaluated yet.
Your problem seems to be a combination of a small data size(?), combined with your choice of train_percent_check=0.01 and a large batch size, means that your number of batches gets rounded to 0. You can see here how the number of batches are calculated:
https://github.com/PyTorchLightning/pytorch-lightning/blob/f531ab957b05c97630d98fed18f9349b7e97046b/pytorch_lightning/trainer/data_loading.py#L165-L170
If I am correct that num_batches=0 in your case, this means that nothing is evaluated, and no metrics are available for your learning rate scheduler.
@SkafteNicki If I initialize a batch, and sample it, I get the following:
train_ds = ClassificationDataset(train_df, 'data/images')
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=256, num_workers=4, sampler=train_sampler)
print("train_ds length", len(train_ds))
train_sample_batch = next(iter(train_dl))
print(train_sample_batch[0].shape)
train_ds length 4270802
torch.Size([64, 3, 224, 224])
Ok, I discovered that if I remove the train_sampler, which is a torch WeightedRandomSampler, it works again.
class_sample_count = np.array([len(np.where(train_df.time_cat.cat.codes.values==t)[0]) for t in np.unique(train_df.time_cat.cat.codes.values)])
weight = 1. / class_sample_count
samples_weights = torch.tensor([weight[t] for t in train_df.time_cat.cat.codes.values], dtype=torch.float)
train_sampler = torch.utils.data.WeightedRandomSampler(samples_weights, 64)
Okay, then it does not seem to be a general bug in the code, but this is a specific corner case. If you want me to dig more into this, a notebook where the error can be reproduced would be nice.
This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.
we can reopen if we want to do this. also seems like an enhancement
I am facing the same issue as mentioned as of lightning 0.9.0. Are there no plans to improve upon this? As I see it, learning rate schedulers that do not work with validation losses makes me look towards other libraries.
@swd543 it is not correct that learning rate schedulers in lightning cannot be conditioned on specific values. Take this example from the docs:
def configure_optimizers(self):
optimizers = [Adam(...), SGD(...)]
schedulers = [
{
'scheduler': ReduceLROnPlateau(optimizers[0], ...),
'monitor': 'val_loss', # Default: val_loss
'interval': 'epoch',
'frequency': 1
},
LambdaLR(optimizers[1], ...)
]
return optimizers, schedulers
here the scheduler gets conditioned on the monitor value which is set to be the validation loss.
How the heck do i make val_loss available for the LR-shceduler?
This is my error:
pytorch_lightning.utilities.exceptions.MisconfigurationException: ReduceLROnPlateau conditioned on metric val_loss which is not available. Available metrics are: val_early_stop_on,val_checkpoint_on,checkpoint_on. Condition can be set using `monitor` key in lr scheduler dict
I return val_loss from the validation step like this:
def validation_step(self, batch, batch_idx):
...
loss = self.loss_funciton(masks_pred, masks)
result = pl.EvalResult(loss, checkpoint_on=loss)
result.log("val_loss", loss, sync_dist=True, prog_bar=True)
...
return result
@vegovs I am having the same problem
@jovenwayfarer I figured it out. Will post here in a jiffy. Or you can check out the code: https://github.com/gil-uav/semantic-image-segmentation
@vegovs Could you let us know how you made lr_scheduler work with val_loss? Had a look through your code but couldn't figure it out.
@jovenwayfarer & @swd543 I don't remember where I got it from but it does exist in the docs somewhere.
See the comment over the monitor key in the scheduler dict. :)
def configure_optimizers(self):
optimizer = torch.optim.Adam(
self.parameters(), lr=(self.lr or self.learning_rate)
)
lr_scheduler = ReduceLROnPlateau(optimizer, "min")
scheduler = {
"scheduler": lr_scheduler,
"reduce_on_plateau": True,
# val_checkpoint_on is val_loss passed in as checkpoint_on
"monitor": "val_checkpoint_on",
"patience": 5,
"mode": "min",
"factor": 0.1,
"verbose": True,
"min_lr": 1e-8,
}
return [optimizer], [scheduler]
However, I do not have any empirical evidence of it working.
I am using the ADAM optimizer, and I think my early stopping callback (20 epochs patience) triggers before ReduceLROnPlateau does.