I think the newly introduced log function function does not log properly while being used in the training_step. The same code in validation_step creates the desired results.
def training_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
self.log("loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
self.log("my_metric_train", 1001, prog_bar=True, logger=True, on_step=True, on_epoch=True)
##### Doesn't Work #######
def validation_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
self.log("val_loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
self.log("my_metric_val", 1001, prog_bar=True, logger=True, on_step=True, on_epoch=True)
##### Works #######
https://gist.github.com/tobiascz/bb2c6de83263eb38181052840062b5ac
Logs created in training_step should show up in the prog_bar and loggers (such as tensorboard logger). Same code in the validation_step creates the desired results.
When returning the loss from the training_step it works.
def training_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
self.log("loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
self.log("my_metric_train", 1001, prog_bar=True, logger=True, on_step=True, on_epoch=True)
return loss
##### Works #######
@tobiascz I believe it isn't a common issue, but from a design perspective, is it the desired behavior? I would expect the logs to work independently of whether the user returned the loss or not, however, a user warning/error should be raised in the case the training step returned None.
Thanks for pointing this out @itsikad.
based on the comment above I am reopening the issue.
def training_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
self.log("loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
self.log("my_metric_train", 1001, prog_bar=True, logger=True, on_step=True, on_epoch=True)
##### Doesn't Work #######
def training_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
self.log("loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
self.log("my_metric_train", 1001, prog_bar=True, logger=True, on_step=True, on_epoch=True)
return loss
##### Works #######
Expected behavior
logging in the training_step should be independent of optimising the model e.g. returning a loss. Even if the training_step has some issues and does not return a loss, the logging should work as expected.
@tchaton please take a look. do we want to add a warning here?
I looked into this a bit but haven't found the source of the issue yet. Using this to test:
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y)
self.log('with_loss', torch.tensor(0.5, device=self.device), on_step=True, on_epoch=False, prog_bar=True, sync_dist=True, sync_dist_op=None)
return loss
def training_step_end(self, output):
print(self._results.items())
return output
dict_items([('meta', {'_internal': {'_reduce_on_epoch': False, 'batch_sizes': []}, 'with_loss': {'prog_bar': True, 'logger': True, 'on_step': True, 'on_epoch': False, 'reduce_fx': <built-in method mean of type object at 0x7fdad2bd05a0>, 'value': tensor(0.5000), 'tbptt_reduce_fx': <built-in method mean of type object at 0x7fdad2bd05a0>, 'tbptt_pad_token': 0, 'forked': False, 'dataloader_idx': None}}), ('with_loss', tensor(0.5000))])
Epoch 0: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 2/2 [00:00<00:00, 8.28it/s, loss=2.302, v_num=18, with_loss=0.5]
dict_items([('meta', {'_internal': {'_reduce_on_epoch': False, 'batch_sizes': []}, 'no___loss': {'prog_bar': True, 'logger': True, 'on_step': True, 'on_epoch': False, 'reduce_fx': <built-in method mean of type object at 0x7f151459e5a0>, 'value': tensor(0.5000), 'tbptt_reduce_fx': <built-in method mean of type object at 0x7f151459e5a0>, 'tbptt_pad_token': 0, 'forked': False, 'dataloader_idx': None}}), ('no___loss', tensor(0.5000))])
Epoch 0: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 2/2 [00:00<00:00, 8.49it/s, loss=nan, v_num=19]
The result objects/dictionaries contructed are identical aside from the key I set. Gotta dig deeper...
Edit:
It seems like not returning a loss causes _curr_step_result to be None which bypasses the logging call
Dear @tobiascz,
Thanks for noticing this wrong behaviour. I will look into it asap.
Best regards,
Thomas Chaton.
Dear @tobiascz,
It seems the bug is currently in the release branch but not master. Let me investigate from there.
def test_train_logging_training_step(tmpdir):
"""
This tests makes sure training_step logged value are properly captured.
"""
class ExtendedBoringModel(BoringModel):
def __init__(self):
super().__init__()
self.layer = torch.nn.Linear(32, 2)
def forward(self, x):
return self.layer(x)
def loss(self, batch, prediction):
# An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))
def training_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
self.log("loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
self.log("my_metric_train", 1001, prog_bar=True, logger=True, on_step=True, on_epoch=True)
def validation_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
self.log("val_loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
self.log("my_metric_val", 1001, prog_bar=True, logger=True, on_step=True, on_epoch=True)
def test_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
self.log("test_loss", loss, prog_bar=True, logger=True, on_step=False, on_epoch=True)
def configure_optimizers(self):
optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
return [optimizer], [lr_scheduler]
def train_dataloader(self):
return torch.utils.data.DataLoader(RandomDataset(32, 64))
def val_dataloader(self):
return torch.utils.data.DataLoader(RandomDataset(32, 64))
def test_dataloader(self):
return torch.utils.data.DataLoader(RandomDataset(32, 64))
model = ExtendedBoringModel()
model.training_step_end = None
model.training_epoch_end = None
# Initialize a trainer
trainer = pl.Trainer(
max_epochs=1,
min_epochs=1,
limit_train_batches=2,
limit_val_batches=0,
)
# Train the model β‘
trainer.fit(model)
On 1.0.6. You are entirely right.
However, it works fine if you return the loss in training_step.
Sometimes there's no unambiguous way of returning a single loss tho, i.e. GAN training.
What I'm doing to bypass the bug rn is this, hopefully there's no impact on the optimization:
(I'm using automatic_optimization=False btw)
def training_step(self, batch, batch_idx, optimizer_idx):
...
D_loss = ...
G_loss = ...
...
self.log(D, ...)
self.log(G, ...)
return torch.tensor(0)
Hey @hecoding,
You might want to have a look at this: https://github.com/PyTorchLightning/pytorch-lightning/pull/4618
Best,
T.C
Hey @hecoding,
Do you get this pb when using master ?
Best regards,
Thomas Chaton.
Dear @tobiascz,
It seems the bug is currently in the release branch but not master. Let me investigate from there.
master branch
def test_train_logging_training_step(tmpdir): """ This tests makes sure training_step logged value are properly captured. """ class ExtendedBoringModel(BoringModel): def __init__(self): super().__init__() self.layer = torch.nn.Linear(32, 2) def forward(self, x): return self.layer(x) def loss(self, batch, prediction): # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction)) def training_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) self.log("loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True) self.log("my_metric_train", 1001, prog_bar=True, logger=True, on_step=True, on_epoch=True) def validation_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) self.log("val_loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True) self.log("my_metric_val", 1001, prog_bar=True, logger=True, on_step=True, on_epoch=True) def test_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) self.log("test_loss", loss, prog_bar=True, logger=True, on_step=False, on_epoch=True) def configure_optimizers(self): optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) return [optimizer], [lr_scheduler] def train_dataloader(self): return torch.utils.data.DataLoader(RandomDataset(32, 64)) def val_dataloader(self): return torch.utils.data.DataLoader(RandomDataset(32, 64)) def test_dataloader(self): return torch.utils.data.DataLoader(RandomDataset(32, 64)) model = ExtendedBoringModel() model.training_step_end = None model.training_epoch_end = None # Initialize a trainer trainer = pl.Trainer( max_epochs=1, min_epochs=1, limit_train_batches=2, limit_val_batches=0, ) # Train the model β‘ trainer.fit(model)
On 1.0.6. You are entirely right.
However, it works fine if you return the loss in
training_step.
I added this
%capture
! pip install git+https://github.com/PyTorchLightning/pytorch-lightning@master
! pip install pytorch-lightning-bolts
to the code here: https://gist.github.com/tobiascz/bb2c6de83263eb38181052840062b5ac
But the error is still present.
Hey @hecoding,
Do you get this pb when using master ?
Best regards,
Thomas Chaton.
Hi @tchaton, sorry busy week. Nope I just ran the code on 1.0.4 from pip. I guess tobias answered your question already. Let me know otherwise.
Good stuff on the lambda closure btw, I'll prolly be using that :+1:
Cheers.
This should be fixed on master.
Most helpful comment
Thanks for pointing this out @itsikad.
based on the comment above I am reopening the issue.
Expected behavior
logging in the training_step should be independent of optimising the model e.g. returning a loss. Even if the training_step has some issues and does not return a loss, the logging should work as expected.