If you still can't find what you need:
For loading data, I should write down path for 'meta_tags.csv'.
However, in the 'meta_tags.csv', there is nothing.
Please paste a code snippet if your question requires it!
Same get_started code with checkpoint callbacks
import os
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
import pytorch_lightning as pl
class CoolSystem(pl.LightningModule):
def __init__(self):
super(CoolSystem, self).__init__()
# not the best model...
self.l1 = torch.nn.Linear(28 * 28, 10)
def forward(self, x):
return torch.relu(self.l1(x.view(x.size(0), -1)))
def training_step(self, batch, batch_nb):
# REQUIRED
x, y = batch
y_hat = self.forward(x)
return {'loss': F.cross_entropy(y_hat, y)}
def validation_step(self, batch, batch_nb):
# OPTIONAL
x, y = batch
y_hat = self.forward(x)
return {'val_loss': F.cross_entropy(y_hat, y)}
def validation_end(self, outputs):
# OPTIONAL
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
return {'avg_val_loss': avg_loss}
def configure_optimizers(self):
# REQUIRED
# can return multiple optimizers and learning_rate schedulers
return torch.optim.Adam(self.parameters(), lr=0.02)
@pl.data_loader
def tng_dataloader(self):
# REQUIRED
return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32)
@pl.data_loader
def val_dataloader(self):
# OPTIONAL
return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32)
@pl.data_loader
def test_dataloader(self):
# OPTIONAL
return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32)
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from test_tube import Experiment
model = CoolSystem()
checkpoint_callback = ModelCheckpoint(
filepath='./model_ckpt/weights6.ckpt',
save_best_only=True,
verbose=True,
monitor='val_loss',
mode='auto'
)
early_stopping = EarlyStopping(
monitor='val_loss',
patience=5,
verbose=True,
mode='auto'
)
exp = Experiment(save_dir=os.getcwd(), version=24)
trainer = Trainer(experiment=exp, max_nb_epochs=1000, train_percent_check=0.1, gpus=[1], checkpoint_callback=checkpoint_callback, early_stop_callback=early_stopping)
I tried on my model and this base tutorial code. But both of them are failed.
I just started using this API today, but I've faced a lot of difficulties..
Sometimes, I need to import exit from sys.
Sometimes, loss doesn't decrease. When I restart jupyter notebook multiple times, suddenly loss gets decreasing.
Sometimes, memory can't be allocated during training (about 81 epochs).
And I can't tag something like 'Losses/train_loss', 'Losses/valid_loss' for tensorboard by using test-tube... The error said 'loss' is referenced before assignments.
I can tolerate these difficulties because I can still progress my training.
But I don't have any solution with loading models from empty file.
conda version (no venv)
no conda
PyTorch version
torch==1.2.0+cu92
torchvision==0.4.0
Lightning version
pytorch-lightning==0.4.7
Test-tube version
test-tube==0.6.9
filepath='./model_ckpt/weights6.ckpt',
should be a path
filepath='./model_ckpt/',
filepath='./model_ckpt/weights6.ckpt',should be a path
filepath='./model_ckpt/',
Oh, I see
Thanks!
Hello, I tested today from your reply!
But I had a same error when I load the pretrained model.
model = CountingSystem()
checkpoint_callback = ModelCheckpoint(
filepath='./ckpt/',
save_best_only=True,
verbose=True,
monitor='avg_val_score',
mode='max'
)
early_stopping = EarlyStopping(
monitor='avg_val_score',
patience=20,
verbose=True,
mode='max'
)
exp = Experiment(save_dir=os.getcwd(), version=25)
trainer = Trainer(experiment=exp, max_nb_epochs=1000, train_percent_check=0.1, gpus=[0], checkpoint_callback=checkpoint_callback, early_stop_callback=early_stopping)
trainer.fit(model)
Then output is
Epoch 00089: avg_val_score did not improve
12it [00:17, 1.26it/s, avg_val_loss=11, avg_val_score=0.732, batch_nb=3, epoch=88, gpu=0, loss=2.839, v_nb=25]
Epoch 00089: early stopping
Then I loaded this pretrained model
pretrained_model = pl.LightningModule.load_from_metrics(
weights_path='./ckpt/',
tags_csv='./default/version_25/meta_tags.csv',
on_gpu=True,
map_location=None
)
But, I got this error.
IsADirectoryError: [Errno 21] Is a directory: './ckpt/'
And then I tried with including file name
pretrained_model = pl.LightningModule.load_from_metrics(
weights_path='./ckpt/_ckpt_epoch_69.ckpt',
tags_csv='./default/version_25/meta_tags.csv',
on_gpu=True,
map_location=None
)
But I got this error.
TypeError Traceback (most recent call last)
in
3 tags_csv='./default/version_25/meta_tags.csv',
4 on_gpu=True,
----> 5 map_location=None
6 )/opt/conda/lib/python3.6/site-packages/pytorch_lightning/root_module/root_module.py in load_from_metrics(cls, weights_path, tags_csv, on_gpu, map_location)
132
133 # load the state_dict on the model automatically
--> 134 model = cls(hparams)
135 model.load_state_dict(checkpoint['state_dict'])
136/opt/conda/lib/python3.6/site-packages/pytorch_lightning/root_module/root_module.py in __init__(self, args, *kwargs)
11
12 def __init__(self, args, *kwargs):
---> 13 super(LightningModule, self).__init__(args, *kwargs)
14
15 self.dtype = torch.FloatTensorTypeError: __init__() takes 1 positional argument but 2 were given
Would you give me some advice?
You need to load your module, not the general LightningModule.
Instead of
pl.LightningModule.load_from_metrics(...
Do this:
CountingSystem.load_from_metrics(
I changed code, but I got the same error message.
pretrained_model = CountingSystem.load_from_metrics(
weights_path='./ckpt/_ckpt_epoch_69.ckpt',
tags_csv='./default/version_25/meta_tags.csv',
on_gpu=True,
map_location={'cuda:0':'cuda:1'}
)
TypeError Traceback (most recent call last)
in
3 tags_csv='./default/version_25/meta_tags.csv',
4 on_gpu=True,
----> 5 map_location={'cuda:0':'cuda:1'}
6 )/opt/conda/lib/python3.6/site-packages/pytorch_lightning/root_module/root_module.py in load_from_metrics(cls, weights_path, tags_csv, on_gpu, map_location)
132
133 # load the state_dict on the model automatically
--> 134 model = cls(hparams)
135 model.load_state_dict(checkpoint['state_dict'])
136TypeError: __init__() takes 1 positional argument but 2 were given
BTW, is this correct way to map gpu location?
I couldn't find an example which uses this load_from_metrics, so I'm confused
pretrained_model = CountingSystem.load_from_metrics(
weights_path='./ckpt/_ckpt_epoch_69.ckpt',
tags_csv='./default/version_25/meta_tags.csv',
on_gpu=True,
map_location='cuda:1'
)
TypeError Traceback (most recent call last)
in
3 tags_csv='./default/version_25/meta_tags.csv',
4 on_gpu=True,
----> 5 map_location='cuda:1'
6 )/opt/conda/lib/python3.6/site-packages/pytorch_lightning/root_module/root_module.py in load_from_metrics(cls, weights_path, tags_csv, on_gpu, map_location)
132
133 # load the state_dict on the model automatically
--> 134 model = cls(hparams)
135 model.load_state_dict(checkpoint['state_dict'])
136TypeError: __init__() takes 1 positional argument but 2 were given
Still same error..

And this is meta_tags.csv file.
Is it normal that this file has nothing?

In metric.csv, there are training logs
no. the file should have something. it might have to do with you using ipynb. i believe the logger doesn鈥檛 support that atm
fit and load without ipynb
we have plenty of tests that save and load models. so it鈥檚 likely the ipynb
I see.. then I can't save model with jupyter notebook..
I'll test with .py and reopen this issue if there is a problem.
I'm sad to reopen this issue..
I got the same error from .py execution.
And in this time, I only used tutorial code + (checkpoint & early stopping callbacks)
Path setting is also okay I think

meta_tags.csv is still empty.
This is changed part from tutorial code.
model = CoolSystem()
checkpoint_callback = ModelCheckpoint(
filepath='base_ckpt',
save_best_only=True,
verbose=True,
monitor='avg_val_loss',
mode='auto'
)
early_stopping = EarlyStopping(
monitor='avg_val_loss',
patience=5,
verbose=True,
mode='auto'
)
exp = Experiment(save_dir=os.getcwd(), version=0)
trainer = Trainer(experiment=exp, max_nb_epochs=1000, train_percent_check=0.1, gpus=[0], checkpoint_callback=checkpoint_callback, early_stop_callback=early_stopping)
trainer.fit(model)
pretrained_model = CoolSystem.load_from_metrics(
weights_path='./base_ckpt/_ckpt_epoch_6.ckpt',
tags_csv='./default/version_0/meta_tags.csv',
on_gpu=True,
map_location=None
)
Traceback (most recent call last):
File "/tmp/pycharm_project_164/lightning_basecode.py", line 89, in
map_location=None
File "/opt/conda/lib/python3.6/site-packages/pytorch_lightning/root_module/root_module.py", line 134, in load_from_metrics
model = cls(hparams)
TypeError: __init__() takes 1 positional argument but 2 were given
enable auto_save to true or do
exp.save()
otherwise, tags won鈥檛 be saved
so...
exp = Experiment(save_dir=os.getcwd(), version=0)
exp.save() # <------- here
trainer = Trainer(experiment=exp, max_nb_epochs=1000, train_percent_check=0.1, gpus=[0], checkpoint_callback=checkpoint_callback, early_stop_callback=early_stopping)
trainer.fit(model)
maybe auto_save makes sense to default to true?
exp.save()
or
exp = Experiment(save_dir=os.getcwd(), autosave=True, version=0)
both of them don't work for me.
I dig into Experiment class in test-tube to figure out where a tag is not saved, but I couldn't get it..
meta_tags.csv is still empty

It'd be nice if meta tags are saved automatically as default.
ok. is this exactly the code you鈥檙e using? i鈥檒l run it locally to see what鈥檚 happening.
import os
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
import pytorch_lightning as pl
class CoolSystem(pl.LightningModule):
def __init__(self):
super(CoolSystem, self).__init__()
# not the best model...
self.l1 = torch.nn.Linear(28 * 28, 10)
def forward(self, x):
return torch.relu(self.l1(x.view(x.size(0), -1)))
def training_step(self, batch, batch_nb):
# REQUIRED
x, y = batch
y_hat = self.forward(x)
return {'loss': F.cross_entropy(y_hat, y)}
def validation_step(self, batch, batch_nb):
# OPTIONAL
x, y = batch
y_hat = self.forward(x)
return {'val_loss': F.cross_entropy(y_hat, y)}
def validation_end(self, outputs):
# OPTIONAL
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
return {'avg_val_loss': avg_loss}
def configure_optimizers(self):
# REQUIRED
# can return multiple optimizers and learning_rate schedulers
return torch.optim.Adam(self.parameters(), lr=0.02)
@pl.data_loader
def tng_dataloader(self):
# REQUIRED
return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32)
@pl.data_loader
def val_dataloader(self):
# OPTIONAL
return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32)
@pl.data_loader
def test_dataloader(self):
# OPTIONAL
return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32)
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from test_tube import Experiment
model = CoolSystem()
checkpoint_callback = ModelCheckpoint(
filepath='base_ckpt',
save_best_only=True,
verbose=True,
monitor='avg_val_loss',
mode='auto'
)
early_stopping = EarlyStopping(
monitor='avg_val_loss',
patience=5,
verbose=True,
mode='auto'
)
exp = Experiment(save_dir=os.getcwd(), version=1)
trainer = Trainer(experiment=exp, max_nb_epochs=1000, train_percent_check=0.1, gpus=[0], checkpoint_callback=checkpoint_callback, early_stop_callback=early_stopping)
trainer.fit(model)
exp.save()
pretrained_model = CoolSystem.load_from_metrics(
weights_path='./base_ckpt/_ckpt_epoch_6.ckpt',
tags_csv='./default/version_0/meta_tags.csv',
on_gpu=True,
map_location=None
)
Yes this is the full code which I'm using.
I just added callbacks based on the tutorial code
I see it now...
meta_tags.csv is not empty
(ddt) mbp:$ cat test/version_0/meta_tags.csv
key,value
A few things are wrong with your code:
this works for me:
import os
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
import pytorch_lightning as pl
class CoolSystem(pl.LightningModule):
def __init__(self, hparams=None):
super(CoolSystem, self).__init__()
# not the best model...
self.l1 = torch.nn.Linear(28 * 28, 10)
def forward(self, x):
return torch.relu(self.l1(x.view(x.size(0), -1)))
def training_step(self, batch, batch_nb):
# REQUIRED
x, y = batch
y_hat = self.forward(x)
return {'loss': F.cross_entropy(y_hat, y)}
def validation_step(self, batch, batch_nb):
# OPTIONAL
x, y = batch
y_hat = self.forward(x)
return {'val_loss': F.cross_entropy(y_hat, y)}
def validation_end(self, outputs):
# OPTIONAL
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
return {'avg_val_loss': avg_loss}
def configure_optimizers(self):
# REQUIRED
# can return multiple optimizers and learning_rate schedulers
return torch.optim.Adam(self.parameters(), lr=0.02)
@pl.data_loader
def tng_dataloader(self):
# REQUIRED
return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32)
@pl.data_loader
def val_dataloader(self):
# OPTIONAL
return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32)
@pl.data_loader
def test_dataloader(self):
# OPTIONAL
return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32)
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from test_tube import Experiment
model = CoolSystem()
checkpoint_callback = ModelCheckpoint(
filepath='base_ckpt',
save_best_only=True,
verbose=True,
monitor='avg_val_loss',
mode='auto'
)
early_stopping = EarlyStopping(
monitor='avg_val_loss',
patience=5,
verbose=True,
mode='auto'
)
exp = Experiment(save_dir=os.getcwd(), version=0)
trainer = Trainer(experiment=exp, max_nb_epochs=1000, train_percent_check=0.1, checkpoint_callback=checkpoint_callback, early_stop_callback=early_stopping)
# trainer.fit(model)
exp.save()
pretrained_model = CoolSystem.load_from_metrics(
weights_path='./base_ckpt/_ckpt_epoch_2.ckpt',
tags_csv='./default/version_0/meta_tags.csv',
on_gpu=True,
map_location=None
)
print('model loaded...')
@Soo95 but your user experience highlights a good thing we need to fix.
We need to make load_from_metrics use an OPTIONAL tags_csv in the case where your model has no hparam arguments.
Expand on the model loading documentation
Hello! Good Day.
It appears im having a similar issue. I have the next model:
class Predictor(pl.LightningModule):
def __init__(self,
nb_layers,
nb_lstm_units = 100,
input_dim = 10,
batch_size = 256,
bilstm = False,
dropout = 0.2,
hidden_size = 100,
encoder_discrete = [],
embedding_dim = 10):
super(Predictor, self).__init__()
self.input_dim = input_dim
self.nb_lstm_layers = nb_layers
self.nb_lstm_units = nb_lstm_units
self.batch_size = batch_size
self.bilstm = bilstm
self.dropout = dropout
self.hidden_size = hidden_size
self.unique_length_encoders = encoder_discrete
self.embedding_dim = embedding_dim
self.loss = RMSELoss()
self.leaky_relu = nn.LeakyReLU(negative_slope=0.1)
self.__build_model()
def __build_model(self):
self.embedding = nn.ModuleList([nn.Embedding(num_embeddings = length + 1,
embedding_dim = self.embedding_dim,
padding_idx = 0) for length in self.unique_length_encoders])
# Design LSTM
self.lstm = torch.nn.LSTM(input_size = self.input_dim,
hidden_size = self.nb_lstm_units,
num_layers = self.nb_lstm_layers,
batch_first=True,
dropout = self.dropout if self.dropout and self.nb_lstm_layers > 1 else 0,
bidirectional = self.bilstm).to(DEVICE)
self.dense1 = torch.nn.Linear(self.nb_lstm_units*2 if self.bilstm else self.nb_lstm_units,
1).to(DEVICE)
self.init_weigths()
def init_hidden(self):
hidden = torch.randn(self.nb_lstm_layers*2 if self.bilstm else self.nb_lstm_layers,
self.batch_size, self.nb_lstm_units).to(DEVICE)
cell = torch.randn(self.nb_lstm_layers*2 if self.bilstm else self.nb_lstm_layers,
self.batch_size, self.nb_lstm_units).to(DEVICE)
return (hidden, cell)
def init_weigths(self):
for name, param in self.lstm.named_parameters():
if 'weight_hh' in name:
torch.nn.init.orthogonal_(param)
elif 'weight_ih' in name:
torch.nn.init.xavier_uniform_(param)
elif 'bias' in name:
torch.nn.init.zeros_(param)
torch.nn.init.xavier_uniform_(self.dense1.weight)
print('weigths initializer: done!')
def forward(self, X_discrete, X_continuous, X_lengths):
(self.hidden, self.cell) = self.init_hidden()
for i, emb in enumerate(self.embedding):
if i == 0:
X_discrete_emb = emb(X_discrete[:,:,i])
else:
X_discrete_emb = torch.cat((X_discrete_emb, emb(X_discrete[:,:,i])), 2)
X = torch.cat((X_discrete_emb,X_continuous), 2)
X = pack_padded_sequence(X, X_lengths, batch_first=True, enforce_sorted=False)
X, (self.hidden, self.cell) = self.lstm(X, (self.hidden, self.cell))
X, _ = pad_packed_sequence(X, batch_first=True, padding_value=0)
hidden = self.hidden.view(self.nb_lstm_layers, 2, -1,
self.nb_lstm_units)[-1] if self.bilstm else self.hidden[-1]
hidden = hidden.contiguous()
hidden = hidden.view(-1, self.nb_lstm_units*2 if self.bilstm else self.nb_lstm_units)
hidden = self.dense1(hidden)
return hidden
def my_loss(self, y_hat, y):
return self.loss(y_hat, y)
def training_step(self, batch, batch_nb):
(x_discrete, x_continuous, lengths, y) = batch
y_hat = self.forward(x_discrete, x_continuous, lengths)
loss = self.my_loss(y_hat, y)
tensorboard_logs = {'train_loss': loss}
return {'loss': loss, 'log': tensorboard_logs}
def validation_step(self, batch, batch_nb):
(x_discrete, x_continuous, lengths, y) = batch
y_hat = self.forward(x_discrete, x_continuous, lengths)
return {'val_loss': self.my_loss(y_hat, y)}
def validation_end(self, outputs):
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
tensorboard_logs = {'val_loss': avg_loss}
return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}
def configure_optimizers(self):
return [torch.optim.Adam(self.parameters(), lr=0.0001, weight_decay=5e-4, amsgrad = True)]
@pl.data_loader
def train_dataloader(self):
return DataLoader(train_dataset, batch_size = self.batch_size, shuffle = True, collate_fn = my_collate)
@pl.data_loader
def val_dataloader(self):
return DataLoader(val_dataset, batch_size = self.batch_size, shuffle = True, collate_fn = my_collate)
I have already trained the model, and i wasnt able to make inference because of not enough GPU memory.
Therefore a restarted my kernel, and decided to load the model again for inference with the next few lines of code:
model = Predictor.load_from_metrics(
weights_path='/lightning_logs/version_50/checkpoints/_ckpt_epoch_50.ckpt',
tags_csv="/lightning_logs/version_50/meta_tags.csv",
)
Im getting the next error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-68-aa919fee611f> in <module>
1 model = HitsPredictor.load_from_metrics(
2 weights_path='/home/daniel/trivago_challenge2/lightning_logs/version_50/checkpoints/_ckpt_epoch_50.ckpt',
----> 3 tags_csv="/home/daniel/trivago_challenge2/lightning_logs/version_50/meta_tags_old.csv",
4 )
5
/opt/conda/envs/nlp/lib/python3.7/site-packages/pytorch_lightning/root_module/root_module.py in load_from_metrics(cls, weights_path, tags_csv)
152
153 # load the state_dict on the model automatically
--> 154 model = cls(hparams)
155 model.load_state_dict(checkpoint['state_dict'])
156
<ipython-input-67-161a6c83168a> in __init__(self, nb_layers, nb_lstm_units, input_dim, batch_size, bilstm, dropout, hidden_size, encoder_discrete, embedding_dim)
46
47 # Function that builds the actual model
---> 48 self.__build_model()
49
50 def __build_model(self):
<ipython-input-67-161a6c83168a> in __build_model(self)
67 batch_first=True,
68 # Appliying dropout only if we have more than 1 layer in the network
---> 69 dropout = self.dropout if self.dropout and self.nb_lstm_layers > 1 else 0,
70 # Bidirectional if i choose so in the input.
71 bidirectional = self.bilstm).to(DEVICE)
TypeError: '>' not supported between instances of 'Namespace' and 'int'
From what is have been testing, it seams that my file meta_data.csv file only has the headers key and value, but it didnt save any thing regarding the input parameters that i chose at the beggining of training. So, the first question is why it didnt save them? and the second one is: is there a way in which i can load those parameters in any way, so i can use the model?. This model training took almost 18 hours, and i dont have time to train it again right now.
Any help would be much appreciated
Most helpful comment
@Soo95 but your user experience highlights a good thing we need to fix.
We need to make load_from_metrics use an OPTIONAL tags_csv in the case where your model has no hparam arguments.
Expand on the model loading documentation