I train LSTM for character level text generation. At first I initialize hidden and cell with zeros using torch.zeros. Unfortunately this tensors are defaultly assigned to the cpu so I get the following error while training
RuntimeError: Input and hidden tensors are not at the same device, found input tensor at cuda:0 and hidden tensor at cpu
class RNN(pl.LightningModule):
lr = 0.0005
def __init__(self, input_size, hidden_size, embeding_size, n_categories, n_layers, output_size, p):
super().__init__()
self.criterion = nn.CrossEntropyLoss()
self.n_layers = n_layers
self.hidden_size = hidden_size
self.embeding = nn.Embedding(input_size+n_categories, embeding_size)
self.lstm = nn.LSTM(embeding_size+n_categories, hidden_size, n_layers, dropout=p)
self.out_fc = nn.Linear(hidden_size, output_size)
self.dropout = nn.Dropout(p)
def forward(self, batch_of_category, batch_of_letter, hidden, cell):
## letter level operations
embeding = self.dropout(self.embeding(batch_of_letter))
category_plus_letter = torch.cat((batch_of_category, embeding), 1)
#sequence_length = 1
category_plus_letter = category_plus_letter.unsqueeze(1)
out, (hidden, cell) = self.lstm(category_plus_letter, (hidden, cell))
out = self.out_fc(out)
out = out.squeeze(1)
return out, (hidden, cell)
def configure_optimizers(self):
optimizer = Adam(self.parameters(), self.lr)
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
return [optimizer], [scheduler]
def training_step(self, batch, batch_idx):
item_dict = batch
loss = 0
batch_of_category = item_dict["category_tensors"]
#we loop over letters, single batch at the time
hidden = torch.zeros(self.n_layers, 1, self.hidden_size).cuda()
cell = torcAh.zeros(self.n_layers, 1, self.hidden_size).cuda()
for t in range(item_dict["input_tensors"].size(1)):
batch_of_letter = item_dict["input_tensors"][:, t]
output, (hidden, cell) = self(batch_of_category, batch_of_letter, hidden, cell)
loss += criterion(output, item_dict["target_tensors"][:, t])
loss = loss/(t+1)
tensorboard_logs = {'train_loss': loss}
return {'loss': loss, 'log': tensorboard_logs}
def init_hidden(self, batch_size):
hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
cell = torch.zeros(self.n_layers, batch_size, self.hidden_size)
return hidden, cell
(['Russian', 'English', 'Russian', 'English'],
['Piskarenkov', 'Clarkson', 'Pochkaev', 'Woods'],
tensor([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]]),
tensor([[42, 9, 19, 11, 1, 18, 5, 14, 11, 15, 22],
[29, 12, 1, 18, 11, 19, 15, 14, 0, 0, 0],
[42, 15, 3, 8, 11, 1, 5, 22, 0, 0, 0],
[49, 15, 15, 4, 19, 0, 0, 0, 0, 0, 0]]),
tensor([[ 9, 19, 11, 1, 18, 5, 14, 11, 15, 22, 59],
[12, 1, 18, 11, 19, 15, 14, 59, 0, 0, 0],
[15, 3, 8, 11, 1, 5, 22, 59, 0, 0, 0],
[15, 15, 4, 19, 59, 0, 0, 0, 0, 0, 0]]))
dm = NamesDatamodule(1)
rnn_model = RNN(input_size=ds.n_tokens,
hidden_size=256,
embeding_size = 128,
n_layers=2,
n_categories=ds.n_categories,
output_size=ds.n_tokens,
p=0.3)
trainer = Trainer(max_epochs=3,
logger=None,
gpus=1,
early_stop_callback=False,
checkpoint_callback=False,
)
trainer.fit(rnn_model, dm)
Hidden values should automatically be assigned to the device
Google Colab
Problem can be solved by adding .cuda() to the variables but it is not a solution that I think should be necessary
Hi! thanks for your contribution!, great first issue!
Try changing to
hidden = torch.zeros(self.n_layers, 1, self.hidden_size)..to(self.device)
cell = torch.zeros(self.n_layers, 1, self.hidden_size).to(self.device)
@rohitgr7 Yeah, this fixes the problem, however I'm not entirely sure it will also work in case if I used more then a single machine to train the model
self.device will always give you the device for the current process(ddp) or current batch(dp) being executed.
Ok, I can close the issue. Thanks for your help :)