I started using PL thinking about the ease of training my model using multiple GPUs. I'm basically using some Transformers from Huggingface.
import hydra
import torch
from pytorch_lightning.core.lightning import LightningModule
from torchtext import data
from transformers import AutoTokenizer
from source.loss.NPairsLoss import NPairsLoss
class JointEncoder(LightningModule):
"""Encodes the code and docstring into an same space of embeddings."""
def __init__(self,
config,
a_encoder,
b_encoder
):
super(JointEncoder, self).__init__()
self.config = config
self.a_encoder = a_encoder
self.b_encoder = b_encoder
self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
self.pad_index = self.tokenizer.convert_tokens_to_ids(self.tokenizer.pad_token)
self.loss_fn = NPairsLoss()
def forward(self, x1, x2):
x1 = self.a_encoder(x1)
x2 = self.b_encoder(x2)
return x1, x2
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=1e-6, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=True)
def training_step(self, batch, batch_idx):
x1, x2 = batch.sentence1, batch.sentence2
predict = self(x1, x2)
target = torch.arange(x1.size()[0])
loss = self.loss_fn(predict, target)
return {'loss': loss}
def test_step(self, batch, batch_idx):
x1, x2 = batch.sentence1, batch.sentence2
predict = self(x1, x2)
target = torch.arange(x1.size()[0])
loss = self.loss_fn(predict, target)
return {'test_loss': loss}
def validation_step(self, batch, batch_idx):
x1, x2 = batch.sentence1, batch.sentence2
predict = self(x1, x2)
# print(x1.shape)
target = torch.arange(x1.size()[0])
loss = self.loss_fn(predict, target)
return {'val_loss': loss}
def train_dataloader(self):
train_dataset = data.TabularDataset(
path=hydra.utils.to_absolute_path(self.config.dataset.train_path),
format="json",
fields=self.get_fields())
return data.BucketIterator(
dataset=train_dataset,
batch_size=self.config.train.batch_size
)
def test_dataloader(self):
test_dataset = data.TabularDataset(
path=hydra.utils.to_absolute_path(self.config.dataset.test_path),
format="json",
fields=self.get_fields()
)
return data.BucketIterator(
dataset=test_dataset,
batch_size=self.config.test.batch_size
)
def val_dataloader(self):
val_dataset = data.TabularDataset(
path=hydra.utils.to_absolute_path(self.config.dataset.val_path),
format="json",
fields=self.get_fields()
)
return data.BucketIterator(
dataset=val_dataset,
batch_size=self.config.val.batch_size
)
def get_fields(self):
text_field = data.Field(
use_vocab=False,
tokenize=self.tokenizer.encode,
batch_first=True,
fix_length=self.config.preprocessing.max_length,
pad_token=self.pad_index
)
return {'sentence1': ('sentence1', text_field),
'sentence2': ('sentence2', text_field)}
@hydra.main(config_path="configs/config.yaml")
def dev_run(cfg):
a_encoder = Encoder(encoder=BertModel.from_pretrained('bert-base-uncased'))
b_encoder = Encoder(encoder=BertModel.from_pretrained('bert-base-uncased'))
print(cfg.pretty())
model = JointEncoder(config=cfg, a_encoder=a_encoder, b_encoder=b_encoder)
trainer = Trainer(max_epochs=3, gpus=[0, 1])
trainer.fit(model)
if __name__ == "__main__":
dev_run()
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01 Driver Version: 440.33.01 CUDA Version: 10.2 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce GTX TIT... Off | 00000000:02:00.0 Off | N/A |
| 31% 51C P0 84W / 250W | 0MiB / 6083MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 1 GeForce GTX TIT... Off | 00000000:03:00.0 Off | N/A |
| 31% 50C P0 85W / 250W | 0MiB / 6083MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 2 GeForce GTX TIT... Off | 00000000:83:00.0 Off | N/A |
| 29% 46C P0 81W / 250W | 0MiB / 6083MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 3 GeForce GTX TIT... Off | 00000000:84:00.0 Off | N/A |
| 0% 50C P0 68W / 250W | 0MiB / 6083MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
But I'm getting the following error:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/home/celso/projects/semantic_code_search/venv/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 20, in _wrap
fn(i, *args)
File "/home/celso/projects/semantic_code_search/venv/lib/python3.7/site-packages/pytorch_lightning/trainer/distrib_data_parallel.py", line 527, in ddp_train
model = model.configure_ddp(model, device_ids)
File "/home/celso/projects/semantic_code_search/venv/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 900, in configure_ddp
find_unused_parameters=True
File "/home/celso/projects/semantic_code_search/venv/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 285, in __init__
self.broadcast_bucket_size)
File "/home/celso/projects/semantic_code_search/venv/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 496, in _distributed_broadcast_coalesced
dist._broadcast_coalesced(self.process_group, tensors, buffer_size)
RuntimeError: CUDA error: no kernel image is available for execution on the device
I consider myself a beginner in Deep Learning, so it would be of great help if someone could give me some directions on how to overcome these errors.
Hi! thanks for your contribution!, great first issue!
This is a PyTorch issue:
https://github.com/pytorch/pytorch/issues/31285
Maybe it helps someone with the same problem.
Just use the PyTorch Docker image available on Dockerhub.
After resolving the above issue, I still have the following error:
Validation sanity check: 0it [00:00, ?it/s]
Traceback (most recent call last):
...
RuntimeError: Expected object of device type cuda but got device type cpu for argument #3 'index' in call to _th_index_select
Is this happening because I am using pre-trained Bert from Huggingface which is not implemented as LightningModule? And for that reason is not been allocated in GPU?
Note: the model runs perfectly when not using GPU.
sounds like that model hardcoded a device... which makes it hard to run on GPUs. maybe it鈥檚 your code though since i doubt HF hardcoded a device.
@sshleifer any thoughts?
@williamFalcon and @sshleifer,
The above mentioned behavior can be reproduced in the executable example in Colab.
Thank you in advance for your help.
I had to specify device="cuda" in train_dataloader method:
def train_dataloader(self):
train_dataset = data.TabularDataset(
path=configs["train_path"],
format="json",
fields=self.get_fields())
return data.BucketIterator(
dataset=train_dataset,
device="cuda",
batch_size=configs["train_batch_size"]
)
def get_fields(self):
text_field = data.Field(
use_vocab=False,
tokenize=self.tokenizer.encode,
batch_first=True,
fix_length=configs["preprocessing_max_length"],
pad_token=self.pad_index
)
return {'sentence1': ('x1', text_field),
'sentence2': ('x2', text_field)}
Is this a correct approach? From what I had seen in the documentation, this type of operation would no longer be necessary with PyTorchLightning.