Pytorch-lightning: Multi GPU Training: No kernel image is available for execution on the device

Created on 24 Jun 2020 · 7Comments · Source: PyTorchLightning/pytorch-lightning

I started using PL thinking about the ease of training my model using multiple GPUs. I'm basically using some Transformers from Huggingface.

LightningModule

import hydra
import torch
from pytorch_lightning.core.lightning import LightningModule
from torchtext import data
from transformers import AutoTokenizer
from source.loss.NPairsLoss import NPairsLoss


class JointEncoder(LightningModule):
    """Encodes the code and docstring into an same space of embeddings."""

    def __init__(self,
                 config,
                 a_encoder,
                 b_encoder
                 ):
        super(JointEncoder, self).__init__()
        self.config = config
        self.a_encoder = a_encoder
        self.b_encoder = b_encoder
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        self.pad_index = self.tokenizer.convert_tokens_to_ids(self.tokenizer.pad_token)
        self.loss_fn = NPairsLoss()

    def forward(self, x1, x2):
        x1 = self.a_encoder(x1)
        x2 = self.b_encoder(x2)
        return x1, x2

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-6, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=True)

    def training_step(self, batch, batch_idx):
        x1, x2 = batch.sentence1, batch.sentence2
        predict = self(x1, x2)
        target = torch.arange(x1.size()[0])
        loss = self.loss_fn(predict, target)
        return {'loss': loss}

    def test_step(self, batch, batch_idx):
        x1, x2 = batch.sentence1, batch.sentence2
        predict = self(x1, x2)
        target = torch.arange(x1.size()[0])
        loss = self.loss_fn(predict, target)
        return {'test_loss': loss}

    def validation_step(self, batch, batch_idx):
        x1, x2 = batch.sentence1, batch.sentence2
        predict = self(x1, x2)
        # print(x1.shape)
        target = torch.arange(x1.size()[0])
        loss = self.loss_fn(predict, target)
        return {'val_loss': loss}

    def train_dataloader(self):
        train_dataset = data.TabularDataset(
            path=hydra.utils.to_absolute_path(self.config.dataset.train_path),
            format="json",
            fields=self.get_fields())

        return data.BucketIterator(
            dataset=train_dataset,
            batch_size=self.config.train.batch_size
        )

    def test_dataloader(self):
        test_dataset = data.TabularDataset(
            path=hydra.utils.to_absolute_path(self.config.dataset.test_path),
            format="json",
            fields=self.get_fields()
        )

        return data.BucketIterator(
            dataset=test_dataset,
            batch_size=self.config.test.batch_size
        )

    def val_dataloader(self):
        val_dataset = data.TabularDataset(
            path=hydra.utils.to_absolute_path(self.config.dataset.val_path),
            format="json",
            fields=self.get_fields()
        )

        return data.BucketIterator(
            dataset=val_dataset,
            batch_size=self.config.val.batch_size
        )

    def get_fields(self):
        text_field = data.Field(
            use_vocab=False,
            tokenize=self.tokenizer.encode,
            batch_first=True,
            fix_length=self.config.preprocessing.max_length,
            pad_token=self.pad_index

        )
        return {'sentence1': ('sentence1', text_field),
                'sentence2': ('sentence2', text_field)}

entry point:

@hydra.main(config_path="configs/config.yaml")
def dev_run(cfg):

    a_encoder = Encoder(encoder=BertModel.from_pretrained('bert-base-uncased'))
    b_encoder = Encoder(encoder=BertModel.from_pretrained('bert-base-uncased'))

    print(cfg.pretty())

    model = JointEncoder(config=cfg, a_encoder=a_encoder, b_encoder=b_encoder)
    trainer = Trainer(max_epochs=3, gpus=[0, 1])
    trainer.fit(model)


if __name__ == "__main__":
    dev_run()

nvidia-smi

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX TIT...  Off  | 00000000:02:00.0 Off |                  N/A |
| 31%   51C    P0    84W / 250W |      0MiB /  6083MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce GTX TIT...  Off  | 00000000:03:00.0 Off |                  N/A |
| 31%   50C    P0    85W / 250W |      0MiB /  6083MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce GTX TIT...  Off  | 00000000:83:00.0 Off |                  N/A |
| 29%   46C    P0    81W / 250W |      0MiB /  6083MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   3  GeForce GTX TIT...  Off  | 00000000:84:00.0 Off |                  N/A |
|  0%   50C    P0    68W / 250W |      0MiB /  6083MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

But I'm getting the following error:

-- Process 1 terminated with the following error:
Traceback (most recent call last):
  File "/home/celso/projects/semantic_code_search/venv/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 20, in _wrap
    fn(i, *args)
  File "/home/celso/projects/semantic_code_search/venv/lib/python3.7/site-packages/pytorch_lightning/trainer/distrib_data_parallel.py", line 527, in ddp_train
    model = model.configure_ddp(model, device_ids)
  File "/home/celso/projects/semantic_code_search/venv/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 900, in configure_ddp
    find_unused_parameters=True
  File "/home/celso/projects/semantic_code_search/venv/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 285, in __init__
    self.broadcast_bucket_size)
  File "/home/celso/projects/semantic_code_search/venv/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 496, in _distributed_broadcast_coalesced
    dist._broadcast_coalesced(self.process_group, tensors, buffer_size)
RuntimeError: CUDA error: no kernel image is available for execution on the device

I consider myself a beginner in Deep Learning, so it would be of great help if someone could give me some directions on how to overcome these errors.

bug / fix help wanted

Source

Ceceu

All 7 comments

Hi! thanks for your contribution!, great first issue!

github-actions[bot] on 24 Jun 2020

😄1

This is a PyTorch issue:
https://github.com/pytorch/pytorch/issues/31285

williamFalcon on 24 Jun 2020

👍1

Maybe it helps someone with the same problem.
Just use the PyTorch Docker image available on Dockerhub.

Ceceu on 24 Jun 2020

After resolving the above issue, I still have the following error:

Validation sanity check: 0it [00:00, ?it/s]
Traceback (most recent call last):
  ...
RuntimeError: Expected object of device type cuda but got device type cpu for argument #3 'index' in call to _th_index_select

Is this happening because I am using pre-trained Bert from Huggingface which is not implemented as LightningModule? And for that reason is not been allocated in GPU?

Note: the model runs perfectly when not using GPU.

Ceceu on 24 Jun 2020

sounds like that model hardcoded a device... which makes it hard to run on GPUs. maybe it’s your code though since i doubt HF hardcoded a device.

@sshleifer any thoughts?

williamFalcon on 24 Jun 2020

👍1

@williamFalcon and @sshleifer,

The above mentioned behavior can be reproduced in the executable example in Colab.

Thank you in advance for your help.

Ceceu on 25 Jun 2020

I had to specify device="cuda" in train_dataloader method:

def train_dataloader(self):
        train_dataset = data.TabularDataset(
            path=configs["train_path"],
            format="json",
            fields=self.get_fields())

        return data.BucketIterator(
            dataset=train_dataset,
            device="cuda",
            batch_size=configs["train_batch_size"]
        )

def get_fields(self):
        text_field = data.Field(
            use_vocab=False,
            tokenize=self.tokenizer.encode,
            batch_first=True,
            fix_length=configs["preprocessing_max_length"],
            pad_token=self.pad_index

        )
        return {'sentence1': ('x1', text_field),
                'sentence2': ('x2', text_field)}

Is this a correct approach? From what I had seen in the documentation, this type of operation would no longer be necessary with PyTorchLightning.

Ceceu on 26 Jun 2020

Was this page helpful?

0 / 5 - 0 ratings