Fastai: RuntimeError: cuda runtime error (11) : invalid argument at THCGeneral.cpp:405

Created on 5 Feb 2019 · 4Comments · Source: fastai/fastai

Description

After installing with conda and cloning the course v3, running the notebook works and then stops at learn.fit_one_cycle(4)

RuntimeError                              Traceback (most recent call last)
<ipython-input-16-495233eaf2b4> in <module>
----> 1 learn.fit_one_cycle(4)

~/anaconda3/lib/python3.7/site-packages/fastai/train.py in fit_one_cycle(learn, cyc_len, max_lr, moms, div_factor, pct_start, wd, callbacks, **kwargs)
     19     callbacks.append(OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor,
     20                                         pct_start=pct_start, **kwargs))
---> 21     learn.fit(cyc_len, max_lr, wd=wd, callbacks=callbacks)
     22 
     23 def lr_find(learn:Learner, start_lr:Floats=1e-7, end_lr:Floats=10, num_it:int=100, stop_div:bool=True, **kwargs:Any):

~/anaconda3/lib/python3.7/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
    164         callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
    165         fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,
--> 166             callbacks=self.callbacks+callbacks)
    167 
    168     def create_opt(self, lr:Floats, wd:Floats=0.)->None:

~/anaconda3/lib/python3.7/site-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
     92     except Exception as e:
     93         exception = e
---> 94         raise e
     95     finally: cb_handler.on_train_end(exception)
     96 

~/anaconda3/lib/python3.7/site-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
     82             for xb,yb in progress_bar(data.train_dl, parent=pbar):
     83                 xb, yb = cb_handler.on_batch_begin(xb, yb)
---> 84                 loss = loss_batch(model, xb, yb, loss_func, opt, cb_handler)
     85                 if cb_handler.on_batch_end(loss): break
     86 

~/anaconda3/lib/python3.7/site-packages/fastai/basic_train.py in loss_batch(model, xb, yb, loss_func, opt, cb_handler)
     16     if not is_listy(xb): xb = [xb]
     17     if not is_listy(yb): yb = [yb]
---> 18     out = model(*xb)
     19     out = cb_handler.on_loss_begin(out)
     20 

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/container.py in forward(self, input)
     90     def forward(self, input):
     91         for module in self._modules.values():
---> 92             input = module(input)
     93         return input
     94 

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/container.py in forward(self, input)
     90     def forward(self, input):
     91         for module in self._modules.values():
---> 92             input = module(input)
     93         return input
     94 

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py in forward(self, input)
    318     def forward(self, input):
    319         return F.conv2d(input, self.weight, self.bias, self.stride,
--> 320                         self.padding, self.dilation, self.groups)
    321 
    322 

RuntimeError: cuda runtime error (11) : invalid argument at /opt/conda/conda-bld/pytorch_1544176307774/work/aten/src/THC/THCGeneral.cpp:405

Install

$ python -c 'import fastai.utils.collect_env; fastai.utils.collect_env.show_install(1)'

=== Software === 
python version  : 3.7.0
fastai version  : 1.0.36
torch version   : 1.0.0
nvidia driver   : 415.25
torch cuda ver  : 9.0.176
torch cuda is   : available
torch cudnn ver : 7401
torch cudnn is  : enabled

=== Hardware === 
nvidia gpus     : 1
torch available : 1
  - gpu0        : 7949MB | GeForce RTX 2080

=== Environment === 
platform        : Linux-4.15.0-45-generic-x86_64-with-debian-buster-sid
distro          : Ubuntu 18.04 Bionic Beaver
conda env       : base
python          : /home/tyoc213/anaconda3/bin/python
sys.path        : 
/home/tyoc213/anaconda3/lib/python37.zip
/home/tyoc213/anaconda3/lib/python3.7
/home/tyoc213/anaconda3/lib/python3.7/lib-dynload
/home/tyoc213/anaconda3/lib/python3.7/site-packages
/home/tyoc213/github.com/fastai
/home/tyoc213/anaconda3/lib/python3.7/site-packages/IPython/extensions

Mon Feb  4 22:39:23 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 415.25       Driver Version: 415.25       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce RTX 2080    Off  | 00000000:01:00.0  On |                  N/A |
| 32%   27C    P8     7W / 225W |   1769MiB /  7949MiB |      1%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0      1032      G   /usr/lib/xorg/Xorg                           259MiB |
|    0      1254      G   /usr/bin/gnome-shell                         160MiB |
|    0      1364      G   /opt/teamviewer/tv_bin/TeamViewer              6MiB |
|    0      3040      G   ...13/.local/share/Steam/ubuntu12_32/steam    19MiB |
|    0      3050      G   ./steamwebhelper                               6MiB |
|    0      9697      G   ...quest-channel-token=4273371138546245145   106MiB |
|    0     20636      G   /usr/lib/firefox/firefox                       6MiB |
|    0     26596      C   /home/tyoc213/anaconda3/bin/python          1191MiB |
+-----------------------------------------------------------------------------+

Screenshots
imagen

And I think the driver is OK because I can run

./gpu_burn 
Run length not specified in the command line.  Burning for 10 secs
GPU 0: GeForce RTX 2080 (UUID: GPU-33fdbebd-0ba1-a286-5f25-4e8773e06ab7)
Initialized device 0 with 7949 MB of memory (5979 MB available, using 5381 MB of it), using FLOATS
20.0%  proc'd: 668 (8726 Gflop/s)   errors: 0   temps: 30 C 
    Summary at:   lun feb  4 23:28:05 CST 2019

30.0%  proc'd: 1002 (8702 Gflop/s)   errors: 0   temps: 30 C 
    Summary at:   lun feb  4 23:28:06 CST 2019

50.0%  proc'd: 2004 (8742 Gflop/s)   errors: 0   temps: 30 C 
    Summary at:   lun feb  4 23:28:08 CST 2019

60.0%  proc'd: 2672 (8694 Gflop/s)   errors: 0   temps: 43 C 
    Summary at:   lun feb  4 23:28:09 CST 2019

80.0%  proc'd: 3674 (8702 Gflop/s)   errors: 0   temps: 43 C 
    Summary at:   lun feb  4 23:28:11 CST 2019

100.0%  proc'd: 4676 (8714 Gflop/s)   errors: 0   temps: 43 C 
    Summary at:   lun feb  4 23:28:13 CST 2019

100.0%  proc'd: 5344 (8705 Gflop/s)   errors: 0   temps: 45 C 
Killing processes.. done

Tested 1 GPUs:
    GPU 0: OK

Source

tyoc213

Most helpful comment

It's a pytorch/cuda issue, and not fastai's. I'd say try installing cuda10 pytorch version, since you are using a cutting edge NVIDIA driver:

conda install -c pytorch pytorch cuda100

It seems to work for people: https://discuss.pytorch.org/t/cuda-runtime-error-11/30080/8.

If the problem persists see if you can run a basic straight pytorch example, e.g. https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html (there is a nb at the bottom of the page), and if you can't - sort it out with the pytorch issue.

If you can and indeed there is a problem with fastai please reopen this ticket.

And I recommend googling any such errors first before opening an Issue. Often, you will find that it was both reported and solved.

stas00 on 5 Feb 2019

👍4 😄1

All 4 comments

It's a pytorch/cuda issue, and not fastai's. I'd say try installing cuda10 pytorch version, since you are using a cutting edge NVIDIA driver:

conda install -c pytorch pytorch cuda100

It seems to work for people: https://discuss.pytorch.org/t/cuda-runtime-error-11/30080/8.

If you can and indeed there is a problem with fastai please reopen this ticket.

And I recommend googling any such errors first before opening an Issue. Often, you will find that it was both reported and solved.

stas00 on 5 Feb 2019

👍4 😄1

Thanks @stas00 I got it pass there (probably stupid question but it is possible to know a pytorch error from a fastai one???). If I run the whole notebook I get

RuntimeError: CUDA out of memory. Tried to allocate 4.32 GiB (GPU 0; 7.76 GiB total capacity; 1.71 GiB already allocated; 4.32 GiB free; 48.06 MiB cached)

learn.lr_find()
learn.recorder.plot()

seems that running the lines manually works on second time of run the cell I dont get the error.... (dont know why on first not, anyway seems somewhat correct)

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-30-c7a9c29f9dd1> in <module>
----> 1 learn.lr_find()
      2 learn.recorder.plot()

~/anaconda3/lib/python3.7/site-packages/fastai/train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, **kwargs)
     29     cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)
     30     a = int(np.ceil(num_it/len(learn.data.train_dl)))
---> 31     learn.fit(a, start_lr, callbacks=[cb], **kwargs)
     32 
     33 def to_fp16(learn:Learner, loss_scale:float=512., flat_master:bool=False)->Learner:

~/anaconda3/lib/python3.7/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
    164         callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
    165         fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,
--> 166             callbacks=self.callbacks+callbacks)
    167 
    168     def create_opt(self, lr:Floats, wd:Floats=0.)->None:

~/anaconda3/lib/python3.7/site-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
     92     except Exception as e:
     93         exception = e
---> 94         raise e
     95     finally: cb_handler.on_train_end(exception)
     96 

~/anaconda3/lib/python3.7/site-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
     82             for xb,yb in progress_bar(data.train_dl, parent=pbar):
     83                 xb, yb = cb_handler.on_batch_begin(xb, yb)
---> 84                 loss = loss_batch(model, xb, yb, loss_func, opt, cb_handler)
     85                 if cb_handler.on_batch_end(loss): break
     86 

~/anaconda3/lib/python3.7/site-packages/fastai/basic_train.py in loss_batch(model, xb, yb, loss_func, opt, cb_handler)
     16     if not is_listy(xb): xb = [xb]
     17     if not is_listy(yb): yb = [yb]
---> 18     out = model(*xb)
     19     out = cb_handler.on_loss_begin(out)
     20 

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/container.py in forward(self, input)
     90     def forward(self, input):
     91         for module in self._modules.values():
---> 92             input = module(input)
     93         return input
     94 

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/container.py in forward(self, input)
     90     def forward(self, input):
     91         for module in self._modules.values():
---> 92             input = module(input)
     93         return input
     94 

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/container.py in forward(self, input)
     90     def forward(self, input):
     91         for module in self._modules.values():
---> 92             input = module(input)
     93         return input
     94 

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/anaconda3/lib/python3.7/site-packages/torchvision/models/resnet.py in forward(self, x)
     74         residual = x
     75 
---> 76         out = self.conv1(x)
     77         out = self.bn1(out)
     78         out = self.relu(out)

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py in forward(self, input)
    318     def forward(self, input):
    319         return F.conv2d(input, self.weight, self.bias, self.stride,
--> 320                         self.padding, self.dilation, self.groups)
    321 
    322 

RuntimeError: CUDA out of memory. Tried to allocate 4.32 GiB (GPU 0; 7.76 GiB total capacity; 1.71 GiB already allocated; 4.32 GiB free; 48.06 MiB cached)

tyoc213 on 5 Feb 2019

you're running an old fastai version
you're not using a dedicated card, so your memory goes wasted by other processes - switch those to igpu if you have 2 gpus https://askubuntu.com/questions/1061551/how-to-configure-igpu-for-xserver-and-nvidia-gpu-for-cuda-work
or consider using cloud gpus (free for lessons), the course site explains how - you will need 8GB RAM available for most lessons.
look up CUDA out of memory on forums.fast.ai - you basically need to reduce bs and sometimes other parameters when you have little ram.

as far as telling, you will know over time which is which. fastai is a library running on top of pytorch, so the line is thin.

Use forums and you will find lots of threads discussing this.

stas00 on 5 Feb 2019

👍3