Runtime error occurs when loss.backward() with nn.CrossEntropyLoss() is run.
RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead. (view at ../aten/src/ATen/native/TensorShape.cpp:1329)
Steps to reproduce the behavior:
class SimpleNet(nn.Module):
def __init__(self, in_channels, num_classes, kernel_size=1, stride=1,
dilation=1, groups=1, offset_groups=1):
super().__init__()
offset_channels = 2 * kernel_size * kernel_size
self.conv2d_offset = nn.Conv2d(
in_channels,
offset_channels * offset_groups,
kernel_size=3,
stride=stride,
padding=dilation,
dilation=dilation,
)
self.conv2d = DeformConv2d(
in_channels,
16,
kernel_size=kernel_size,
stride=stride,
padding=1,
dilation=dilation,
groups=groups,
bias=False
)
self.fc = nn.Linear(16 * 32 * 32, num_classes)
def forward(self, x):
offset = self.conv2d_offset(x)
x = self.conv2d(x, offset)
x = x.view(-1, 16 * 32 * 32)
print("X:", x.shape)
return self.fc(x)
for idx, batch in enumerate(train_loader):
data, label = batch
out = model(data.to(device=device))
print("OUT:", out.shape)
print("LABEL:", label.shape)
optimizer.zero_grad()
loss = criterion(out, label.to(device=device))
losses.append(loss)
loss.backward() #### <-- ERROR HERE ####
optimizer.step()
if idx % 10 == 0:
print(f"Loss: {loss}")
#### ERROR STACK ####
Traceback (most recent call last):
File "deformable.py", line 72, in <module>
loss.backward()
File "/opt/conda/lib/python3.6/site-packages/torch/tensor.py", line 198, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/opt/conda/lib/python3.6/site-packages/torch/autograd/__init__.py", line 99, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead. (view at ../aten/src/ATen/native/TensorShape.cpp:1329)
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x6c (0x7f18a962d36c in /opt/conda/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: at::native::view(at::Tensor const&, c10::ArrayRef<long>) + 0x31b (0x7f18ff38939b in /opt/conda/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #2: <unknown function> + 0x31f415b (0x7f18aca4015b in /opt/conda/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #3: <unknown function> + 0x32591b7 (0x7f18acaa51b7 in /opt/conda/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0x2afc996 (0x7f19012bf996 in /opt/conda/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #5: <unknown function> + 0xe5d0c7 (0x7f18ff6200c7 in /opt/conda/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #6: at::Tensor::view(c10::ArrayRef<long>) const + 0xff (0x7f18728859df in /opt/conda/lib/python3.6/site-packages/torchvision/_C.so)
frame #7: <unknown function> + 0xdd08e (0x7f18728a808e in /opt/conda/lib/python3.6/site-packages/torchvision/_C.so)
frame #8: DeformConv2d_backward_cuda(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, std::pair<int, int>, std::pair<int, int>, std::pair<int, int>, int, int) + 0x1fc (0x7f18728a9075 in /opt/conda/lib/python3.6/site-packages/torchvision/_C.so)
frame #9: DeformConv2d_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, std::pair<int, int> const&, std::pair<int, int> const&, std::pair<int, int> const&, int, int) + 0x1e2 (0x7f1872822422 in /opt/conda/lib/python3.6/site-packages/torchvision/_C.so)
frame #10: DeformConv2dFunction::backward(torch::autograd::AutogradContext*, std::vector<at::Tensor, std::allocator<at::Tensor> >) + 0x4e9 (0x7f1872840de9 in /opt/conda/lib/python3.6/site-packages/torchvision/_C.so)
frame #11: torch::autograd::CppNode<DeformConv2dFunction>::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x13c (0x7f187284441c in /opt/conda/lib/python3.6/site-packages/torchvision/_C.so)
frame #12: <unknown function> + 0x2bcbb1c (0x7f190138eb1c in /opt/conda/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #13: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&) + 0x169b (0x7f190138bb8b in /opt/conda/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #14: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&, bool) + 0x579 (0x7f190138ca89 in /opt/conda/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #15: torch::autograd::Engine::thread_init(int) + 0x49 (0x7f1901384569 in /opt/conda/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #16: torch::autograd::python::PythonEngine::thread_init(int) + 0x48 (0x7f1904631608 in /opt/conda/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #17: <unknown function> + 0xc819d (0x7f19072e719d in /opt/conda/bin/../lib/libstdc++.so.6)
frame #18: <unknown function> + 0x76db (0x7f193ccc86db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #19: clone + 0x3f (0x7f193c9f188f in /lib/x86_64-linux-gnu/libc.so.6)
No runtime error
Please copy and paste the output from our
environment collection script
(or fill out the checklist below manually).
You can get the script and run it with:
wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
# For security purposes, please check the contents of collect_env.py before running it.
python collect_env.py
PyTorch version: 1.5.0a0+8f84ded
Is debug build: No
CUDA used to build PyTorch: 10.2
OS: Ubuntu 18.04.4 LTS
GCC version: (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0
CMake version: version 3.14.0
Python version: 3.6
Is CUDA available: Yes
CUDA runtime version: 10.2.89
GPU models and configuration: GPU 0: GeForce GTX 1050 Ti with Max-Q Design
Nvidia driver version: 440.100
cuDNN version: /usr/lib/x86_64-linux-gnu/libcudnn.so.7.6.5
Versions of relevant libraries:
[pip] msgpack-numpy==0.4.3.2
[pip] numpy==1.18.1
[pip] pytorch-transformers==1.1.0
[pip] torch==1.5.0a0+8f84ded
[pip] torchtext==0.4.0
[pip] torchvision==0.6.0a0
[conda] magma-cuda101 2.5.1 1 local
[conda] mkl 2019.1 144
[conda] mkl-include 2019.1 144
[conda] msgpack-numpy 0.4.3.2 py36_0
[conda] nomkl 3.0 0
[conda] numpy 1.18.1 py36h94c655d_0
[conda] numpy-base 1.18.1 py36h2f8d375_1
[conda] pytorch-transformers 1.1.0 pypi_0 pypi
[conda] torch 1.5.0a0+8f84ded pypi_0 pypi
[conda] torchtext 0.4.0 pypi_0 pypi
[conda] torchvision 0.6.0a0 pypi_0 pypi
EDIT:
transform = transforms.Compose([transforms.ToTensor()])
train_dataset = datasets.cifar.CIFAR10(
"./", train=True, transform=transform, download=True)
train_loader = torch.utils.data.DataLoader(
train_dataset, BATCH_SIZE, shuffle=True)
simple CIFAR10 dataset used in the example :)
Hi,
Thanks for the report, can you provide a smaller and self-contained example that reproduces the issue? It will make it much easier to spot and fix the issue.
Sorry, accidentally hard coded the out_channels value as 16 in the given snippet above. My apologies for the mistake.
Closing issue.
@charlie4284 you mean that this was an user-error? Still, I think it would be good to understand it a bit better so that the error message could be better
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import ops
class BasicDeformConv2d(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
dilation=1, groups=1, offset_groups=1):
super().__init__()
self.conv1 = nn.Conv2d(in_channels, 6, kernel_size=5)
self.pool = nn.MaxPool2d(2, 2)
# number of channels for offset should be a multiple
# of 2 * module.weight.size[2] * module.weight.size[3], which correspond
# to the kernel_size
offset_channels = 2 * 4 * 4
self.conv2d_offset = nn.Conv2d(
6,
offset_channels * offset_groups,
kernel_size=4,
stride=stride,
padding=dilation,
dilation=dilation,
)
self.conv2d = ops.DeformConv2d(
6,
16,
kernel_size=4,
stride=stride,
padding=dilation,
dilation=dilation,
groups=groups,
bias=False
)
self.pool2 = nn.MaxPool2d(2, 2)
self.fc = nn.Linear(16*6*6, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
offset = self.conv2d_offset(x)
x = self.pool2(F.relu(self.conv2d(x, offset)))
# x = x.view(-1, 16*6*6) also doesn't work
x = x.reshape(-1, 16*6*6)
return self.fc(x)
model = BasicDeformConv2d(3, 1, dilation=1, kernel_size=3)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
criterion = nn.CrossEntropyLoss()
test_in = torch.rand((32, 3, 32, 32))
test_out = torch.rand(32).to(torch.long)
out = model(test_in)
optimizer.zero_grad()
loss = criterion(out, test_out)
loss.backward()
optimizer.step()
I was able to replicate the issue again today, seems like it wasn't a user-error, sorry I was quite tired yesterday after work I wasn't able to provide detailed information!
Woud this code snippet be of any help?
@charlie4284 I've tried your snippet on my laptop (CPU) on a very recent torchvision (from master) and didn't managed to reproduce the error.
Can you try updating torchvision and see if you still face the error?
Works like magic! Thank you!
Some background info:
I was using the official Pytorch image for nvidia-docker from NVIDIA NGC (nvcr.io/nvidia/pytorch:20.07-py3) which contained torchvision version 0.6.0a0(pip).
Inside the container upgraded the torchvision dependency pip install torchvision --upgrade.
The current working torchvision version is as follows:
torchvision==0.7.0
Works both with CUDA.