In the paper of HTC, there is a result with the backbone of SENet-154. However, when i change the backbone from res-50 to SENet-154, it raise an OOM error even when i use a 24G M40. Can you show some memory information when training with SENet-154?
The batch-size is set to 2, and it will not raise the error at the beginning of training procedure.
@jichilen Is it ok to share your SENet-154 model file here for us to try?
Change batch size to 1 may help you
senet.py
from __future__ import print_function, division, absolute_import
from collections import OrderedDict
import math
from ..registry import BACKBONES
import torch.nn as nn
from torch.utils import model_zoo
from mmcv.runner import load_checkpoint
import logging
"""
https://github.com/Cadene/pretrained-models.pytorch/blob/master/pretrainedmodels/models/senet.py
"""
"""
ResNet code gently borrowed from
https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
"""
__all__ = ['SENet', 'senet154', 'se_resnet50', 'se_resnet101', 'se_resnet152',
'se_resnext50_32x4d', 'se_resnext101_32x4d']
pretrained_settings = {
'senet154': {
'imagenet': {
'url': 'http://data.lip6.fr/cadene/pretrainedmodels/senet154-c7b49a05.pth',
'input_space': 'RGB',
'input_size': [3, 224, 224],
'input_range': [0, 1],
'mean': [0.485, 0.456, 0.406],
'std': [0.229, 0.224, 0.225],
'num_classes': 1000
}
},
}
class SEModule(nn.Module):
def __init__(self, channels, reduction):
super(SEModule, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc1 = nn.Conv2d(channels, channels // reduction, kernel_size=1,
padding=0)
self.relu = nn.ReLU(inplace=True)
self.fc2 = nn.Conv2d(channels // reduction, channels, kernel_size=1,
padding=0)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
module_input = x
x = self.avg_pool(x)
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
x = self.sigmoid(x)
return module_input * x
class Bottleneck(nn.Module):
"""
Base class for bottlenecks that implements `forward()` method.
"""
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out = self.se_module(out) + residual
out = self.relu(out)
return out
class SEBottleneck(Bottleneck):
"""
Bottleneck for SENet154.
"""
expansion = 4
def __init__(self, inplanes, planes, groups, reduction, stride=1,
downsample=None):
super(SEBottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes * 2, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes * 2)
self.conv2 = nn.Conv2d(planes * 2, planes * 4, kernel_size=3,
stride=stride, padding=1, groups=groups,
bias=False)
self.bn2 = nn.BatchNorm2d(planes * 4)
self.conv3 = nn.Conv2d(planes * 4, planes * 4, kernel_size=1,
bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.se_module = SEModule(planes * 4, reduction=reduction)
self.downsample = downsample
self.stride = stride
class SEResNetBottleneck(Bottleneck):
"""
ResNet bottleneck with a Squeeze-and-Excitation module. It follows Caffe
implementation and uses `stride=stride` in `conv1` and not in `conv2`
(the latter is used in the torchvision implementation of ResNet).
"""
expansion = 4
def __init__(self, inplanes, planes, groups, reduction, stride=1,
downsample=None):
super(SEResNetBottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False,
stride=stride)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1,
groups=groups, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.se_module = SEModule(planes * 4, reduction=reduction)
self.downsample = downsample
self.stride = stride
class SEResNeXtBottleneck(Bottleneck):
"""
ResNeXt bottleneck type C with a Squeeze-and-Excitation module.
"""
expansion = 4
def __init__(self, inplanes, planes, groups, reduction, stride=1,
downsample=None, base_width=4):
super(SEResNeXtBottleneck, self).__init__()
width = math.floor(planes * (base_width / 64)) * groups
self.conv1 = nn.Conv2d(inplanes, width, kernel_size=1, bias=False,
stride=1)
self.bn1 = nn.BatchNorm2d(width)
self.conv2 = nn.Conv2d(width, width, kernel_size=3, stride=stride,
padding=1, groups=groups, bias=False)
self.bn2 = nn.BatchNorm2d(width)
self.conv3 = nn.Conv2d(width, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.se_module = SEModule(planes * 4, reduction=reduction)
self.downsample = downsample
self.stride = stride
bottleneck_dic = {
'SEBottleneck': SEBottleneck,
'SEResNetBottleneck': SEResNetBottleneck,
'SEResNeXtBottleneck': SEResNeXtBottleneck
}
@BACKBONES.register_module
class SENet(nn.Module):
def __init__(self, block, layers, groups, reduction, dropout_p=0.2,
inplanes=128, input_3x3=True, downsample_kernel_size=3,
downsample_padding=1, num_classes=1000):
"""
Parameters
----------
block (nn.Module): Bottleneck class.
- For SENet154: SEBottleneck
- For SE-ResNet models: SEResNetBottleneck
- For SE-ResNeXt models: SEResNeXtBottleneck
layers (list of ints): Number of residual blocks for 4 layers of the
network (layer1...layer4).
groups (int): Number of groups for the 3x3 convolution in each
bottleneck block.
- For SENet154: 64
- For SE-ResNet models: 1
- For SE-ResNeXt models: 32
reduction (int): Reduction ratio for Squeeze-and-Excitation modules.
- For all models: 16
dropout_p (float or None): Drop probability for the Dropout layer.
If `None` the Dropout layer is not used.
- For SENet154: 0.2
- For SE-ResNet models: None
- For SE-ResNeXt models: None
inplanes (int): Number of input channels for layer1.
- For SENet154: 128
- For SE-ResNet models: 64
- For SE-ResNeXt models: 64
input_3x3 (bool): If `True`, use three 3x3 convolutions instead of
a single 7x7 convolution in layer0.
- For SENet154: True
- For SE-ResNet models: False
- For SE-ResNeXt models: False
downsample_kernel_size (int): Kernel size for downsampling convolutions
in layer2, layer3 and layer4.
- For SENet154: 3
- For SE-ResNet models: 1
- For SE-ResNeXt models: 1
downsample_padding (int): Padding for downsampling convolutions in
layer2, layer3 and layer4.
- For SENet154: 1
- For SE-ResNet models: 0
- For SE-ResNeXt models: 0
num_classes (int): Number of outputs in `last_linear` layer.
- For all models: 1000
"""
super(SENet, self).__init__()
block = bottleneck_dic[block]
self.inplanes = inplanes
if input_3x3:
layer0_modules = [
('conv1', nn.Conv2d(3, 64, 3, stride=2, padding=1,
bias=False)),
('bn1', nn.BatchNorm2d(64)),
('relu1', nn.ReLU(inplace=True)),
('conv2', nn.Conv2d(64, 64, 3, stride=1, padding=1,
bias=False)),
('bn2', nn.BatchNorm2d(64)),
('relu2', nn.ReLU(inplace=True)),
('conv3', nn.Conv2d(64, inplanes, 3, stride=1, padding=1,
bias=False)),
('bn3', nn.BatchNorm2d(inplanes)),
('relu3', nn.ReLU(inplace=True)),
]
else:
layer0_modules = [
('conv1', nn.Conv2d(3, inplanes, kernel_size=7, stride=2,
padding=3, bias=False)),
('bn1', nn.BatchNorm2d(inplanes)),
('relu1', nn.ReLU(inplace=True)),
]
# To preserve compatibility with Caffe weights `ceil_mode=True`
# is used instead of `padding=1`.
layer0_modules.append(('pool', nn.MaxPool2d(3, stride=2,
ceil_mode=True)))
self.layer0 = nn.Sequential(OrderedDict(layer0_modules))
self.layer1 = self._make_layer(
block,
planes=64,
blocks=layers[0],
groups=groups,
reduction=reduction,
downsample_kernel_size=1,
downsample_padding=0
)
self.layer2 = self._make_layer(
block,
planes=128,
blocks=layers[1],
stride=2,
groups=groups,
reduction=reduction,
downsample_kernel_size=downsample_kernel_size,
downsample_padding=downsample_padding
)
self.layer3 = self._make_layer(
block,
planes=256,
blocks=layers[2],
stride=2,
groups=groups,
reduction=reduction,
downsample_kernel_size=downsample_kernel_size,
downsample_padding=downsample_padding
)
self.layer4 = self._make_layer(
block,
planes=512,
blocks=layers[3],
stride=2,
groups=groups,
reduction=reduction,
downsample_kernel_size=downsample_kernel_size,
downsample_padding=downsample_padding
)
# self.avg_pool = nn.AvgPool2d(7, stride=1)
# self.dropout = nn.Dropout(dropout_p) if dropout_p is not None else None
# self.last_linear = nn.Linear(512 * block.expansion, num_classes)
def init_weights(self, pretrained=None):
if isinstance(pretrained, str):
logger = logging.getLogger()
load_checkpoint(self, pretrained, strict=False, logger=logger)
def _make_layer(self, block, planes, blocks, groups, reduction, stride=1,
downsample_kernel_size=1, downsample_padding=0):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=downsample_kernel_size, stride=stride,
padding=downsample_padding, bias=False),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, groups, reduction, stride,
downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes, groups, reduction))
return nn.Sequential(*layers)
def features(self, x):
outputs = []
x = self.layer0(x)
x = self.layer1(x)
outputs.append(x)
x = self.layer2(x)
outputs.append(x)
x = self.layer3(x)
outputs.append(x)
x = self.layer4(x)
outputs.append(x)
return x, outputs
'''
def logits(self, x):
x = self.avg_pool(x)
if self.dropout is not None:
x = self.dropout(x)
x = x.view(x.size(0), -1)
x = self.last_linear(x)
return x
'''
def forward(self, x):
x, outputs = self.features(x)
# x = self.logits(x)
return outputs # x
# def initialize_pretrained_model(model, num_classes, settings):
# assert num_classes == settings['num_classes'], \
# 'num_classes should be {}, but is {}'.format(
# settings['num_classes'], num_classes)
# model.load_state_dict(model_zoo.load_url(settings['url']))
# model.input_space = settings['input_space']
# model.input_size = settings['input_size']
# model.input_range = settings['input_range']
# model.mean = settings['mean']
# model.std = settings['std']
# def senet154(num_classes=1000, pretrained='imagenet'):
# model = SENet(SEBottleneck, [3, 8, 36, 3], groups=64, reduction=16,
# dropout_p=0.2, num_classes=num_classes)
# if pretrained is not None:
# settings = pretrained_settings['senet154'][pretrained]
# initialize_pretrained_model(model, num_classes, settings)
# return model
config.py
pretrained='./senet154.pth',#http://data.lip6.fr/cadene/pretrainedmodels/senet154-c7b49a05.pth
backbone=dict(
type='SENet',
block='SEBottleneck', layers=[3, 8, 36, 3], groups=64, reduction=16,
dropout_p=0.2, num_classes=2
),
@Joker316701882
@jichilen Thank you.
I'll check it!
@thangvubk I don't have enough M40s. If i change the batchsize to 1, will i suffer from the drawbacks of the batch normalization?
Not really. But you should change learning rate based on num gpus and img per gpu. Please refer to readme for details
Thank you, i'll try it.
@jichilen
I think you should implement __freeze_stages() and train() functions in senet.py.
_freeze_stages():
`
def _freeze_stages(self):
if self.frozen_stages >= 0:
for m in [self.layer0]:
for param in m.parameters():
param.requires_grad = False
for i in range(1, self.frozen_stages + 1):
m = getattr(self, 'layer{}'.format(i))
for param in m.parameters():
param.requires_grad = False
`
and trian():
`
def train(self, mode=True):
super(SENet, self).train(mode)
if mode and self.norm_eval:
for m in self.modules():
# trick: eval have effect on BatchNorm only
if isinstance(m, (nn.BatchNorm2d)):
m.eval()
`
Especially train().
If you don't implement this function, your batchnorm statistics will be calculated during running time. In this case, both imgs_per_gpu=2 and imgs_per_gpu=1 will hurt performance.
@Joker316701882
Oh, thank you very much.
About the function _freeze_stages(), i have another question.
In commit c899cdf1e4971ede74686315b80782ee671ac330(bug fix for freezing parameters)
def _freeze_stages(self):
if self.frozen_stages >= 0:
self.norm1.eval()
for m in [self.conv1, self.norm1]:
for param in m.parameters():
param.requires_grad = False
for i in range(1, self.frozen_stages + 1):
m = getattr(self, 'layer{}'.format(i))
m.eval()
Is this change for the same reason?
@jichilen
Not really. You can understand the purpose of train() is to provide 'setting bn to eval mode for backbone'. __freeze_stages() is to tell model not to train some specific stages you specified. Given the resnet.py, I think train() should be implement in any backbone, including SENet154, because in my experiments, fixing bn statistics always provides better results than lettting bn learn. (8 gpus with 2 imgs per gpu, my setting). So in SENet, we need train() function to set bn into eval mode.
Here is the important thing:
If we set bn to eval mode, then the commit you showed us is not necessary, because norm1 is already in eval() mode. The reason @hellock made this commit is that when you didn't set bn to eval mode (like you want all bn to learn), meanwhile you want to freeze some stages, then it's necessary to explicitly set those bn in stages you want to freeze to eval mode ( which is not done before this commit was made).
And yes, I should update my __freeze_stages() function into:
def _freeze_stages(self):
if self.frozen_stages >= 0:
for m in [self.layer0]:
m.eval()
for param in m.parameters():
param.requires_grad = False
for i in range(1, self.frozen_stages + 1):
m = getattr(self, 'layer{}'.format(i))
m.eval()
for param in m.parameters():
param.requires_grad = False
@Joker316701882
Thank you for your patient explanation, i have a clearer understanding of BN now.
@jichilen Have you ever tried to use the DPN107 backbone? I modified the configs according to the SENet according to yours, but the results in val sets are zero? Anything else should I change ?
I copy code from here and finish a simple SENet154 backbone, you can try mmdetection_with_SENet154.
@runzeer Would you please show the model code of DPN107 backbone for us to try and check :-)
Most helpful comment
senet.py
config.py
@Joker316701882