When voc data set was used,it is good running. But the loss will be nan ,when use my data set.
I used to use the version 1x mmdet to train my data,while it is running normally.
Now i use the last mmdet,and the loss become nan.
_There is loss log._
2020-06-14 10:16:55,236 - mmdet - INFO - Epoch [1][50/4675] lr: 0.00049, eta: 15:41:30, time: 1.008, data_time: 0.051, memory: 7237, loss_rpn_cls: 0.4373, loss_rpn_bbox: 0.0584, s0.loss_cls: 1.6474, s0.acc: 82.6758, s0.loss_bbox: 0.0144, s1.loss_cls: 0.8500, s1.acc: 78.6055, s1.loss_bbox: 0.0047, s2.loss_cls: 0.4462, s2.acc: 74.5586, s2.loss_bbox: 0.0006, loss: 3.4589
2020-06-14 10:17:45,189 - mmdet - INFO - Epoch [1][100/4675] lr: 0.00099, eta: 15:36:33, time: 0.999, data_time: 0.005, memory: 7237, loss_rpn_cls: 0.2068, loss_rpn_bbox: 0.0503, s0.loss_cls: 0.5536, s0.acc: 98.2969, s0.loss_bbox: 0.0259, s1.loss_cls: 0.2599, s1.acc: 98.8789, s1.loss_bbox: 0.0066, s2.loss_cls: 0.1278, s2.acc: 99.0723, s2.loss_bbox: 0.0012, loss: 1.2322
2020-06-14 10:18:31,520 - mmdet - INFO - Epoch [1][150/4675] lr: 0.00149, eta: 15:11:50, time: 0.927, data_time: 0.005, memory: 7237, loss_rpn_cls: nan, loss_rpn_bbox: nan, s0.loss_cls: nan, s0.acc: 55.3535, s0.loss_bbox: nan, s1.loss_cls: nan, s1.acc: 55.5742, s1.loss_bbox: nan, s2.loss_cls: nan, s2.acc: 55.6484, s2.loss_bbox: nan, loss: nan
2020-06-14 10:19:12,284 - mmdet - INFO - Epoch [1][200/4675] lr: 0.00199, eta: 14:33:09, time: 0.815, data_time: 0.005, memory: 7237, loss_rpn_cls: nan, loss_rpn_bbox: nan, s0.loss_cls: nan, s0.acc: 1.3333, s0.loss_bbox: nan, s1.loss_cls: nan, s1.acc: 1.3333, s1.loss_bbox: nan, s2.loss_cls: nan, s2.acc: 1.3333, s2.loss_bbox: nan, loss: nan
2020-06-14 10:19:52,919 - mmdet - INFO - Epoch [1][250/4675] lr: 0.00249, eta: 14:09:11, time: 0.813, data_time: 0.005, memory: 7237, loss_rpn_cls: nan, loss_rpn_bbox: nan, s0.loss_cls: nan, s0.acc: 0.6000, s0.loss_bbox: nan, s1.loss_cls: nan, s1.acc: 0.6000, s1.loss_bbox: nan, s2.loss_cls: nan, s2.acc: 0.6000, s2.loss_bbox: nan, loss: nan
2020-06-14 10:20:34,484 - mmdet - INFO - Epoch [1][300/4675] lr: 0.00299, eta: 13:55:52, time: 0.831, data_time: 0.005, memory: 7237, loss_rpn_cls: nan, loss_rpn_bbox: nan, s0.loss_cls: nan, s0.acc: 4.1667, s0.loss_bbox: nan, s1.loss_cls: nan, s1.acc: 4.1667, s1.loss_bbox: nan, s2.loss_cls: nan, s2.acc: 4.1667, s2.loss_bbox: nan, loss: nan
2020-06-14 10:21:15,492 - mmdet - INFO - Epoch [1][350/4675] lr: 0.00349, eta: 13:44:41, time: 0.820, data_time: 0.005, memory: 7237, loss_rpn_cls: nan, loss_rpn_bbox: nan, s0.loss_cls: nan, s0.acc: 1.3333, s0.loss_bbox: nan, s1.loss_cls: nan, s1.acc: 1.3333, s1.loss_bbox: nan, s2.loss_cls: nan, s2.acc: 1.3333, s2.loss_bbox: nan, loss: nan
2020-06-14 10:21:56,436 - mmdet - INFO - Epoch [1][400/4675] lr: 0.00399, eta: 13:35:59, time: 0.819, data_time: 0.005, memory: 7237, loss_rpn_cls: nan, loss_rpn_bbox: nan, s0.loss_cls: nan, s0.acc: 5.0000, s0.loss_bbox: nan, s1.loss_cls: nan, s1.acc: 5.0000, s1.loss_bbox: nan, s2.loss_cls: nan, s2.acc: 5.0000, s2.loss_bbox: nan, loss: nan
2020-06-14 10:22:37,299 - mmdet - INFO - Epoch [1][450/4675] lr: 0.00449, eta: 13:28:53, time: 0.817, data_time: 0.005, memory: 7237, loss_rpn_cls: nan, loss_rpn_bbox: nan, s0.loss_cls: nan, s0.acc: 0.0000, s0.loss_bbox: nan, s1.loss_cls: nan, s1.acc: 0.0000, s1.loss_bbox: nan, s2.loss_cls: nan, s2.acc: 0.0000, s2.loss_bbox: nan, loss: nan
_There is config_
sys.platform: linux
Python: 3.7.7 (default, May 7 2020, 21:25:33) [GCC 7.3.0]
CUDA available: True
CUDA_HOME: /usr/local/cuda
NVCC: Cuda compilation tools, release 10.1, V10.1.105
GPU 0,1: Tesla P100-PCIE-12GB
GCC: gcc (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609
PyTorch: 1.5.0
PyTorch compiling details: PyTorch built with:
TorchVision: 0.6.0a0+82fd1c8
OpenCV: 4.2.0
MMCV: 0.5.9
MMDetection: 2.0.0+unknown
MMDetection Compiler: GCC 5.4
2020-06-14 10:15:57,474 - mmdet - INFO - Distributed training: False
2020-06-14 10:15:57,855 - mmdet - INFO - Config:
model = dict(
type='CascadeRCNN',
pretrained='open-mmlab://resnext101_32x4d',
backbone=dict(
type='ResNeXt',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=True,
style='pytorch',
groups=32,
base_width=4),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_generator=dict(
type='AnchorGenerator',
scales=[8],
ratios=[0.5, 1.0, 2.0],
strides=[4, 8, 16, 32, 64]),
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0.0, 0.0, 0.0, 0.0],
target_stds=[1.0, 1.0, 1.0, 1.0]),
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(
type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=1.0)),
roi_head=dict(
type='CascadeRoIHead',
num_stages=3,
stage_loss_weights=[1, 0.5, 0.25],
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=[
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=80,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0.0, 0.0, 0.0, 0.0],
target_stds=[0.1, 0.1, 0.2, 0.2]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)),
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=80,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0.0, 0.0, 0.0, 0.0],
target_stds=[0.05, 0.05, 0.1, 0.1]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)),
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=80,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0.0, 0.0, 0.0, 0.0],
target_stds=[0.033, 0.033, 0.067, 0.067]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
]))
train_cfg = dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
match_low_quality=True,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=[
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.6,
neg_iou_thr=0.6,
min_pos_iou=0.6,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.7,
min_pos_iou=0.7,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False)
])
test_cfg = dict(
rpn=dict(
nms_across_levels=False,
nms_pre=1000,
nms_post=1000,
max_num=1000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100))
dataset_type = 'WeldDataset'
data_root = '/7T_DISK/zhr/data/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=False),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1000, 600),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=False),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type='WeldDataset',
ann_file=
'/7T_DISK/zhr/data/weldDataSet_v9.0/ImageSets/Main/trainval.txt',
img_prefix='/7T_DISK/zhr/data/weldDataSet_v9.0/',
pipeline=[
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=False),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
]),
val=dict(
type='WeldDataset',
ann_file='/7T_DISK/zhr/data/weldDataSet_v9.0/ImageSets/Main/test.txt',
img_prefix='/7T_DISK/zhr/data/weldDataSet_v9.0/',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1000, 600),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=False),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]),
test=dict(
type='WeldDataset',
ann_file='/7T_DISK/zhr/data/weldDataSet_v9.0/ImageSets/Main/test.txt',
img_prefix='/7T_DISK/zhr/data/weldDataSet_v9.0/',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1000, 600),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=False),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]))
evaluation = dict(interval=1, metric='mAP')
optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=None)
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[8, 11])
total_epochs = 12
checkpoint_config = dict(interval=1)
log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
work_dir = 'checkpoints/cascade_rcnn_x101_32x4d_1x_weld_v10.0_20200613'
gpu_ids = [0]`
Hi @zuhaoran ,
If you are converting models and datasets from v1.0 version, you might need to check the box conversions here. Some old conversion (e.g., bbox transform) might in v1.0 could cause loss to be Nan in V2.0.
Thanks @ZwwWayne ,
I use the model from v2.10 version. What text I need to change?
There is a annotations.
<annotation>
<folder>weld2020</folder>
<filename>0001.png</filename>
<size>
<width>880</width>
<height>457</height>
<depth>3</depth>
</size>
<segmented>1</segmented>
<object>
<name>circular</name>
<pose>Unspecified</pose>
<truncated>0</truncated>
<difficult>0</difficult>
<bndbox>
<xmin>432</xmin>
<ymin>348</ymin>
<xmax>532</xmax>
<ymax>440</ymax>
</bndbox>
</object>
</annotation>
There is the my dataset file, mmdet.datasets.weld.py. It was changed from voc.py
`from mmdet.core import eval_map, eval_recalls
from .builder import DATASETS
from .xml_style import XMLDataset
@DATASETS.register_module()
class WeldDataset(XMLDataset):
CLASSES = ('crack', 'lpenetration', 'lfusion', 'circular', 'stripy')
# CLASSES_ad = ('cr', 'lp', 'lf', 'ci', 'st')
def __init__(self, **kwargs):
super(WeldDataset, self).__init__(**kwargs)
def evaluate(self,
results,
metric='mAP',
logger=None,
proposal_nums=(100, 300, 1000),
iou_thr=0.5,
scale_ranges=None):
if not isinstance(metric, str):
assert len(metric) == 1
metric = metric[0]
allowed_metrics = ['mAP', 'recall']
if metric not in allowed_metrics:
raise KeyError('metric {} is not supported'.format(metric))
annotations = [self.get_ann_info(i) for i in range(len(self))]
eval_results = {}
if metric == 'mAP':
assert isinstance(iou_thr, float)
ds_name = self.CLASSES
mean_ap, _ = eval_map(
results,
annotations,
scale_ranges=None,
iou_thr=iou_thr,
dataset=ds_name,
logger=logger)
eval_results['mAP'] = mean_ap
elif metric == 'recall':
gt_bboxes = [ann['bboxes'] for ann in annotations]
if isinstance(iou_thr, float):
iou_thr = [iou_thr]
recalls = eval_recalls(
gt_bboxes, results, proposal_nums, iou_thr, logger=logger)
for i, num in enumerate(proposal_nums):
for j, iou in enumerate(iou_thr):
eval_results['recall@{}@{}'.format(num, iou)] = recalls[i,
j]
if recalls.shape[1] > 1:
ar = recalls.mean(axis=1)
for i, num in enumerate(proposal_nums):
eval_results['AR@{}'.format(num)] = ar[i]
return eval_results
`
It is my data set config, which changed from voc0712.py
`# dataset settings
dataset_type = 'WeldDataset'
data_root = '/7T_DISK/zhr/data/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=False)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0),
dict(type='Normalize', *img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1000, 600),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', *img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'weldDataSet_v9.0/ImageSets/Main/trainval.txt',
img_prefix=data_root + 'weldDataSet_v9.0/',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=data_root + 'weldDataSet_v9.0/ImageSets/Main/test.txt',
img_prefix=data_root + 'weldDataSet_v9.0/',
pipeline=test_pipeline),
test=dict(
type=dataset_type,
ann_file=data_root + 'weldDataSet_v9.0/ImageSets/Main/test.txt',
img_prefix=data_root + 'weldDataSet_v9.0/',
pipeline=test_pipeline))
evaluation = dict(interval=1, metric='mAP')
`
Hi @zuhaoran ,
For now I have not observe bug from you code. You may also try to add gradient clip in your model. In MMDetection V1.x gradient clip is a default option but we do not use that for speed. Also, you can check the learning rate, 0.02 is for batch size 16 on 8 gpus.
Thanks @ZwwWayne
Can you give me an example to set gradient clip?
@ZwwWayne
I have changed the schedule_1x.py
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
The nan still appears.
2020-06-14 17:57:03,135 - mmdet - INFO - workflow: [('train', 1)], max: 12 epochs
2020-06-14 17:58:07,697 - mmdet - INFO - Epoch [1][50/9350] lr: 0.00100, eta: 1 day, 16:13:15, time: 1.291, data_time: 0.046, memory: 4140, loss_rpn_cls: 0.2407, loss_rpn_bbox: 0.0520, s0.loss_cls: 0.6504, s0.acc: 95.0664, s0.loss_bbox: 0.0124, s1.loss_cls: 0.3260, s1.acc: 94.9141, s1.loss_bbox: 0.0039, s2.loss_cls: 0.1794, s2.acc: 92.7070, s2.loss_bbox: 0.0004, loss: 1.4654
2020-06-14 17:59:09,184 - mmdet - INFO - Epoch [1][100/9350] lr: 0.00116, eta: 1 day, 15:14:51, time: 1.230, data_time: 0.003, memory: 4140, loss_rpn_cls: 0.2045, loss_rpn_bbox: 0.0578, s0.loss_cls: 0.2052, s0.acc: 98.5273, s0.loss_bbox: 0.0244, s1.loss_cls: 0.0908, s1.acc: 99.0664, s1.loss_bbox: 0.0056, s2.loss_cls: 0.0437, s2.acc: 99.2383, s2.loss_bbox: 0.0007, loss: 0.6327
2020-06-14 18:00:11,717 - mmdet - INFO - Epoch [1][150/9350] lr: 0.00133, eta: 1 day, 15:07:45, time: 1.251, data_time: 0.003, memory: 4140, loss_rpn_cls: 0.2378, loss_rpn_bbox: 0.0525, s0.loss_cls: 0.1292, s0.acc: 98.3008, s0.loss_bbox: 0.0219, s1.loss_cls: 0.0557, s1.acc: 98.7539, s1.loss_bbox: 0.0069, s2.loss_cls: 0.0261, s2.acc: 98.9453, s2.loss_bbox: 0.0012, loss: 0.5311
2020-06-14 18:01:13,258 - mmdet - INFO - Epoch [1][200/9350] lr: 0.00150, eta: 1 day, 14:54:20, time: 1.231, data_time: 0.003, memory: 4140, loss_rpn_cls: 0.1725, loss_rpn_bbox: 0.0373, s0.loss_cls: 0.1727, s0.acc: 98.5625, s0.loss_bbox: 0.0226, s1.loss_cls: 0.0734, s1.acc: 99.0664, s1.loss_bbox: 0.0065, s2.loss_cls: 0.0346, s2.acc: 99.2383, s2.loss_bbox: 0.0013, loss: 0.5209
2020-06-14 18:02:08,820 - mmdet - INFO - Epoch [1][250/9350] lr: 0.00166, eta: 1 day, 14:01:18, time: 1.111, data_time: 0.003, memory: 4140, loss_rpn_cls: nan, loss_rpn_bbox: nan, s0.loss_cls: nan, s0.acc: 43.2812, s0.loss_bbox: nan, s1.loss_cls: nan, s1.acc: 43.4922, s1.loss_bbox: nan, s2.loss_cls: nan, s2.acc: 43.5391, s2.loss_bbox: nan, loss: nan
2020-06-14 18:02:59,441 - mmdet - INFO - Epoch [1][300/9350] lr: 0.00183, eta: 1 day, 12:54:56, time: 1.012, data_time: 0.003, memory: 4141, loss_rpn_cls: nan, loss_rpn_bbox: nan, s0.loss_cls: nan, s0.acc: 2.0000, s0.loss_bbox: nan, s1.loss_cls: nan, s1.acc: 2.0000, s1.loss_bbox: nan, s2.loss_cls: nan, s2.acc: 2.0000, s2.loss_bbox: nan, loss: nan
2020-06-14 18:03:49,653 - mmdet - INFO - Epoch [1][350/9350] lr: 0.00200, eta: 1 day, 12:05:08, time: 1.004, data_time: 0.003, memory: 4141, loss_rpn_cls: nan, loss_rpn_bbox: nan, s0.loss_cls: nan, s0.acc: 4.0000, s0.loss_bbox: nan, s1.loss_cls: nan, s1.acc: 4.0000, s1.loss_bbox: nan, s2.loss_cls: nan, s2.acc: 4.0000, s2.loss_bbox: nan, loss: nan
2020-06-14 18:04:41,088 - mmdet - INFO - Epoch [1][400/9350] lr: 0.00216, eta: 1 day, 11:33:14, time: 1.029, data_time: 0.003, memory: 4141, loss_rpn_cls: nan, loss_rpn_bbox: nan, s0.loss_cls: nan, s0.acc: 2.0000, s0.loss_bbox: nan, s1.loss_cls: nan, s1.acc: 2.0000, s1.loss_bbox: nan, s2.loss_cls: nan, s2.acc: 2.0000, s2.loss_bbox: nan, loss: nan
From my experience, if intersection of gt_bbox and image has zero size, the loss will be NaN in 2.0 version.
If the gradient clip does not work, you might need to check whether the data is clean, e.g., do not have out-of-image bbox.
@ZwwWayne @thisisi3 thank you for your suggestions.
Hi,@ZwwWayne I met same problem, when using cascade-rcnn.Last week,with same dataset,same configs , I trained successfully with v2.0. After upgrading mmdetection to v2.1,loss will become nan after some epochs.I have tried diffrent lr and check my dataset,loss will still become nan.
Hi @cdzhu ,
Similar suggestions as given above. 1. You may also need to add gradient clip in your model. 2. If the gradient clip does not work, you might need to check whether the data is clean, e.g., do not have out-of-image box. 3. If you implement you own dataset or augmentation, check the code whether the box has zero area.
A demo for checking whether the annos has out-of-image boxs.
import xml.etree.ElementTree as ET
import os
import cv2
import mmcv
from PIL import Image
import numpy as np
xml_root = "./data"
new_xml_root = "./data"
image_root = "./data"
xml_name_list = sorted(os.listdir(xml_root))
def print_all_classes():
all_name_list = []
for xml_name in xml_name_list:
print(f"{xml_name}")
xml_path = os.path.join(xml_root, xml_name)
tree = ET.parse(xml_path)
root = tree.getroot()
for obj in root.findall("object"):
name = obj.find("name").text
all_name_list.append(name)
print(all_name_list)
def check_hw():
tranposed_name_lists = []
for xml_name in xml_name_list:
xml_path = os.path.join(xml_root, xml_name)
tree = ET.parse(xml_path)
root = tree.getroot()
size = root.find("size")
width = int(size.find("width").text)
height = int(size.find("height").text)
image_path = os.path.join(image_root, xml_name[:-4] + ".jpg")
img = cv2.imread(image_path, flags=cv2.IMREAD_COLOR)
h, w, _ = img.shape
if height != h or width != w:
print(width, w, height, h)
print(f"{xml_name}'s h, w is tranposed.")
tranposed_name_lists.append(xml_name)
print(tranposed_name_lists)
def check_bbox():
if not os.path.exists(new_xml_root):
os.makedirs(new_xml_root)
for xml_name in xml_name_list:
xml_path = os.path.join(xml_root, xml_name)
tree = ET.parse(xml_path)
root = tree.getroot()
for obj in root.findall("object"):
bnd_box = obj.find("bndbox")
bbox = [
int(float(bnd_box.find("xmin").text)),
int(float(bnd_box.find("ymin").text)),
int(float(bnd_box.find("xmax").text)),
int(float(bnd_box.find("ymax").text)),
]
image_path = os.path.join(image_root, xml_name[:-4] + ".jpg")
img = cv2.imread(image_path, flags=cv2.IMREAD_COLOR)
h, w, _ = img.shape
if bbox[0] >= bbox[2] or bbox[1] >= bbox[3]:
print("bbox[0] >= bbox[2] or bbox[1] >= bbox[3]", bbox, xml_name)
# bboxes = np.array([bbox])
# mmcv.imshow_det_bboxes(img, bboxes, labels=np.array(["h"]))
# bbox_min_ge_max_name_lists.append(xml_name)
root.remove(obj)
elif bbox[3] > h or bbox[2] > w:
bnd_box.find("xmax").text = str(min(w, bbox[2]))
bnd_box.find("ymax").text = str(min(h, bbox[3]))
print("bbox[3] > h or bbox[2] > w", bbox, h, w, xml_name)
# bboxes = np.array([bbox])
# mmcv.imshow_det_bboxes(img, bboxes, labels=np.array(["h"]))
# bbox_max_border_name_lists.append(xml_name)
tree.write(os.path.join(new_xml_root, xml_name))
check_bbox()
Hi, I had the same problem. Would u please tell me how u solve it?
@tyj1996 you can check you dataset
if bbox.xmin==0 : bbox.xmin=1 ;
if bbox.ymin==0 : bbox.ymin=1 ;
if bbox.xmax==wight : bbox.xmax=wight-1 ;
if bbox.ymax==high : bbox.ymin=high-1 ;
@tyj1996 if xmin>xmax and ymin >ymax
the xmin,ymin should be xmax,ymax
Most helpful comment
Hi,@ZwwWayne I met same problem, when using cascade-rcnn.Last week,with same dataset,same configs , I trained successfully with v2.0. After upgrading mmdetection to v2.1,loss will become nan after some epochs.I have tried diffrent lr and check my dataset,loss will still become nan.