Validation step on custom dataset fails because num_classes is wrongly intialized here:
https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/evaluation/mean_ap.py#L342
./tools/dist_train.sh configs/my_config_faster_rcnn_r101_fpn_1x.py 1 --validate
Traceback (most recent call last):
File "./tools/train.py", line 90, in <module>
main()
File "./tools/train.py", line 86, in main
logger=logger)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmdet-0.6.0+11e9c74-py3.6.egg/mmdet/apis/train.py", line 58, in train_detector
_dist_train(model, dataset, cfg, validate=validate)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmdet-0.6.0+11e9c74-py3.6.egg/mmdet/apis/train.py", line 99, in _dist_train
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmcv/runner/runner.py", line 356, in run
epoch_runner(data_loaders[i], **kwargs)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmcv/runner/runner.py", line 272, in train
self.call_hook('after_train_epoch')
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmcv/runner/runner.py", line 229, in call_hook
getattr(hook, fn_name)(self)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmdet-0.6.0+11e9c74-py3.6.egg/mmdet/core/evaluation/eval_hooks.py", line 65, in after_train_epoch
self.evaluate(runner, results)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmdet-0.6.0+11e9c74-py3.6.egg/mmdet/core/evaluation/eval_hooks.py", line 110, in evaluate
print_summary=True)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmdet-0.6.0+11e9c74-py3.6.egg/mmdet/core/evaluation/mean_ap.py", line 327, in eval_map
print_map_summary(mean_ap, eval_results, dataset)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmdet-0.6.0+11e9c74-py3.6.egg/mmdet/core/evaluation/mean_ap.py", line 370, in print_map_summary
label_names[j], num_gts[i, j], results[j]['num_dets'],
IndexError: tuple index out of range
Traceback (most recent call last):
File "/home/ubuntu/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/home/ubuntu/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/torch/distributed/launch.py", line 235, in <module>
main()
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/torch/distributed/launch.py", line 231, in main
cmd=process.args)
Printing the AP table:
| class | gts | dets | recall | precision | ap |
+----------+-----+------+--------+-----------+-------+
| class_1 | 110 | 781 | 0.091 | 0.013 | 0.002 |
| class_2 | 125 | 891 | 0.208 | 0.029 | 0.009 |
| class_3 | 98 | 1446 | 0.316 | 0.021 | 0.008 |
| class_4 | 0 | 0 | 0.000 | 0.000 | 0.000 |
| class_5 | 118 | 1578 | 0.339 | 0.025 | 0.020 |
| class_6 | 0 | 0 | 0.000 | 0.000 | 0.000 |
+----------+-----+------+--------+-----------+-------+
| mAP | | | | | 0.019 |
+----------+-----+------+--------+-----------+-------+
Moving the line 342 after label_names is initialized, resolved the issue:
num_classes = len(label_names)
Thanks for the reporting! Usually L342 should work well, we will look into this bug.
BTW, L342 cannot be moved after label_names being initialized, which may cause issues in L357.
I am having exactly the same issue.
Traceback (most recent call last):
File "./tools/train.py", line 90, in
main()
File "./tools/train.py", line 86, in main
logger=logger)
File "/datadrive/mmdetection/mmdet/apis/train.py", line 58, in train_detector
_dist_train(model, dataset, cfg, validate=validate)
File "/datadrive/mmdetection/mmdet/apis/train.py", line 99, in _dist_train
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/runner.py", line 356, in run
epoch_runner(data_loaders[i], **kwargs)
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/runner.py", line 272, in train
self.call_hook('after_train_epoch')
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/runner.py", line 229, in call_hook
getattr(hook, fn_name)(self)
File "/datadrive/mmdetection/mmdet/core/evaluation/eval_hooks.py", line 65, in after_train_epoch
self.evaluate(runner, results)
File "/datadrive/mmdetection/mmdet/core/evaluation/eval_hooks.py", line 110, in evaluate
print_summary=True)
File "/datadrive/mmdetection/mmdet/core/evaluation/mean_ap.py", line 327, in eval_map
print_map_summary(mean_ap, eval_results, dataset)
File "/datadrive/mmdetection/mmdet/core/evaluation/mean_ap.py", line 370, in print_map_summary
label_names[j], num_gts[i, j], results[j]['num_dets'],
IndexError: list index out of range
Traceback (most recent call last):
File "/data/anaconda/envs/open-mmlab/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/data/anaconda/envs/open-mmlab/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launch.py", line 235, in
main()
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launch.py", line 231, in main
cmd=process.args)
Thanks for lookin into the fix.
thanks for sharing the trace, @fboylu. Are you using CustomDataset as well?
I am trying to submit a PR for this and have a question for
@hellock : could you share the use-case for dataset as None use case here. If we are using custom dataset, it is usually of type tuple.
Yes, I am using custom dataset. thanks.
@domarps where is your PR for this fix? I would like to take a look, thank you.
same issue using CustomDataset:
Traceback (most recent call last):
File "./tools/train.py", line 94, in
main()
File "./tools/train.py", line 90, in main
logger=logger)
File "/home/lzhang/PycharmProjects/mmdetection/mmdet/apis/train.py", line 59, in train_detector
_dist_train(model, dataset, cfg, validate=validate)
File "/home/lzhang/PycharmProjects/mmdetection/mmdet/apis/train.py", line 171, in _dist_train
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/home/lzhang/anaconda3/envs/FCOS/lib/python3.6/site-packages/mmcv-0.2.7-py3.6.egg/mmcv/runner/runner.py", line 356, in run
epoch_runner(data_loaders[i], **kwargs)
File "/home/lzhang/anaconda3/envs/FCOS/lib/python3.6/site-packages/mmcv-0.2.7-py3.6.egg/mmcv/runner/runner.py", line 272, in train
self.call_hook('after_train_epoch')
File "/home/lzhang/anaconda3/envs/FCOS/lib/python3.6/site-packages/mmcv-0.2.7-py3.6.egg/mmcv/runner/runner.py", line 229, in call_hook
getattr(hook, fn_name)(self)
File "/home/lzhang/PycharmProjects/mmdetection/mmdet/core/evaluation/eval_hooks.py", line 65, in after_train_epoch
self.evaluate(runner, results)
File "/home/lzhang/PycharmProjects/mmdetection/mmdet/core/evaluation/eval_hooks.py", line 110, in evaluate
print_summary=True)
File "/home/lzhang/PycharmProjects/mmdetection/mmdet/core/evaluation/mean_ap.py", line 327, in eval_map
print_map_summary(mean_ap, eval_results, dataset)
File "/home/lzhang/PycharmProjects/mmdetection/mmdet/core/evaluation/mean_ap.py", line 378, in print_map_summary
label_names[j], num_gts[i, j], results[j]['num_dets'],
IndexError: tuple index out of range
@hellock @domarps
It seems that the bug is solved.
if your CustomDataset is coco style, you should modify this line to
if issubclass(dataset_type, datasets.yourCustomDataset):
@domarps @hellock @fboylu
Unfortunately, this does not seem to be a trivial fix. The issue with the IndexError: tuple index out of range is the num_classes has been initialized to the default number of classes 80 thing classes from MS-COCO. For t
One quick way to verify this is to do the following:
bbox_head is set to the right number from your CustomDataset eval_map. @leochangzliao - I tried it but I ran into some other issue. Did you notice any changes with respect to the above observations after this fix?
No, my code runs smoothly after modifying the code mentioned above.
@domarps
I have only 1 class and it seems I left the bbox_head as default to Pascal VOC (21) but now when I change it to 1, I am getting the below error so maybe tapping into some other issue with just having 1 class. I am not able to train without --validate while setting bbox_head=1. My training results are very bad anyways with bbox_head =21, would that be the reason? @hellock can you comment? @domarps any ideas?
2019-05-20 15:57:51,876 - INFO - Start running, host: fboylu@fboylulinuxgpu, work_dir: /datadrive/mmdetection/work_dirs/cust_faster_rcnn_r50_fpn_1x_voc0712
2019-05-20 15:57:51,876 - INFO - workflow: [('train', 1)], max: 4 epochs
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
Traceback (most recent call last):
File "./tools/train.py", line 90, in
main()
File "./tools/train.py", line 86, in main
logger=logger)
File "/datadrive/mmdetection/mmdet/apis/train.py", line 58, in train_detector
_dist_train(model, dataset, cfg, validate=validate)
File "/datadrive/mmdetection/mmdet/apis/train.py", line 99, in _dist_train
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/runner.py", line 356, in run
epoch_runner(data_loaders[i], kwargs)
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/runner.py", line 262, in train
self.model, data_batch, train_mode=True, *
File "/datadrive/mmdetection/mmdet/apis/train.py", line 38, in batch_processor
losses = model(data)
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(input, *kwargs)
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/mmcv/parallel/distributed.py", line 50, in forward
return self.module(inputs[0], *kwargs[0])
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(input, *kwargs)
File "/datadrive/mmdetection/mmdet/models/detectors/base.py", line 84, in forward
return self.forward_train(img, img_meta, **kwargs)
File "/datadrive/mmdetection/mmdet/models/detectors/two_stage.py", line 152, in forward_train
*bbox_targets)
File "/datadrive/mmdetection/mmdet/models/bbox_heads/bbox_head.py", line 102, in loss
4)[pos_inds, labels[pos_inds]]
RuntimeError: copy_if failed to synchronize: device-side assert triggered
terminate called after throwing an instance of 'c10::Error'
what(): CUDA error: device-side assert triggered (insert_events at /opt/conda/conda-bld/pytorch_1556653215914/work/c10/cuda/CUDACachingAllocator.cpp:564)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x45 (0x7faf23bd1dc5 in /data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1:
frame #2: c10::TensorImpl::release_resources() + 0x50 (0x7faf23bc1640 in /data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #3:
frame #4:
frame #5:
frame #6:
frame #7:
frame #8:
frame #9:
frame #10:
frame #11:
frame #12:
frame #13:
frame #14:
frame #15:
frame #16:
frame #17:
frame #18:
frame #19:
frame #20: PyDict_SetItem + 0x4d2 (0x5640b8a81792 in /data/anaconda/envs/open-mmlab/bin/python)
frame #21: PyDict_SetItemString + 0x4f (0x5640b8a8223f in /data/anaconda/envs/open-mmlab/bin/python)
frame #22: PyImport_Cleanup + 0x9e (0x5640b8ab927e in /data/anaconda/envs/open-mmlab/bin/python)
frame #23: Py_FinalizeEx + 0x67 (0x5640b8b2c8a7 in /data/anaconda/envs/open-mmlab/bin/python)
frame #24:
frame #25: _Py_UnixMain + 0x3c (0x5640b8b44f7c in /data/anaconda/envs/open-mmlab/bin/python)
frame #26: __libc_start_main + 0xf0 (0x7faf5f9d3830 in /lib/x86_64-linux-gnu/libc.so.6)
frame #27:
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
/opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/THCUNN/ClassNLLCriterion.cu:56: void ClassNLLCriterion_updateOutput_no_reduce_kernel(int, THCDeviceTensorcur_target >= 0 && cur_target < n_classes failed.
Traceback (most recent call last):
File "./tools/train.py", line 90, in
main()
File "./tools/train.py", line 86, in main
logger=logger)
File "/datadrive/mmdetection/mmdet/apis/train.py", line 58, in train_detector
_dist_train(model, dataset, cfg, validate=validate)
File "/datadrive/mmdetection/mmdet/apis/train.py", line 99, in _dist_train
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/runner.py", line 356, in run
epoch_runner(data_loaders[i], kwargs)
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/runner.py", line 262, in train
self.model, data_batch, train_mode=True, *
File "/datadrive/mmdetection/mmdet/apis/train.py", line 38, in batch_processor
losses = model(data)
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(input, *kwargs)
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/mmcv/parallel/distributed.py", line 50, in forward
return self.module(inputs[0], *kwargs[0])
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in __call__
result = self.forward(input, *kwargs)
File "/datadrive/mmdetection/mmdet/models/detectors/base.py", line 84, in forward
return self.forward_train(img, img_meta, **kwargs)
File "/datadrive/mmdetection/mmdet/models/detectors/two_stage.py", line 152, in forward_train
*bbox_targets)
File "/datadrive/mmdetection/mmdet/models/bbox_heads/bbox_head.py", line 102, in loss
4)[pos_inds, labels[pos_inds]]
RuntimeError: copy_if failed to synchronize: device-side assert triggered
terminate called after throwing an instance of 'c10::Error'
what(): CUDA error: device-side assert triggered (insert_events at /opt/conda/conda-bld/pytorch_1556653215914/work/c10/cuda/CUDACachingAllocator.cpp:564)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x45 (0x7fcf0d12cdc5 in /data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1:
frame #2: c10::TensorImpl::release_resources() + 0x50 (0x7fcf0d11c640 in /data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #3:
frame #4:
frame #5:
frame #6:
frame #7:
frame #8:
frame #9:
frame #10:
frame #11:
frame #12:
frame #13:
frame #14:
frame #15:
frame #16:
frame #17:
frame #18:
frame #19:
frame #20: PyDict_SetItem + 0x4d2 (0x563624165792 in /data/anaconda/envs/open-mmlab/bin/python)
frame #21: PyDict_SetItemString + 0x4f (0x56362416623f in /data/anaconda/envs/open-mmlab/bin/python)
frame #22: PyImport_Cleanup + 0x9e (0x56362419d27e in /data/anaconda/envs/open-mmlab/bin/python)
frame #23: Py_FinalizeEx + 0x67 (0x5636242108a7 in /data/anaconda/envs/open-mmlab/bin/python)
frame #24:
frame #25: _Py_UnixMain + 0x3c (0x563624228f7c in /data/anaconda/envs/open-mmlab/bin/python)
frame #26: __libc_start_main + 0xf0 (0x7fcf48f2e830 in /lib/x86_64-linux-gnu/libc.so.6)
frame #27:
Traceback (most recent call last):
File "/data/anaconda/envs/open-mmlab/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/data/anaconda/envs/open-mmlab/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launch.py", line 235, in
main()
File "/data/anaconda/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launch.py", line 231, in main
cmd=process.args)
subprocess.CalledProcessError: Command '['/data/anaconda/envs/open-mmlab/bin/python', '-u', './tools/train.py', '--local_rank=0', 'configs/pascal_voc/my_faster_rcnn_r50_fpn_1x_voc0712.py', '--launcher', 'pytorch']' died with
I think this is related to https://github.com/open-mmlab/mmdetection/issues/344 so I am using 1+1(for background) and training seems to go fine now and --validate works as well.. Just not getting good results.
馃悰 Bug
Validation step on custom dataset fails because
num_classesis wrongly intialized here:
https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/evaluation/mean_ap.py#L342To Reproduce
- Follow steps in Custom Dataset preparation
- Distributed training on single GPU (Link to model config gist)
./tools/dist_train.sh configs/my_config_faster_rcnn_r101_fpn_1x.py 1 --validateError message (during the validate step after first epoch):
Traceback (most recent call last): File "./tools/train.py", line 90, in <module> main() File "./tools/train.py", line 86, in main logger=logger) File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmdet-0.6.0+11e9c74-py3.6.egg/mmdet/apis/train.py", line 58, in train_detector _dist_train(model, dataset, cfg, validate=validate) File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmdet-0.6.0+11e9c74-py3.6.egg/mmdet/apis/train.py", line 99, in _dist_train runner.run(data_loaders, cfg.workflow, cfg.total_epochs) File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmcv/runner/runner.py", line 356, in run epoch_runner(data_loaders[i], **kwargs) File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmcv/runner/runner.py", line 272, in train self.call_hook('after_train_epoch') File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmcv/runner/runner.py", line 229, in call_hook getattr(hook, fn_name)(self) File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmdet-0.6.0+11e9c74-py3.6.egg/mmdet/core/evaluation/eval_hooks.py", line 65, in after_train_epoch self.evaluate(runner, results) File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmdet-0.6.0+11e9c74-py3.6.egg/mmdet/core/evaluation/eval_hooks.py", line 110, in evaluate print_summary=True) File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmdet-0.6.0+11e9c74-py3.6.egg/mmdet/core/evaluation/mean_ap.py", line 327, in eval_map print_map_summary(mean_ap, eval_results, dataset) File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/mmdet-0.6.0+11e9c74-py3.6.egg/mmdet/core/evaluation/mean_ap.py", line 370, in print_map_summary label_names[j], num_gts[i, j], results[j]['num_dets'], IndexError: tuple index out of range Traceback (most recent call last): File "/home/ubuntu/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main "__main__", mod_spec) File "/home/ubuntu/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code exec(code, run_globals) File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/torch/distributed/launch.py", line 235, in <module> main() File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/torch/distributed/launch.py", line 231, in main cmd=process.args)Expected Behavior
Printing the AP table:
| class | gts | dets | recall | precision | ap | +----------+-----+------+--------+-----------+-------+ | class_1 | 110 | 781 | 0.091 | 0.013 | 0.002 | | class_2 | 125 | 891 | 0.208 | 0.029 | 0.009 | | class_3 | 98 | 1446 | 0.316 | 0.021 | 0.008 | | class_4 | 0 | 0 | 0.000 | 0.000 | 0.000 | | class_5 | 118 | 1578 | 0.339 | 0.025 | 0.020 | | class_6 | 0 | 0 | 0.000 | 0.000 | 0.000 | +----------+-----+------+--------+-----------+-------+ | mAP | | | | | 0.019 | +----------+-----+------+--------+-----------+-------+Moving the line
342afterlabel_namesis initialized, resolved the issue:num_classes = len(label_names)
thanks, i had solved my issue using your way
I had the same error while trying to train faster_rcnn. Initially there was no model configuration in the config file. But after adding following information error got fixed
model = dict(
roi_head=dict(
bbox_head=dict(
num_classes=len(classes))))
It's important to override num_classes from where were are calling the base class...