When I run (b[:, 2] - b[:, 0] ) in my loss function while b is
'tensor([[336., 189., 345., 231.]], dtype=torch.float16)' ,
I got this error:
*** RuntimeError: "add" not implemented for 'torch.HalfTensor'
It turns out that after using apex.amp.initialize, any ( torch.float16 +/- torch.float16) will raise this error
How to fix it ? I got confused.
And this is my train code.
retinanet = network.shufflenet(num_classes=dataset_train.num_classes(), pretrained=True)
use_gpu = True
if use_gpu:
retinanet = retinanet.cuda()
retinanet.training = True
optimizer = optim.Adam(retinanet.parameters(), lr=5e-5 * args.batch_size * 1e-1)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)
loss_hist = collections.deque(maxlen=500)
loss_to_be_see = [[],[],[],[],[],[]]
retinanet, optimizer = apex.amp.initialize(retinanet, optimizer, opt_level="O3")
#retinanet = torch.nn.DataParallel(retinanet, device_ids=[0,1,2]).cuda()
for epoch_num in range(args.epochs):
retinanet.train()
#retinanet.module.freeze_bn()
epoch_loss = []
for iter_num, data in enumerate(dataloader_train):
try:
optimizer.zero_grad()
classification_loss, regression_loss = retinanet([data['img'].cuda().float(), data['annot']])
classification_loss = classification_loss.mean()
This bug raise in my loss funtion when i want to caculate IOU:
def calc_iou(a, b):
pdb.set_trace()
area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
pdb.set_trace()
iw = torch.min(torch.unsqueeze(a[:, 2], dim=1),b[:, 2]) - torch.max(torch.unsqueeze(a[:, 0], 1), b[:, 0])
ih = torch.min(torch.unsqueeze(a[:, 3], dim=1),b[:, 3]) - torch.max(torch.unsqueeze(a[:, 1], 1), b[:, 1])
iw = torch.clamp(iw, min=0)
ih = torch.clamp(ih, min=0)
ua = torch.unsqueeze((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), dim=1) + area - iw * ih
ua = torch.clamp(ua, min=1e-8)
intersection = iw * ih
IoU = intersection / ua
return IoU
ok I got it.
I should cast any tensor to tensor.cuda() first
Most helpful comment
ok I got it.
I should cast any tensor to tensor.cuda() first