Hey guys,
we implemented full 3D support for mxnet (3D convolution, 3D pooling, 3D batchnorm) for the cudnn layers. Only problem is the input. We currently use a custom workaround imglistiter, which is pretty slow. We are not that familiar with your recordIO datastructure, so if you help us adding 3D support at the recordIO level we can push our layers into mxnet, to enable 3D networks.
This is a long missing features. Thanks very much for your contribution.
We are working on float 16 cudnn layers. It should be pushed to main repo in a few days. Could you merge in it first and submit a PR?
As to IO, It depends on your use case.
Are you sequentially storing your data and using random access for shuffling?
Or are you duplicating data at each time frame?
Also, see if this helps:
class Mov3dStack(mx.io.DataIter):
def __init__(self, path, data_shape, batch_size, scale,
mean_file=None, test_mode=False, output_depth=False, data_frames=1, flow_frames=1,
source=None, upsample=1, base_shape=None, stride=1, no_left0=False, right_whiten=False):
self.data_shape = data_shape
self.batch_size = batch_size
self.scale = scale
self.test_mode = test_mode
self.output_depth = output_depth
self.data_frames = data_frames
self.flow_frames = flow_frames
self.upsample = upsample
self.base_shape = base_shape
self.stride = stride
self.no_left0 = no_left0
self.right_whiten = right_whiten
self.fix_p = None
self.env = lmdb.open(path, map_size=1<<40, max_dbs=5, readonly=True, readahead=False)
self.ldb = self.env.open_db('l')
self.fdb = self.env.open_db('flow')
if output_depth:
self.ddb = self.env.open_db('depth')
self.margin = (scale[1] - scale[0])/2
else:
self.margin = 0
self.rdb = self.env.open_db('r')
self.cur = 0
with self.env.begin() as txn:
if source:
self.idx = [int(i) for i in txn.get(source).split(',')]
elif self.test_mode:
self.idx = [int(i) for i in txn.get('valid_idx').split(',')]
else:
self.idx = [int(i) for i in txn.get('train_idx').split(',')]
if self.upsample > 1:
self.caps = [cv2.VideoCapture('/archive/mov3d/mkv/submitted/%s.mkv'%p) for p in txn.get('prefix_list').split(',')]
self.provide_data = []
if data_frames > 0:
self.provide_data.append(('left', (batch_size, 3*data_frames, data_shape[1], data_shape[0])))
if flow_frames > 0:
self.provide_data.append(('flow', (batch_size, 2*flow_frames, data_shape[1], data_shape[0])))
if not no_left0:
self.provide_data.append(('left0', (batch_size, 3, data_shape[1]*upsample, data_shape[0]*upsample)))
self.provide_label = [('l1_label', (batch_size, 3, data_shape[1]*upsample, data_shape[0]*upsample))]
if self.output_depth:
self.provide_label = [('softmax_label', (batch_size, data_shape[1]*data_shape[0]))]
self.left_mean = np.zeros((3, data_shape[1], data_shape[0]))
self.right_mean = np.zeros((3, data_shape[1], data_shape[0]))
self.flow_mean = np.zeros((2, data_shape[1], data_shape[0]))
self.left_mean_nd = mx.nd.array(self.left_mean)
self.left_mean_nd_1 = self.left_mean_nd.reshape((1,)+self.left_mean_nd.shape)
self.right_mean_nd = mx.nd.array(self.right_mean)
self.flow_mean_nd = mx.nd.array(self.flow_mean)
# self.left_mean_t = self.left_mean.transpose((1,2,0))
# self.flow_mean_t = self.flow_mean.transpose((1,2,0))
if mean_file is None:
mean_file = path+'/mean.npz'
mean_dict = load_mean(mean_file, self, label_mean=True)
self.left_mean = mean_dict['left']
self.right_mean = mean_dict['l1_label']
self.flow_mean = mean_dict['flow']
self.left_mean_nd = mx.nd.array(self.left_mean)
self.left_mean_nd_1 = self.left_mean_nd.reshape((1,)+self.left_mean_nd.shape)
self.right_mean_nd = mx.nd.array(self.right_mean)
self.flow_mean_nd = mx.nd.array(self.flow_mean)
def reset(self):
logging.info("Mov3dStack.reset at %d"%self.cur)
self.cur = 0
if not self.test_mode:
random.shuffle(self.idx)
def seek(self, n_iter):
self.cur = (n_iter*self.batch_size)%len(self.idx)
def next(self):
from parse import split
ndleft = mx.nd.zeros((self.batch_size*self.data_frames, 3, self.data_shape[1], self.data_shape[0]))
if self.upsample > 1:
left0 = np.zeros((self.batch_size, self.data_shape[1]*self.upsample, self.data_shape[0]*self.upsample, 3), dtype=np.float32)
else:
ndleft0 = mx.nd.zeros((self.batch_size, 3, self.data_shape[1], self.data_shape[0]))
if self.flow_frames > 0:
ndflow = mx.nd.zeros((self.batch_size*self.flow_frames, 2, self.data_shape[1], self.data_shape[0]))
right = np.zeros((self.batch_size, self.data_shape[1]*self.upsample, self.data_shape[0]*self.upsample, 3), dtype=np.float32)
if self.output_depth:
depth = np.zeros((self.batch_size, self.data_shape[1]*self.data_shape[0]), dtype=np.float32)
with self.env.begin() as txn:
for i in range(self.batch_size):
if self.cur >= len(self.idx):
i -= 1
break
idx = self.idx[self.cur]
if self.upsample > 1:
nidx = int(idx)
mov = nidx/1000000
nframe = nidx%1000000
nframe = nframe/10000*3*24*60 + nframe%10000
if self.caps[mov].get(cv2.CAP_PROP_POS_FRAMES) != nframe:
print 'seek', nframe
self.caps[mov].set(cv2.CAP_PROP_POS_FRAMES, nframe)
ret, frame = self.caps[mov].read()
assert ret
margin = (frame.shape[0] - 800)/2
lframe, rframe = split(frame, reshape=self.base_shape, vert=True, clip=(0, margin, 960, margin+800))
p = self.fix_p
if self.output_depth:
sd = txn.get('%09d'%idx, db=self.ddb)
assert sd is not None
_, dimg = mx.recordio.unpack_img(sd, -1)
dimg, p = crop_img(dimg, p, self.data_shape, self.margin, test=self.test_mode)
depth[i] = dimg.flat
if self.upsample > 1:
rimg, p = crop_img(rframe, p, (self.data_shape[0]*self.upsample, self.data_shape[1]*self.upsample), 0, test=self.test_mode, grid=self.upsample)
right[i] = rimg
else:
sr = txn.get('%09d'%idx, db=self.rdb)
assert sr is not None
_, rimg = mx.recordio.unpack_img(sr, 1)
rimg, p = crop_img(rimg, p, self.data_shape, 0, test=self.test_mode)
right[i] = rimg
for j in range(max(1, self.data_frames)):
sl = txn.get('%09d'%(idx+(j-self.data_frames/2)*self.stride), db=self.ldb)
if sl is None:
pass
else:
_, s = mx.recordio.unpack(sl)
mx.nd.imdecode(s, clip_rect=(p[0], p[1], p[0] + self.data_shape[0], p[1] + self.data_shape[1]),
out=ndleft, index=i*self.data_frames+j, channels=3, mean=self.left_mean_nd)
if self.upsample > 1:
limg, p = crop_img(lframe, p, (self.data_shape[0]*self.upsample, self.data_shape[1]*self.upsample), 0, test=self.test_mode, grid=self.upsample)
left0[i] = limg
else:
start = i*max(1, self.data_frames)+max(1, self.data_frames)/2
ndleft0[i:(i+1)] = ndleft[start:(start+1)] + self.left_mean_nd_1
for j in range(self.flow_frames):
sf = txn.get('%09d'%(idx+(j-self.flow_frames/2)*self.stride), db=self.fdb)
if sf is None:
pass
else:
_, s = mx.recordio.unpack(sf)
mx.nd.imdecode(s, clip_rect=(p[0], p[1], p[0] + self.data_shape[0], p[1] + self.data_shape[1]),
out=ndflow, index=i*self.flow_frames+j, channels=2, mean=self.flow_mean_nd)
self.cur += 1
data = []
if self.data_frames > 0:
ndleft = ndleft.reshape((self.batch_size, self.data_frames*3, self.data_shape[1], self.data_shape[0]))
data.append(ndleft)
if self.flow_frames > 0:
ndflow = ndflow.reshape((self.batch_size, self.flow_frames*2, self.data_shape[1], self.data_shape[0]))
data.append(ndflow)
if self.upsample > 1:
data.append(mx.nd.array(left0.transpose((0, 3, 1, 2))))
elif not self.no_left0:
data.append(ndleft0)
right = right.transpose((0, 3, 1, 2))
if self.right_whiten:
right -= self.right_mean
i += 1
pad = self.batch_size - i
if pad:
raise StopIteration
if self.output_depth:
#return mx.io.DataBatch(data, [mx.nd.array(right), mx.nd.array(depth)], pad, None)
return mx.io.DataBatch(data, [mx.nd.array(depth)], pad, None)
else:
return mx.io.DataBatch(data, [mx.nd.array(right)], pad, None)
Perfect, we wait for your float16 update and merge our layers in.
We are currently working on 3D medical data (similar to the dsb2 data), not movies. Thanks for your example, this is what we were looking for. We will build an iterator based on lmdb to store and load Nd input images.
If you are working on medical data instead of videos, you may be able to do it with simply split 3D images into 2D images and store them sequentially. You can then reshape the output of ImageRecordIter from (batch_size*depth, channels, height, width) to (batch_size, depth, channels, height, width)
@piiswrong @Liquidburner hi thanks for the info and the code, is there any plan to include 3d convolution and 3d deconvolution soon?
For your inspiration on the data layer .
3D convolution seems merged. 3D deconvolution would be another big plus.
@jingpengwu When I run 3d convolution, the error "Volume convolution is not implmented in mshadow" occured. Is there an example for 3d convolution in mxnet? Thanks.
@rongrongxiangxin I only checked the code, have not got a chance to really test it!
@rongrongxiangxin , the 3d convolution seems to require cudnn to be active ( see #4301 ).
Is that true that 3D cnn requires an active cudnn? Implies a gpu is required?
This issue is closed due to lack of activity in the last 90 days. Feel free to reopen if this is still an active issue. Thanks!
Most helpful comment
This is a long missing features. Thanks very much for your contribution.
We are working on float 16 cudnn layers. It should be pushed to main repo in a few days. Could you merge in it first and submit a PR?
As to IO, It depends on your use case.
Are you sequentially storing your data and using random access for shuffling?
Or are you duplicating data at each time frame?
Also, see if this helps: