Incubator-mxnet: Asynchronous Issue on CustomOP and mxnet.image.ImageDetIter

Created on 28 Feb 2018  ·  16Comments  ·  Source: apache/incubator-mxnet

Description

When I use CustomOP and mxnet.image.ImageDetIter simultaneously, it may cause the error:

Traceback (most recent call last):
   File "/home/wkcn/proj/faster-rcnn-mx/it.py", line 95, in <module>
     model.fit(train_data = train_data, begin_epoch = 0, num_epoch = 120, allow_missing = True, batch_end_callback = mx.callback.Speedometer(batch_size, 5)    , eval_metric = MyMetric()) 
   File "/usr/local/lib/python2.7/dist-packages/mxnet/module/base_module.py", line 491, in fit
     next_data_batch = next(data_iter)
   File "/usr/local/lib/python2.7/dist-packages/mxnet/image/detection.py", line 765, in next
     data = self.imdecode(s)
   File "/usr/local/lib/python2.7/dist-packages/mxnet/image/image.py", line 1223, in imdecode
     raise RuntimeError("{}, {}".format(locate(), e))
 RuntimeError: Broken image index: 32, [12:14:03] src/io/image_io.cc:186: Check failed: inputs[0].ctx().dev_mask() == Context::kCPU (2 vs. 1) Only supports cpu input

The reason is that the custom operator will create the GPU context and enter it.
https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/operator.py#L790
https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/operator.py#L814

At this time (context.default_ctx is GPU), ImageDetIter execute the codes below:
https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/image/detection.py#L764
https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/image/image.py#L135

Because context.default_ctx is GPU now, (The custom operator is in the GPU context and doesn't exit it yet), the function imdecode will use nd.array(...) to create a NDArray in the default context, namely the GPU context. It causes the error:

   File "/usr/local/lib/python2.7/dist-packages/mxnet/image/detection.py", line 765, in next
     data = self.imdecode(s)
   File "/usr/local/lib/python2.7/dist-packages/mxnet/image/image.py", line 1223, in imdecode
     raise RuntimeError("{}, {}".format(locate(), e))
 RuntimeError: Broken image index: 32, [12:14:03] src/io/image_io.cc:186: Check failed: inputs[0].ctx().dev_mask() == Context::kCPU (2 vs. 1) Only supports cpu input

Environment info (Required)

----------Python Info----------
('Version      :', '2.7.12')
('Compiler     :', 'GCC 5.4.0 20160609')
('Build        :', ('default', 'Dec  4 2017 14:50:18'))
('Arch         :', ('64bit', 'ELF'))
------------Pip Info-----------
('Version      :', '8.1.1')
('Directory    :', '/usr/lib/python2.7/dist-packages/pip')
----------MXNet Info-----------
('Version      :', '1.1.0')
('Directory    :', '/usr/local/lib/python2.7/dist-packages/mxnet')
('Commit Hash   :', '07a83a0325a3d782513a04f47d711710972cb144')
----------System Info----------
('Platform     :', 'Linux-4.7.3-coreos-r3-x86_64-with-Ubuntu-16.04-xenial')
('system       :', 'Linux')
('node         :', 'phlrr3110')
('release      :', '4.7.3-coreos-r3')
('version      :', '#1 SMP Thu Feb 23 02:16:16 UTC 2017')

GPU: Tesla M40 x 4

Package used (Python/R/Scala/Julia):
Python

Error Message:

Traceback (most recent call last):
   File "/home/wkcn/proj/faster-rcnn-mx/it.py", line 95, in <module>
     model.fit(train_data = train_data, begin_epoch = 0, num_epoch = 120, allow_missing = True, batch_end_callback = mx.callback.Speedometer(batch_size, 5)    , eval_metric = MyMetric()) 
   File "/usr/local/lib/python2.7/dist-packages/mxnet/module/base_module.py", line 491, in fit
     next_data_batch = next(data_iter)
   File "/usr/local/lib/python2.7/dist-packages/mxnet/image/detection.py", line 765, in next
     data = self.imdecode(s)
   File "/usr/local/lib/python2.7/dist-packages/mxnet/image/image.py", line 1223, in imdecode
     raise RuntimeError("{}, {}".format(locate(), e))
 RuntimeError: Broken image index: 32, [12:14:03] src/io/image_io.cc:186: Check failed: inputs[0].ctx().dev_mask() == Context::kCPU (2 vs. 1) Only supports cpu input

Minimum reproducible example

import mxnet as mx
from mxnet import gluon
from mxnet import image
from mxnet import nd
import numpy as np
import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

root_url = ('https://apache-mxnet.s3-accelerate.amazonaws.com/'
            'gluon/dataset/pikachu/')
data_dir = './data/pikachu/'
dataset = {'train.rec': 'e6bcb6ffba1ac04ff8a9b1115e650af56ee969c8',
          'train.idx': 'dcf7318b2602c06428b9988470c731621716c393',
          'val.rec': 'd6c33f799b4d058e82f2cb5bd9a976f69d72d520'}
for k, v in dataset.items():
    gluon.utils.download(root_url+k, data_dir+k, sha1_hash=v)

T = 1
devs = [mx.gpu(i) for i in range(4)]
data_shape = 224 * T
batch_size = 20 * len(devs)
rgb_mean = np.array([1,2,3]) 

class_names = ['pikachu']
num_class = len(class_names)

def get_iterators(data_shape, batch_size):
    train_iter = image.ImageDetIter(
        batch_size=batch_size,
        data_shape=(3, data_shape, data_shape),
        path_imgrec=data_dir+'train.rec',
        path_imgidx=data_dir+'train.idx',
        shuffle=True,
        mean=True,
        rand_crop=1,
        min_object_covered=0.95,
        max_attempts=200)
    val_iter = image.ImageDetIter(
        batch_size=batch_size,
        data_shape=(3, data_shape, data_shape),
        path_imgrec=data_dir+'val.rec',
        shuffle=False,
        mean=True)
    return train_iter, val_iter, class_names, num_class

train_data, test_data, class_names, num_class = get_iterators(
    data_shape, batch_size)


class MyCustom(mx.operator.CustomOp):
    def __init__(self):
        super(MyCustom, self).__init__()
    def forward(self, is_train, req, in_data, out_data, aux):
        self.assign(out_data[0], req[0], 0)
    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        self.assign(in_grad[0], req[0], 0)
        self.assign(in_grad[1], req[1], 0)

@mx.operator.register("MyCustom")
class MyCustomProp(mx.operator.CustomOpProp):
    def __init__(self):
        super(MyCustomProp, self).__init__(need_top_grad = False)
    def list_arguments(self):
        return ["data", "label"]
    def list_outputs(self):
        return ["loss"]
    def infer_shape(self, in_shape):
        return [in_shape[0], in_shape[1]], [(1, )], []
    def infer_type(self, in_type):
        dtype = in_type[0]
        return [dtype, dtype], [dtype], []
    def create_operator(self, ctx, shapes, dtypes):
        return MyCustom()

class MyMetric(mx.metric.EvalMetric):
    def __init__(self):
        super(MyMetric, self).__init__("MyMetric")
        self.name = ['empty']
    def update(self, labels, preds):
        pass
    def get(self):
        return self.name, [0]

x = mx.sym.Variable("data")
label = mx.sym.Variable("label")
x = mx.sym.FullyConnected(data = x, num_hidden = 100)
label = mx.sym.Reshape(data = label, shape = (0, -1))
sym = mx.sym.Custom(data = x, label = label, op_type = "MyCustom")

model = mx.module.Module(context = devs, symbol = sym, data_names = ('data',), label_names = ('label',))

print ("start")
model.fit(train_data = train_data, begin_epoch = 0, num_epoch = 120, allow_missing = True, batch_end_callback = mx.callback.Speedometer(batch_size, 5), eval_metric = MyMetric()) 
'''
with mx.gpu(0):
    while 1:
        e = train_data.next()
        print ("batch")
'''

Steps to reproduce

(Paste the commands you ran that produced the error.)

  1. Run the code

What have you tried to solve it?

  1. Changing the code: https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/image/image.py#L135, and adding the parameter ctx = context.cpu(), the problem will be solved.

The code changed

Bug

Most helpful comment

@anirudh2290
Hi!
The issue is fixed.
Thank you!

All 16 comments

Need to use thread local structures to store global states like context

@wkcn thanks for sharing your ways. I added ctx = context.cpu(), however, the new problem happens:
/usr/local/lib/python2.7/dist-packages/mxnet/module/base_module.py:466: UserWarning: Optimizer created manually outside Module but rescale_grad is not normalized to 1.0/batch_size/num_workers (0.25 vs. 0.0078125). Is this intended?
optimizer_params=optimizer_params)
call reset()
terminate called after throwing an instance of 'dmlc::Error'
what(): [14:37:42] src/engine/./threaded_engine.h:359: [14:37:42] src/ndarray/ndarray_function.cu:43: Check failed: to->type_flag_ == from.type_flag_ (0 vs. 3) Source and target must have the same data type when copying across devices.

Did you solve it ?

@IrvingShu
It seems that the data types of source ndarray and target ndarray are different.
Could you upload the Minimum reproducible example?

@piiswrong can you tell me how to add thread local structures to store global states like context. I add some custom layers such as center loss , focal loss, ringloss , all happens this problem. Thank you.

@IrvingShu I am currently working on fixing this. Please see: https://github.com/anirudh2290/mxnet/blob/fix_global_ctx_ex/python/mxnet/context.py to get more context on what @piiswrong means.

@anirudh2290 thank you, did you finished fixing it, I tried it , it also failed.

@IrvingShu The workaround I provided should work with the minimum reproducible steps in the issue. There was an issue with the current_context() function which has also been fixed. The workaround is only when you use the Context class and not for anything else. Please provide a minimum reproducible script if it doesn't work.

@wkcn @IrvingShu the PR is merged. can you check with the latest master if your issue is fixed.

@anirudh2290
Hi!
The issue is fixed.
Thank you!

@wkcn 你好,请问你是怎么解决 Check failed: to->type_flag_ == from.type_flag_ (0 vs. 3) Source and target must have the same data type when copying across devices的,我一开始也是报了这个issue的错误,然后按照代码改了之后又会出现上面这个错误,我报错的信息是
Traceback (most recent call last):
File "train_softmax_fast.py", line 892, in
main()
File "train_softmax_fast.py", line 889, in main
train_net(args)
File "train_softmax_fast.py", line 883, in train_net
epoch_end_callback = epoch_cb )
File "/usr/local/lib/python2.7/dist-packages/mxnet/module/multitaskmodulev3.py", line 269, in fit
self.update_metric(gender_eval_metric, data_batch.label)
File "/usr/local/lib/python2.7/dist-packages/mxnet/module/module.py", line 768, in update_metric
self._exec_group.update_metric(eval_metric, labels)
File "/usr/local/lib/python2.7/dist-packages/mxnet/module/executor_group.py", line 621, in update_metric
eval_metric.update_dict(labels_, preds)
File "/usr/local/lib/python2.7/dist-packages/mxnet/metric.py", line 304, in update_dict
metric.update_dict(labels, preds)
File "/usr/local/lib/python2.7/dist-packages/mxnet/metric.py", line 132, in update_dict
self.update(label, pred)
File "train_softmax_fast.py", line 203, in update
print("pred_label before if is {}\n".format(mx.ndarray.array(pred_label)))
File "/usr/local/lib/python2.7/dist-packages/mxnet/ndarray/ndarray.py", line 189, in __repr__
return '\n%s\n<%s %s @%s>' % (str(self.asnumpy()),
File "/usr/local/lib/python2.7/dist-packages/mxnet/ndarray/ndarray.py", line 1876, in asnumpy
ctypes.c_size_t(data.size)))
File "/usr/local/lib/python2.7/dist-packages/mxnet/base.py", line 149, in check_call
raise MXNetError(py_str(_LIB.MXGetLastError()))
mxnet.base.MXNetError: [14:31:06] src/ndarray/ndarray_function.cu:43: Check failed: to->type_flag_ == from.type_flag_ (0 vs. 3) Source and target must have the same data type when copying across devices.

Stack trace returned 10 entries:
[bt] (0) /usr/local/lib/python2.7/dist-packages/mxnet/libmxnet.so(+0x30756a) [0x7f8c83ab656a]
[bt] (1) /usr/local/lib/python2.7/dist-packages/mxnet/libmxnet.so(+0x307b91) [0x7f8c83ab6b91]
[bt] (2) /usr/local/lib/python2.7/dist-packages/mxnet/libmxnet.so(+0x401cd87) [0x7f8c877cbd87]
[bt] (3) /usr/local/lib/python2.7/dist-packages/mxnet/libmxnet.so(+0x2628a16) [0x7f8c85dd7a16]
[bt] (4) /usr/local/lib/python2.7/dist-packages/mxnet/libmxnet.so(+0x263c41a) [0x7f8c85deb41a]
[bt] (5) /usr/local/lib/python2.7/dist-packages/mxnet/libmxnet.so(+0x263c54b) [0x7f8c85deb54b]
[bt] (6) /usr/local/lib/python2.7/dist-packages/mxnet/libmxnet.so(+0x2472944) [0x7f8c85c21944]
[bt] (7) /usr/local/lib/python2.7/dist-packages/mxnet/libmxnet.so(+0x2478e13) [0x7f8c85c27e13]
[bt] (8) /usr/local/lib/python2.7/dist-packages/mxnet/libmxnet.so(+0x2479066) [0x7f8c85c28066]
[bt] (9) /usr/local/lib/python2.7/dist-packages/mxnet/libmxnet.so(+0x2473054) [0x7f8c85c22054]
请问你是如何解决的,我一共有三个自定义的AccMetric,前两个AccMetric用的方法都是mx.nd.array(pred_label, ctx=mx.current_context())来解决,奇怪的是前面两个加的地方不一样才可以,这第三个就完全解决不了了

@MinZhaoLin 你遇到的问题好像和这个issue不一样
metric里传入的pred_label是GPU数据,label是CPU数据
你可以尝试pred_label.as_in_context(label.context),让pred和label在同一个context里

@wkcn
我是一开始定义了自己的softmax层后,出现了这个issue的报错src/io/image_io.cc:186: Check failed: inputs[0].ctx().dev_mask() == Context::kCPU (2 vs. 1) Only supports cpu input,
然后根据这个issue改了之后,就是修改Imge.py之后,在AccMetric里面就出现了src/ndarray/ndarray_function.cu:43: Check failed: to->type_flag == from.type_flag_ (0 vs. 3) Source and target must have the same data type when copying across devices.这个问题
这个问题具体是在pred_label = pred_label.asnumpy().astype('int32').flatten()的时候出现的,然后当我尝试在这句话前面(某些地方,有些地方可以,有些地方也会报错)将pred_label直接print输出的时候,他是有时会正常运行不会报错,但是每次当pred_label调用到asnumpy()的时候就会报这个错

class GenderAccMetric(mx.metric.EvalMetric):
  def __init__(self):
    self.axis = 1
    super(GenderAccMetric, self).__init__(
        'acc', axis=self.axis,
        output_names=None, label_names=None)
    self.losses = []
    self.count = 0

  def update(self, _labels, preds):
    self.count+=1
    #print("in gender AccMetric\n")
    #print("label is {}\n".format(_labels[2])) 
    #print("preds is {}\n".format(preds[3]))  当输出label和preds时,有时会正常运行
    labels = [_labels[2]]
    _preds = [preds[3]] #use softmax output
    for label, pred_label in zip(labels, _preds):
        #print("pred_label before if is {}\n".format(pred_label)) 在这里输出也会报错
        pred_label = pred_label.as_in_context(label.context) 加上了这句话还是不行
        if pred_label.shape != label.shape:
            #pred_label = mx.ndarray.array(_pred_label, ctx=mx.current_context())其中一个AccMetric在这里进行转换就解决了问题,但剩下的两个不行
            pred_label = mx.ndarray.argmax(pred_label, axis=self.axis)
        pred_label = pred_label.asnumpy().astype('int32').flatten()
        label = label.asnumpy()
        if label.ndim==2:
          label = label[:,0]
        label = label.astype('int32').flatten()
        assert label.shape==pred_label.shape
        self.sum_metric += (pred_label.flat == label.flat).sum()
        self.num_inst += len(pred_label.flat)

所以总的来说,我感到非常奇怪和困扰,按照你说的方法改了还是不行,我感觉是只要是pred调用了asnumpy()就会报错

@wkcn 我用的时mxnet-cu80,这是我定义的softmax层,主要是当label为全0时,回传0梯度,主要是想mxnet的module多任务,但每个任务单独训练,就是因为数据只有单独一个任务的标签

class Softmax(mx.operator.CustomOp):
  def forward(self, is_train, req, in_data, out_data, aux):
      x = in_data[0]
      y = mx.nd.exp(x - x.max(axis=1).reshape((x.shape[0], 1)))
      y[:] = mx.nd.divide(y, y.sum(axis=1).reshape(x.shape[0], 1))
      self.assign(out_data[0], req[0], mx.nd.array(y))

  def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
      l = in_data[1].astype('int32')
      y = out_data[0]
      if l.sum().asnumpy() == 0:
        self.assign(in_grad[0], req[0], mx.nd.zeros_like(y))
      else:
        y[np.arange(l.shape[0]), l] -= 1.0
        y = y / 160
        self.assign(in_grad[0], req[0], mx.nd.array(y))

@mx.operator.register("softmax")
class SoftmaxProp(mx.operator.CustomOpProp):
  def __init__(self):
      super(SoftmaxProp, self).__init__(need_top_grad=False)

  def list_arguments(self):
      return ['data', 'label']

  def list_outputs(self):
      return ['output']

  def infer_shape(self, in_shape):
      data_shape = in_shape[0]
      label_shape = (in_shape[0][0],)
      output_shape = in_shape[0]
      return [data_shape, label_shape], [output_shape], []

  def infer_type(self, in_type):
      return in_type, [in_type[0]], []

  def create_operator(self, ctx, shapes, dtypes):
      return Softmax()

@wkcn 问题解决啦,我根据Change class variables to thread local variables #10833 就是上面anirudh2290 的PR把需要改的文件都改了就可以了

@MinZhaoLin 诶?你用的MXNet版本是不是没有合并PR #10833呢?
你定义的Softmax层应该也可以用下面语句代替

valid = (label == 0).sum() > 0
output = valid * mx.sym.SoftmaxOutput(pred, label, ...)

@wkcn 问题解决啦,我根据Change class variables to thread local variables #10833 就是上面anirudh2290 的PR把需要改的文件都改了就可以了

你好,我也是按照这个改了但是 我用多GPU的时候还是会出现你描述的这个问题,你有遇到吗

Was this page helpful?
0 / 5 - 0 ratings

Related issues

seongkyun picture seongkyun  ·  3Comments

xzqjack picture xzqjack  ·  3Comments

luoruisichuan picture luoruisichuan  ·  3Comments

Ajoo picture Ajoo  ·  3Comments

xzqjack picture xzqjack  ·  3Comments