Skip to content
This repository was archived by the owner on Nov 17, 2023. It is now read-only.
This repository was archived by the owner on Nov 17, 2023. It is now read-only.

Asynchronous Issue on CustomOP and mxnet.image.ImageDetIter #9920

@wkcn

Description

@wkcn

Description

When I use CustomOP and mxnet.image.ImageDetIter simultaneously, it may cause the error:

Traceback (most recent call last):
   File "/home/wkcn/proj/faster-rcnn-mx/it.py", line 95, in <module>
     model.fit(train_data = train_data, begin_epoch = 0, num_epoch = 120, allow_missing = True, batch_end_callback = mx.callback.Speedometer(batch_size, 5)    , eval_metric = MyMetric()) 
   File "/usr/local/lib/python2.7/dist-packages/mxnet/module/base_module.py", line 491, in fit
     next_data_batch = next(data_iter)
   File "/usr/local/lib/python2.7/dist-packages/mxnet/image/detection.py", line 765, in next
     data = self.imdecode(s)
   File "/usr/local/lib/python2.7/dist-packages/mxnet/image/image.py", line 1223, in imdecode
     raise RuntimeError("{}, {}".format(locate(), e))
 RuntimeError: Broken image index: 32, [12:14:03] src/io/image_io.cc:186: Check failed: inputs[0].ctx().dev_mask() == Context::kCPU (2 vs. 1) Only supports cpu input

The reason is that the custom operator will create the GPU context and enter it.
https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/operator.py#L790
https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/operator.py#L814

At this time (context.default_ctx is GPU), ImageDetIter execute the codes below:
https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/image/detection.py#L764
https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/image/image.py#L135

Because context.default_ctx is GPU now, (The custom operator is in the GPU context and doesn't exit it yet), the function imdecode will use nd.array(...) to create a NDArray in the default context, namely the GPU context. It causes the error:

   File "/usr/local/lib/python2.7/dist-packages/mxnet/image/detection.py", line 765, in next
     data = self.imdecode(s)
   File "/usr/local/lib/python2.7/dist-packages/mxnet/image/image.py", line 1223, in imdecode
     raise RuntimeError("{}, {}".format(locate(), e))
 RuntimeError: Broken image index: 32, [12:14:03] src/io/image_io.cc:186: Check failed: inputs[0].ctx().dev_mask() == Context::kCPU (2 vs. 1) Only supports cpu input

Environment info (Required)

----------Python Info----------
('Version      :', '2.7.12')
('Compiler     :', 'GCC 5.4.0 20160609')
('Build        :', ('default', 'Dec  4 2017 14:50:18'))
('Arch         :', ('64bit', 'ELF'))
------------Pip Info-----------
('Version      :', '8.1.1')
('Directory    :', '/usr/lib/python2.7/dist-packages/pip')
----------MXNet Info-----------
('Version      :', '1.1.0')
('Directory    :', '/usr/local/lib/python2.7/dist-packages/mxnet')
('Commit Hash   :', '07a83a0325a3d782513a04f47d711710972cb144')
----------System Info----------
('Platform     :', 'Linux-4.7.3-coreos-r3-x86_64-with-Ubuntu-16.04-xenial')
('system       :', 'Linux')
('node         :', 'phlrr3110')
('release      :', '4.7.3-coreos-r3')
('version      :', '#1 SMP Thu Feb 23 02:16:16 UTC 2017')

GPU: Tesla M40 x 4

Package used (Python/R/Scala/Julia):
Python

Error Message:

Traceback (most recent call last):
   File "/home/wkcn/proj/faster-rcnn-mx/it.py", line 95, in <module>
     model.fit(train_data = train_data, begin_epoch = 0, num_epoch = 120, allow_missing = True, batch_end_callback = mx.callback.Speedometer(batch_size, 5)    , eval_metric = MyMetric()) 
   File "/usr/local/lib/python2.7/dist-packages/mxnet/module/base_module.py", line 491, in fit
     next_data_batch = next(data_iter)
   File "/usr/local/lib/python2.7/dist-packages/mxnet/image/detection.py", line 765, in next
     data = self.imdecode(s)
   File "/usr/local/lib/python2.7/dist-packages/mxnet/image/image.py", line 1223, in imdecode
     raise RuntimeError("{}, {}".format(locate(), e))
 RuntimeError: Broken image index: 32, [12:14:03] src/io/image_io.cc:186: Check failed: inputs[0].ctx().dev_mask() == Context::kCPU (2 vs. 1) Only supports cpu input

Minimum reproducible example

import mxnet as mx
from mxnet import gluon
from mxnet import image
from mxnet import nd
import numpy as np
import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

root_url = ('https://apache-mxnet.s3-accelerate.amazonaws.com/'
            'gluon/dataset/pikachu/')
data_dir = './data/pikachu/'
dataset = {'train.rec': 'e6bcb6ffba1ac04ff8a9b1115e650af56ee969c8',
          'train.idx': 'dcf7318b2602c06428b9988470c731621716c393',
          'val.rec': 'd6c33f799b4d058e82f2cb5bd9a976f69d72d520'}
for k, v in dataset.items():
    gluon.utils.download(root_url+k, data_dir+k, sha1_hash=v)

T = 1
devs = [mx.gpu(i) for i in range(4)]
data_shape = 224 * T
batch_size = 20 * len(devs)
rgb_mean = np.array([1,2,3]) 

class_names = ['pikachu']
num_class = len(class_names)

def get_iterators(data_shape, batch_size):
    train_iter = image.ImageDetIter(
        batch_size=batch_size,
        data_shape=(3, data_shape, data_shape),
        path_imgrec=data_dir+'train.rec',
        path_imgidx=data_dir+'train.idx',
        shuffle=True,
        mean=True,
        rand_crop=1,
        min_object_covered=0.95,
        max_attempts=200)
    val_iter = image.ImageDetIter(
        batch_size=batch_size,
        data_shape=(3, data_shape, data_shape),
        path_imgrec=data_dir+'val.rec',
        shuffle=False,
        mean=True)
    return train_iter, val_iter, class_names, num_class

train_data, test_data, class_names, num_class = get_iterators(
    data_shape, batch_size)


class MyCustom(mx.operator.CustomOp):
    def __init__(self):
        super(MyCustom, self).__init__()
    def forward(self, is_train, req, in_data, out_data, aux):
        self.assign(out_data[0], req[0], 0)
    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        self.assign(in_grad[0], req[0], 0)
        self.assign(in_grad[1], req[1], 0)
        
@mx.operator.register("MyCustom")
class MyCustomProp(mx.operator.CustomOpProp):
    def __init__(self):
        super(MyCustomProp, self).__init__(need_top_grad = False)
    def list_arguments(self):
        return ["data", "label"]
    def list_outputs(self):
        return ["loss"]
    def infer_shape(self, in_shape):
        return [in_shape[0], in_shape[1]], [(1, )], []
    def infer_type(self, in_type):
        dtype = in_type[0]
        return [dtype, dtype], [dtype], []
    def create_operator(self, ctx, shapes, dtypes):
        return MyCustom()

class MyMetric(mx.metric.EvalMetric):
    def __init__(self):
        super(MyMetric, self).__init__("MyMetric")
        self.name = ['empty']
    def update(self, labels, preds):
        pass
    def get(self):
        return self.name, [0]

x = mx.sym.Variable("data")
label = mx.sym.Variable("label")
x = mx.sym.FullyConnected(data = x, num_hidden = 100)
label = mx.sym.Reshape(data = label, shape = (0, -1))
sym = mx.sym.Custom(data = x, label = label, op_type = "MyCustom")

model = mx.module.Module(context = devs, symbol = sym, data_names = ('data',), label_names = ('label',))

print ("start")
model.fit(train_data = train_data, begin_epoch = 0, num_epoch = 120, allow_missing = True, batch_end_callback = mx.callback.Speedometer(batch_size, 5), eval_metric = MyMetric()) 
'''
with mx.gpu(0):
    while 1:
        e = train_data.next()
        print ("batch")
'''

Steps to reproduce

(Paste the commands you ran that produced the error.)

  1. Run the code

What have you tried to solve it?

  1. Changing the code: https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/image/image.py#L135, and adding the parameter ctx = context.cpu(), the problem will be solved.

The code changed

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions