Skip to content
This repository was archived by the owner on Nov 17, 2023. It is now read-only.
This repository was archived by the owner on Nov 17, 2023. It is now read-only.

CUDA illegal memory access when training SSD #6592

@dtmoodie

Description

@dtmoodie

For bugs or installation issues, please provide the following information.
The more information you provide, the more likely people will be able to help you.

Environment info

Operating System:
Ubuntu 16.04

Compiler:
GCC 5.4

Package used (Python/R/Scala/Julia):
Python

Or if installed from source:
6935195

Python version and distribution:
2.7.12 (default, Nov 19 2016, 06:48:10)
[GCC 5.4.0 20160609]

Error Message:

Please paste the full error message, including stack trace.

[13:13:08] /code/mxnet/dmlc-core/include/dmlc/logging.h:304: [13:13:08] /code/mxnet/mshadow/mshadow/./stream_gpu-inl.h:49: Check failed: e == cudaSuccess CUDA: an illegal memory access was encountered

Stack trace returned 8 entries:
[bt] (0) /usr/local/lib/python2.7/dist-packages/mxnet-0.10.1-py2.7.egg/mxnet/libmxnet.so(_ZN4dmlc15LogMessageFatalD1Ev+0x3c) [0x7f35618a4bbc]
[bt] (1) /usr/local/lib/python2.7/dist-packages/mxnet-0.10.1-py2.7.egg/mxnet/libmxnet.so(_ZN7mshadow6StreamINS_3gpuEE4WaitEv+0xd8) [0x7f35618bcb08]
[bt] (2) /usr/local/lib/python2.7/dist-packages/mxnet-0.10.1-py2.7.egg/mxnet/libmxnet.so(+0x60e328) [0x7f356192c328]
[bt] (3) /usr/local/lib/python2.7/dist-packages/mxnet-0.10.1-py2.7.egg/mxnet/libmxnet.so(_ZN5mxnet6engine14ThreadedEngine15ExecuteOprBlockENS_10RunContextEPNS0_8OprBlockE+0x87) [0x7f356190e287]
[bt] (4) /usr/local/lib/python2.7/dist-packages/mxnet-0.10.1-py2.7.egg/mxnet/libmxnet.so(_ZNSt17_Function_handlerIFvvEZZN5mxnet6engine23ThreadedEnginePerDevice13PushToExecuteEPNS2_8OprBlockEbENKUlvE1_clEvEUlvE_E9_M_invokeERKSt9_Any_data+0x78) [0x7f3561912a08]
[bt] (5) /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xb8c80) [0x7f35dd244c80]
[bt] (6) /lib/x86_64-linux-gnu/libpthread.so.0(+0x76ba) [0x7f35e12a46ba]
[bt] (7) /lib/x86_64-linux-gnu/libc.so.6(clone+0x6d) [0x7f35e0fda82d]

Minimum reproducible example

The below example is a near copy and past of (https://github.com/zhreshold/mxnet-ssd/blob/master/train/train_net.py) with modifications to work on the mapillary dataset.

import mxnet as mx
import logging
import sys
import os
import importlib
import re
sys.path.append('/code/mxnet/example/ssd/')

from dataset.iterator import DetRecordIter
from train.metric import MultiBoxMetric
from evaluate.eval_metric import MApMetric, VOC07MApMetric
from config.config import cfg

data_width = 512
data_height = 512
mean_pixels = [123, 117, 104]
train_path='/data/mxnet/ssd/training_filelist.rec'
val_path = '/data/mxnet/ssd/validation_filelist.rec'
train_list = '/data/mxnet/ssd/training_filelist.lst'
val_list = './2017-06-04/validation_filelist.lst'
net = 'vgg16_ssd_512'
log_file = date + '/log.txt'
prefix = './model/vgg16_reduced'
batch_size = 128
label_pad_width = -1
nms_thresh = 0.45
nms_topk = 400
force_suppress=False
freeze_layer_pattern="^(conv1_|conv2_)."
ctx = [mx.gpu(int(i)) for i in range(8)]
pretrained='model/vgg16_reduced'
epoch=1
frequent = 20
begin_epoch=0
end_epoch=100
momentum = 0.9
learning_rate=0.004
lr_refactor_step=[20, 40, 60]
lr_refactor_ratio=0.1
ovp_thresh=0.5
use_difficult=False
num_example=20000
weight_decay=0.0005
iter_monitor=100
monitor_pattern='.
'
voc07_metric=False
classes = ['human--person', 'human--rider--bicyclist', 'human--rider--motorcyclist',
'human--rider--other-rider', 'object--pothole', 'object--street-light', 'object--traffic-light',
'object--traffic-sign--back', 'object--traffic-sign--front', 'object--vehicle--bicycle',
'object--vehicle--boat', 'object--vehicle--bus', 'object--vehicle--car',
'object--vehicle--caravan', 'object--vehicle--motorcycle', 'object--vehicle--on-rails',
'object--vehicle--other-vehicle', 'object--vehicle--trailer', 'object--vehicle--truck',
'object--vehicle--wheeled-slow']
num_classes = len(classes)

def get_lr_scheduler(learning_rate, lr_refactor_step, lr_refactor_ratio,
num_example, batch_size, begin_epoch):
assert lr_refactor_ratio > 0
iter_refactor = lr_refactor_step
if lr_refactor_ratio >= 1:
return (learning_rate, None)
else:
lr = learning_rate
epoch_size = num_example // batch_size
for s in iter_refactor:
if begin_epoch >= s:
lr *= lr_refactor_ratio
if lr != learning_rate:
logging.getLogger().info("Adjusted learning rate to {} for epoch {}".format(lr, begin_epoch))
steps = [epoch_size * (x - begin_epoch) for x in iter_refactor if x > begin_epoch]
lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=lr_refactor_ratio)
return (lr, lr_scheduler)

def convert_pretrained(name, args):
if 'vgg16_reduced' in name:
args['conv6_bias'] = args.pop('fc6_bias')
args['conv6_weight'] = args.pop('fc6_weight')
args['conv7_bias'] = args.pop('fc7_bias')
args['conv7_weight'] = args.pop('fc7_weight')
del args['fc8_weight']
del args['fc8_bias']
return args

logger = logging.getLogger()
logger.setLevel(logging.INFO)
if log_file:
fh = logging.FileHandler(log_file)
logger.addHandler(fh)

data_shape = (3, data_height, data_width)

prefix += '_' + str(data_shape[1])

if isinstance(mean_pixels, (int, float)):
mean_pixels = [mean_pixels, mean_pixels, mean_pixels]
assert len(mean_pixels) == 3, "must provide all RGB mean values"

train_iter = DetRecordIter(train_path, batch_size, data_shape, mean_pixels=mean_pixels,
label_pad_width=label_pad_width, path_imglist=train_list, **cfg.train)

if val_path:
val_iter = DetRecordIter(val_path, batch_size, data_shape, mean_pixels=mean_pixels,
label_pad_width=label_pad_width, path_imglist=val_list, **cfg.valid)
else:
val_iter = None

sys.path.append(os.path.join(cfg.ROOT_DIR, 'symbol'))
symbol_module = importlib.import_module("symbol_" + net)
net = symbol_module.get_symbol_train(num_classes, nms_thresh=nms_thresh,
force_suppress=False, nms_topk=nms_topk)

if freeze_layer_pattern.strip():
re_prog = re.compile(freeze_layer_pattern)
fixed_param_names = [name for name in net.list_arguments() if re_prog.match(name)]
else:
fixed_param_names = None

ctx_str = '('+ ','.join([str(c) for c in ctx]) + ')'

logger.info("Start training with {} from pretrained model {}"
.format(ctx_str, pretrained))
_, args, auxs = mx.model.load_checkpoint(pretrained, epoch)
args = convert_pretrained(pretrained, args)

mod = mx.mod.Module(net, label_names=('label',), logger=logger, context=ctx,
fixed_param_names=fixed_param_names)

batch_end_callback = mx.callback.Speedometer(train_iter.batch_size, frequent=frequent)
epoch_end_callback = mx.callback.do_checkpoint(prefix)
learning_rate, lr_scheduler = get_lr_scheduler(learning_rate, lr_refactor_step,
lr_refactor_ratio, num_example, batch_size, begin_epoch)
optimizer_params={'learning_rate':learning_rate,
'momentum':momentum,
'wd':weight_decay,
'lr_scheduler':lr_scheduler,
'clip_gradient':None,
'rescale_grad': 1.0}
monitor = mx.mon.Monitor(iter_monitor, pattern=monitor_pattern) if iter_monitor > 0 else None

class_names = classes
if voc07_metric:
valid_metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3)
else:
valid_metric = MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3)

mod.fit(train_iter,
val_iter,
eval_metric=MultiBoxMetric(),
validation_metric=valid_metric,
batch_end_callback=batch_end_callback,
epoch_end_callback=epoch_end_callback,
optimizer='sgd',
optimizer_params=optimizer_params,
begin_epoch=begin_epoch,
num_epoch=end_epoch,
initializer=mx.init.Xavier(),
arg_params=args,
aux_params=auxs,
allow_missing=True,
monitor=monitor)

What I've tried:
Reducing batch size to to 16, and 8; reducing num GPUs to 4, and 1. Running with NaiveEngine gives:

[13:34:04] /code/mxnet/dmlc-core/include/dmlc/logging.h:304: [13:34:04] /code/mxnet/mshadow/mshadow/./stream_gpu-inl.h:49: Check failed: e == cudaSuccess CUDA: an illegal memory access was encountered

Stack trace returned 10 entries:
[bt] (0) /usr/local/lib/python2.7/dist-packages/mxnet-0.10.1-py2.7.egg/mxnet/libmxnet.so(_ZN4dmlc15LogMessageFatalD1Ev+0x3c) [0x7efe81e1ebbc]
[bt] (1) /usr/local/lib/python2.7/dist-packages/mxnet-0.10.1-py2.7.egg/mxnet/libmxnet.so(_ZN7mshadow6StreamINS_3gpuEE4WaitEv+0xd8) [0x7efe81e36b08]
[bt] (2) /usr/local/lib/python2.7/dist-packages/mxnet-0.10.1-py2.7.egg/mxnet/libmxnet.so(+0x60e328) [0x7efe81ea6328]
[bt] (3) /usr/local/lib/python2.7/dist-packages/mxnet-0.10.1-py2.7.egg/mxnet/libmxnet.so(ZNSt17_Function_handlerIFvN5mxnet10RunContextENS0_6engine18CallbackOnCompleteEEZNS2_11NaiveEngine4PushEPNS2_3OprENS0_7ContextEibEUlS1_S3_E_E9_M_invokeERKSt9_Any_dataOS1_OS3+0x50) [0x7efe81e637b0]
[bt] (4) /usr/local/lib/python2.7/dist-packages/mxnet-0.10.1-py2.7.egg/mxnet/libmxnet.so(_ZN5mxnet6engine11NaiveEngine9PushAsyncESt8functionIFvNS_10RunContextENS0_18CallbackOnCompleteEEENS_7ContextERKSt6vectorIPNS0_3VarESaISA_EESE_NS_10FnPropertyEiPKc+0x606) [0x7efe81e6b966]
[bt] (5) /usr/local/lib/python2.7/dist-packages/mxnet-0.10.1-py2.7.egg/mxnet/libmxnet.so(_ZN5mxnet6engine11NaiveEngine4PushEPNS0_3OprENS_7ContextEib+0x8f) [0x7efe81e6c9cf]
[bt] (6) /usr/local/lib/python2.7/dist-packages/mxnet-0.10.1-py2.7.egg/mxnet/libmxnet.so(_ZN5mxnet4exec13GraphExecutor6RunOpsEbmm+0x21e) [0x7efe81ea748e]
[bt] (7) /usr/local/lib/python2.7/dist-packages/mxnet-0.10.1-py2.7.egg/mxnet/libmxnet.so(MXExecutorForward+0x11) [0x7efe81e23641]
[bt] (8) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7efee3795e40]
[bt] (9) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x2eb) [0x7efee37958ab]

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions