From 02f74be612a4c09ea6aec483e542e05834cf0b90 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Wed, 7 Jul 2021 10:50:11 -0700 Subject: [PATCH 1/2] [FIX] Correct initial data copy of scatter_nd The initial data copy from `data` to `out` in scatter_nd had too large bounds. --- python/tvm/topi/cuda/scatter.py | 10 ++++++---- python/tvm/topi/scatter.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py index cee13d7e01a2..2d4710b8b53b 100644 --- a/python/tvm/topi/cuda/scatter.py +++ b/python/tvm/topi/cuda/scatter.py @@ -788,7 +788,7 @@ def gen_ir(data_ptr, indices_ptr, updates_ptr, out_ptr): fused_shape *= i # For now we avoid parallizing over dimensions indexed by `indices` as - # there may be repeated indices and hadling parallel accumulation can + # there may be repeated indices and handling parallel accumulation can # be hard. So we parallelize over X_M .. X_{N-1} instead. This will # work well when these dimensions are large enough to saturate memory # bandwidth, but performance will be bad when these dimensions are @@ -801,12 +801,14 @@ def gen_ir(data_ptr, indices_ptr, updates_ptr, out_ptr): bdim = ceil_div(fused_updates_dimension, tdim) ib.scope_attr(bx, "thread_extent", bdim) - with ib.for_range(0, ceil_div(fused_shape, bdim)) as i: - index = i * fused_updates_dimension + bx * tdim + tx + # Copy data into the output. This loop writes to the same portions of + # memory as the following loop, so we do not need a memory sync. + with ib.for_range(0, ceil_div(fused_shape, bdim * tdim), name="i") as i: + index = i * bdim * tdim + bx * tdim + tx with ib.if_scope(index < fused_shape): out[index] = data[index] - with ib.for_range(0, fused_indices_dimension) as i: + with ib.for_range(0, fused_indices_dimension, name="i") as i: j = bx * tdim + tx with ib.if_scope(j < fused_updates_dimension): offset = fused_updates_dimension diff --git a/python/tvm/topi/scatter.py b/python/tvm/topi/scatter.py index 0fe29f315b43..afb0d6633a2b 100644 --- a/python/tvm/topi/scatter.py +++ b/python/tvm/topi/scatter.py @@ -203,7 +203,7 @@ def _verify_scatter_nd_inputs(data, indices, updates): mdim = int(indices.shape[0]) assert mdim <= len(data.shape), ( f"The first dimension of the indices ({mdim}) must be less than or equal to " - f"the length of the shape of the output ({len(shape)})." + f"the length of the shape of the output ({len(data.shape)})." ) for i in range(len(indices.shape) - 1): if isinstance(indices.shape[i + 1], expr.Var) or isinstance(updates.shape[i], expr.Var): From d60c5ffe72901e8c3d2b6663d75f91eb0d7ae8dd Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Thu, 8 Jul 2021 10:01:07 -0700 Subject: [PATCH 2/2] actually fix bounds --- python/tvm/topi/cuda/scatter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py index 2d4710b8b53b..652a109abe7d 100644 --- a/python/tvm/topi/cuda/scatter.py +++ b/python/tvm/topi/cuda/scatter.py @@ -803,9 +803,9 @@ def gen_ir(data_ptr, indices_ptr, updates_ptr, out_ptr): # Copy data into the output. This loop writes to the same portions of # memory as the following loop, so we do not need a memory sync. - with ib.for_range(0, ceil_div(fused_shape, bdim * tdim), name="i") as i: - index = i * bdim * tdim + bx * tdim + tx - with ib.if_scope(index < fused_shape): + with ib.for_range(0, ceil_div(fused_shape, fused_updates_dimension), name="i") as i: + index = i * fused_updates_dimension + bx * tdim + tx + with ib.if_scope(bx * tdim + tx < fused_updates_dimension): out[index] = data[index] with ib.for_range(0, fused_indices_dimension, name="i") as i: