From 5e3773ec22dbf44f6a3d771afba7e46d360b9c47 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Tue, 5 Oct 2021 16:13:44 -0500
Subject: [PATCH 01/20] [RFC][TIR] Separate physical and logical layout of
 buffers

---
 rfcs/XXXX-buffer-physical-layout.md | 399 ++++++++++++++++++++++++++++
 1 file changed, 399 insertions(+)
 create mode 100644 rfcs/XXXX-buffer-physical-layout.md

diff --git a/rfcs/XXXX-buffer-physical-layout.md b/rfcs/XXXX-buffer-physical-layout.md
new file mode 100644
index 00000000..4a04084e
--- /dev/null
+++ b/rfcs/XXXX-buffer-physical-layout.md
@@ -0,0 +1,399 @@
+- Feature Name: Buffer Physical Layout
+- Start Date: 2021-10-05
+- RFC PR: [apache/tvm-rfcs#0000](https://github.com/apache/tvm-rfcs/pull/0000)
+- GitHub Issue: TODO
+
+# Summary
+[summary]: #summary
+
+This RFC introduces a hard boundary between the “logical layout” of a
+mathematical tensor and the “physical layout” of a buffer in memory,
+along with a specification for defining the conversion between the
+two.
+
+# Motivation
+[motivation]: #motivation
+
+Currently, TVM assumes that all buffers can be treated as flat memory.
+That is, while a tensor may have N dimensions, the underlying buffer
+allocated by the low-level codegen has a single value defining the
+size, and access into that buffer is done using a single index.  This
+assumptions holds for most cases, such as a CPU accessing RAM, but
+doesn't hold in all cases.  For example, texture memory on a GPU
+requires two indices to access.  In addition, computations that are
+semantically identical (e.g. 2-d convolution) require independent
+compute definitions and schedules (e.g. `conv2d_nchw` and
+`conv2d_hwcn`) based on the format of the data accepted as input.
+
+This RFC introduces a mechanism to specify and vary the physical
+layout of buffers in memory.  This will allow for target-specific
+handling of non-flat memory, and will allow for code re-use across
+compute definitions that differ only in memory layout.
+
+# Guide-level explanation
+[guide-level-explanation]: #guide-level-explanation
+
+“Logical layout” refers to the layout of a tensor as it exists in the
+tensor.  All indices refer to the location of elements within an N-d
+tensor, which may or may not correspond to the layout as it exists in
+either host or device memory.  For example, compute defintions for
+image processing may be written on tensors in the NHWC format.
+
+“Physical layout” refers to the layout of memory as it exists within
+physical memory, either on the host or on the device.  For example,
+the physical layout of that same image processing data may a row-major
+traversal of a NCHWc layout (e.g. [cudnn's `CUDNN_TENSOR_NCHW_VECT_C`
+format](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t)),
+where the C axis has been split into vectorizable chunks, and
+reordered.
+
+For logical layouts, any number of dimensions are allowed.  However,
+for physical layouts, the dimensionality must be supported by the
+specific runtime.  1-d physical layouts correspond to flat memory, and
+should be supported by all runtimes.  N-d physical layouts have
+runtime-specific interpretation, and may not be supported by all
+runtimes.  For example, for OpenCL devices that support texture
+memory, a 2-d physical layout may be used to represent access into a
+2-d memory space.
+
+To define the physical layout in a TE schedule, use the
+`set_physical_layout` method of a schedule, as shown below.  The
+arguments to `set_physical_layout` are either tuples of
+`(logical_axis, factor)`, to indicate that the logical axis should be
+split by the factor given, or as `logical_axis` to indicate that the
+logical axis should be used with no additional splitting.  The order
+of arguments defines any reordering that may occur when generating the
+physical layout.  If `set_physical_layout` isn't called, then no
+splits or reorderings are applied.
+
+For example, below defines the reordering from NHWC logical layout to
+NCHWc physical layout.
+
+```python
+# Compute definition, written in terms of NHWC logical axes
+B = te.compute(A.shape, lambda n,h,w,c: A[n,h,w,c])
+s = te.create_schedule(B.op)
+
+# Option 1: Numeric axes
+s[B].set_physical_layout(0, 3, 1, 2, (3,4))
+
+# Option 2: Equivalent, named axes
+n,h,w,c = B.op.axis
+s[B].set_physical_layout(n, c, h, w, (c,4))
+
+# Compute definition that would produce an equivalent physical layout
+B_equivalent = te.compute(
+    [A.shape[0], A.shape[3]//4, A.shape[1], A.shape[2], 4],
+    lambda n, c_outer, h, w, c_inner: A[n, h, w, 4*c_outer+c_inner],
+)
+```
+
+By default, after the splits and reorders are applied, all axes are
+flattened to a single physical axis by following a row-major
+traversal.  This produces a 1-d physical layout, which corresponds to
+flat memory.  To add additional dimensions in the physical layout,
+insert `te.PHYSICAL_AXIS_SEPARATOR` into the axis list in
+`set_physical_layout`.  These define groups of axes, where each group
+is combined into a single physical axis.
+
+```python
+B = te.compute(shape=(M,N,P,Q), ...)
+m, n, p, q = B.op.axis
+s = te.create_schedule(B.op)
+
+# Default, produces a 1-d allocation with shape (M*N*P*Q,)
+s[B].set_physical_layout(m, n, p, q)
+
+# One separator, produces a 2-d allocation with shape (M*N, P*Q).
+s[B].set_physical_layout(m, n, te.PHYSICAL_AXIS_SEPARATOR, p, q)
+
+# Two separators, produces a 3-d allocation with shape (M, N*P, Q).
+s[B].set_physical_layout(m, te.PHYSICAL_AXIS_SEPARATOR, n, p, te.PHYSICAL_AXIS_SEPARATOR, q)
+
+# Can be used along with reorders and splits.
+s[B].set_physical_layout(m, q, n, te.PHYSICAL_AXIS_SEPARATOR, p, (q, 4))
+```
+
+
+# Reference-level explanation
+[reference-level-explanation]: #reference-level-explanation
+
+A buffer in logical layout may be allocated with `BufferRealizeNode`,
+and may be interacted with using `BufferLoadNode` and
+`BufferStoreNode`.  A buffer in physical layout may be allocated with
+`AllocateNode`, and may be interacted with using `LoadNode` and
+`StoreNode`.  Lowering from logical to physical layout will occur at
+the StorageFlatten pass for TE-based schedules, and at the
+FlattenBuffer pass for TensorIR-based schedules.
+
+A graph may only interact with a buffer in a single type of layout.
+That is, a buffer that is allocated with BufferRealize (logical
+layout) may not be accessed with Load (physical layout), and must
+instead be accessed with BufferLoad (logical layout).  Logical layout
+and physical layout may co-exist within the same graph, so long as
+they refer to different buffers.
+
+## Impacted TIR Nodes
+
+- BufferNode
+  - Describes a buffer, in logical layout.
+  
+  - Change: Add an `reorder_split` member variable, to describe
+    reorderings and axis splits that generate the physical layout from
+    the logical layout.  These will default to a physical layout that
+    assigns the first logical index to the slowest-changing dimension,
+    and the last logical index to the fastest-changing dimension, with
+    no reordering.  This default behavior reproduces the previous
+    behavior.
+    
+  - Change: Define which axes are to be merged by specifying
+    `axis_separators`.  Groups of logical axes, where each group
+    consists of all logical axes that do not have a separator between
+    them, are to be merged into a single physical index.  This will
+    default to an empty list, collapsing all logical axes/indices into
+    a single physical axis/index, reproducing the previous behavior.
+
+
+- BufferRealizeNode
+  - Realization of a buffer, in logical layout.
+  - For external buffers, serves as an optional annotation.  For
+    internal buffers, results in allocation of memory.
+
+
+- BufferLoadNode/BufferStoreNode
+  - Read/write of a buffer, in logical layout.
+
+
+- AllocateNode
+  - Allocation of a buffer, in physical layout.
+  
+  - Gives the N-d shape of the physical buffer
+    
+  - Change from previous behavior: Previously, all allocations were
+    done as a 1-d size of the physical layout, but `Array<PrimExpr>
+    AllocateNode::extents` held the shape of the logical layout used
+    to generate the `AllocateNode`.  This is replaced with Replace N-d
+    “extents” (logical layout) with N-d “shape” (physical layout).
+    Because these are both of type `Array<PrimExpr>`, but have
+    different semantics, this change is made in two steps rather
+    than a single find/replace.
+    
+    - Step 1: Replace N-d `Array<PrimExpr> extents` with 1-d `PrimExpr
+      extent`.  Any optimization passes that require knowledge of the
+      logical layout should be moved prior to the
+      StorageFlatten/FlattenBuffer pass and updated to act on the
+      logical layout.
+      
+    - Step 2: Replace 1-d `PrimExpr extent` N-d `Array<PrimExpr>
+      shape`.  Any access that assumes flat memory should verify that
+      `shape.size()==1`.
+
+
+- LoadNode/StoreNode
+  - Read/write of a buffer, in physical layout.
+  - Change from previous behavior: Replace 1-d `PrimExpr index` with
+    N-d `Array<PrimExpr> index`.
+
+
+## Impacted tir Transformations
+
+- SplitReorderIndices
+  - A new pass that takes as input a TIR graph with buffers in logical
+    layout.  The return from SplitReorderIndices
+    `buffer.reorder_splits.size()==0` for all buffers in the
+    graph, and represents the same computation as the input.
+  
+  - Replace the `Buffer` object in `BufferRealize`, `BufferLoad`, and
+    `BufferStore` nodes with updated `Buffer` objects whose shape has
+    all axis splits and reordering applied, and whose `reorder_splits`
+    is empty.
+    
+  - Rewrite `index` in BufferStore/BufferLoad nodes to follow the
+    updated layout.
+  
+- FlattenBuffer/StorageFlatten
+  - Existing passes that convert from logical layout to physical
+    layout for TE schedules (StorageFlatten) or TensorIR schedules
+    (FlattenBuffer).
+    
+  - Use the `N-1` axis separators specified in BufferNode to convert to
+    an N-d physical layout. The default of having 0 axis separators
+    will correspond to the previous behavior of flattening to a 1-d
+    physical layout.
+
+
+## Examples
+
+The following are intended as pseudo-code, and exclude details not
+relevant to this RFC (e.g. dtype).  These do not correspond with the
+final version of TensorIR that implements this RFC.  Numeric values
+are shown unsimplified to indicate where they come from.
+
+This first example shows a 2-d logical buffer, which is lowered to a
+1-d physical buffer.  `set_physical_layout` has been used to define a
+physical layout whose fastest changing dimension corresponds to the
+first index in the logical layout.
+
+```python
+# Initial graph, in logical layout
+x = Buffer(name="x", shape=[2,3], reorder_splits=[1,0], axis_separators=[])
+with BufferRealize(x):
+    val = BufferLoad(x, [10, 15])
+    BufferStore(x, 7, [20, 23])
+
+# After SplitReorderIndices has been applied.
+x = Buffer(name="x", shape=[3,2], reorder_splits=[], axis_separators=[])
+with BufferRealize(x):
+    val = BufferLoad(x, index=[15, 10])
+    BufferStore(x, 7, index=[23, 20])
+
+# After StorageFlatten/FlattenBuffer has been applied
+x = Var(name="x")
+with Allocate(x, shape=[3*2]):
+    val = Load(x, index=[15*2 + 10])
+    Store(x, 7, index=[23*2 + 10])
+```
+
+The next example shows a remapping from NHWC logical layout to NCHWc
+physical layout.  The 4 logical axes are expanded to 5 logical axes
+during the SplitReorderIndices pass, then flattened into 1 physical
+axis during StorageFlatten/FlattenBuffer.
+
+```python
+# Initial graph, in logical layout
+x = Buffer(name="x", shape=[16,64,64,128], reorder_splits=[0,3,1,2,(3,4)], axis_separators=[])
+with BufferRealize(x):
+    val = BufferLoad(x, [11, 37, 23, 101])
+
+# After SplitReorderIndices has been applied.
+x = Buffer(name="x", shape=[16, 128/4, 64, 64, 4], reorder_splits=[], axis_separators=[])
+with BufferRealize(x):
+    val = BufferLoad(x, index=[11, floor(101/4), 37, 23, 101%4])
+
+# After StorageFlatten/FlattenBuffer has been applied
+x = Var(name="x")
+with Allocate(x, shape=[16 * (128/4) * 64 * 64 * 4]):
+    val = Load(x, index=[(128/4)*64*64*4*11 + 64*64*4*floor(101/4) + 64*4*37 + 4*23 + 101%4])
+```
+
+Lastly, an example of remapping from `NHWC` logical layout to `NCHWc`
+physical layout, packed into a 2-d physical layout with `NCH` in the
+first physical axis and `Wc` in the second physical axis.  This is the
+definition used by the current `"global.texture"` definition used for
+texture memory.  The change applied during SplitReorderIndices is
+identical to the previous example, but StorageFlatten produces a 2-d
+physical index.  The interpretation of this 2-d index depends on the
+target-specific codegen.
+
+```python
+# Initial graph, in logical layout
+x = Buffer(name="x", shape=[16,64,64,128], reorder_splits=[0,3,1,2,(3,4)], axis_separators=[3])
+with BufferRealize(x):
+    val = BufferLoad(x, [11, 37, 23, 101])
+
+# After SplitReorderIndices has been applied.
+x = Buffer(name="x", shape=[16, 128/4, 64, 64, 4], reorder_splits=[], axis_separators=[3])
+with BufferRealize(x):
+    val = BufferLoad(x, index=[11, floor(101/4), 37, 23, 101%4])
+
+# After StorageFlatten/FlattenBuffer has been applied
+x = Var(name="x")
+with Allocate(x, shape=[16 * (128/4) * 64, 64 * 4]):
+    val = Load(x, index=[(128/4)*64*11 + 64*floor(101/4) + 37, 4*23 + 101%4])
+```
+
+
+# Drawbacks
+[drawbacks]: #drawbacks
+
+This change may make it more difficult to reason about the memory
+layout when writing the `te.compute` definition.  When the physical
+layout differs from the logical layout, it isn't guaranteed that
+`A[i]` and `A[i+1]` will be adjacent.  For example, a tensor with
+`NHWC` logical layout and a `NCHWc` physical layout defined by
+`set_physical_layout(n,c,h,w,(c,4))`, logical indices `(0,0,0,3)` and
+`(0,0,0,4)` will not be adjacent.
+
+
+# Rationale and alternatives
+[rationale-and-alternatives]: #rationale-and-alternatives
+
+This design applies equally to tensors defined as a result of a
+computation and to input tensors.  In both cases, the
+`set_physical_layout` causes all reads/writes to that buffer to obey
+the specified layout.  In the case of input tensors, it states that
+the tensors passed in will be in the specified format.
+
+The `te.compute` function can be used to define an updated layout.
+However, this introduces a new tensor that must be inlined to avoid
+additional memory allocation, and cannot be used for input tensors.
+
+
+# Prior art
+[prior-art]: #prior-art
+
+- CuDNN has an [explicit enumeration of allowed input
+  formats](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t),
+  which are specific to image formatting.
+
+- The reorder/split/flatten sequences is equivalent in numpy to using
+  `np.reshape` to split the logical axes, then `np.transpose` to
+  reorder them, then `np.reshape` to merge multiple axes into the N-d
+  physical axes.
+  
+# Unresolved questions
+[unresolved-questions]: #unresolved-questions
+
+- What is appropriate terminology for size/shape/extent of physical
+  and logical buffers?
+  - I am partial to using "shape" both for the N-d logical layout
+    and the N-d physical layout, and have attempted to use it
+    consistently through this RFC.
+  - "size" implies a 1-d buffer, which wouldn't be appropriate for
+    an N-d parameter.
+  - "extent" would be a reasonable name, but is currently used by
+    `tvm::RangeNode` to indicate a range of values that may start at
+    a non-zero value.  Since the indices for logical and physical
+    buffers both start at zero, using "extents" for the maximum
+    index would imply some offset.
+    
+- How should loops over an array be handled when re-writing the shape?
+
+  To avoid memory latency issues, loops should iterate over an array
+  sequentially sequentially when possible.  Iteration that is defined
+  in terms of the logical layout may be inappropriate for the physical
+  layout.
+  
+  - Option: Do nothing, and always keep the same iteration order, as
+    defined in terms of the logical axes.
+    
+    This would produce valid code, but not necessarily performant
+    code.  This can be a default behavior during development, to be
+    improved upon.
+  
+  - Option: Automatically detect loops that are over the full extent
+    of an array in sequential order of the logical layout, and rewrite
+    to be in sequential order of the physical layout.
+    
+    This would reduce the memory latency issues, but raises some
+    implementation questions.
+
+    - If a loop body references multiple tensors with different
+      physical layouts, which should define the loop iteration order?
+      
+    - If a series of nested loops contains a `cache_read` or
+      `cache_write` stage, can these be recognized and reordered?
+
+  - Option: Expose the `reorder_split` definition to be used as part
+    of a schedule definition.
+  
+    This would allow the greatest flexibility, but would make the
+    schedule dependent on the physical layout.
+
+# Future possibilities
+[future-possibilities]: #future-possibilities
+
+- Could be used to simplify many of the `topi` schedules for image
+  processing.
+- Could introduce variation of physical layout during `cache_read` and
+  `cache_write` steps, as a potential source of optimization.

From cba84068bfb62f9b1cb2de4b07fe6a88b53c2a1a Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Tue, 5 Oct 2021 16:16:14 -0500
Subject: [PATCH 02/20] Updated with link to the PR

---
 ...buffer-physical-layout.md => 0039-buffer-physical-layout.md} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename rfcs/{XXXX-buffer-physical-layout.md => 0039-buffer-physical-layout.md} (99%)

diff --git a/rfcs/XXXX-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
similarity index 99%
rename from rfcs/XXXX-buffer-physical-layout.md
rename to rfcs/0039-buffer-physical-layout.md
index 4a04084e..5da4e007 100644
--- a/rfcs/XXXX-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -1,6 +1,6 @@
 - Feature Name: Buffer Physical Layout
 - Start Date: 2021-10-05
-- RFC PR: [apache/tvm-rfcs#0000](https://github.com/apache/tvm-rfcs/pull/0000)
+- RFC PR: [apache/tvm-rfcs#0039](https://github.com/apache/tvm-rfcs/pull/0039)
 - GitHub Issue: TODO
 
 # Summary

From 31194a19a233f67a667c46e0ad479aaa27a7a4f7 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Tue, 5 Oct 2021 16:18:17 -0500
Subject: [PATCH 03/20] Fixed typo in example

---
 rfcs/0039-buffer-physical-layout.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index 5da4e007..09723470 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -251,7 +251,7 @@ with BufferRealize(x):
 x = Var(name="x")
 with Allocate(x, shape=[3*2]):
     val = Load(x, index=[15*2 + 10])
-    Store(x, 7, index=[23*2 + 10])
+    Store(x, 7, index=[23*2 + 20])
 ```
 
 The next example shows a remapping from NHWC logical layout to NCHWc

From 7a3de369869443cb74010b3b77c434368727cb3b Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Wed, 6 Oct 2021 09:10:10 -0500
Subject: [PATCH 04/20] Added link to RFC#0040.

---
 rfcs/0039-buffer-physical-layout.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index 09723470..1ca5074e 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -1,7 +1,7 @@
 - Feature Name: Buffer Physical Layout
 - Start Date: 2021-10-05
 - RFC PR: [apache/tvm-rfcs#0039](https://github.com/apache/tvm-rfcs/pull/0039)
-- GitHub Issue: TODO
+- GitHub Issue: Not Yet Written
 
 # Summary
 [summary]: #summary
@@ -183,6 +183,9 @@ they refer to different buffers.
       logical layout should be moved prior to the
       StorageFlatten/FlattenBuffer pass and updated to act on the
       logical layout.
+
+      This step is described in the more detail in the related
+      [RFC#0040](https://github.com/apache/tvm-rfcs/pull/0040).
       
     - Step 2: Replace 1-d `PrimExpr extent` N-d `Array<PrimExpr>
       shape`.  Any access that assumes flat memory should verify that

From bf75cb5b95ef216d9e82e51472f8456d42db68b2 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Wed, 6 Oct 2021 11:31:29 -0500
Subject: [PATCH 05/20] Updated version with a function to define the physical
 layout.

---
 rfcs/0039-buffer-physical-layout.md | 92 +++++++++++++++++++----------
 1 file changed, 62 insertions(+), 30 deletions(-)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index 1ca5074e..0ab473a9 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -58,13 +58,11 @@ memory, a 2-d physical layout may be used to represent access into a
 
 To define the physical layout in a TE schedule, use the
 `set_physical_layout` method of a schedule, as shown below.  The
-arguments to `set_physical_layout` are either tuples of
-`(logical_axis, factor)`, to indicate that the logical axis should be
-split by the factor given, or as `logical_axis` to indicate that the
-logical axis should be used with no additional splitting.  The order
-of arguments defines any reordering that may occur when generating the
-physical layout.  If `set_physical_layout` isn't called, then no
-splits or reorderings are applied.
+arguments to `set_physical_layout` is a function that accepts a list
+of `tvm.tir.Var` representing a logical index, and outputs a list of
+`tvm.tir.PrimExpr` giving a corresponding physical index.  If
+`set_physical_layout` isn't called, then no splits or reorderings are
+applied.
 
 For example, below defines the reordering from NHWC logical layout to
 NCHWc physical layout.
@@ -74,12 +72,11 @@ NCHWc physical layout.
 B = te.compute(A.shape, lambda n,h,w,c: A[n,h,w,c])
 s = te.create_schedule(B.op)
 
-# Option 1: Numeric axes
-s[B].set_physical_layout(0, 3, 1, 2, (3,4))
+def nhwc_to_nchwc(logical_axes):
+    n,h,w,c = logical_axes
+    return [n, c//4, h, w, c%4]
 
-# Option 2: Equivalent, named axes
-n,h,w,c = B.op.axis
-s[B].set_physical_layout(n, c, h, w, (c,4))
+s[B].set_physical_layout(nhwc_to_nchwc)
 
 # Compute definition that would produce an equivalent physical layout
 B_equivalent = te.compute(
@@ -92,9 +89,9 @@ By default, after the splits and reorders are applied, all axes are
 flattened to a single physical axis by following a row-major
 traversal.  This produces a 1-d physical layout, which corresponds to
 flat memory.  To add additional dimensions in the physical layout,
-insert `te.PHYSICAL_AXIS_SEPARATOR` into the axis list in
-`set_physical_layout`.  These define groups of axes, where each group
-is combined into a single physical axis.
+insert `te.PHYSICAL_AXIS_SEPARATOR` into the axis list return by the
+physical layout function.  These define groups of axes, where each
+group is combined into a single physical axis.
 
 ```python
 B = te.compute(shape=(M,N,P,Q), ...)
@@ -102,16 +99,16 @@ m, n, p, q = B.op.axis
 s = te.create_schedule(B.op)
 
 # Default, produces a 1-d allocation with shape (M*N*P*Q,)
-s[B].set_physical_layout(m, n, p, q)
+s[B].set_physical_layout(lambda i: i)
 
 # One separator, produces a 2-d allocation with shape (M*N, P*Q).
-s[B].set_physical_layout(m, n, te.PHYSICAL_AXIS_SEPARATOR, p, q)
+s[B].set_physical_layout(lambda i: [i[0], i[1], te.PHYSICAL_AXIS_SEPARATOR, i[2], i[3]])
 
 # Two separators, produces a 3-d allocation with shape (M, N*P, Q).
-s[B].set_physical_layout(m, te.PHYSICAL_AXIS_SEPARATOR, n, p, te.PHYSICAL_AXIS_SEPARATOR, q)
+s[B].set_physical_layout(lambda i: [i[0], te.PHYSICAL_AXIS_SEPARATOR, i[1], i[2], te.PHYSICAL_AXIS_SEPARATOR, i[3]])
 
 # Can be used along with reorders and splits.
-s[B].set_physical_layout(m, q, n, te.PHYSICAL_AXIS_SEPARATOR, p, (q, 4))
+s[B].set_physical_layout(lambda i: [i[0], i[3]//4, i[1], te.PHYSICAL_AXIS_SEPARATOR, i[2], i[3]%4])
 ```
 
 
@@ -140,11 +137,11 @@ they refer to different buffers.
   
   - Change: Add an `reorder_split` member variable, to describe
     reorderings and axis splits that generate the physical layout from
-    the logical layout.  These will default to a physical layout that
-    assigns the first logical index to the slowest-changing dimension,
-    and the last logical index to the fastest-changing dimension, with
-    no reordering.  This default behavior reproduces the previous
-    behavior.
+    the logical layout.  This defaults to the identity function,
+    generating a physical layout that assigns the first logical index
+    to the slowest-changing dimension, and the last logical index to
+    the fastest-changing dimension, with no reordering.  This default
+    behavior reproduces the previous behavior.
     
   - Change: Define which axes are to be merged by specifying
     `axis_separators`.  Groups of logical axes, where each group
@@ -239,7 +236,7 @@ first index in the logical layout.
 
 ```python
 # Initial graph, in logical layout
-x = Buffer(name="x", shape=[2,3], reorder_splits=[1,0], axis_separators=[])
+x = Buffer(name="x", shape=[2,3], reorder_splits=(lambda i,j: j,i), axis_separators=[])
 with BufferRealize(x):
     val = BufferLoad(x, [10, 15])
     BufferStore(x, 7, [20, 23])
@@ -263,8 +260,16 @@ during the SplitReorderIndices pass, then flattened into 1 physical
 axis during StorageFlatten/FlattenBuffer.
 
 ```python
+nhwc_to_nchwc = [
+    lambda i: i[0],
+    lambda i: i[3]//4,
+    lambda i: i[1],
+    lambda i: i[2],
+    lambda i: i[3]%4,
+]
+
 # Initial graph, in logical layout
-x = Buffer(name="x", shape=[16,64,64,128], reorder_splits=[0,3,1,2,(3,4)], axis_separators=[])
+x = Buffer(name="x", shape=[16,64,64,128], reorder_splits=nhwc_to_nchwc, axis_separators=[])
 with BufferRealize(x):
     val = BufferLoad(x, [11, 37, 23, 101])
 
@@ -289,8 +294,16 @@ physical index.  The interpretation of this 2-d index depends on the
 target-specific codegen.
 
 ```python
+nhwc_to_nchwc = [
+    lambda i: i[0],
+    lambda i: i[3]//4,
+    lambda i: i[1],
+    lambda i: i[2],
+    lambda i: i[3]%4,
+]
+
 # Initial graph, in logical layout
-x = Buffer(name="x", shape=[16,64,64,128], reorder_splits=[0,3,1,2,(3,4)], axis_separators=[3])
+x = Buffer(name="x", shape=[16,64,64,128], reorder_splits=nhwc_to_nchwc, axis_separators=[3])
 with BufferRealize(x):
     val = BufferLoad(x, [11, 37, 23, 101])
 
@@ -313,9 +326,9 @@ This change may make it more difficult to reason about the memory
 layout when writing the `te.compute` definition.  When the physical
 layout differs from the logical layout, it isn't guaranteed that
 `A[i]` and `A[i+1]` will be adjacent.  For example, a tensor with
-`NHWC` logical layout and a `NCHWc` physical layout defined by
-`set_physical_layout(n,c,h,w,(c,4))`, logical indices `(0,0,0,3)` and
-`(0,0,0,4)` will not be adjacent.
+`NHWC` logical layout and a `NCHWc` physical layout defined by `[n,
+c//4, h, w, c%4]`, logical indices `(0,0,0,3)` and `(0,0,0,4)` will
+not be adjacent.
 
 
 # Rationale and alternatives
@@ -347,6 +360,25 @@ additional memory allocation, and cannot be used for input tensors.
 # Unresolved questions
 [unresolved-questions]: #unresolved-questions
 
+- Representation of `reorder_split`
+
+    - Option: A string, used to look up a function during lowering.
+
+      This would work, but would make the TIR graph less
+      self-contained, making debugging more difficult.
+
+    - Option: `Array<Var> dummy_index` and `Array<PrimExpr> physical_layout`
+
+      The `physical_layout` expressions would be written in terms of
+      the variables in `dummy_index`.  The physical index for a
+      load/store would be generated by subsituting the logical indices
+      for the `dummy_index`.
+
+    - Option: Store as `Array<PrimFunc>`.
+
+      Each `PrimFunc` takes arguments of the logical index, and
+      produces a reorder/split index.
+
 - What is appropriate terminology for size/shape/extent of physical
   and logical buffers?
   - I am partial to using "shape" both for the N-d logical layout

From f64ecd98ad81955618b3b79030205a08e3de0170 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Wed, 6 Oct 2021 11:33:01 -0500
Subject: [PATCH 06/20] Updated with reference to iter_affine_map.h

---
 rfcs/0039-buffer-physical-layout.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index 0ab473a9..abe341e4 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -123,6 +123,9 @@ and may be interacted with using `BufferLoadNode` and
 the StorageFlatten pass for TE-based schedules, and at the
 FlattenBuffer pass for TensorIR-based schedules.
 
+Many of the utilities needed for this transformation already exist in
+`iter_affine_map.h`, and are used in this implementation.
+
 A graph may only interact with a buffer in a single type of layout.
 That is, a buffer that is allocated with BufferRealize (logical
 layout) may not be accessed with Load (physical layout), and must

From 27db7a160d52b791fc4212b1ae9a5908bd6566fe Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Thu, 28 Oct 2021 15:28:53 -0500
Subject: [PATCH 07/20] Updated following TQ's suggestions.

---
 rfcs/0039-buffer-physical-layout.md | 509 ++++++++++++++++------------
 1 file changed, 297 insertions(+), 212 deletions(-)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index abe341e4..66362831 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -2,14 +2,19 @@
 - Start Date: 2021-10-05
 - RFC PR: [apache/tvm-rfcs#0039](https://github.com/apache/tvm-rfcs/pull/0039)
 - GitHub Issue: Not Yet Written
+- Related RFCs: [RFC#XXXX](TODO: Link to BufferPointer RFC)
 
 # Summary
 [summary]: #summary
 
-This RFC introduces a hard boundary between the “logical layout” of a
-mathematical tensor and the “physical layout” of a buffer in memory,
-along with a specification for defining the conversion between the
-two.
+This RFC introduces layout transformations that can be applies to a
+buffer during the lowering process.  These transformations will be
+part of the schedule, allowing the same compute definition to be used
+across multiple different layouts.
+
+[RFC#XXXX](TODO: Link to BufferPointer RFC) is intended to make these
+buffer transformations easier to write, though it isn't strictly
+necessary for this change.
 
 # Motivation
 [motivation]: #motivation
@@ -20,52 +25,52 @@ allocated by the low-level codegen has a single value defining the
 size, and access into that buffer is done using a single index.  This
 assumptions holds for most cases, such as a CPU accessing RAM, but
 doesn't hold in all cases.  For example, texture memory on a GPU
-requires two indices to access.  In addition, computations that are
-semantically identical (e.g. 2-d convolution) require independent
-compute definitions and schedules (e.g. `conv2d_nchw` and
-`conv2d_hwcn`) based on the format of the data accepted as input.
+requires two indices to access.
+
+In addition, computations that are semantically identical (e.g. 2-d
+convolution) require independent compute definitions and schedules
+(e.g. `conv2d_nchw` and `conv2d_hwcn`) based on the format of the data
+accepted as input.
 
-This RFC introduces a mechanism to specify and vary the physical
-layout of buffers in memory.  This will allow for target-specific
-handling of non-flat memory, and will allow for code re-use across
-compute definitions that differ only in memory layout.
+This RFC introduces a mechanism to specify transformations to be
+applied to the layout of buffers in memory, including the option to
+present multi-dimensional indices to the low-level code generators.
+This will allow for target-specific handling of non-flat memory, and
+will allow for code re-use across compute definitions that differ only
+in memory layout.
 
 # Guide-level explanation
 [guide-level-explanation]: #guide-level-explanation
 
-“Logical layout” refers to the layout of a tensor as it exists in the
-tensor.  All indices refer to the location of elements within an N-d
-tensor, which may or may not correspond to the layout as it exists in
-either host or device memory.  For example, compute defintions for
-image processing may be written on tensors in the NHWC format.
-
-“Physical layout” refers to the layout of memory as it exists within
-physical memory, either on the host or on the device.  For example,
-the physical layout of that same image processing data may a row-major
-traversal of a NCHWc layout (e.g. [cudnn's `CUDNN_TENSOR_NCHW_VECT_C`
-format](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t)),
-where the C axis has been split into vectorizable chunks, and
-reordered.
-
-For logical layouts, any number of dimensions are allowed.  However,
-for physical layouts, the dimensionality must be supported by the
-specific runtime.  1-d physical layouts correspond to flat memory, and
-should be supported by all runtimes.  N-d physical layouts have
-runtime-specific interpretation, and may not be supported by all
-runtimes.  For example, for OpenCL devices that support texture
-memory, a 2-d physical layout may be used to represent access into a
-2-d memory space.
-
-To define the physical layout in a TE schedule, use the
-`set_physical_layout` method of a schedule, as shown below.  The
-arguments to `set_physical_layout` is a function that accepts a list
-of `tvm.tir.Var` representing a logical index, and outputs a list of
+A buffer is represented by a `tvm::tir::Buffer` object, and has some
+shape associated with it.  This shape is initially defined from the
+buffer's shape in the compute definition.  Buffers can either be
+allocated within a `tvm::tir::PrimFunc` using a `tvm::tir::Allocate`
+node, or can be passed in as parameters to a `PrimFunc`.  Buffer
+access is done using `tvm::tir::BufferLoad` and
+`tvm::tir::BufferStore` for reads and writes, respectively.
+
+When a TIR graph is passed into the low-level code generator
+`tvm::codegen::Build`, the dimensionality of each buffer must be
+supported by the target code generator.  Typically, this will mean
+generating a 1-dimensional index representing access into flat memory.
+Some code generators may attach alternative semantics for
+multi-dimensional buffers (e.g. 2-d buffers to represent texture
+memory on OpenCL).  A low-level code generator should check the
+dimensionality of the buffers it is acting on, and give a diagnostic
+error for unsupported dimensionality.
+
+To define the layout transformation in a TE schedule, use the
+`transform_layout` method of a schedule, as shown below.  The
+arguments to `transform_layout` is a function that accepts a list of
+`tvm.tir.Var` representing a logical index, and outputs a list of
 `tvm.tir.PrimExpr` giving a corresponding physical index.  If
-`set_physical_layout` isn't called, then no splits or reorderings are
-applied.
+`transform_layout` isn't called, then no additional layout
+transformations are applied.
 
 For example, below defines the reordering from NHWC logical layout to
-NCHWc physical layout.
+NCHWc physical layout.  Similar to `cache_read` and `cache_write`, the
+`transform_layout` method introduces a new stage in the schedule.
 
 ```python
 # Compute definition, written in terms of NHWC logical axes
@@ -76,7 +81,7 @@ def nhwc_to_nchwc(logical_axes):
     n,h,w,c = logical_axes
     return [n, c//4, h, w, c%4]
 
-s[B].set_physical_layout(nhwc_to_nchwc)
+B_nchwc = s[B].transform_layout(nhwc_to_nchwc)
 
 # Compute definition that would produce an equivalent physical layout
 B_equivalent = te.compute(
@@ -85,13 +90,13 @@ B_equivalent = te.compute(
 )
 ```
 
-By default, after the splits and reorders are applied, all axes are
-flattened to a single physical axis by following a row-major
-traversal.  This produces a 1-d physical layout, which corresponds to
-flat memory.  To add additional dimensions in the physical layout,
-insert `te.PHYSICAL_AXIS_SEPARATOR` into the axis list return by the
-physical layout function.  These define groups of axes, where each
-group is combined into a single physical axis.
+By default, after all explicitly specified layout transformations are
+applied, all axes are flattened to a single physical axis by following
+a row-major traversal.  This produces a 1-d physical layout, which
+corresponds to flat memory.  To add additional dimensions in the
+physical layout, insert `te.AXIS_SEPARATOR` into the axis
+list return by the physical layout function.  These define groups of
+axes, where each group is combined into a single physical axis.
 
 ```python
 B = te.compute(shape=(M,N,P,Q), ...)
@@ -99,60 +104,39 @@ m, n, p, q = B.op.axis
 s = te.create_schedule(B.op)
 
 # Default, produces a 1-d allocation with shape (M*N*P*Q,)
-s[B].set_physical_layout(lambda i: i)
+s[B].transform_layout(lambda i: i)
 
 # One separator, produces a 2-d allocation with shape (M*N, P*Q).
-s[B].set_physical_layout(lambda i: [i[0], i[1], te.PHYSICAL_AXIS_SEPARATOR, i[2], i[3]])
+s[B].transform_layout(lambda i: [i[0], i[1], te.AXIS_SEPARATOR, i[2], i[3]])
 
 # Two separators, produces a 3-d allocation with shape (M, N*P, Q).
-s[B].set_physical_layout(lambda i: [i[0], te.PHYSICAL_AXIS_SEPARATOR, i[1], i[2], te.PHYSICAL_AXIS_SEPARATOR, i[3]])
+s[B].transform_layout(lambda i: [i[0], te.AXIS_SEPARATOR, i[1], i[2], te.AXIS_SEPARATOR, i[3]])
 
 # Can be used along with reorders and splits.
-s[B].set_physical_layout(lambda i: [i[0], i[3]//4, i[1], te.PHYSICAL_AXIS_SEPARATOR, i[2], i[3]%4])
+s[B].transform_layout(lambda i: [i[0], i[3]//4, i[1], te.AXIS_SEPARATOR, i[2], i[3]%4])
 ```
 
 
+
 # Reference-level explanation
 [reference-level-explanation]: #reference-level-explanation
 
-A buffer in logical layout may be allocated with `BufferRealizeNode`,
-and may be interacted with using `BufferLoadNode` and
-`BufferStoreNode`.  A buffer in physical layout may be allocated with
-`AllocateNode`, and may be interacted with using `LoadNode` and
-`StoreNode`.  Lowering from logical to physical layout will occur at
-the StorageFlatten pass for TE-based schedules, and at the
-FlattenBuffer pass for TensorIR-based schedules.
+Transformation of a buffer is represented by a `BufferTransformNode`.
+It specifies a buffer to be reshaped, and the transformation to be
+applied to it.  Many of the utilities needed for this transformation
+already exist in `iter_affine_map.h`, and are used in the
+implementation.
 
-Many of the utilities needed for this transformation already exist in
-`iter_affine_map.h`, and are used in this implementation.
-
-A graph may only interact with a buffer in a single type of layout.
-That is, a buffer that is allocated with BufferRealize (logical
-layout) may not be accessed with Load (physical layout), and must
-instead be accessed with BufferLoad (logical layout).  Logical layout
-and physical layout may co-exist within the same graph, so long as
-they refer to different buffers.
+A buffer may be allocated with `AllocateNode`, and may be interacted
+with using `BufferLoadNode` and `BufferStoreNode`.
+`BufferRealizeNode` should only appear in TE-based schedules, and
+should be converted to `AllocateNode`.  `LoadNode` and `StoreNode`
+are deprecated.
 
 ## Impacted TIR Nodes
 
 - BufferNode
-  - Describes a buffer, in logical layout.
-  
-  - Change: Add an `reorder_split` member variable, to describe
-    reorderings and axis splits that generate the physical layout from
-    the logical layout.  This defaults to the identity function,
-    generating a physical layout that assigns the first logical index
-    to the slowest-changing dimension, and the last logical index to
-    the fastest-changing dimension, with no reordering.  This default
-    behavior reproduces the previous behavior.
-    
-  - Change: Define which axes are to be merged by specifying
-    `axis_separators`.  Groups of logical axes, where each group
-    consists of all logical axes that do not have a separator between
-    them, are to be merged into a single physical index.  This will
-    default to an empty list, collapsing all logical axes/indices into
-    a single physical axis/index, reproducing the previous behavior.
-
+  - Describes a N-d buffer.  The layout of the buffer may be
 
 - BufferRealizeNode
   - Realization of a buffer, in logical layout.
@@ -161,64 +145,53 @@ they refer to different buffers.
 
 
 - BufferLoadNode/BufferStoreNode
-  - Read/write of a buffer, in logical layout.
+  - Read/write of a buffer.
+
+  - Change from previous behavior: Will exist throughout the lowering
+    process, and will be passed to the low-level code generators.
+    Transformations that previously created `Load` and `Store` nodes
+    will instead create `BufferLoad` and `BufferStore` nodes with 1-d
+    indices.
 
 
 - AllocateNode
   - Allocation of a buffer, in physical layout.
-  
-  - Gives the N-d shape of the physical buffer
-    
-  - Change from previous behavior: Previously, all allocations were
-    done as a 1-d size of the physical layout, but `Array<PrimExpr>
-    AllocateNode::extents` held the shape of the logical layout used
-    to generate the `AllocateNode`.  This is replaced with Replace N-d
-    “extents” (logical layout) with N-d “shape” (physical layout).
-    Because these are both of type `Array<PrimExpr>`, but have
-    different semantics, this change is made in two steps rather
-    than a single find/replace.
-    
-    - Step 1: Replace N-d `Array<PrimExpr> extents` with 1-d `PrimExpr
-      extent`.  Any optimization passes that require knowledge of the
-      logical layout should be moved prior to the
-      StorageFlatten/FlattenBuffer pass and updated to act on the
-      logical layout.
-
-      This step is described in the more detail in the related
-      [RFC#0040](https://github.com/apache/tvm-rfcs/pull/0040).
-      
-    - Step 2: Replace 1-d `PrimExpr extent` N-d `Array<PrimExpr>
-      shape`.  Any access that assumes flat memory should verify that
-      `shape.size()==1`.
+
+  - Declares an allocation of a buffer.
+
+  - Change from previous behavior: Previously, `AllocateNode` held the
+    `buffer_var`, datatype, and buffer extents directly. After
+    implementation of this RFC, `AllocateNode` will instead hold the
+    `Buffer` that is to be allocated.
 
 
 - LoadNode/StoreNode
-  - Read/write of a buffer, in physical layout.
-  - Change from previous behavior: Replace 1-d `PrimExpr index` with
-    N-d `Array<PrimExpr> index`.
+  - Read/write of a 1-d buffer, given a `Var` pointer to the start of
+    the buffer and a single index.
+
+  - Deprecated, should instead use `BufferLoad` and `BufferStore` with
+    a 1-d index.
 
 
 ## Impacted tir Transformations
 
-- SplitReorderIndices
-  - A new pass that takes as input a TIR graph with buffers in logical
-    layout.  The return from SplitReorderIndices
-    `buffer.reorder_splits.size()==0` for all buffers in the
-    graph, and represents the same computation as the input.
-  
-  - Replace the `Buffer` object in `BufferRealize`, `BufferLoad`, and
-    `BufferStore` nodes with updated `Buffer` objects whose shape has
-    all axis splits and reordering applied, and whose `reorder_splits`
-    is empty.
-    
-  - Rewrite `index` in BufferStore/BufferLoad nodes to follow the
-    updated layout.
-  
+- `ApplyBufferTransform`
+  - A new pass that takes as input a TIR graph that may have
+    `BufferTransform` nodes present.  The return from
+    `ApplyBufferTransform` has all `BufferTransform` nodes removed,
+    with the buffers marked in them reordered as specified.
+
+  - Rewrite `indices` in BufferStore/BufferLoad nodes based on the
+    specified transformation.
+
 - FlattenBuffer/StorageFlatten
+  - Can be implemented as the addition of a `BufferTransform`, which
+    is later lowered by `ApplyBufferTransform`.
+
   - Existing passes that convert from logical layout to physical
     layout for TE schedules (StorageFlatten) or TensorIR schedules
     (FlattenBuffer).
-    
+
   - Use the `N-1` axis separators specified in BufferNode to convert to
     an N-d physical layout. The default of having 0 axis separators
     will correspond to the previous behavior of flattening to a 1-d
@@ -232,25 +205,53 @@ relevant to this RFC (e.g. dtype).  These do not correspond with the
 final version of TensorIR that implements this RFC.  Numeric values
 are shown unsimplified to indicate where they come from.
 
-This first example shows a 2-d logical buffer, which is lowered to a
-1-d physical buffer.  `set_physical_layout` has been used to define a
+The first example shows a 2-d buffer with no layout transformations
+explicitly specified.  The generated TIR includes a `BufferTransform`
+annotation to apply a row-major traversal and generate a flat 1-d
+buffer.
+
+```python
+# In TE schedule, no call to transform_layout.
+
+# Initial TIR graph
+x = Buffer(name="x", shape=[2,3])
+BufferTransform(x, lambda i,j: [i*x.shape[1] + j])
+with Allocate(x):
+    val = BufferLoad(x, [10, 15])
+    BufferStore(x, 7, [20, 23])
+
+# After applying the implicit flattening to 1-d
+x = Var(name="x")
+with Allocate(x, shape=[2*3]):
+    val = Load(x, index=[10*3 + 15])
+    Store(x, 7, index=[20*3 + 23])
+```
+
+This next example shows a 2-d logical buffer, which is lowered to a
+1-d physical buffer.  `transform_layout` has been used to define a
 physical layout whose fastest changing dimension corresponds to the
 first index in the logical layout.
 
 ```python
-# Initial graph, in logical layout
-x = Buffer(name="x", shape=[2,3], reorder_splits=(lambda i,j: j,i), axis_separators=[])
-with BufferRealize(x):
+# In TE schedule
+# s[x].transform_layout(lambda i,j: [j,i])
+
+# Initial TIR graph
+x = Buffer(name="x", shape=[2,3])
+BufferTransform(x, lambda i,j: [j,i])
+BufferTransform(x, lambda i,j: [i*x.shape[1] + j])
+with Allocate(x):
     val = BufferLoad(x, [10, 15])
     BufferStore(x, 7, [20, 23])
 
-# After SplitReorderIndices has been applied.
-x = Buffer(name="x", shape=[3,2], reorder_splits=[], axis_separators=[])
-with BufferRealize(x):
+# After applying the explicit reordering
+x = Buffer(name="x", shape=[3,2])
+BufferTransform(x, lambda i,j: [i*x.shape[1] + j])
+with Allocate(x):
     val = BufferLoad(x, index=[15, 10])
     BufferStore(x, 7, index=[23, 20])
 
-# After StorageFlatten/FlattenBuffer has been applied
+# After applying the implicit flattening to 1-d
 x = Var(name="x")
 with Allocate(x, shape=[3*2]):
     val = Load(x, index=[15*2 + 10])
@@ -263,25 +264,23 @@ during the SplitReorderIndices pass, then flattened into 1 physical
 axis during StorageFlatten/FlattenBuffer.
 
 ```python
-nhwc_to_nchwc = [
-    lambda i: i[0],
-    lambda i: i[3]//4,
-    lambda i: i[1],
-    lambda i: i[2],
-    lambda i: i[3]%4,
-]
-
-# Initial graph, in logical layout
+# In TE schedule
+# s[x].transform_layout(lambda n,h,w,c: [n, c//4, h, w, c%4])
+
+# Initial TIR graph
 x = Buffer(name="x", shape=[16,64,64,128], reorder_splits=nhwc_to_nchwc, axis_separators=[])
-with BufferRealize(x):
+BufferTransform(x, lambda n,h,w,c: [n, c//4, h, w, c%4])
+BufferTransform(x, lambda n,C_outer,h,w,c_inner: [x.shape[4]*(x.shape[3]*(x.shape[2]*(x.shape[1]*n + C_outer) + h) + w) + c_inner]
+with Allocate(x):
     val = BufferLoad(x, [11, 37, 23, 101])
 
-# After SplitReorderIndices has been applied.
+# After applying the explicit reordering
 x = Buffer(name="x", shape=[16, 128/4, 64, 64, 4], reorder_splits=[], axis_separators=[])
-with BufferRealize(x):
+BufferTransform(x, lambda n,c_outer,h,w,c_inner: [x.shape[4]*(x.shape[3]*(x.shape[2]*(x.shape[1]*n + c_outer) + h) + w) + c_inner]
+with Allocate(x):
     val = BufferLoad(x, index=[11, floor(101/4), 37, 23, 101%4])
 
-# After StorageFlatten/FlattenBuffer has been applied
+# After applying the implicit flattening to 1-d
 x = Var(name="x")
 with Allocate(x, shape=[16 * (128/4) * 64 * 64 * 4]):
     val = Load(x, index=[(128/4)*64*64*4*11 + 64*64*4*floor(101/4) + 64*4*37 + 4*23 + 101%4])
@@ -297,25 +296,27 @@ physical index.  The interpretation of this 2-d index depends on the
 target-specific codegen.
 
 ```python
-nhwc_to_nchwc = [
-    lambda i: i[0],
-    lambda i: i[3]//4,
-    lambda i: i[1],
-    lambda i: i[2],
-    lambda i: i[3]%4,
-]
-
-# Initial graph, in logical layout
-x = Buffer(name="x", shape=[16,64,64,128], reorder_splits=nhwc_to_nchwc, axis_separators=[3])
-with BufferRealize(x):
+# In TE schedule
+# s[x].transform_layout(lambda n,h,w,c: [n, c//4, h, te.AXIS_SEPARATOR, w, c%4])
+
+# Initial TIR graph
+x = Buffer(name="x", shape=[16,64,64,128])
+
+BufferTransform(x, lambda n,h,w,c: [n, c//4, h, w, c%4])
+BufferTransform(x, lambda n,c_outer,h,w,c_inner: [x.shape[1]*(x.shape[2]*c_outer + n) + h,
+                                                  x.shape[4]*w + c_inner])
+with Allocate(x):
     val = BufferLoad(x, [11, 37, 23, 101])
 
-# After SplitReorderIndices has been applied.
-x = Buffer(name="x", shape=[16, 128/4, 64, 64, 4], reorder_splits=[], axis_separators=[3])
-with BufferRealize(x):
+# After applying the explicit reordering.
+x = Buffer(name="x", shape=[16, 128/4, 64, 64, 4])
+BufferTransform(x, lambda n,c_outer,h,w,c_inner: [x.shape[1]*(x.shape[2]*c_outer + n) + h,
+                                                  x.shape[4]*w + c_inner])
+with Allocate(x):
     val = BufferLoad(x, index=[11, floor(101/4), 37, 23, 101%4])
 
-# After StorageFlatten/FlattenBuffer has been applied
+# After applying the implicit flattening.  The final result is 2-d,
+# due to the te.AXIS_SEPARATOR used in the `.transform_layout`.
 x = Var(name="x")
 with Allocate(x, shape=[16 * (128/4) * 64, 64 * 4]):
     val = Load(x, index=[(128/4)*64*11 + 64*floor(101/4) + 37, 4*23 + 101%4])
@@ -329,23 +330,115 @@ This change may make it more difficult to reason about the memory
 layout when writing the `te.compute` definition.  When the physical
 layout differs from the logical layout, it isn't guaranteed that
 `A[i]` and `A[i+1]` will be adjacent.  For example, a tensor with
-`NHWC` logical layout and a `NCHWc` physical layout defined by `[n,
-c//4, h, w, c%4]`, logical indices `(0,0,0,3)` and `(0,0,0,4)` will
-not be adjacent.
+compute definition defined in `NHWC` layout and with layout
+transformation to `NCHWc` defined by `[n, c//4, h, w, c%4]`, locations
+`(0,0,0,3)` and `(0,0,0,4)` in the compute definition will not be
+adjacent.
 
 
 # Rationale and alternatives
 [rationale-and-alternatives]: #rationale-and-alternatives
 
-This design applies equally to tensors defined as a result of a
-computation and to input tensors.  In both cases, the
-`set_physical_layout` causes all reads/writes to that buffer to obey
-the specified layout.  In the case of input tensors, it states that
-the tensors passed in will be in the specified format.
+- Can these design goals be met with existing features?
+
+  The `te.compute` function can be used to define an updated layout.
+  However, this introduces a new tensor that must be inlined to avoid
+  additional memory allocation, and cannot be used for input
+  parameters.
+
+  This design applies equally to tensors defined as a result of a
+  computation and to input tensors.  In both cases, the
+  `transform_layout` causes all reads/writes to that buffer to obey
+  the specified layout.  In the case of input tensors, it states that
+  the tensors passed in will be in the specified format.
+
+
+- Should `BufferTransform` apply only to its body, or apply to the
+  entire graph it is contained in?
+
+  Option 2 is preferred.
+
+  - Option 1: A scope-limited `BufferTransform` would define a
+    transformation that should apply to any allocations, reads, or
+    writes that occur to the named buffer within the body of the
+    `BufferTransform`.  However, this couldn't apply to `PrimFunc`
+    arguments, which are outside of the scope of any node within the
+    body.
+
+  - Option 2: A `BufferTransform` that applies to all uses within the
+    entire graph may apply to a buffer that is declared outside of its
+    body.  This would especially be the case for buffers passed as
+    `PrimFunc` arguments.
+
+
+
+- When should the `tir::transform::LowerBufferTransforms` pass be
+  applied?
+
+  Applying it at the end of phase-2 in `driver_api.cc::CreatePassList`
+  satisfies these conditions.
+
+  - To ensure that host and device have the same definition for buffer
+    layout, it should occur before the host/device split in
+    `MakePackedAPI`.
+
+  - Since other transformations can make use of buffer
+    transformations, it should otherwise be as late as possible in the
+    lowering flow.  (e.g. `InjectDoubleBuffer` mapping to a new buffer
+    shape)
 
-The `te.compute` function can be used to define an updated layout.
-However, this introduces a new tensor that must be inlined to avoid
-additional memory allocation, and cannot be used for input tensors.
+
+
+- Should `BufferTransform` re-use functionality of other nodes,
+  rather than being an independent node?
+
+  Option 1 is preferred.
+
+  - Option 1: Add `BufferTransform` as its own node.
+
+  - Option 2: In TE-based schedules, `AttrStmtNode` could give the
+    buffer to be transformed, along with the transformation to be
+    applied, similar to how `buffer_bind_scope` is currently handled.
+
+    The `BufferTransform` must also contain multiple objects that are
+    not derived from `PrimExpr`, the buffer to be transformed and the
+    mapping to be applied, while `AttrStmtNode` only allows a single
+    `ObjectRef` node and a `PrimExpr` value.
+
+  - Option 3: In TensorIR-based schedules, `MatchBufferRegion` could
+    be extended to also include a transformation while performing the
+    buffer replacement.
+
+    However, this could make it more difficult to reason about which
+    locations in the buffer region are being accessed.
+
+  - Option 4: The `BufferNode` object could contain an array of
+    transformations that should be applied to it during the lowering
+    process.  This would be convenient and allow for arbitrarily many
+    transformations.
+
+    Wouldn't follow the TVM convention of having annotations external
+    to the node itself.
+
+
+- Where should transformations to be applied to the function inputs be
+  specified?
+
+  Option 1 is preferred.
+
+  - Option 1: Any `BufferTransform` that describes a buffer in the
+    `PrimFuncNode::buffer_map` gets applied to that buffer.
+
+    Would require two traversals, the first to locate all buffer
+    transforms, and the second to apply them.
+
+  - Option 2: `BufferTransform` nodes listed in the `PrimFunc::attrs`
+    under a `"buffer_argument_transforms"` key apply to the function arguments.
+
+    Would only need a single traversal to apply.
+
+    Would require other passes to be aware of where a buffer was first
+    defined, in order to add it to the appropriate location.
 
 
 # Prior art
@@ -359,34 +452,20 @@ additional memory allocation, and cannot be used for input tensors.
   `np.reshape` to split the logical axes, then `np.transpose` to
   reorder them, then `np.reshape` to merge multiple axes into the N-d
   physical axes.
-  
+
 # Unresolved questions
 [unresolved-questions]: #unresolved-questions
 
-- Representation of `reorder_split`
-
-    - Option: A string, used to look up a function during lowering.
-
-      This would work, but would make the TIR graph less
-      self-contained, making debugging more difficult.
-
-    - Option: `Array<Var> dummy_index` and `Array<PrimExpr> physical_layout`
-
-      The `physical_layout` expressions would be written in terms of
-      the variables in `dummy_index`.  The physical index for a
-      load/store would be generated by subsituting the logical indices
-      for the `dummy_index`.
-
-    - Option: Store as `Array<PrimFunc>`.
-
-      Each `PrimFunc` takes arguments of the logical index, and
-      produces a reorder/split index.
 
 - What is appropriate terminology for size/shape/extent of physical
   and logical buffers?
-  - I am partial to using "shape" both for the N-d logical layout
-    and the N-d physical layout, and have attempted to use it
-    consistently through this RFC.
+
+  If Allocate/BufferStore/BufferLoad each hold a reference to the
+  buffer they act upon, then this becomes a somewhat irrelevant
+  question, as there is only one `BufferNode::shape`.
+
+  - I am partial to using "shape" both for the N-d parameters, and
+    have attempted to use it consistently through this RFC.
   - "size" implies a 1-d buffer, which wouldn't be appropriate for
     an N-d parameter.
   - "extent" would be a reasonable name, but is currently used by
@@ -394,39 +473,45 @@ additional memory allocation, and cannot be used for input tensors.
     a non-zero value.  Since the indices for logical and physical
     buffers both start at zero, using "extents" for the maximum
     index would imply some offset.
-    
+
+
+
 - How should loops over an array be handled when re-writing the shape?
 
   To avoid memory latency issues, loops should iterate over an array
   sequentially sequentially when possible.  Iteration that is defined
   in terms of the logical layout may be inappropriate for the physical
   layout.
-  
-  - Option: Do nothing, and always keep the same iteration order, as
-    defined in terms of the logical axes.
-    
+
+  Option 3 is preferred.
+
+  - Option 1: Do nothing, and always keep the same iteration order,
+    using the same iteration axes as defined in the compute
+    definition.
+
     This would produce valid code, but not necessarily performant
     code.  This can be a default behavior during development, to be
     improved upon.
-  
-  - Option: Automatically detect loops that are over the full extent
+
+  - Option 2: Automatically detect loops that are over the full extent
     of an array in sequential order of the logical layout, and rewrite
     to be in sequential order of the physical layout.
-    
+
     This would reduce the memory latency issues, but raises some
     implementation questions.
 
     - If a loop body references multiple tensors with different
       physical layouts, which should define the loop iteration order?
-      
+
     - If a series of nested loops contains a `cache_read` or
       `cache_write` stage, can these be recognized and reordered?
 
-  - Option: Expose the `reorder_split` definition to be used as part
+  - Option 3: Expose the `reorder_split` definition to be used as part
     of a schedule definition.
-  
+
     This would allow the greatest flexibility, but would make the
-    schedule dependent on the physical layout.
+    schedule dependent on the transformed layout, beyond the one
+    definition.
 
 # Future possibilities
 [future-possibilities]: #future-possibilities

From 8fb21f7e2e5f0ce48a67e30d80126b242d624e09 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Thu, 28 Oct 2021 15:59:15 -0500
Subject: [PATCH 08/20] Updated with link to BufferPointer RFC

---
 rfcs/0039-buffer-physical-layout.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index 66362831..f4ac40b6 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -2,7 +2,7 @@
 - Start Date: 2021-10-05
 - RFC PR: [apache/tvm-rfcs#0039](https://github.com/apache/tvm-rfcs/pull/0039)
 - GitHub Issue: Not Yet Written
-- Related RFCs: [RFC#XXXX](TODO: Link to BufferPointer RFC)
+- Related RFCs: [RFC#0042](https://github.com/apache/tvm-rfcs/pull/0042)
 
 # Summary
 [summary]: #summary
@@ -12,9 +12,9 @@ buffer during the lowering process.  These transformations will be
 part of the schedule, allowing the same compute definition to be used
 across multiple different layouts.
 
-[RFC#XXXX](TODO: Link to BufferPointer RFC) is intended to make these
-buffer transformations easier to write, though it isn't strictly
-necessary for this change.
+[RFC#0042](https://github.com/apache/tvm-rfcs/pull/0042) is intended
+to make these buffer transformations easier to write, though it isn't
+strictly necessary for this change.
 
 # Motivation
 [motivation]: #motivation

From a4cdd95fe3cd826fcbcd16fa0b749b41e20adc64 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Fri, 29 Oct 2021 14:57:47 -0500
Subject: [PATCH 09/20] Updated following some comments from @vinx13

---
 rfcs/0039-buffer-physical-layout.md | 63 +++++++++++++++++++++++------
 1 file changed, 50 insertions(+), 13 deletions(-)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index f4ac40b6..c0b98d8d 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -91,29 +91,28 @@ B_equivalent = te.compute(
 ```
 
 By default, after all explicitly specified layout transformations are
-applied, all axes are flattened to a single physical axis by following
-a row-major traversal.  This produces a 1-d physical layout, which
-corresponds to flat memory.  To add additional dimensions in the
-physical layout, insert `te.AXIS_SEPARATOR` into the axis
-list return by the physical layout function.  These define groups of
-axes, where each group is combined into a single physical axis.
+applied, all axes are flattened to a single axis by following a
+row-major traversal.  This produces a 1-d buffer, which corresponds to
+flat memory.  To add additional dimensions in the physical layout,
+insert `te.AXIS_SEPARATOR` into the axis list return by the physical
+layout function.  These define groups of axes, where each group is
+combined into a single physical axis.
 
 ```python
 B = te.compute(shape=(M,N,P,Q), ...)
-m, n, p, q = B.op.axis
 s = te.create_schedule(B.op)
 
 # Default, produces a 1-d allocation with shape (M*N*P*Q,)
-s[B].transform_layout(lambda i: i)
+s[B].transform_layout(lambda m,n,p,q: [m,n,p,q])
 
 # One separator, produces a 2-d allocation with shape (M*N, P*Q).
-s[B].transform_layout(lambda i: [i[0], i[1], te.AXIS_SEPARATOR, i[2], i[3]])
+s[B].transform_layout(lambda m,n,p,q: [m, n, te.AXIS_SEPARATOR, p, q])
 
 # Two separators, produces a 3-d allocation with shape (M, N*P, Q).
-s[B].transform_layout(lambda i: [i[0], te.AXIS_SEPARATOR, i[1], i[2], te.AXIS_SEPARATOR, i[3]])
+s[B].transform_layout(lambda m,n,p,q: [m, te.AXIS_SEPARATOR, n, p, te.AXIS_SEPARATOR, q])
 
 # Can be used along with reorders and splits.
-s[B].transform_layout(lambda i: [i[0], i[3]//4, i[1], te.AXIS_SEPARATOR, i[2], i[3]%4])
+s[B].transform_layout(lambda m,n,p,q: [m, q//4, n, te.AXIS_SEPARATOR, p, q%4])
 ```
 
 
@@ -439,6 +438,41 @@ adjacent.
 
     Would require other passes to be aware of where a buffer was first
     defined, in order to add it to the appropriate location.
+    
+    
+- What arguments should the function passed to `transform_layout` accept?
+
+  In these examples, `N` is the number of dimensions of the array,
+  prior to the transformation.
+  
+  Option 3 is preferred.
+
+  - Option 1: Accept a list of length `N`.  Each element of the list
+    is a variable corresponding to a coordinate in the input tensor.
+    
+    This would be the simplest python implementation, but would
+    require additional configuration to have named variables in the
+    mapping.
+
+  - Option 2: Accept `N` named positional arguments (`func(i,j,k)`), where each argument is
+    a variable corresponding to a coordinate in the input tensor.
+    
+    This follows the usual method of defining the `fcompute` function
+    passed to `te.compute`.  This also allows the named variables to
+    be used as the names in TIR, improving readability.
+
+    However, this wouldn't allow utility functions that define
+    transformations that apply to an arbitrary number of indices, such
+    as a layout transformation that changes the last index, while
+    leaving the other `N-1` indices untouched.
+    
+  - Option 3: Accept either `N` named positional arguments
+    (`func(i,j,k)`), or a variable number of arguments
+    (`func(*indices)`).
+    
+    This follows the same convention as the `fcompute` function passed
+    to `te.compute`.  This would allow either an explicit listing of
+    all indices as named arguments, or an arbitrary number of indices.
 
 
 # Prior art
@@ -506,8 +540,11 @@ adjacent.
     - If a series of nested loops contains a `cache_read` or
       `cache_write` stage, can these be recognized and reordered?
 
-  - Option 3: Expose the `reorder_split` definition to be used as part
-    of a schedule definition.
+  - Option 3: Expose the transformed axes to be used as part of a
+    schedule definition.  In TE, the return value from `AA =
+    s[A].transform_layout(...)` would be a tensor, and the transformed
+    axes `AA.op.axis` can then be used for the remainder of the
+    schedule.
 
     This would allow the greatest flexibility, but would make the
     schedule dependent on the transformed layout, beyond the one

From 3a4f050b020c823f94b3aa3f0aeb8aec12d62990 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Mon, 1 Nov 2021 12:58:13 -0500
Subject: [PATCH 10/20] Added description of the BufferTransform node.

---
 rfcs/0039-buffer-physical-layout.md | 58 +++++++++++++++++++++++++----
 1 file changed, 51 insertions(+), 7 deletions(-)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index c0b98d8d..55f95df0 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -152,6 +152,50 @@ are deprecated.
     will instead create `BufferLoad` and `BufferStore` nodes with 1-d
     indices.
 
+- BufferTransform
+  - Indicates a transformation that should be performed to modify the
+    specified buffer.
+
+  - A possible structure for the `BufferTransform` node is shown below.
+
+    ```
+    class BufferTransform : public Stmt {
+    public:
+      // The buffer var to be transformed.  All buffers that have
+      // `BufferNode::data` equal to this `buffer_var` should have their
+      // physical layout rewritten.
+      Var buffer_var;
+
+      // The transformation to be applied to the buffer.
+      IndexMap layout_transformation;
+
+      // The statement containing buffer allocations/accesses that should
+      // be rewritten.
+      Stmt body;
+    };
+
+    class IndexMap : public Object {
+    public:
+      /*! \brief Variables representing the indices prior to remapping.
+       *
+       * If initial_index is empty, then final_index should also be
+       * empty, and no mapping is applied.
+       */
+      Array<Var> initial_index;
+
+      /*!
+       * \brief Expressions defining the indices after remapping.
+       *
+       * These expressions should only be in terms of the initial_index,
+       * and must be expressible as a `tvm::arith::IterSumExpr`.  The
+       * mapping from `initial_index` to `final_index` must be injective.
+       *
+       * If final_index is empty, then initial_index should also be
+       * empty, and the map is an identity function.
+       */
+      Array<PrimExpr> final_index;
+    };
+    ```
 
 - AllocateNode
   - Allocation of a buffer, in physical layout.
@@ -438,25 +482,25 @@ adjacent.
 
     Would require other passes to be aware of where a buffer was first
     defined, in order to add it to the appropriate location.
-    
-    
+
+
 - What arguments should the function passed to `transform_layout` accept?
 
   In these examples, `N` is the number of dimensions of the array,
   prior to the transformation.
-  
+
   Option 3 is preferred.
 
   - Option 1: Accept a list of length `N`.  Each element of the list
     is a variable corresponding to a coordinate in the input tensor.
-    
+
     This would be the simplest python implementation, but would
     require additional configuration to have named variables in the
     mapping.
 
   - Option 2: Accept `N` named positional arguments (`func(i,j,k)`), where each argument is
     a variable corresponding to a coordinate in the input tensor.
-    
+
     This follows the usual method of defining the `fcompute` function
     passed to `te.compute`.  This also allows the named variables to
     be used as the names in TIR, improving readability.
@@ -465,11 +509,11 @@ adjacent.
     transformations that apply to an arbitrary number of indices, such
     as a layout transformation that changes the last index, while
     leaving the other `N-1` indices untouched.
-    
+
   - Option 3: Accept either `N` named positional arguments
     (`func(i,j,k)`), or a variable number of arguments
     (`func(*indices)`).
-    
+
     This follows the same convention as the `fcompute` function passed
     to `te.compute`.  This would allow either an explicit listing of
     all indices as named arguments, or an arbitrary number of indices.

From 7fa99801671fe6ebf2424496a9f1850697b28f39 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Wed, 3 Nov 2021 11:41:49 -0500
Subject: [PATCH 11/20] Added details on te.AXIS_SEPARATOR

---
 rfcs/0039-buffer-physical-layout.md | 36 +++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index 55f95df0..c9364701 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -116,6 +116,15 @@ s[B].transform_layout(lambda m,n,p,q: [m, q//4, n, te.AXIS_SEPARATOR, p, q%4])
 ```
 
 
+The `te.AXIS_SEPARATOR` object exists only within the API interface,
+and does not have a representation within the generated TIR graph.
+Instead, it is used to indicate that the `BufferTransform` inserted to
+represent the row-major traversal of the N-d buffer to generates a
+flat 1-d index into the underlying array shouldn't be generated.
+Instead, the TIR graph will contain a `BufferTransform` that generates
+a M-d index, where `M` is one greater than the number of
+`te.AXIS_SEPARATOR` instances in the expression given.
+
 
 # Reference-level explanation
 [reference-level-explanation]: #reference-level-explanation
@@ -366,6 +375,7 @@ with Allocate(x, shape=[16 * (128/4) * 64, 64 * 4]):
 ```
 
 
+
 # Drawbacks
 [drawbacks]: #drawbacks
 
@@ -534,6 +544,32 @@ adjacent.
 # Unresolved questions
 [unresolved-questions]: #unresolved-questions
 
+- Should the `te.AXIS_SEPARATOR` appear in the TIR graph?
+
+  Option 1 is preferred.
+
+  - Option 1: The `te.AXIS_SEPARATOR` is a TE-specific concept, and
+    does not appear in the generated TIR graph.  Instead, it changes
+    the `BufferTransform` node that represent the flattening of
+    buffers to a device-supported number of indices.
+
+    This would be a unified way to represent all layout
+    transformations in the TIR graph, which may or may not change the
+    dimensionality of the buffer.  The flattening of buffers to a
+    device-supported dimensionality would be handled identically to
+    any other layout transformation, rather than having an implicit
+    row-major traversal.
+
+  - Option 2: The `te.AXIS_SEPARATOR` is represented in the TIR graph,
+    and alters the behavior of the `StorageFlatten` pass.  There is no
+    `BufferTransform` node that represents the flattening of
+
+    In a TIR graph without any other modifications, this would
+    maintain the current behavior of the `StorageFlatten` pass, which
+    reduces the N-d buffer to a 1-d buffer by a row-major traversal.
+    In a TIR graph with some additional annotation to represent the
+    `M` axis separators, the N-d buffer could instead be reduced to a
+    `M+1`-d buffer.
 
 - What is appropriate terminology for size/shape/extent of physical
   and logical buffers?

From d02a5ce8b397ac8a2e8268c2c30a27b7f488d212 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Thu, 4 Nov 2021 11:13:30 -0500
Subject: [PATCH 12/20] Updated buffer transformations to be in the
 PrimFunc::attrs

---
 rfcs/0039-buffer-physical-layout.md | 224 ++++++++++++++--------------
 1 file changed, 112 insertions(+), 112 deletions(-)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index c9364701..4c14c860 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -117,23 +117,22 @@ s[B].transform_layout(lambda m,n,p,q: [m, q//4, n, te.AXIS_SEPARATOR, p, q%4])
 
 
 The `te.AXIS_SEPARATOR` object exists only within the API interface,
-and does not have a representation within the generated TIR graph.
-Instead, it is used to indicate that the `BufferTransform` inserted to
-represent the row-major traversal of the N-d buffer to generates a
-flat 1-d index into the underlying array shouldn't be generated.
-Instead, the TIR graph will contain a `BufferTransform` that generates
-a M-d index, where `M` is one greater than the number of
-`te.AXIS_SEPARATOR` instances in the expression given.
+and is not part of the representation of the layout transformation
+within the generated TIR graph.  Instead, the TIR graph will contain
+an integer list of axis separators, to be used when flattening buffers
+to device-supported dimensions in the `StorageFlatten` or
+`FlattenBuffer` passes.
 
 
 # Reference-level explanation
 [reference-level-explanation]: #reference-level-explanation
 
-Transformation of a buffer is represented by a `BufferTransformNode`.
-It specifies a buffer to be reshaped, and the transformation to be
-applied to it.  Many of the utilities needed for this transformation
-already exist in `iter_affine_map.h`, and are used in the
-implementation.
+Transformation of a buffer is represented by the attribute
+`"buffer_layout_transformations"` in the `PrimFunc` attributes.  This
+is a map whose keys are buffer var to be reshaped, and whose values
+are the transformations to be applied.  Many of the utilities
+needed for this transformation already exist in `iter_affine_map.h`,
+and are used in the implementation.
 
 A buffer may be allocated with `AllocateNode`, and may be interacted
 with using `BufferLoadNode` and `BufferStoreNode`.
@@ -161,29 +160,48 @@ are deprecated.
     will instead create `BufferLoad` and `BufferStore` nodes with 1-d
     indices.
 
-- BufferTransform
-  - Indicates a transformation that should be performed to modify the
-    specified buffer.
 
-  - A possible structure for the `BufferTransform` node is shown below.
 
-    ```
-    class BufferTransform : public Stmt {
-    public:
-      // The buffer var to be transformed.  All buffers that have
-      // `BufferNode::data` equal to this `buffer_var` should have their
-      // physical layout rewritten.
-      Var buffer_var;
+- AllocateNode
+  - Allocation of a buffer, in physical layout.
 
-      // The transformation to be applied to the buffer.
-      IndexMap layout_transformation;
+  - Declares an allocation of a buffer.
 
-      // The statement containing buffer allocations/accesses that should
-      // be rewritten.
-      Stmt body;
-    };
+  - Change from previous behavior: Previously, `AllocateNode` held the
+    `buffer_var`, datatype, and buffer extents directly. After
+    implementation of this RFC, `AllocateNode` will instead hold the
+    `Buffer` that is to be allocated.
+
+
+- LoadNode/StoreNode
+  - Read/write of a 1-d buffer, given a `Var` pointer to the start of
+    the buffer and a single index.
 
-    class IndexMap : public Object {
+  - Deprecated, should instead use `BufferLoad` and `BufferStore` with
+    a 1-d index.
+
+
+## Impacted tir Transformations
+
+- `ApplyBufferTransforms`
+  - A new pass that takes as input a TIR graph that may have buffer
+    transformations stored in the `PrimFunc` attributes.  Returns
+    a TIR graph with all buffer transforms applied as specified.
+
+  - Rewrite `indices` in BufferStore/BufferLoad nodes based on the
+    specified transformation.
+
+  - The transformations are stored as a `Map<Var, Array<IndexMap>>` in
+    the `"buffer_layout_transformations"` attribute of a primfunc.
+    All buffers whose `BufferNode::data` is a key in this map should
+    have their physical layout rewritten.  If the array contains
+    multiple transformations, they are applied sequentially.
+
+    A possible structure for the `IndexMap` node is shown
+    below.
+
+    ```
+    class IndexMapNode : public Object {
     public:
       /*! \brief Variables representing the indices prior to remapping.
        *
@@ -206,48 +224,31 @@ are deprecated.
     };
     ```
 
-- AllocateNode
-  - Allocation of a buffer, in physical layout.
-
-  - Declares an allocation of a buffer.
-
-  - Change from previous behavior: Previously, `AllocateNode` held the
-    `buffer_var`, datatype, and buffer extents directly. After
-    implementation of this RFC, `AllocateNode` will instead hold the
-    `Buffer` that is to be allocated.
-
-
-- LoadNode/StoreNode
-  - Read/write of a 1-d buffer, given a `Var` pointer to the start of
-    the buffer and a single index.
-
-  - Deprecated, should instead use `BufferLoad` and `BufferStore` with
-    a 1-d index.
-
-
-## Impacted tir Transformations
-
-- `ApplyBufferTransform`
-  - A new pass that takes as input a TIR graph that may have
-    `BufferTransform` nodes present.  The return from
-    `ApplyBufferTransform` has all `BufferTransform` nodes removed,
-    with the buffers marked in them reordered as specified.
-
-  - Rewrite `indices` in BufferStore/BufferLoad nodes based on the
-    specified transformation.
+  - After applying the transformations, the
+    `"buffer_layout_transformations"` attribute should be removed.
+    This ensures that additional application of
+    `ApplyBufferTransforms` has no effect.
 
 - FlattenBuffer/StorageFlatten
-  - Can be implemented as the addition of a `BufferTransform`, which
-    is later lowered by `ApplyBufferTransform`.
 
   - Existing passes that convert from logical layout to physical
     layout for TE schedules (StorageFlatten) or TensorIR schedules
     (FlattenBuffer).
 
-  - Use the `N-1` axis separators specified in BufferNode to convert to
-    an N-d physical layout. The default of having 0 axis separators
-    will correspond to the previous behavior of flattening to a 1-d
-    physical layout.
+  - The transformations are stored as a `Map<Var, Array<IntImm>>` in
+    the `"buffer_axis_separators"` attribute of a primfunc.  All
+    buffers whose `BufferNode::data` is a key in this map should be
+    flattened to an output buffer of dimension
+    `separators[buf->data].size()+1`.  All other buffers should be
+    flattened to a 1-d output buffer.
+
+  - After flattening a buffer to an N-d output, the corresponding
+    value in the `"buffer_axis_separators"` attribute should be set to
+    `range(N-1)`.  This ensures that repeated application of the
+    flattening passes have no additional effect.  (The attribute
+    shouldn't be deleted entirely, as that would cause a flattened
+    buffer with `N` dimensions and an unflattened buffer with `N`
+    dimensions to have identical representations.)
 
 
 ## Examples
@@ -258,25 +259,25 @@ final version of TensorIR that implements this RFC.  Numeric values
 are shown unsimplified to indicate where they come from.
 
 The first example shows a 2-d buffer with no layout transformations
-explicitly specified.  The generated TIR includes a `BufferTransform`
-annotation to apply a row-major traversal and generate a flat 1-d
-buffer.
+explicitly specified.  The generated `PrimFunc` has no
+`"buffer_layout_transformations"` attribute, and so the default
+behavior is used, applying a row-major traversal to generate a flat
+1-d buffer.
 
 ```python
 # In TE schedule, no call to transform_layout.
 
 # Initial TIR graph
 x = Buffer(name="x", shape=[2,3])
-BufferTransform(x, lambda i,j: [i*x.shape[1] + j])
 with Allocate(x):
     val = BufferLoad(x, [10, 15])
     BufferStore(x, 7, [20, 23])
 
-# After applying the implicit flattening to 1-d
+# After flattening to 1-d
 x = Var(name="x")
 with Allocate(x, shape=[2*3]):
-    val = Load(x, index=[10*3 + 15])
-    Store(x, 7, index=[20*3 + 23])
+    val = BufferLoad(x, [10*3 + 15])
+    BufferStore(x, 7, [20*3 + 23])
 ```
 
 This next example shows a 2-d logical buffer, which is lowered to a
@@ -289,30 +290,28 @@ first index in the logical layout.
 # s[x].transform_layout(lambda i,j: [j,i])
 
 # Initial TIR graph
+attrs["buffer_layout_transformations"][x] = lambda i,j: [j,i]
 x = Buffer(name="x", shape=[2,3])
-BufferTransform(x, lambda i,j: [j,i])
-BufferTransform(x, lambda i,j: [i*x.shape[1] + j])
 with Allocate(x):
     val = BufferLoad(x, [10, 15])
     BufferStore(x, 7, [20, 23])
 
 # After applying the explicit reordering
 x = Buffer(name="x", shape=[3,2])
-BufferTransform(x, lambda i,j: [i*x.shape[1] + j])
 with Allocate(x):
-    val = BufferLoad(x, index=[15, 10])
-    BufferStore(x, 7, index=[23, 20])
+    val = BufferLoad(x, [15, 10])
+    BufferStore(x, 7, [23, 20])
 
-# After applying the implicit flattening to 1-d
+# After flattening to 1-d
 x = Var(name="x")
 with Allocate(x, shape=[3*2]):
-    val = Load(x, index=[15*2 + 10])
-    Store(x, 7, index=[23*2 + 20])
+    val = BufferLoad(x, [15*2 + 10])
+    BufferStore(x, 7, [23*2 + 20])
 ```
 
 The next example shows a remapping from NHWC logical layout to NCHWc
 physical layout.  The 4 logical axes are expanded to 5 logical axes
-during the SplitReorderIndices pass, then flattened into 1 physical
+during the `ApplyBufferTransforms` pass, then flattened into 1 physical
 axis during StorageFlatten/FlattenBuffer.
 
 ```python
@@ -320,22 +319,20 @@ axis during StorageFlatten/FlattenBuffer.
 # s[x].transform_layout(lambda n,h,w,c: [n, c//4, h, w, c%4])
 
 # Initial TIR graph
+attrs["buffer_layout_transformations"][x] = lambda n,h,w,c: [n, c//4, h, w, c%4]
 x = Buffer(name="x", shape=[16,64,64,128], reorder_splits=nhwc_to_nchwc, axis_separators=[])
-BufferTransform(x, lambda n,h,w,c: [n, c//4, h, w, c%4])
-BufferTransform(x, lambda n,C_outer,h,w,c_inner: [x.shape[4]*(x.shape[3]*(x.shape[2]*(x.shape[1]*n + C_outer) + h) + w) + c_inner]
 with Allocate(x):
     val = BufferLoad(x, [11, 37, 23, 101])
 
 # After applying the explicit reordering
 x = Buffer(name="x", shape=[16, 128/4, 64, 64, 4], reorder_splits=[], axis_separators=[])
-BufferTransform(x, lambda n,c_outer,h,w,c_inner: [x.shape[4]*(x.shape[3]*(x.shape[2]*(x.shape[1]*n + c_outer) + h) + w) + c_inner]
 with Allocate(x):
     val = BufferLoad(x, index=[11, floor(101/4), 37, 23, 101%4])
 
-# After applying the implicit flattening to 1-d
+# After flattening to 1-d
 x = Var(name="x")
 with Allocate(x, shape=[16 * (128/4) * 64 * 64 * 4]):
-    val = Load(x, index=[(128/4)*64*64*4*11 + 64*64*4*floor(101/4) + 64*4*37 + 4*23 + 101%4])
+    val = BufferLoad(x, index=[(128/4)*64*64*4*11 + 64*64*4*floor(101/4) + 64*4*37 + 4*23 + 101%4])
 ```
 
 Lastly, an example of remapping from `NHWC` logical layout to `NCHWc`
@@ -352,26 +349,26 @@ target-specific codegen.
 # s[x].transform_layout(lambda n,h,w,c: [n, c//4, h, te.AXIS_SEPARATOR, w, c%4])
 
 # Initial TIR graph
+attrs["buffer_layout_transformations"][x] = lambda n,h,w,c: [n, c//4, h, w, c%4]
+attrs["buffer_axis_separators"][x] = [2]
 x = Buffer(name="x", shape=[16,64,64,128])
-
-BufferTransform(x, lambda n,h,w,c: [n, c//4, h, w, c%4])
-BufferTransform(x, lambda n,c_outer,h,w,c_inner: [x.shape[1]*(x.shape[2]*c_outer + n) + h,
-                                                  x.shape[4]*w + c_inner])
 with Allocate(x):
     val = BufferLoad(x, [11, 37, 23, 101])
 
 # After applying the explicit reordering.
+attrs["buffer_axis_separators"][x] = [2]
 x = Buffer(name="x", shape=[16, 128/4, 64, 64, 4])
-BufferTransform(x, lambda n,c_outer,h,w,c_inner: [x.shape[1]*(x.shape[2]*c_outer + n) + h,
-                                                  x.shape[4]*w + c_inner])
 with Allocate(x):
     val = BufferLoad(x, index=[11, floor(101/4), 37, 23, 101%4])
 
-# After applying the implicit flattening.  The final result is 2-d,
-# due to the te.AXIS_SEPARATOR used in the `.transform_layout`.
+# After applying StorageFlatten or FlattenBuffer.  The final result is
+# 2-d, due to the te.AXIS_SEPARATOR used in the `.transform_layout`.
+# The `"buffer_axis_separators"` attribute is set to [0], to
+# distinguish this 2-d flattened buffer from a 2-d unflattened buffer.
+attrs["buffer_axis_separators"][x] = [0]
 x = Var(name="x")
 with Allocate(x, shape=[16 * (128/4) * 64, 64 * 4]):
-    val = Load(x, index=[(128/4)*64*11 + 64*floor(101/4) + 37, 4*23 + 101%4])
+    val = BufferLoad(x, index=[(128/4)*64*11 + 64*floor(101/4) + 37, 4*23 + 101%4])
 ```
 
 
@@ -406,26 +403,29 @@ adjacent.
   the tensors passed in will be in the specified format.
 
 
-- Should `BufferTransform` apply only to its body, or apply to the
-  entire graph it is contained in?
+- Should buffer transformations be a node within a TIR graph, or an
+  attribute?
+
+  Option 1 is preferred.
+
+  - Option 1: The transformations are stored in attributes of
+    `PrimFunc`.
 
-  Option 2 is preferred.
+    This makes it clear that the transformations apply to all uses of
+    the buffer within the graph, and are not scoped to some region of
+    the TIR graph.
 
-  - Option 1: A scope-limited `BufferTransform` would define a
-    transformation that should apply to any allocations, reads, or
-    writes that occur to the named buffer within the body of the
-    `BufferTransform`.  However, this couldn't apply to `PrimFunc`
-    arguments, which are outside of the scope of any node within the
-    body.
+  - Option 2: The transformations are stored in node that inherits
+    from `tir::Stmt`.
 
-  - Option 2: A `BufferTransform` that applies to all uses within the
-    entire graph may apply to a buffer that is declared outside of its
-    body.  This would especially be the case for buffers passed as
-    `PrimFunc` arguments.
+    This would be easier for other passes to visit using
+    `StmtVisitor`, if the layout transformations require modification.
+    However, it would add confusion if a `Stmt` impacts buffers far
+    outside its own scope.
 
 
 
-- When should the `tir::transform::LowerBufferTransforms` pass be
+- When should the `tir::transform::ApplyBufferTransforms` pass be
   applied?
 
   Applying it at the end of phase-2 in `driver_api.cc::CreatePassList`
@@ -442,12 +442,12 @@ adjacent.
 
 
 
-- Should `BufferTransform` re-use functionality of other nodes,
-  rather than being an independent node?
+- Should buffer transformations re-use functionality of other nodes?
 
   Option 1 is preferred.
 
-  - Option 1: Add `BufferTransform` as its own node.
+  - Option 1: Add buffer transformations as an attribute to the
+    `PrimFunc`.
 
   - Option 2: In TE-based schedules, `AttrStmtNode` could give the
     buffer to be transformed, along with the transformation to be

From 84ca0ad4370817cb5651f95fcce7c0018b32e1a2 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Thu, 4 Nov 2021 14:38:50 -0500
Subject: [PATCH 13/20] Added author list, Eric/Wuwei

---
 rfcs/0039-buffer-physical-layout.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index 4c14c860..30141503 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -1,4 +1,5 @@
 - Feature Name: Buffer Physical Layout
+- Authors: Eric Lunderberg (@Lunderberg), Wuwei Lin (@vinx13)
 - Start Date: 2021-10-05
 - RFC PR: [apache/tvm-rfcs#0039](https://github.com/apache/tvm-rfcs/pull/0039)
 - GitHub Issue: Not Yet Written

From 3fa627f59ff85697317655575f7e3d2b2885f9af Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Tue, 9 Nov 2021 09:35:16 -0600
Subject: [PATCH 14/20] Clarifying updates following comments from @areusch

---
 rfcs/0039-buffer-physical-layout.md | 76 +++++++++++++++--------------
 1 file changed, 39 insertions(+), 37 deletions(-)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index 30141503..f245d6f5 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -8,7 +8,7 @@
 # Summary
 [summary]: #summary
 
-This RFC introduces layout transformations that can be applies to a
+This RFC introduces layout transformations that can be applied to a
 buffer during the lowering process.  These transformations will be
 part of the schedule, allowing the same compute definition to be used
 across multiple different layouts.
@@ -21,12 +21,15 @@ strictly necessary for this change.
 [motivation]: #motivation
 
 Currently, TVM assumes that all buffers can be treated as flat memory.
-That is, while a tensor may have N dimensions, the underlying buffer
-allocated by the low-level codegen has a single value defining the
-size, and access into that buffer is done using a single index.  This
-assumptions holds for most cases, such as a CPU accessing RAM, but
-doesn't hold in all cases.  For example, texture memory on a GPU
-requires two indices to access.
+That is, while a rank-N tensor requires N values to describe its shape
+and N indices to identify a particular value within it, the underlying
+buffer allocated by the low-level codegen has a single value defining
+the size, and access into that buffer is done using a single index.
+This assumptions holds for most cases, such as a CPU accessing RAM,
+but doesn't hold in all cases.  For example, texture memory on a GPU
+requires two indices to access.  These are currently handled on a
+case-by-case basis, such as using `tvm::tir::builtin::texture2d_store`
+in a `CallNode`.
 
 In addition, computations that are semantically identical (e.g. 2-d
 convolution) require independent compute definitions and schedules
@@ -34,11 +37,11 @@ convolution) require independent compute definitions and schedules
 accepted as input.
 
 This RFC introduces a mechanism to specify transformations to be
-applied to the layout of buffers in memory, including the option to
-present multi-dimensional indices to the low-level code generators.
-This will allow for target-specific handling of non-flat memory, and
-will allow for code re-use across compute definitions that differ only
-in memory layout.
+applied to the layout of buffers in memory, along with a unified
+method of presenting multiple indices to the low-level code
+generators.  This will allow for target-specific handling of non-flat
+memory, and will allow for code re-use across compute definitions that
+differ only in memory layout.
 
 # Guide-level explanation
 [guide-level-explanation]: #guide-level-explanation
@@ -52,14 +55,13 @@ access is done using `tvm::tir::BufferLoad` and
 `tvm::tir::BufferStore` for reads and writes, respectively.
 
 When a TIR graph is passed into the low-level code generator
-`tvm::codegen::Build`, the dimensionality of each buffer must be
-supported by the target code generator.  Typically, this will mean
-generating a 1-dimensional index representing access into flat memory.
-Some code generators may attach alternative semantics for
-multi-dimensional buffers (e.g. 2-d buffers to represent texture
-memory on OpenCL).  A low-level code generator should check the
-dimensionality of the buffers it is acting on, and give a diagnostic
-error for unsupported dimensionality.
+`tvm::codegen::Build`, the rank of each buffer must be supported by
+the target code generator.  Typically, this will mean generating a
+single index representing access into flat memory.  Some code
+generators may attach alternative semantics for `rank>1`
+buffers (e.g. rank-2 buffers to represent texture memory on OpenCL).
+A low-level code generator should check the rank of the buffers it is
+acting on, and give a diagnostic error for unsupported rank.
 
 To define the layout transformation in a TE schedule, use the
 `transform_layout` method of a schedule, as shown below.  The
@@ -94,7 +96,7 @@ B_equivalent = te.compute(
 By default, after all explicitly specified layout transformations are
 applied, all axes are flattened to a single axis by following a
 row-major traversal.  This produces a 1-d buffer, which corresponds to
-flat memory.  To add additional dimensions in the physical layout,
+flat memory.  To produce `rank>1` buffers in the physical layout,
 insert `te.AXIS_SEPARATOR` into the axis list return by the physical
 layout function.  These define groups of axes, where each group is
 combined into a single physical axis.
@@ -121,8 +123,8 @@ The `te.AXIS_SEPARATOR` object exists only within the API interface,
 and is not part of the representation of the layout transformation
 within the generated TIR graph.  Instead, the TIR graph will contain
 an integer list of axis separators, to be used when flattening buffers
-to device-supported dimensions in the `StorageFlatten` or
-`FlattenBuffer` passes.
+to device-supported rank in the `StorageFlatten` or `FlattenBuffer`
+passes.
 
 
 # Reference-level explanation
@@ -239,7 +241,7 @@ are deprecated.
   - The transformations are stored as a `Map<Var, Array<IntImm>>` in
     the `"buffer_axis_separators"` attribute of a primfunc.  All
     buffers whose `BufferNode::data` is a key in this map should be
-    flattened to an output buffer of dimension
+    flattened to an output buffer of rank
     `separators[buf->data].size()+1`.  All other buffers should be
     flattened to a 1-d output buffer.
 
@@ -248,8 +250,8 @@ are deprecated.
     `range(N-1)`.  This ensures that repeated application of the
     flattening passes have no additional effect.  (The attribute
     shouldn't be deleted entirely, as that would cause a flattened
-    buffer with `N` dimensions and an unflattened buffer with `N`
-    dimensions to have identical representations.)
+    rank-`N` buffer and an unflattened rank-`N` buffer to have
+    identical representations.)
 
 
 ## Examples
@@ -497,8 +499,8 @@ adjacent.
 
 - What arguments should the function passed to `transform_layout` accept?
 
-  In these examples, `N` is the number of dimensions of the array,
-  prior to the transformation.
+  In these examples, the tensor is rank `N` prior to the
+  transformation.
 
   Option 3 is preferred.
 
@@ -509,8 +511,9 @@ adjacent.
     require additional configuration to have named variables in the
     mapping.
 
-  - Option 2: Accept `N` named positional arguments (`func(i,j,k)`), where each argument is
-    a variable corresponding to a coordinate in the input tensor.
+  - Option 2: Accept `N` named positional arguments (`func(i,j,k)`),
+    where each argument is a variable corresponding to a coordinate in
+    the input tensor.
 
     This follows the usual method of defining the `fcompute` function
     passed to `te.compute`.  This also allows the named variables to
@@ -556,10 +559,10 @@ adjacent.
 
     This would be a unified way to represent all layout
     transformations in the TIR graph, which may or may not change the
-    dimensionality of the buffer.  The flattening of buffers to a
-    device-supported dimensionality would be handled identically to
-    any other layout transformation, rather than having an implicit
-    row-major traversal.
+    rank of the buffer.  The flattening of buffers to a
+    device-supported rank would be handled identically to any other
+    layout transformation, rather than having an implicit row-major
+    traversal.
 
   - Option 2: The `te.AXIS_SEPARATOR` is represented in the TIR graph,
     and alters the behavior of the `StorageFlatten` pass.  There is no
@@ -594,9 +597,8 @@ adjacent.
 - How should loops over an array be handled when re-writing the shape?
 
   To avoid memory latency issues, loops should iterate over an array
-  sequentially sequentially when possible.  Iteration that is defined
-  in terms of the logical layout may be inappropriate for the physical
-  layout.
+  sequentially when possible.  Iteration that is defined in terms of
+  the logical layout may be inappropriate for the physical layout.
 
   Option 3 is preferred.
 

From bf9d46ac19e533c038eb3ea3c73bbbebb73e06cf Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Mon, 13 Dec 2021 09:40:35 -0600
Subject: [PATCH 15/20] Remove reference to RFC#0042

Since we decided against RFC#0042, the reference is no longer needed
here.
---
 rfcs/0039-buffer-physical-layout.md | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index f245d6f5..2f482255 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -3,7 +3,6 @@
 - Start Date: 2021-10-05
 - RFC PR: [apache/tvm-rfcs#0039](https://github.com/apache/tvm-rfcs/pull/0039)
 - GitHub Issue: Not Yet Written
-- Related RFCs: [RFC#0042](https://github.com/apache/tvm-rfcs/pull/0042)
 
 # Summary
 [summary]: #summary
@@ -11,11 +10,9 @@
 This RFC introduces layout transformations that can be applied to a
 buffer during the lowering process.  These transformations will be
 part of the schedule, allowing the same compute definition to be used
-across multiple different layouts.
-
-[RFC#0042](https://github.com/apache/tvm-rfcs/pull/0042) is intended
-to make these buffer transformations easier to write, though it isn't
-strictly necessary for this change.
+across multiple different layouts.  These transformations can produce
+either flat memory buffers or multi-dimensional memory buffers to be
+exposed to the low-level code generators.
 
 # Motivation
 [motivation]: #motivation

From 07f79ec12206be2c0a4269069c00220e27ab161d Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Mon, 13 Dec 2021 09:41:30 -0600
Subject: [PATCH 16/20] Updated with examples of returned loop iterators.

Also, removed implementation details that didn't match the
implementation.  A new stage wasn't necessary, and the loop iterators
could instead be updated in the existing stage, similar to `fuse()`
and `split()`.
---
 rfcs/0039-buffer-physical-layout.md | 32 +++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index 2f482255..ab17e650 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -69,8 +69,7 @@ arguments to `transform_layout` is a function that accepts a list of
 transformations are applied.
 
 For example, below defines the reordering from NHWC logical layout to
-NCHWc physical layout.  Similar to `cache_read` and `cache_write`, the
-`transform_layout` method introduces a new stage in the schedule.
+NCHWc physical layout.
 
 ```python
 # Compute definition, written in terms of NHWC logical axes
@@ -123,6 +122,35 @@ an integer list of axis separators, to be used when flattening buffers
 to device-supported rank in the `StorageFlatten` or `FlattenBuffer`
 passes.
 
+If the tensor whose layout is being transformed is the result of
+`te.compute`, then the loop iteration order over that tensor will be
+rewritten to be along the updated memory layout.  If the loop
+iteration order is modified, these new loop iteration variables will
+be returned from `transform_layout()`.
+
+```python
+A = te.placeholder(shape=[16,64,128])
+B = te.compute(A.shape, lambda i,j,k: 2*A[i,j,k])
+
+s = te.create_schedule(B.op)
+
+# A is an input placeholder, and doesn't have nested loops that
+# generate it.  Therefore, while the layout of A is rewritten along
+# with any reads/writes into A, there are no loop iterators to be
+# rewritten and no loop iterators are returned.
+s[A].transform_layout(lambda i,j,k: [i*64 + j, k//4, k%4])
+
+# B is a computed tensor, and is computed inside a sequence of nested
+# loops.  Therefore, when B's layout is rewritten, those nested loops
+# are also rewritten, and the corresponding loop iterators are
+# returned.
+i_outer, jk_merged, i_inner = s[B].transform_layout(lambda i,j,k: [i//4, 128*j + k, i%4])
+
+# The loop iterators returned by transform_layout() can be used later
+# in the schedule, if the iteration order should be different from the
+# layout order of the output tensor.
+s[B].reorder(i_outer, i_inner, jk_merged)
+```
 
 # Reference-level explanation
 [reference-level-explanation]: #reference-level-explanation

From 4734bf465497958383f322efd6e0c576aec18c14 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Tue, 14 Dec 2021 11:22:16 -0600
Subject: [PATCH 17/20] Updated examples of buffer flattening to have valid
 shape/indices.

Initial example used a buffer shape of `[2,3]`, which was smaller than
the indices used in the example.
---
 rfcs/0039-buffer-physical-layout.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index ab17e650..ed53048a 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -296,16 +296,16 @@ behavior is used, applying a row-major traversal to generate a flat
 # In TE schedule, no call to transform_layout.
 
 # Initial TIR graph
-x = Buffer(name="x", shape=[2,3])
+x = Buffer(name="x", shape=[64,128])
 with Allocate(x):
     val = BufferLoad(x, [10, 15])
     BufferStore(x, 7, [20, 23])
 
 # After flattening to 1-d
 x = Var(name="x")
-with Allocate(x, shape=[2*3]):
-    val = BufferLoad(x, [10*3 + 15])
-    BufferStore(x, 7, [20*3 + 23])
+with Allocate(x, shape=[64*128]):
+    val = BufferLoad(x, [10*128 + 15])
+    BufferStore(x, 7, [20*128 + 23])
 ```
 
 This next example shows a 2-d logical buffer, which is lowered to a
@@ -319,22 +319,22 @@ first index in the logical layout.
 
 # Initial TIR graph
 attrs["buffer_layout_transformations"][x] = lambda i,j: [j,i]
-x = Buffer(name="x", shape=[2,3])
+x = Buffer(name="x", shape=[64,128])
 with Allocate(x):
     val = BufferLoad(x, [10, 15])
     BufferStore(x, 7, [20, 23])
 
 # After applying the explicit reordering
-x = Buffer(name="x", shape=[3,2])
+x = Buffer(name="x", shape=[128,64])
 with Allocate(x):
     val = BufferLoad(x, [15, 10])
     BufferStore(x, 7, [23, 20])
 
 # After flattening to 1-d
 x = Var(name="x")
-with Allocate(x, shape=[3*2]):
-    val = BufferLoad(x, [15*2 + 10])
-    BufferStore(x, 7, [23*2 + 20])
+with Allocate(x, shape=[128*64]):
+    val = BufferLoad(x, [15*64 + 10])
+    BufferStore(x, 7, [23*64 + 20])
 ```
 
 The next example shows a remapping from NHWC logical layout to NCHWc

From de4e0395aebdf75cc1a60fb27bbf88db40c3ecd5 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Wed, 9 Feb 2022 12:04:11 -0600
Subject: [PATCH 18/20] Added discussion on buffer index conventions.

---
 rfcs/0039-buffer-physical-layout.md | 117 ++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index ed53048a..f184a2a3 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -558,6 +558,123 @@ adjacent.
     all indices as named arguments, or an arbitrary number of indices.
 
 
+- What convention should be used for buffer indexing?
+
+  Previously, the interpretation of an index into a buffer depended on
+  whether the buffer was being accessed with
+  `BufferStore`/`BufferLoad` (pre-flattening) or with `Store`/`Load`
+  (post-flattening).  Since the same data structures will be used at
+  all lowering stages, the indexing should have consistent semantics.
+
+  Option 1 is preferred.
+
+  - Option 1: When accessing a buffer, the type and offset are based on
+    `buffer->dtype`.
+
+    The offset of an element is given by `index *
+    sizeof(buffer->dtype)`.  The type of the element being accessed is
+    `buffer->dtype.with_lanes(index.lanes() * buffer->dtype.lanes())`.
+
+    This is the convention used by user-defined schedules in TE, and
+    in BufferLoad/BufferStore objects.  In this convention, scalar
+    loads and vectorized loads can be expressed for scalar buffers and
+    vectorized buffers.  Accessing a buffer to return a different
+    datatype requires declaring an aliasing buffer that shares the
+    same backing array.
+
+    ```python
+    @T.prim_func
+    def scalar_load_from_scalar_buffer(A: T.Buffer[(64,), "float32"]):
+        assert A[0].dtype == "float32"
+
+
+    @T.prim_func
+    def vector_load_from_vector_buffer(A: T.Buffer[(16,), "float32x4"]):
+        assert A[0].dtype == "float32x4"
+
+
+    @T.prim_func
+    def vector_load_from_vector_buffer(A: T.Buffer[(16,), "float32x4"]):
+        A_vector_2 = T.buffer_decl([32], "float32x2", data=A.data)
+        assert A[0].dtype == "float32x4"
+        assert A_vector_2[0].dtype == "float32x2"
+
+
+    @T.prim_func
+    def vector_load_from_scalar_buffer_option1(A: T.Buffer[(64,), "float32"]):
+        assert A[T.ramp(0, 1, 4)].dtype == "float32x4"
+
+
+    @T.prim_func
+    def vector_load_from_scalar_buffer_option2(A: T.Buffer[(64,), "float32"]):
+        A_vector = T.buffer_decl([16], "float32x4", data=A.data)
+        assert A_vector[0].dtype == "float32x4"
+
+
+    @T.prim_func
+    def scalar_load_from_vector_buffer(A: T.Buffer[(16,), "float32x4"]):
+        A_scalar = T.buffer_decl([64], "float32", data=A.data)
+        assert A_scalar[0].dtype == "float32"
+    ```
+
+    - Pro: The return type of `buf[0]` is always `buf.dtype`, even
+      when `buf.dtype` is a vectorized type.
+
+    - Pro: No changes needed on the user-defined schedules.
+
+    - Con: Requires updates to code generators to follow this new
+      convention.  However, the code generators will already require
+      updates to support BufferLoad/BufferStore.
+
+  - Option 2: When accessing a buffer, the type and offset are based on
+    `buffer->dtype.element_of()`.
+
+    The offset of an element is given by `index *
+    sizeof(buffer->dtype.element_of())`.  The type of the element
+    being accessed is `buffer->dtype.with_lanes(index.lanes())`.
+
+    Prior to this RFC, this is the convention used by Load/Store
+    nodes.  In this convention, scalar loads and vectorized loads can
+    be expressed for scalar buffers and vectorized buffers.  Accessing
+    a buffer to return a vectorized datatype requires using a
+    vectorized index, even if the buffer holds a vectorized datatype.
+
+    ```python
+    @T.prim_func
+    def scalar_load_from_scalar_buffer(A: T.Buffer[(64,), "float32"]):
+        assert A[0].dtype == "float32"
+
+
+    @T.prim_func
+    def vector_load_from_vector_buffer(A: T.Buffer[(16,), "float32x4"]):
+        assert A[T.ramp(0, 1, 4)].dtype == "float32x4"
+
+
+    @T.prim_func
+    def scalar_load_from_vector_buffer(A: T.Buffer[(16,), "float32x4"]):
+        assert A[0].dtype == "float32"
+
+
+    @T.prim_func
+    def vector_load_from_scalar_buffer(A: T.Buffer[(64,), "float32"]):
+        assert A[T.ramp(0, 1, 4)].dtype == "float32x4"
+    ```
+
+    - Pro: The number of lanes of output can be determined solely from
+      the index used to access the buffer.  That is, `A[0]` is
+      guaranteed to have one lane of output, and `A[Ramp(0, stride=1,
+      lanes=4)]` is guaranteed to have four lanes of output.
+
+    - Con: Access of a buffer with scalar index does not always have
+      the same datatype as the buffer.  If the buffer has a vectorized
+      datatype, then `buf[0].dtype != buf.dtype`.
+
+    - Con: Need explicit check for vectorized types at the codegen
+      level.
+
+    - Con: Requires updates to user-defined schedules.
+
+
 # Prior art
 [prior-art]: #prior-art
 

From 014994b175a526fc6211eba06c142fb97b96e248 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Fri, 11 Feb 2022 14:17:40 -0600
Subject: [PATCH 19/20] Typo, forgot to end a sentence.

---
 rfcs/0039-buffer-physical-layout.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index f184a2a3..fbf91663 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -171,7 +171,9 @@ are deprecated.
 ## Impacted TIR Nodes
 
 - BufferNode
-  - Describes a N-d buffer.  The layout of the buffer may be
+  - Describes a N-d buffer.  This may directly represent a tensor (N-d
+    buffer produced by TE), a flat memory array (1-d buffer as input
+    to the low-level codegen), or intermediates between them.
 
 - BufferRealizeNode
   - Realization of a buffer, in logical layout.

From b8e3c18c3d81f3a27580d42a24bbd77a79984976 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Tue, 15 Feb 2022 16:03:51 -0600
Subject: [PATCH 20/20] Typo fixup, plus integrating in the conclusions from
 comments.

---
 rfcs/0039-buffer-physical-layout.md | 70 ++++++++++++++---------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/rfcs/0039-buffer-physical-layout.md b/rfcs/0039-buffer-physical-layout.md
index fbf91663..ee865e0d 100644
--- a/rfcs/0039-buffer-physical-layout.md
+++ b/rfcs/0039-buffer-physical-layout.md
@@ -76,11 +76,10 @@ NCHWc physical layout.
 B = te.compute(A.shape, lambda n,h,w,c: A[n,h,w,c])
 s = te.create_schedule(B.op)
 
-def nhwc_to_nchwc(logical_axes):
-    n,h,w,c = logical_axes
+def nhwc_to_nchwc(n, h, w, c):
     return [n, c//4, h, w, c%4]
 
-B_nchwc = s[B].transform_layout(nhwc_to_nchwc)
+transformed_nchwc_axes = s[B].transform_layout(nhwc_to_nchwc)
 
 # Compute definition that would produce an equivalent physical layout
 B_equivalent = te.compute(
@@ -155,12 +154,17 @@ s[B].reorder(i_outer, i_inner, jk_merged)
 # Reference-level explanation
 [reference-level-explanation]: #reference-level-explanation
 
-Transformation of a buffer is represented by the attribute
-`"buffer_layout_transformations"` in the `PrimFunc` attributes.  This
-is a map whose keys are buffer var to be reshaped, and whose values
-are the transformations to be applied.  Many of the utilities
-needed for this transformation already exist in `iter_affine_map.h`,
-and are used in the implementation.
+For schedules written in either TE or TIR, the axis separators are stored
+in `BufferNode::axis_separators`.  For TIR-based schedules, the
+re-indexing of a buffer is performed on demand.  For TE-based schedules,
+the mapping used to re-index a buffer is stored in the
+`"layout_transform_map"` attribute of the `PrimFunc`, and is applied as
+part of lowering.  This attribute is a map whose keys are buffer var to
+be reshaped, and whose values are the transformations to be applied.
+
+Many of the utilities needed for this transformation already exist in
+`iter_affine_map.h`, and are used in the implementation.  For TIR-based
+schedules, the transformation primitive is appleid immediately.
 
 A buffer may be allocated with `AllocateNode`, and may be interacted
 with using `BufferLoadNode` and `BufferStoreNode`.
@@ -222,7 +226,7 @@ are deprecated.
     specified transformation.
 
   - The transformations are stored as a `Map<Var, Array<IndexMap>>` in
-    the `"buffer_layout_transformations"` attribute of a primfunc.
+    the `"layout_transform_map"` attribute of a primfunc.
     All buffers whose `BufferNode::data` is a key in this map should
     have their physical layout rewritten.  If the array contains
     multiple transformations, they are applied sequentially.
@@ -255,7 +259,7 @@ are deprecated.
     ```
 
   - After applying the transformations, the
-    `"buffer_layout_transformations"` attribute should be removed.
+    `"layout_transform_map"` attribute should be removed.
     This ensures that additional application of
     `ApplyBufferTransforms` has no effect.
 
@@ -265,20 +269,18 @@ are deprecated.
     layout for TE schedules (StorageFlatten) or TensorIR schedules
     (FlattenBuffer).
 
-  - The transformations are stored as a `Map<Var, Array<IntImm>>` in
-    the `"buffer_axis_separators"` attribute of a primfunc.  All
-    buffers whose `BufferNode::data` is a key in this map should be
-    flattened to an output buffer of rank
-    `separators[buf->data].size()+1`.  All other buffers should be
-    flattened to a 1-d output buffer.
+  - The transformations are stored in the `Buffer` object as the
+    `BufferNode::axis_separators`.  All buffers that share the same
+    `BufferNode::data` should be flattened to an
+    output buffer of rank `axis_separators.size()+1`.  All other
+    buffers should be flattened to a 1-d output buffer.
 
   - After flattening a buffer to an N-d output, the corresponding
-    value in the `"buffer_axis_separators"` attribute should be set to
-    `range(N-1)`.  This ensures that repeated application of the
-    flattening passes have no additional effect.  (The attribute
-    shouldn't be deleted entirely, as that would cause a flattened
-    rank-`N` buffer and an unflattened rank-`N` buffer to have
-    identical representations.)
+    value in the `axis_separators` should be set to `range(N-1)`.
+    This ensures that repeated application of the flattening passes
+    have no additional effect.  (The list shouldn't be deleted
+    entirely, as that would cause a flattened rank-`N` buffer and an
+    unflattened rank-`N` buffer to have identical representations.)
 
 
 ## Examples
@@ -290,7 +292,7 @@ are shown unsimplified to indicate where they come from.
 
 The first example shows a 2-d buffer with no layout transformations
 explicitly specified.  The generated `PrimFunc` has no
-`"buffer_layout_transformations"` attribute, and so the default
+`"layout_transform_map"` attribute, and so the default
 behavior is used, applying a row-major traversal to generate a flat
 1-d buffer.
 
@@ -320,7 +322,7 @@ first index in the logical layout.
 # s[x].transform_layout(lambda i,j: [j,i])
 
 # Initial TIR graph
-attrs["buffer_layout_transformations"][x] = lambda i,j: [j,i]
+attrs["layout_transform_map"][x] = lambda i,j: [j,i]
 x = Buffer(name="x", shape=[64,128])
 with Allocate(x):
     val = BufferLoad(x, [10, 15])
@@ -349,7 +351,7 @@ axis during StorageFlatten/FlattenBuffer.
 # s[x].transform_layout(lambda n,h,w,c: [n, c//4, h, w, c%4])
 
 # Initial TIR graph
-attrs["buffer_layout_transformations"][x] = lambda n,h,w,c: [n, c//4, h, w, c%4]
+attrs["layout_transform_map"][x] = lambda n,h,w,c: [n, c//4, h, w, c%4]
 x = Buffer(name="x", shape=[16,64,64,128], reorder_splits=nhwc_to_nchwc, axis_separators=[])
 with Allocate(x):
     val = BufferLoad(x, [11, 37, 23, 101])
@@ -379,24 +381,22 @@ target-specific codegen.
 # s[x].transform_layout(lambda n,h,w,c: [n, c//4, h, te.AXIS_SEPARATOR, w, c%4])
 
 # Initial TIR graph
-attrs["buffer_layout_transformations"][x] = lambda n,h,w,c: [n, c//4, h, w, c%4]
-attrs["buffer_axis_separators"][x] = [2]
-x = Buffer(name="x", shape=[16,64,64,128])
+attrs["layout_transform_map"][x] = lambda n,h,w,c: [n, c//4, h, w, c%4]
+x = Buffer(name="x", shape=[16,64,64,128], axis_separators=[2])
 with Allocate(x):
     val = BufferLoad(x, [11, 37, 23, 101])
 
 # After applying the explicit reordering.
-attrs["buffer_axis_separators"][x] = [2]
-x = Buffer(name="x", shape=[16, 128/4, 64, 64, 4])
+x = Buffer(name="x", shape=[16, 128/4, 64, 64, 4], axis_separators=[2])
 with Allocate(x):
     val = BufferLoad(x, index=[11, floor(101/4), 37, 23, 101%4])
 
 # After applying StorageFlatten or FlattenBuffer.  The final result is
 # 2-d, due to the te.AXIS_SEPARATOR used in the `.transform_layout`.
-# The `"buffer_axis_separators"` attribute is set to [0], to
-# distinguish this 2-d flattened buffer from a 2-d unflattened buffer.
-attrs["buffer_axis_separators"][x] = [0]
-x = Var(name="x")
+# The `axis_separators` are set to [0], to distinguish this 2-d flattened
+# buffer from a 2-d unflattened buffer.
+
+x = Buffer(name="x", shape=[16 * (128/4) * 64, 64*4], axis_separators=[0])
 with Allocate(x, shape=[16 * (128/4) * 64, 64 * 4]):
     val = BufferLoad(x, index=[(128/4)*64*11 + 64*floor(101/4) + 37, 4*23 + 101%4])
 ```