From e2e0072464ad265f272040b84b67b294ac2e535d Mon Sep 17 00:00:00 2001
From: Matthew Barrett <matthew.barrett@arm.com>
Date: Mon, 1 Nov 2021 18:03:51 +0000
Subject: [PATCH 1/4] [microNPU][3] Plan generation for the cascader

The cascader creates 'Plans' which describe how
to schedule subgraphs. As part of the cascading
algorithm, it's necessary to explore a large
variety of Plans which are Pareto optimal (in
terms of memory usage and performance). This is
done by the Plan generation algorithm.

This commit adds the TensorConfig and Plan data
structures which hold information on how to schedule
the tensors/operators. Additionally, it includes
functions to calculate Pareto frontiers which are
used to cull sub-optimal Plans.

Change-Id: Ia358b2a1b29bd810df4441027752ced75812ad4e
---
 .../tvm/contrib/ethosu/cascader/__init__.py   |   3 +
 .../ethosu/cascader/cascader_options.py       |  45 ++
 python/tvm/contrib/ethosu/cascader/pareto.py  |  39 ++
 python/tvm/contrib/ethosu/cascader/plan.py    |  99 ++++
 .../contrib/ethosu/cascader/plan_generator.py |  51 ++
 .../contrib/ethosu/cascader/tensor_config.py  | 118 ++++
 .../ethosu/cascader/cascader_options.cc       |  59 ++
 .../ethosu/cascader/cascader_options.h        |  71 +++
 src/contrib/ethosu/cascader/pareto.cc         | 140 +++++
 src/contrib/ethosu/cascader/pareto.h          |  73 +++
 src/contrib/ethosu/cascader/plan.cc           | 112 ++++
 src/contrib/ethosu/cascader/plan.h            | 187 ++++++
 src/contrib/ethosu/cascader/plan_generator.cc | 530 ++++++++++++++++++
 src/contrib/ethosu/cascader/plan_generator.h  | 108 ++++
 src/contrib/ethosu/cascader/tensor_config.cc  | 177 ++++++
 src/contrib/ethosu/cascader/tensor_config.h   | 226 ++++++++
 .../contrib/test_ethosu/cascader/conftest.py  |  65 +++
 .../contrib/test_ethosu/cascader/infra.py     |  29 +
 .../test_ethosu/cascader/test_pareto.py       | 145 +++++
 .../contrib/test_ethosu/cascader/test_plan.py | 244 ++++++++
 .../cascader/test_plan_generator.py           | 179 ++++++
 .../cascader/test_tensor_config.py            | 110 ++++
 22 files changed, 2810 insertions(+)
 create mode 100644 python/tvm/contrib/ethosu/cascader/cascader_options.py
 create mode 100644 python/tvm/contrib/ethosu/cascader/pareto.py
 create mode 100644 python/tvm/contrib/ethosu/cascader/plan.py
 create mode 100644 python/tvm/contrib/ethosu/cascader/plan_generator.py
 create mode 100644 python/tvm/contrib/ethosu/cascader/tensor_config.py
 create mode 100644 src/contrib/ethosu/cascader/cascader_options.cc
 create mode 100644 src/contrib/ethosu/cascader/cascader_options.h
 create mode 100644 src/contrib/ethosu/cascader/pareto.cc
 create mode 100644 src/contrib/ethosu/cascader/pareto.h
 create mode 100644 src/contrib/ethosu/cascader/plan.cc
 create mode 100644 src/contrib/ethosu/cascader/plan.h
 create mode 100644 src/contrib/ethosu/cascader/plan_generator.cc
 create mode 100644 src/contrib/ethosu/cascader/plan_generator.h
 create mode 100644 src/contrib/ethosu/cascader/tensor_config.cc
 create mode 100644 src/contrib/ethosu/cascader/tensor_config.h
 create mode 100644 tests/python/contrib/test_ethosu/cascader/test_pareto.py
 create mode 100644 tests/python/contrib/test_ethosu/cascader/test_plan.py
 create mode 100644 tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
 create mode 100644 tests/python/contrib/test_ethosu/cascader/test_tensor_config.py

diff --git a/python/tvm/contrib/ethosu/cascader/__init__.py b/python/tvm/contrib/ethosu/cascader/__init__.py
index 03753b4049bb..f151fa9bc418 100644
--- a/python/tvm/contrib/ethosu/cascader/__init__.py
+++ b/python/tvm/contrib/ethosu/cascader/__init__.py
@@ -34,3 +34,6 @@
 )
 from .parts import InlinePart, EthosuPart
 from .device_config import EthosuDeviceConfig
+from .tensor_config import TensorConfigState, MemoryRegion, TensorConfig
+from .plan import Plan
+from .cascader_options import CascaderOptions
diff --git a/python/tvm/contrib/ethosu/cascader/cascader_options.py b/python/tvm/contrib/ethosu/cascader/cascader_options.py
new file mode 100644
index 000000000000..83a222589deb
--- /dev/null
+++ b/python/tvm/contrib/ethosu/cascader/cascader_options.py
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Object to hold options for the NPU cascader"""
+import tvm._ffi
+
+from tvm.runtime import Object
+
+from . import _ffi_api
+from .tensor_config import MemoryRegion
+
+
+@tvm._ffi.register_object("contrib.ethosu.cascader.CascaderOptions")
+class CascaderOptions(Object):
+    """A class to hold configuration options for the cascader."""
+
+    def __init__(
+        self,
+        cascade_region: MemoryRegion,
+        max_proposals: int,
+        stripe_factors: int,
+        max_plan_size: int,
+        always_copy_size: int,
+    ):
+        self.__init_handle_by_constructor__(
+            _ffi_api.CascaderOptions,
+            cascade_region,
+            max_proposals,
+            stripe_factors,
+            max_plan_size,
+            always_copy_size,
+        )
diff --git a/python/tvm/contrib/ethosu/cascader/pareto.py b/python/tvm/contrib/ethosu/cascader/pareto.py
new file mode 100644
index 000000000000..985937696b2e
--- /dev/null
+++ b/python/tvm/contrib/ethosu/cascader/pareto.py
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Pareto optimisation functions for the NPU cascader."""
+from typing import List
+
+from tvm import Object
+
+from . import _ffi_api
+from .plan import Plan
+
+
+def get_pareto_frontier(costs: List[List[float]]) -> List[bool]:
+    for i, cost in enumerate(costs):
+        for j, value in enumerate(cost):
+            costs[i][j] = float(value)
+
+    return [bool(v) for v in _ffi_api.GetParetoFrontier(costs)]
+
+
+def thin_vector(vec: List[Object], max_size: int) -> List[Object]:
+    return list(_ffi_api.ThinVector(vec, max_size))
+
+
+def pareto_cull_plans(plans: List[Plan], max_plans: int) -> List[Plan]:
+    return list(_ffi_api.ParetoCullPlans(plans, max_plans))
diff --git a/python/tvm/contrib/ethosu/cascader/plan.py b/python/tvm/contrib/ethosu/cascader/plan.py
new file mode 100644
index 000000000000..1d2396a8026e
--- /dev/null
+++ b/python/tvm/contrib/ethosu/cascader/plan.py
@@ -0,0 +1,99 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Plan class to hold subgraph scheduling information."""
+from typing import Dict, FrozenSet
+import tvm._ffi
+
+from tvm.runtime import Object
+
+from . import _ffi_api
+from .graph import Tensor, Part
+from .tensor_config import TensorConfig, MemoryRegion
+
+
+@tvm._ffi.register_object("contrib.ethosu.cascader.Plan")
+class Plan(Object):
+    """Plan class"""
+
+    def __init__(
+        self,
+        tensor_configs: Dict[Tensor, TensorConfig],
+        open_configs: FrozenSet[TensorConfig],
+        output_config: TensorConfig,
+        part_group: FrozenSet[Part],
+        interior_region: MemoryRegion,
+        memory_usage: int,
+        cycles: int,
+    ):
+        self.__init_handle_by_constructor__(
+            _ffi_api.Plan,
+            list(tensor_configs.values()),
+            list(open_configs),
+            output_config,
+            list(part_group),
+            interior_region,
+            memory_usage,
+            cycles,
+        )
+
+    def merge(self, other):
+        return _ffi_api.PlanMerge(self, other)
+
+    def benchmark_merge(self, other, repeats):
+        return _ffi_api.PlanMergeBenchmark(self, other, repeats)
+
+    @property
+    def tensor_configs(self):
+        tensor_configs = {}
+        for config in self._tensor_configs:
+            tensor_configs[config.tensor] = config
+        return tensor_configs
+
+    @property
+    def open_configs(self):
+        return frozenset(self._open_configs)
+
+    @property
+    def output_config(self):
+        return self._output_config
+
+    @property
+    def part_group(self):
+        return frozenset(self._part_group)
+
+    @property
+    def interior_region(self):
+        return self._interior_region
+
+    @property
+    def memory_usage(self):
+        return self._memory_usage
+
+    @property
+    def cycles(self):
+        return self._cycles
+
+    def __repr__(self):
+        return (
+            f"Plan(tensor_configs={self.tensor_configs}, "
+            f"open_configs={self.open_configs}, "
+            f"output_config={self.output_config}, "
+            f"part_group={self.part_group}, "
+            f"interior_region={self.interior_region.name}, "
+            f"memory_usage={self.memory_usage}, "
+            f"cycles={self.cycles}, "
+        )
diff --git a/python/tvm/contrib/ethosu/cascader/plan_generator.py b/python/tvm/contrib/ethosu/cascader/plan_generator.py
new file mode 100644
index 000000000000..b36799d903cf
--- /dev/null
+++ b/python/tvm/contrib/ethosu/cascader/plan_generator.py
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Algorithms to generate Plans for a CascaderGraph."""
+from typing import List, Dict
+
+from tvm.contrib.ethosu.cascader.tensor_config import MemoryRegion
+
+from . import _ffi_api
+from .cascader_options import CascaderOptions
+from .plan import Plan
+from .stripe_config import StripeConfig
+from .graph import CascaderGraph, Part, Tensor
+
+
+def generate_output_stripe_configs(part: Part, stripe_factors: int) -> List[StripeConfig]:
+    return list(_ffi_api.GenerateOutputStripeConfigs(part, stripe_factors))
+
+
+def generate_single_plans(
+    part: Part,
+    output_stripe_configs: List[StripeConfig],
+    home_map: Dict[Tensor, List[MemoryRegion]],
+    cascade_region: MemoryRegion,
+) -> List[Plan]:
+    return list(_ffi_api.GenerateSinglePlans(part, output_stripe_configs, home_map, cascade_region))
+
+
+def generate_graph_plans(
+    graph: CascaderGraph,
+    home_map: Dict[Tensor, List[MemoryRegion]],
+    options: CascaderOptions,
+):
+    return _ffi_api.GenerateGraphPlans(
+        graph,
+        home_map,
+        options,
+    )
diff --git a/python/tvm/contrib/ethosu/cascader/tensor_config.py b/python/tvm/contrib/ethosu/cascader/tensor_config.py
new file mode 100644
index 000000000000..493c45ea0f40
--- /dev/null
+++ b/python/tvm/contrib/ethosu/cascader/tensor_config.py
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tensor config class to hold tensor scheduling information."""
+from typing import List, Union
+from enum import IntEnum
+import tvm._ffi
+from tvm.contrib.ethosu.cascader.stripe_config import StripeConfig
+
+from tvm.runtime import Object
+
+from . import _ffi_api
+from .stripe_config import StripeConfig
+from .graph import Tensor, BufferMode
+
+
+class TensorConfigState(IntEnum):
+    BOUNDARY = 0
+    INTERIOR = 1
+
+
+@tvm._ffi.register_object("contrib.ethosu.cascader.MemoryRegion")
+class MemoryRegion(Object):
+    """MemoryRegion class"""
+
+    def __init__(self, name: str, size: int, read_bandwidth: int, write_bandwidth: int):
+        self.__init_handle_by_constructor__(
+            _ffi_api.MemoryRegion, name, size, read_bandwidth, write_bandwidth
+        )
+
+
+@tvm._ffi.register_object("contrib.ethosu.cascader.TensorConfig")
+class TensorConfig(Object):
+    """TensorConfig class"""
+
+    def __init__(
+        self,
+        tensor: Tensor,
+        home_region: MemoryRegion,
+        state: TensorConfigState,
+        buffer_mode: BufferMode,
+        stripe_configs: List[StripeConfig],
+        copy_tensor: bool = False,
+        copy_region: Union[MemoryRegion, None] = None,
+    ):
+        if copy_region is None:
+            copy_region = home_region
+        self.__init_handle_by_constructor__(
+            _ffi_api.TensorConfig,
+            tensor,
+            home_region,
+            state,
+            buffer_mode,
+            stripe_configs,
+            copy_tensor,
+            copy_region,
+        )
+
+    def get_buffer_size(self):
+        return _ffi_api.TensorConfigGetBufferSize(self)
+
+    @property
+    def tensor(self):
+        return self._tensor
+
+    @property
+    def home_region(self):
+        return self._home_region
+
+    @property
+    def state(self):
+        return TensorConfigState(self._state)
+
+    @property
+    def buffer_mode(self):
+        return BufferMode(self._buffer_mode)
+
+    @property
+    def stripe_configs(self):
+        return list(self._stripe_configs)
+
+    @property
+    def copy_tensor(self):
+        return bool(self._copy_tensor)
+
+    @property
+    def copy_region(self):
+        return self._copy_region
+
+    def __hash__(self):
+        return self._hash
+
+    def __eq__(self, other):
+        return _ffi_api.TensorConfigEqual(self, other)
+
+    def __repr__(self):
+        return (
+            f"TensorConfig(tensor={self.tensor}, "
+            f"home_region={self.home_region.name}, "
+            f"state={self.state.name}, "
+            f"buffer_mode={self.buffer_mode.name}, "
+            f"stripe_configs={self.stripe_configs}, "
+            f"copy_tensor={self.copy_tensor}, "
+            f"copy_region={self.copy_region.name}"
+        )
diff --git a/src/contrib/ethosu/cascader/cascader_options.cc b/src/contrib/ethosu/cascader/cascader_options.cc
new file mode 100644
index 000000000000..fb4b07940e2c
--- /dev/null
+++ b/src/contrib/ethosu/cascader/cascader_options.cc
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "cascader_options.h"
+
+#include <utility>
+
+namespace tvm {
+namespace contrib {
+namespace ethosu {
+namespace cascader {
+
+void CascaderOptionsNode::VisitAttrs(AttrVisitor* v) {
+  v->Visit("cascade_region", &cascade_region);
+  v->Visit("max_proposals", &max_proposals);
+  v->Visit("stripe_factors", &stripe_factors);
+  v->Visit("max_plan_size", &max_plan_size);
+  v->Visit("always_copy_size", &always_copy_size);
+}
+
+CascaderOptions::CascaderOptions(const MemoryRegion& cascade_region, int max_proposals,
+                                 int stripe_factors, int max_plan_size, int always_copy_size) {
+  auto n = make_object<CascaderOptionsNode>();
+  n->cascade_region = std::move(cascade_region);
+  n->max_proposals = max_proposals;
+  n->stripe_factors = stripe_factors;
+  n->max_plan_size = max_plan_size;
+  n->always_copy_size = always_copy_size;
+  data_ = std::move(n);
+}
+
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.CascaderOptions")
+    .set_body_typed([](MemoryRegion cascade_region, int max_proposals, int stripe_factors,
+                       int max_plan_size, int always_copy_size) {
+      return CascaderOptions(cascade_region, max_proposals, stripe_factors, max_plan_size,
+                             always_copy_size);
+    });
+
+TVM_REGISTER_NODE_TYPE(CascaderOptionsNode);
+
+}  // namespace cascader
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace tvm
diff --git a/src/contrib/ethosu/cascader/cascader_options.h b/src/contrib/ethosu/cascader/cascader_options.h
new file mode 100644
index 000000000000..135de784ad3c
--- /dev/null
+++ b/src/contrib/ethosu/cascader/cascader_options.h
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/contrib/ethosu/cascader/cascader_options.h
+ * \brief Class to store configuration options for the NPU cascader
+ */
+#ifndef TVM_CONTRIB_ETHOSU_CASCADER_CASCADER_OPTIONS_H_
+#define TVM_CONTRIB_ETHOSU_CASCADER_CASCADER_OPTIONS_H_
+
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/object.h>
+
+#include "tensor_config.h"
+
+namespace tvm {
+namespace contrib {
+namespace ethosu {
+namespace cascader {
+
+/*! \brief Node to represent CascaderOptions */
+class CascaderOptionsNode : public Object {
+ public:
+  void VisitAttrs(AttrVisitor* v);
+
+  /*! \brief The MemoryRegion to place cascading buffer into. */
+  MemoryRegion cascade_region;
+  /*! \brief The maximum number of Proposals to generate. */
+  int max_proposals;
+  /*! \brief How many striping factors to try per axis. */
+  int stripe_factors;
+  /*! \brief The maximum number of Parts in a Plan. */
+  int max_plan_size;
+  /*! \brief The maximum size of Tensor that will always be copied into the cascade region. */
+  int always_copy_size;
+
+  static constexpr const char* _type_key = "contrib.ethosu.cascader.CascaderOptions";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CascaderOptionsNode, Object)
+};
+
+/*! \brief A class to hold configuration options for the cascader. */
+class CascaderOptions : public ObjectRef {
+ public:
+  CascaderOptions(const MemoryRegion& cascade_region, int max_proposals, int stripe_factors,
+                  int max_plan_size, int always_copy_size);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(CascaderOptions, ObjectRef, CascaderOptionsNode);
+};
+
+}  // namespace cascader
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace tvm
+
+#endif  // TVM_CONTRIB_ETHOSU_CASCADER_CASCADER_OPTIONS_H_
diff --git a/src/contrib/ethosu/cascader/pareto.cc b/src/contrib/ethosu/cascader/pareto.cc
new file mode 100644
index 000000000000..255719088cb0
--- /dev/null
+++ b/src/contrib/ethosu/cascader/pareto.cc
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "pareto.h"
+
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/map.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/registry.h>
+
+#include <array>
+#include <utility>
+#include <vector>
+
+#include "common.h"
+#include "plan.h"
+
+namespace tvm {
+namespace contrib {
+namespace ethosu {
+namespace cascader {
+
+template <int N>
+std::vector<bool> GetParetoFrontier(const std::vector<std::array<float, N>>& costs) {
+  std::vector<bool> is_optimal(costs.size(), true);
+  for (size_t i = 0; i < costs.size(); i++) {
+    if (is_optimal[i]) {
+      for (size_t j = 0; j < costs.size(); j++) {
+        if (is_optimal[j]) {
+          bool optimal = false;
+          for (size_t k = 0; k < N; k++) {
+            if (costs[i][k] > costs[j][k]) {
+              optimal = true;
+              break;
+            }
+          }
+          is_optimal[j] = optimal;
+        }
+      }
+      is_optimal[i] = true;
+    }
+  }
+  return is_optimal;
+}
+
+template <class T>
+std::vector<T> ThinVector(const std::vector<T>& vec, size_t max_size) {
+  if (max_size < 1) {
+    return std::vector<T>();
+  }
+  if (vec.size() <= max_size || vec.size() == 0) {
+    return vec;
+  }
+  if (max_size == 1) {
+    return std::vector<T>{vec[0]};
+  }
+  std::vector<T> thin_vec;
+  float step = static_cast<float>(vec.size()) / static_cast<float>(max_size - 1);
+  for (float i = 0; i < vec.size() - 1; i += step) {
+    thin_vec.push_back(vec[static_cast<int>(i)]);
+  }
+  thin_vec.push_back(vec.back());
+  return thin_vec;
+}
+
+std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans) {
+  if (plans.size() <= max_plans) {
+    return plans;
+  }
+  std::sort(plans.begin(), plans.end(), [](const Plan& a, const Plan& b) -> bool {
+    return a->GetMemoryUsage() < b->GetMemoryUsage();
+  });
+  std::vector<std::array<float, 2>> costs;
+  for (const auto& plan : plans) {
+    std::array<float, 2> cost = {static_cast<float>(plan->GetMemoryUsage()),
+                                 static_cast<float>(plan->GetCycles())};
+    costs.emplace_back(cost);
+  }
+  std::vector<bool> is_optimal = GetParetoFrontier<2>(costs);
+  std::vector<Plan> optimal_plans;
+  size_t i = 0;
+  for (bool optimal : is_optimal) {
+    if (optimal) {
+      optimal_plans.push_back(plans[i]);
+    }
+    i++;
+  }
+  if (optimal_plans.size() <= max_plans) {
+    return optimal_plans;
+  }
+  return ThinVector(optimal_plans, max_plans);
+}
+
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GetParetoFrontier")
+    .set_body_typed([](Array<Array<FloatImm>> tcosts) {
+      std::vector<std::array<float, 2>> costs;
+      for (const auto& tcost : tcosts) {
+        ICHECK_EQ(tcost.size(), 2);
+        std::array<float, 2> point = {static_cast<float>(tcost[0]->value),
+                                      static_cast<float>(tcost[1]->value)};
+        costs.push_back(point);
+      }
+      Array<Bool> is_optimal;
+      for (bool opt : GetParetoFrontier<2>(costs)) {
+        is_optimal.push_back(Bool(opt));
+      }
+      return is_optimal;
+    });
+
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.ThinVector")
+    .set_body_typed([](Array<ObjectRef> vec, int max_size) {
+      std::vector<ObjectRef> vvec(vec.begin(), vec.end());
+      return Array<ObjectRef>(ThinVector<ObjectRef>(vvec, max_size));
+    });
+
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.ParetoCullPlans")
+    .set_body_typed([](Array<Plan> plans, int max_size) {
+      std::vector<Plan> vplans(plans.begin(), plans.end());
+      return Array<Plan>(ParetoCullPlans(vplans, max_size));
+    });
+
+}  // namespace cascader
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace tvm
diff --git a/src/contrib/ethosu/cascader/pareto.h b/src/contrib/ethosu/cascader/pareto.h
new file mode 100644
index 000000000000..6b0cd4e294ac
--- /dev/null
+++ b/src/contrib/ethosu/cascader/pareto.h
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/contrib/ethosu/cascader/pareto.h
+ * \brief Pareto optimisation functions for the NPU cascader
+ */
+#ifndef TVM_CONTRIB_ETHOSU_CASCADER_PARETO_H_
+#define TVM_CONTRIB_ETHOSU_CASCADER_PARETO_H_
+
+#include <tvm/ir/expr.h>
+#include <tvm/runtime/container/array.h>
+
+#include <algorithm>
+#include <array>
+#include <vector>
+
+namespace tvm {
+namespace contrib {
+namespace ethosu {
+namespace cascader {
+
+class Plan;
+
+/*!
+ * \brief Determine the Pareto optimal points.
+ * \param costs The points as a vector of N-dimensional costs.
+ * \return A vector that is true where a point is Pareto optimal and false otherwise.
+ */
+template <int N>
+std::vector<bool> GetParetoFrontier(const std::vector<std::array<float, N>>& costs);
+
+/*!
+ * \brief Evenly sample items from a vector to reduce its size.
+ * \param vec The vector to thin.
+ * \param max_size The maximum size of the thinned vector.
+ * \return The thinned vector.
+ */
+template <class T>
+std::vector<T> ThinVector(const std::vector<T>& vec, size_t max_size);
+
+/*!
+ * \brief Cull plans which are not Pareto optimal then thin them down.
+ * \param plans The plans to apply the Pareto culling to.
+ * \param max_plans The maximum number of plans after the culling.
+ * \return The culled plans.
+ * \note Plan Pareto-optimality is determined based upon a Plan's memory_usage
+ * and cycles.
+ */
+std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans);
+
+}  // namespace cascader
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace tvm
+
+#endif  // TVM_CONTRIB_ETHOSU_CASCADER_PARETO_H_
diff --git a/src/contrib/ethosu/cascader/plan.cc b/src/contrib/ethosu/cascader/plan.cc
new file mode 100644
index 000000000000..173b3f9e8d20
--- /dev/null
+++ b/src/contrib/ethosu/cascader/plan.cc
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "plan.h"
+
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/map.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/registry.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "graph.h"
+#include "tensor_config.h"
+
+namespace tvm {
+namespace contrib {
+namespace ethosu {
+namespace cascader {
+
+void PlanNode::VisitAttrs(AttrVisitor* v) {
+  Array<TensorConfig> tmp_arr(tensor_configs_);
+  v->Visit("_tensor_configs", &tmp_arr);
+  Array<TensorConfig> tmp_cfgs(open_configs_.begin(), open_configs_.end());
+  v->Visit("_open_configs", &tmp_cfgs);
+  v->Visit("_output_config", &output_config_);
+  Array<Part> tmp_parts(part_group_.begin(), part_group_.end());
+  v->Visit("_part_group", &tmp_parts);
+  v->Visit("_interior_region", &interior_region_);
+  v->Visit("_memory_usage", &memory_usage_);
+  v->Visit("_cycles", &cycles_);
+}
+
+Plan::Plan(const std::vector<TensorConfig>& tensor_configs,
+           const std::vector<TensorConfig>& open_configs, const TensorConfig& output_config,
+           const std::vector<Part>& part_group, const MemoryRegion& interior_region,
+           int memory_usage, int cycles) {
+  auto n = make_object<PlanNode>();
+  n->tensor_configs_ = std::move(tensor_configs);
+  n->open_configs_ = std::move(open_configs);
+  n->output_config_ = std::move(output_config);
+  n->part_group_ = std::move(part_group);
+  n->interior_region_ = interior_region;
+  n->memory_usage_ = memory_usage;
+  n->cycles_ = cycles;
+  data_ = std::move(n);
+}
+
+Plan Plan::Merge(const Plan& other) const {
+  auto n = make_object<PlanNode>(*this->operator->());
+  n->tensor_configs_.insert(n->tensor_configs_.end(), other->tensor_configs_.begin(),
+                            other->tensor_configs_.end());
+  n->open_configs_.erase(
+      std::remove(n->open_configs_.begin(), n->open_configs_.end(), (*this)->output_config_),
+      n->open_configs_.end());
+  for (const auto& config : other->open_configs_) {
+    if (config->GetTensor() != (*this)->output_config_->GetTensor()) {
+      n->open_configs_.push_back(config);
+    }
+  }
+  n->output_config_ = other->output_config_;
+  n->part_group_.insert(n->part_group_.end(), other->part_group_.begin(), other->part_group_.end());
+  std::sort(n->part_group_.begin(), n->part_group_.end());
+  n->memory_usage_ += other->memory_usage_;
+  n->cycles_ += other->cycles_;
+  return Plan(n);
+}
+
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.Plan")
+    .set_body_typed([](Array<TensorConfig> tensor_configs, Array<TensorConfig> open_configs,
+                       TensorConfig output_config, Array<Part> part_group,
+                       MemoryRegion interior_region, int memory_usage, int cycles) {
+      std::vector<TensorConfig> vtensor_configs(tensor_configs.begin(), tensor_configs.end());
+      std::vector<TensorConfig> sopen_configs(open_configs.begin(), open_configs.end());
+      std::vector<Part> spart_group(part_group.begin(), part_group.end());
+      return Plan(vtensor_configs, sopen_configs, output_config, spart_group, interior_region,
+                  memory_usage, cycles);
+    });
+
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.PlanMerge").set_body_method(&Plan::Merge);
+
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.PlanMergeBenchmark")
+    .set_body_typed([](Plan plan, Plan other, int repeats) {
+      for (int i = 0; i < repeats; i++) {
+        plan.Merge(other);
+      }
+      return plan.Merge(other);
+    });
+
+TVM_REGISTER_NODE_TYPE(PlanNode);
+
+}  // namespace cascader
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace tvm
diff --git a/src/contrib/ethosu/cascader/plan.h b/src/contrib/ethosu/cascader/plan.h
new file mode 100644
index 000000000000..65efe98e4ff5
--- /dev/null
+++ b/src/contrib/ethosu/cascader/plan.h
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/contrib/ethosu/cascader/plan.h
+ * \brief Plan object for the NPU cascader
+ */
+#ifndef TVM_CONTRIB_ETHOSU_CASCADER_PLAN_H_
+#define TVM_CONTRIB_ETHOSU_CASCADER_PLAN_H_
+
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/object.h>
+
+#include <functional>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "graph.h"
+#include "tensor_config.h"
+
+namespace tvm {
+namespace contrib {
+namespace ethosu {
+namespace cascader {
+
+/*! \brief Node to represent a Plan */
+class PlanNode : public Object {
+ public:
+  void VisitAttrs(AttrVisitor* v);
+
+  /*! \return The TensorConfigs specified by the Plan */
+  const std::vector<TensorConfig>& GetTensorConfigs() const { return tensor_configs_; }
+  /*! \return The TensorConfigs which are 'open' meaning they are a Plan input/output but have
+   * INTERIOR state */
+  const std::vector<TensorConfig>& GetOpenConfigs() const { return open_configs_; }
+  /*! \return The TensorConfig of the Plan's output tensor */
+  const TensorConfig GetOutputConfig() const { return output_config_; }
+  /*! \return The Parts which are covered by the Plan */
+  const std::vector<Part>& GetPartGroup() const { return part_group_; }
+  /*! \return The memory region in which to store interior Plan buffers */
+  MemoryRegion const GetInteriorRegion() const { return interior_region_; }
+  /*!
+   * \return The interior memory used by the Plan in bytes.
+   * \note The interior memory usage is defined as being the memory required in the interior region
+   * to execute the Plan excluding input and output buffers.
+   */
+  int GetMemoryUsage() const { return memory_usage_; }
+  /*! \return The cycles taken to execute the Plan */
+  int GetCycles() const { return cycles_; }
+  /*! \return Whether the Plan is 'closed' meaning it has no 'open' TensorConfigs */
+  bool IsClosed() const { return open_configs_.size() == 0; }
+
+  static constexpr const char* _type_key = "contrib.ethosu.cascader.Plan";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PlanNode, Object);
+
+ protected:
+  friend class Plan;
+
+  /*! \brief The TensorConfigs specified by the Plan */
+  std::vector<TensorConfig> tensor_configs_;
+  /*! \brief The TensorConfigs which are 'open' meaning they are a Plan input/output but have
+   * INTERIOR state */
+  std::vector<TensorConfig> open_configs_;
+  /*! \brief The TensorConfig of the Plan's output tensor */
+  TensorConfig output_config_;
+  /*! \brief The Parts which are covered by the Plan */
+  std::vector<Part> part_group_;
+  /*! \brief The memory region in which to store interior Plan buffers */
+  MemoryRegion interior_region_;
+  /*! \brief The interior memory used by the Plan in bytes */
+  int memory_usage_;
+  /*! \brief The cycles taken to execute the Plan */
+  int cycles_;
+};
+
+/*!
+ * \brief A class which describes how to schedule a subgraph of Parts together.
+ * \note A Plan takes the form of a subgraph of connected Parts (recorded in part_group) with
+ * TensorConfigs for all of the required Tensors (recorded in tensor_configs). This information can
+ * be used to produce a Tensor Expression schedule with inter-operator scheduling. A Plan is
+ * necessarily single-output such that all non-output Parts are 'computed_at'ed the scope of the
+ * output Part. This is what achieves the technique referred to as 'cascading'. A Plan also has an
+ * interior memory region which specifies the region of memory into which all the Plans intermediate
+ * buffers should be allocated.
+ *
+ * Additionally, a Plan contains some other information used during the Plan generation and
+ * selection algorithms. Both the memory and cycles required to run the Plan are accounted for so
+ * that Plans can be ranked and Pareto-culled on these metrics. Furthermore, the TensorConfigs which
+ * are 'open' is recorded indicating that these are valid points to merge with another Plan. A Plan
+ * can only be turned into a schedule if it has no 'open' TensorConfigs - at which point the Plan is
+ * said to be 'closed'.
+ */
+class Plan : public ObjectRef {
+ public:
+  Plan(const std::vector<TensorConfig>& tensor_configs,
+       const std::vector<TensorConfig>& open_configs, const TensorConfig& output_config,
+       const std::vector<Part>& part_group, const MemoryRegion& interior_region, int memory_usage,
+       int cycles);
+  /*!
+   * \brief Merge two Plans which share an 'open' TensorConfig.
+   * \param other The Plan to merge with.
+   * \return The merged Plan.
+   * \note The current Plan is referred to as the 'upper Plan' and the other Plan as the 'lower
+   * Plan'. The 'open' output config of the upper Plan must be an 'open' input config of the lower
+   * Plan. The Tensor referenced by these configs is the Tensor on which the two Plans will be
+   * merged. The merge process does the following:
+   *
+   * The tensor config maps will be merged with TensorConfigs from the upper Plan taking priority.
+   * The open configs will be merged with the TensorConfigs that are being merged having been
+   * removed. The output config will be that of the lower Plan. The part groups will be merged. The
+   * interior region is necessarily the same for both the upper and lower Plan. The cycles and
+   * memory usage will be summed.
+   */
+  Plan Merge(const Plan& other) const;
+
+  TVM_DEFINE_OBJECT_REF_METHODS(Plan, ObjectRef, PlanNode);
+};
+
+}  // namespace cascader
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace tvm
+
+// Hash functions TensorConfig and Part sets
+namespace std {
+
+using TensorConfigSet = std::vector<::tvm::contrib::ethosu::cascader::TensorConfig>;
+using PartSet = std::vector<::tvm::contrib::ethosu::cascader::Part>;
+
+template <>
+struct hash<TensorConfigSet> {
+  std::size_t operator()(const TensorConfigSet& tensor_config_set) const {
+    size_t seed = 0;
+    for (const auto& tensor_config : tensor_config_set) {
+      seed ^= hash<::tvm::contrib::ethosu::cascader::TensorConfig>()(tensor_config);
+    }
+    return seed;
+  }
+};
+
+template <>
+struct equal_to<TensorConfigSet> {
+  bool operator()(const TensorConfigSet& lhs, const TensorConfigSet& rhs) const {
+    std::unordered_set<::tvm::contrib::ethosu::cascader::TensorConfig> lh_set(lhs.begin(),
+                                                                              lhs.end());
+    std::unordered_set<::tvm::contrib::ethosu::cascader::TensorConfig> rh_set(rhs.begin(),
+                                                                              rhs.end());
+    return lh_set == rh_set;
+  }
+};
+
+template <>
+struct hash<PartSet> {
+  std::size_t operator()(const PartSet& part_set) const {
+    size_t seed = 0;
+    for (const auto& part : part_set) {
+      seed ^= tvm::runtime::ObjectHash()(part);
+    }
+    return seed;
+  }
+};
+
+template <>
+struct equal_to<PartSet> {
+  bool operator()(const PartSet& lhs, const PartSet& rhs) const { return lhs == rhs; }
+};
+
+}  // namespace std
+
+#endif  // TVM_CONTRIB_ETHOSU_CASCADER_PLAN_H_
diff --git a/src/contrib/ethosu/cascader/plan_generator.cc b/src/contrib/ethosu/cascader/plan_generator.cc
new file mode 100644
index 000000000000..7cb6baf39159
--- /dev/null
+++ b/src/contrib/ethosu/cascader/plan_generator.cc
@@ -0,0 +1,530 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "plan_generator.h"
+
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/map.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/support/parallel_for.h>
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "cascader_options.h"
+#include "common.h"
+#include "graph.h"
+#include "pareto.h"
+#include "plan.h"
+#include "stripe_config.h"
+#include "tensor_config.h"
+
+namespace tvm {
+namespace contrib {
+namespace ethosu {
+namespace cascader {
+
+template <class T>
+std::vector<std::vector<T>> EnumerateCombinations(std::vector<std::vector<T>> values) {
+  if (values.size() == 0) {
+    return values;
+  }
+  if (values.size() == 1) {
+    std::vector<std::vector<T>> combs;
+    for (const auto& value : values[0]) {
+      combs.push_back(std::vector<T>(1, value));
+    }
+    return combs;
+  }
+  auto combs = EnumerateCombinations(std::vector<std::vector<T>>(values.begin(), values.end() - 1));
+  std::vector<std::vector<T>> new_combs;
+  for (const auto& value : values.back()) {
+    for (const auto& comb : combs) {
+      auto new_comb = std::vector<T>(comb);
+      new_comb.push_back(value);
+      new_combs.push_back(new_comb);
+    }
+  }
+  return new_combs;
+}
+
+std::vector<bool> GetCascadableAxes(const Part& part) {
+  std::vector<bool> cascadable_axes(part->GetOutputTensor()->GetShape().size());
+  // Check all the propagators to see if an output axis is projected into any
+  // of the inputs. If it is, then that axis is cascadable.
+  for (const auto& propagator : part->GetPropagators()) {
+    auto transform = propagator->GetTransform();
+    for (size_t i = 0; i < transform.size(); i++) {
+      for (size_t j = 0; j < transform[0].size() - 1; j++) {
+        // An axis is projected if there's a non-zero element
+        // in the transform matrix
+        if (transform[i][j] != 0) {
+          cascadable_axes[j] = true;
+        }
+      }
+    }
+  }
+  return cascadable_axes;
+}
+
+std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stripe_factors) {
+  // If stripe_factors is <= 0, then we won't produce any StripeConfigs
+  if (stripe_factors <= 0) {
+    return std::vector<StripeConfig>();
+  }
+  // Work out the factors to divide by as inverse powers of 2.
+  // The last factor is always reserved to be '0' which will correspond to
+  // choosing a stripe size of 1 in the dimension. We always include this
+  // as it represents the most extreme striping choice that uses the least
+  // memory, so it is our choice of last resort.
+  // For example, if stripe_factors = 4 then the factors are 1, 1/2, 1/4, 0.
+  std::vector<float> factors;
+  for (size_t i = 0; i < static_cast<size_t>(stripe_factors) - 1; i++) {
+    factors.push_back(1.0f / (std::pow(2.0f, i)));
+  }
+  factors.push_back(0);
+  // Then use the factors to derive the possible ways to split each dimension
+  // into stripes. As an example, if an had extent 128 then by applying
+  // the factors derived above we get the following possible splits for that axis:
+  // 128, 64, 32, 1
+  std::vector<std::vector<int>> splits;
+  std::vector<int> output_shape = part->GetOutputTensor()->GetShape();
+  size_t output_dims = output_shape.size();
+  // Only bother striping along the axes which are cascadable
+  auto cascadable_axes = GetCascadableAxes(part);
+  for (size_t i = 0; i < output_dims; i++) {
+    auto axis = output_shape[i];
+    auto axis_align = part->GetStripeAlignHint()[i];
+    std::set<int> axis_splits;  // Note this is a set to remove duplicate splits
+    if (!cascadable_axes[i]) {
+      axis_splits.insert(axis);
+    } else {
+      for (float factor : factors) {
+        int split =
+            std::max(static_cast<int>(std::ceil(axis * factor / axis_align)), 1) * axis_align;
+        split = std::min(axis, split);
+        axis_splits.insert(split);
+      }
+    }
+    splits.push_back(std::vector<int>(axis_splits.begin(), axis_splits.end()));
+  }
+  // Now calculate all the possible combinations of splits for each dimension
+  // to give us all the possible stripe shapes. For example, if we had two axes
+  // both with possible splits in {128, 64, 32, 1}, the stripe shapes would be:
+  // (128, 128), (128, 64), (128, 32) ... (1, 64), (1, 32), (1, 1)
+  auto stripe_shapes = EnumerateCombinations<int>(splits);
+  auto offset = std::vector<int>(output_dims);
+  std::vector<StripeConfig> stripe_configs;
+  // Calculate the possible axis orderings such that each axis has the opportunity
+  // to be the 'outermost' axis (which is axis that is chosen for rolling).
+  std::vector<std::vector<int>> orders;
+  for (size_t i = 0; i < output_dims; i++) {
+    std::vector<int> order(output_dims);
+    for (size_t j = 0; j < output_dims; j++) {
+      order[j] = 1 + (j + i) % output_dims;
+    }
+    orders.push_back(order);
+  }
+  // Finally, create the StripeConfigs from the possible stripe shapes and orders
+  for (const auto& stripe_shape : stripe_shapes) {
+    std::vector<int> stripes;
+    std::vector<float> strides;
+    for (size_t i = 0; i < output_dims; i++) {
+      stripes.push_back(std::ceil(static_cast<float>(output_shape[i]) / stripe_shape[i]));
+      strides.push_back(static_cast<float>(stripe_shape[i]));  // strides = stripe_shape
+    }
+    // If the stripe shape equals the output shape (i.e. there's no striping), then
+    // the order doesn't matter, so just pick the first order and continue.
+    if (stripe_shape == output_shape) {
+      stripe_configs.push_back(
+          StripeConfig(stripe_shape, output_shape, strides, orders[0], stripes, offset));
+      continue;
+    }
+    for (const auto& order : orders) {
+      // Some logic to avoid having an axis be the 'outermost' if the stripe is full
+      // size in that axis. This would otherwise be a waste because we can't roll
+      // over an axis that hasn't been split.
+      bool skip = false;
+      for (size_t i = 0; i < output_dims; i++) {
+        if (order[i] == 1 && stripe_shape[i] == output_shape[i]) {
+          skip = true;
+          break;
+        }
+      }
+      if (skip) continue;
+      stripe_configs.push_back(
+          StripeConfig(stripe_shape, output_shape, strides, order, stripes, offset));
+    }
+  }
+  return stripe_configs;
+}
+
+std::vector<TensorConfig> GetPossibleInputConfigs(const StripeConfig& stripe_config,
+                                                  const Tensor& tensor,
+                                                  const std::vector<MemoryRegion>& home_regions,
+                                                  const CascaderOptions& options) {
+  std::vector<TensorConfig> configs;
+  for (const auto& home_region : home_regions) {
+    // Boundary configs
+    if (home_region == options->cascade_region ||
+        tensor->GetSize() > options->always_copy_size) {
+      configs.push_back(TensorConfig(tensor, home_region, TensorConfigState::BOUNDARY,
+                                     BufferMode::RECOMPUTE, {stripe_config}, false, home_region));
+    }
+    if (home_region != options->cascade_region) {
+      configs.push_back(TensorConfig(tensor, home_region, TensorConfigState::BOUNDARY,
+                                     BufferMode::ROLLING, {stripe_config}, true,
+                                     options->cascade_region));
+    }
+  }
+  if (!tensor->IsConstant()) {
+    // Interior configs
+    configs.push_back(TensorConfig(tensor, options->cascade_region, TensorConfigState::INTERIOR,
+                                   BufferMode::RECOMPUTE, {stripe_config}, false,
+                                   options->cascade_region));
+    configs.push_back(TensorConfig(tensor, options->cascade_region, TensorConfigState::INTERIOR,
+                                   BufferMode::ROLLING, {stripe_config}, false,
+                                   options->cascade_region));
+  }
+  return configs;
+}
+
+// Check whether a StripeConfig can be an output boundary config
+bool CanBound(const StripeConfig& stripe_config) {
+  // Determine whether the StripeConfig results in non-overlapping stripes
+  // which is the case when the stripe shape equals the strides
+  for (size_t i = 0; i < stripe_config->GetShape().size(); i++) {
+    // Check that the stripe shape and strides are equal
+    if (stripe_config->GetShape()[i] - stripe_config->GetStrides()[i] != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::vector<TensorConfig> GetPossibleOutputConfigs(const StripeConfig& stripe_config,
+                                                   const Tensor& tensor,
+                                                   const std::vector<MemoryRegion>& home_regions,
+                                                   const CascaderOptions& options) {
+  std::vector<TensorConfig> configs;
+  // Only StripeConfigs with non-overlapping stripes can be output boundary configs
+  if (CanBound(stripe_config)) {
+    for (const auto& home_region : home_regions) {
+      // Boundary configs
+      configs.push_back(TensorConfig(tensor, home_region, TensorConfigState::BOUNDARY,
+                                     BufferMode::RECOMPUTE, {stripe_config}, false, home_region));
+    }
+  }
+  // Interior configs
+  configs.push_back(TensorConfig(tensor, options->cascade_region, TensorConfigState::INTERIOR,
+                                 BufferMode::RECOMPUTE, {stripe_config}, false,
+                                 options->cascade_region));
+  configs.push_back(TensorConfig(tensor, options->cascade_region, TensorConfigState::INTERIOR,
+                                 BufferMode::ROLLING, {stripe_config}, false,
+                                 options->cascade_region));
+  return configs;
+}
+
+int GetInteriorMemoryUsage(const std::vector<TensorConfig>& input_configs,
+                           const TensorConfig& output_config, const MemoryRegion& interior_region) {
+  int memory_usage = 0;
+  if (output_config->GetHomeRegion() == interior_region &&
+      output_config->GetState() == TensorConfigState::BOUNDARY) {
+    memory_usage += output_config->GetTensor()->GetSize();
+  }
+  for (const auto& input_config : input_configs) {
+    if (input_config->GetHomeRegion() == interior_region &&
+        input_config->GetState() == TensorConfigState::BOUNDARY) {
+      memory_usage += input_config->GetTensor()->GetSize();
+    } else if (input_config->GetHomeRegion() == interior_region ||
+               input_config->GetCopyRegion() == interior_region) {
+      memory_usage += input_config->GetBufferSize();
+    }
+  }
+  return memory_usage;
+}
+
+std::vector<Plan> GenerateSinglePlans(
+    const Part& part, const std::vector<StripeConfig>& output_stripe_configs,
+    const std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>&
+        home_map,
+    const CascaderOptions& options) {
+  std::vector<Plan> plans;
+  std::vector<Part> part_group{part};
+  // Create a selection of Plans per output_stripe_config
+  for (const auto& output_stripe_config : output_stripe_configs) {
+    // Calculate the input_stripe_configs
+    auto input_stripe_configs = part->CalculateInputStripeConfigs(output_stripe_config);
+    // From the input_stripe_configs, now derive all the possible input TensorConfigs
+    std::vector<std::vector<TensorConfig>> all_possible_input_configs;
+    size_t i = 0;
+    for (const auto& stripe_config : input_stripe_configs) {
+      Tensor tensor = part->GetInputTensors()[i];
+      all_possible_input_configs.push_back(
+          GetPossibleInputConfigs(stripe_config, tensor, home_map.at(tensor), options));
+      i++;
+    }
+    // Now work out all the possible combinations of input TensorConfigs
+    auto input_config_combinations =
+        EnumerateCombinations<TensorConfig>(all_possible_input_configs);
+    Tensor output_tensor = part->GetOutputTensor();
+    // Then determine the possible output TensorConfigs (no combinations here because there's only
+    // one output)
+    auto output_configs = GetPossibleOutputConfigs(output_stripe_config, output_tensor,
+                                                   home_map.at(output_tensor), options);
+    // Calculate the performance information for the output_stripe_config for both the recompute and
+    // rolling cases
+    PerformanceInfo rolling_perf =
+        part->GetPerformanceInfo(output_stripe_config, BufferMode::ROLLING);
+    PerformanceInfo recompute_perf =
+        part->GetPerformanceInfo(output_stripe_config, BufferMode::RECOMPUTE);
+    // For all the possible input TensorConfig combinations
+    for (const auto& input_configs : input_config_combinations) {
+      std::vector<TensorConfig> tensor_configs;
+      std::vector<TensorConfig> open_input_configs;
+      // Add the input TensorConfigs to the 'tensor_configs' and
+      // record which input TensorConfigs are 'open' (i.e. 'INTERIOR')
+      for (const auto& input_config : input_configs) {
+        tensor_configs.push_back(input_config);
+        if (input_config->GetState() == TensorConfigState::INTERIOR) {
+          open_input_configs.push_back(input_config);
+        }
+      }
+      for (const auto& output_config : output_configs) {
+        // Add the output TensorConfig to the tensor_configs and to
+        // the open configs (if it's 'INTERIOR')
+        tensor_configs.push_back(output_config);
+        std::vector<TensorConfig> open_configs = open_input_configs;
+        if (output_config->GetState() == TensorConfigState::INTERIOR) {
+          open_configs.push_back(output_config);
+        }
+        int bandwidth_cycles = 0;
+        int compute_cycles = 0;
+        int mem2mem_cycles = 0;
+
+        // Pick the correct performance info based on the BufferMode
+        PerformanceInfo perf_info;
+        if (output_config->GetBufferMode() == BufferMode::RECOMPUTE) {
+          perf_info = recompute_perf;
+        } else {
+          perf_info = rolling_perf;
+        }
+        // Calculate the bandwidth cycles by multiplying the bytes read/written by the
+        // bandwidth of the memories
+        for (size_t i = 0; i < input_configs.size(); i++) {
+          bandwidth_cycles +=
+              perf_info->read_bytes[i] / input_configs[i]->GetCopyRegion()->read_bandwidth;
+          if (input_configs[i]->DoCopy()) {
+            // This Tensor needs to be copied - Count stripes for this config
+            Tensor tensor = input_configs[i]->GetTensor();
+            for (const auto& stripe_config : input_configs[i]->GetStripeConfigs()) {
+              std::map<std::vector<int>, int> input_blocks = CountStripes(stripe_config, true);
+              for (const auto& block : input_blocks) {
+                int bytes_transferred = mul_reduce(block.first) * tensor->GetDataType().bytes() *
+                                        tensor->GetCompressionRatio() * block.second;
+                int read_cycles =
+                    bytes_transferred * input_configs[i]->GetHomeRegion()->read_bandwidth;
+                int write_cycles =
+                    bytes_transferred * input_configs[i]->GetCopyRegion()->write_bandwidth;
+                mem2mem_cycles += std::max(read_cycles, write_cycles);
+              }
+            }
+          }
+        }
+        bandwidth_cycles +=
+            perf_info->write_bytes / output_config->GetCopyRegion()->write_bandwidth;
+        compute_cycles = perf_info->compute_cycles;
+        // Take the max of compute and bandwidth cycles as we assume compute cycles
+        // can hide memory latency
+        int cycles = std::max(std::max(compute_cycles, bandwidth_cycles), mem2mem_cycles);
+        int memory_usage =
+            GetInteriorMemoryUsage(input_configs, output_config, options->cascade_region);
+        plans.push_back(Plan(tensor_configs, open_configs, output_config, part_group,
+                             options->cascade_region, memory_usage, cycles));
+      }
+    }
+  }
+  return plans;
+}
+
+std::unordered_map<std::vector<Part>, std::vector<Plan>> GenerateGraphPlans(
+    const CascaderGraph& graph,
+    const std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>&
+        home_map,
+    const CascaderOptions& options) {
+  ICHECK_GT(options->stripe_factors, 0)
+      << "stripe_factors = " << options->stripe_factors << ", but must be > 0";
+  ICHECK_GT(options->max_plan_size, 0)
+      << "max_plan_size = " << options->max_plan_size << ", but must be > 0";
+  // Define a map between the graph Tensors and possible StripeConfigs that the Tensor may be
+  // executed with
+  std::unordered_map<Tensor, std::set<StripeConfig>, ObjectPtrHash, ObjectPtrEqual>
+      stripe_configs_by_tensor;
+  // Define a map between a given open TensorConfig and all the Plans which provide it
+  std::unordered_map<TensorConfig, std::vector<Plan>> plans_by_config;
+  // Define a map between a group of connected Parts and all the closed plans covering them
+  std::unordered_map<std::vector<Part>, std::vector<Plan>> closed_plans;
+  // Define a nested map which indexes open plans by both Part group and the open TensorConfigs they
+  // provide. Note that we index in this way because Part group + open TensorConfigs combined
+  // defines a group of Plans which can be mutually Pareto culled. If we culled of Part group alone,
+  // we'd lose potentially valuable open Plans which could have gone on to be grown into Pareto
+  // optimal closed plans.
+  std::unordered_map<std::vector<Part>,
+                     std::unordered_map<std::vector<TensorConfig>, std::vector<Plan>>>
+      open_plans;
+  // Traverse the graph in a reverse topological order (should be enforced by GetPartOrder)
+  for (const auto& part : graph->GetPartOrder()) {
+    // First generate all the possible StripeConfigs for the Part assuming that it will become the
+    // output of a Plan. The number generated is a function of stripe_factors and the number of
+    // cascadable dimensions in the Part.
+    std::vector<StripeConfig> stripe_configs =
+        GenerateOutputStripeConfigs(part, options->stripe_factors);
+    // Check to see if the output Tensor is part of any existing open Plans
+    if (stripe_configs_by_tensor.find(part->GetOutputTensor()) != stripe_configs_by_tensor.end()) {
+      // If there are other open Plans which have this Part's output Tensor as an input, then
+      // additionally consider the StripeConfigs of those open TensorConfigs so that we have the
+      // option to merge into those open Plans.
+      const std::set<StripeConfig>& connecting_configs =
+          stripe_configs_by_tensor.at(part->GetOutputTensor());
+      std::copy(connecting_configs.begin(), connecting_configs.end(),
+                std::back_inserter(stripe_configs));
+    }
+    // Generate all the single Part Plans for the previously determined StripeConfigs
+    auto single_part_plans = GenerateSinglePlans(part, stripe_configs, home_map, options);
+    std::vector<Plan> plans;
+    for (const auto& partial_plan : single_part_plans) {
+      // If the output TensorConfig of the Plan is 'INTERIOR', then it must be merged with
+      // another open Plan
+      if (partial_plan->GetOutputConfig()->GetState() == TensorConfigState::INTERIOR) {
+        if (plans_by_config.find(partial_plan->GetOutputConfig()) != plans_by_config.end() &&
+            partial_plan->GetOutputConfig()->GetTensor()->GetConsumers().size() == 1) {
+          // Search for all the open Plans which require the same TensorConfig
+          const auto& join_plans = plans_by_config.at(partial_plan->GetOutputConfig());
+          for (const auto& join_plan : join_plans) {
+            // Only merge to form a new Plan if the resulting Plan size won't exceed the
+            // max_plan_size
+            if (join_plan->GetPartGroup().size() < static_cast<size_t>(options->max_plan_size)) {
+              if (partial_plan->GetMemoryUsage() + join_plan->GetMemoryUsage() <
+                  options->cascade_region->size) {
+                plans.push_back(partial_plan.Merge(join_plan));
+              }
+            }
+          }
+        }
+      } else {
+        // If the single Part Plan had a 'BOUNDARY' output TensorConfig, then it doesn't need
+        // merging and can stand on its own.
+        plans.push_back(partial_plan);
+      }
+    }
+    // For all the newly created Plans, update the various maps
+    std::unordered_set<std::vector<Part>> new_part_groups;
+    for (const auto& plan : plans) {
+      new_part_groups.insert(plan->GetPartGroup());
+      if (plan->IsClosed()) {
+        closed_plans[plan->GetPartGroup()].push_back(plan);
+      } else {
+        open_plans[plan->GetPartGroup()][plan->GetOpenConfigs()].push_back(plan);
+      }
+    }
+    // Now Pareto cull both the open and closed Plans to remove non-optimal Plans
+    // Additionally, once culled we update another two maps, the stripe_configs_by_tensor
+    // and plans_by_config maps.
+    for (const auto& part_group : new_part_groups) {
+      if (closed_plans.find(part_group) != closed_plans.end()) {
+        closed_plans[part_group] = ParetoCullPlans(closed_plans.at(part_group), 32);
+      }
+      for (const auto& it : open_plans[part_group]) {
+        auto pareto_plans = ParetoCullPlans(it.second, 8);
+        for (const auto& plan : pareto_plans) {
+          for (const auto& open_config : plan->GetOpenConfigs()) {
+            if (open_config != plan->GetOutputConfig()) {
+              for (const auto& stripe_config : open_config->GetStripeConfigs()) {
+                // Only add a StripeConfig if it contains for than one stripe
+                if (mul_reduce(stripe_config->GetStripes()) > 1) {
+                  stripe_configs_by_tensor[open_config->GetTensor()].insert(stripe_config);
+                }
+              }
+              plans_by_config[open_config].push_back(plan);
+            }
+          }
+        }
+      }
+    }
+  }
+  return closed_plans;
+}
+
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateOutputStripeConfigs")
+    .set_body_typed([](Part part, int stripe_factors) {
+      if (stripe_factors < 0) {
+        return Array<StripeConfig>();
+      }
+      return Array<StripeConfig>(GenerateOutputStripeConfigs(part, stripe_factors));
+    });
+
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateSinglePlans")
+    .set_body_typed([](Part part, Array<StripeConfig> output_stripe_configs,
+                       Map<Tensor, Array<MemoryRegion>> home_map, CascaderOptions options) {
+      std::vector<StripeConfig> voutput_stripe_configs(output_stripe_configs.begin(),
+                                                       output_stripe_configs.end());
+      std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>
+          mhome_map;
+      for (const auto& it : home_map) {
+        std::vector<MemoryRegion> home_regions;
+        for (const auto& i : it.second) {
+          home_regions.push_back(i);
+        }
+        mhome_map[it.first] = home_regions;
+      }
+      return Array<Plan>(GenerateSinglePlans(part, voutput_stripe_configs, mhome_map, options));
+    });
+
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateGraphPlans")
+    .set_body_typed([](CascaderGraph graph, Map<Tensor, Array<MemoryRegion>> home_map,
+                       CascaderOptions options) {
+      std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>
+          mhome_map;
+      for (const auto& it : home_map) {
+        std::vector<MemoryRegion> home_regions;
+        for (const auto& i : it.second) {
+          home_regions.push_back(i);
+        }
+        mhome_map[it.first] = home_regions;
+      }
+      auto closed_plans = GenerateGraphPlans(graph, mhome_map, options);
+      Map<Array<Part>, Array<Plan>> tclosed_plans;
+      for (auto& it : closed_plans) {
+        Array<Part> part_arr(it.first.begin(), it.first.end());
+        Array<Plan> plan_arr(it.second);
+        tclosed_plans.Set(part_arr, plan_arr);
+      }
+      return tclosed_plans;
+    });
+
+}  // namespace cascader
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace tvm
diff --git a/src/contrib/ethosu/cascader/plan_generator.h b/src/contrib/ethosu/cascader/plan_generator.h
new file mode 100644
index 000000000000..947728addfd1
--- /dev/null
+++ b/src/contrib/ethosu/cascader/plan_generator.h
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/contrib/ethosu/cascader/plan_generator.h
+ * \brief Algorithm to generate possible Plans in the NPU cascader
+ */
+#ifndef TVM_CONTRIB_ETHOSU_CASCADER_PLAN_GENERATOR_H_
+#define TVM_CONTRIB_ETHOSU_CASCADER_PLAN_GENERATOR_H_
+
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/object.h>
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace tvm {
+namespace contrib {
+namespace ethosu {
+namespace cascader {
+
+class CascaderGraph;
+class MemoryRegion;
+class Part;
+class Tensor;
+class StripeConfig;
+class Plan;
+class CascaderOptions;
+
+using HomeMap =
+    std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>;
+
+/*!
+ * \brief Generate possible output StripeConfigs that could be applied to a Part's output.
+ * \param part The Part to generate StripeConfigs for.
+ * \param stripe_factors How many striping factors to try per axis.
+ * \return The generated StripeConfigs for the Part's output.
+ */
+std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stripe_factors);
+
+/*!
+ * \brief Generate single-Part Plans for a Part for a given list of output StripeConfigs.
+ * \param part The Part to generate Plans for.
+ * \param output_stripe_configs The output StripeConfigs to generate Plans with.
+ * \param home_map The Tensor homing map defining valid memory homes for Tensors.
+ * \param options The configuration options with which to run the generator.
+ * \return The generated Plans covering the Part.
+ * \note For each of the output StripeConfigs provided, this algorithm will produce a number
+ * of Plans corresponding to different choices of Tensor homing/copying, buffer modes
+ * and INTERIOR/BOUNDARY states. For each of these variants, the Part's performance will
+ * be queried and the memory usage will be calculated.
+ */
+std::vector<Plan> GenerateSinglePlans(const Part& part,
+                                      const std::vector<StripeConfig>& output_stripe_configs,
+                                      const HomeMap& home_map, const CascaderOptions& options);
+
+/*!
+ * \brief Generate pareto optimal Plans for a Graph.
+ * \param graph The Graph to generate Plans for.
+ * \param home_map The Tensor homing map defining valid memory homes for Tensors.
+ * \param options The configuration options with which to run the generator.
+ * \return A map between Part groups and a list of pareto optimal Plans which cover that group.
+ * \note This algorithm does the following:
+ *
+ * Iterate Part-by-Part in a reversed topological ordering (starting at the output Parts and
+ * working towards the input Parts).
+ *
+ * For each Part:
+ *  1. Determine the possible StripeConfigs we might want to use to stripe the Part using
+ *     GenerateOutputStripeConfigs.
+ *  2. Additionally, collect all the StripeConfigs of open Plans that could connect to this
+ *     Part (i.e. the Plan has an open TensorConfig for the Part's output Tensor).
+ *  3. Use these two lists of StripeConfigs to produce single Part Plans with GenerateSinglePlans.
+ *  4. For the generated Plans that have an open output TensorConfig, try and merge these into
+ *     existing Plans which share an open input TensorConfig.
+ *  5. All Plans are then indexed by both the Part group they cover and their open TensorConfigs.
+ *  6. Plans which cover the same Part group and share the same open TensorConfigs are culled
+ *     using ParetoCullPlans.
+ *
+ * Once every Part has been visited, return the Plans with no open TensorConfigs indexed by Part
+ * group.
+ */
+std::unordered_map<std::vector<Part>, std::vector<Plan>> GenerateGraphPlans(
+    const CascaderGraph& graph, const HomeMap& home_map, const CascaderOptions& options);
+
+}  // namespace cascader
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace tvm
+
+#endif  // TVM_CONTRIB_ETHOSU_CASCADER_PLAN_GENERATOR_H_
diff --git a/src/contrib/ethosu/cascader/tensor_config.cc b/src/contrib/ethosu/cascader/tensor_config.cc
new file mode 100644
index 000000000000..5fec8b31e9ea
--- /dev/null
+++ b/src/contrib/ethosu/cascader/tensor_config.cc
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "tensor_config.h"
+
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/registry.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "common.h"
+
+namespace tvm {
+namespace contrib {
+namespace ethosu {
+namespace cascader {
+
+void MemoryRegionNode::VisitAttrs(AttrVisitor* v) {
+  v->Visit("name", &name);
+  v->Visit("size", &size);
+  v->Visit("read_bandwidth", &read_bandwidth);
+  v->Visit("write_bandwidth", &write_bandwidth);
+}
+
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.MemoryRegion")
+    .set_body_typed([](String name, int size, int read_bandwidth, int write_bandwidth) {
+      return MemoryRegion(name, size, read_bandwidth, write_bandwidth);
+    });
+
+TVM_REGISTER_NODE_TYPE(MemoryRegionNode);
+
+void TensorConfigNode::VisitAttrs(AttrVisitor* v) {
+  v->Visit("_tensor", &tensor_);
+  v->Visit("_home_region", &home_region_);
+  int state = static_cast<int>(state_);
+  v->Visit("_state", &state);
+  int buffer_mode = static_cast<int>(buffer_mode_);
+  v->Visit("_buffer_mode", &buffer_mode);
+  Array<StripeConfig> tmp_arr(stripe_configs_);
+  v->Visit("_stripe_configs", &tmp_arr);
+  v->Visit("_copy_tensor", &copy_tensor_);
+  v->Visit("_copy_region", &copy_region_);
+  int64_t tmp_hash = static_cast<int64_t>(hash_);
+  v->Visit("_hash", &tmp_hash);
+}
+
+int TensorConfigNode::GetBufferSize() const {
+  if (buffer_mode_ == BufferMode::RECOMPUTE) {
+    return GetRecomputeBufferSize_();
+  } else {
+    return GetRollingBufferSize_();
+  }
+}
+
+void TensorConfigNode::ComputeHash_() {
+  hash_ = ObjectHash()(tensor_);
+  hash_combine(&hash_, std::hash<std::string>()(home_region_->name));
+  hash_combine(&hash_, std::hash<TensorConfigState>()(state_));
+  hash_combine(&hash_, std::hash<BufferMode>()(buffer_mode_));
+  hash_combine(&hash_, hash_vector(stripe_configs_));
+  hash_combine(&hash_, std::hash<bool>()(copy_tensor_));
+  hash_combine(&hash_, std::hash<std::string>()(copy_region_->name));
+}
+
+int TensorConfigNode::GetRecomputeBufferSize_() const {
+  size_t buffer_size = 0;
+  for (const auto& stripe_config : stripe_configs_) {
+    buffer_size += mul_reduce(stripe_config->GetShape());
+  }
+  return buffer_size * tensor_->GetDataType().bytes() * tensor_->GetCompressionRatio();
+}
+
+int TensorConfigNode::GetRollingBufferSize_() const {
+  int buffer_size = 0;
+  for (const auto& stripe_config : stripe_configs_) {
+    int rolling_axis = -1;
+    for (size_t i = 0; i < stripe_config->GetOrder().size(); i++) {
+      // The axis must be striped (> 1 stripes) and ordered (order != 0)
+      if (stripe_config->GetStripes()[i] > 1 && stripe_config->GetOrder()[i] != 0) {
+        // If we've yet to find a possible rolling axis, use this one
+        if (rolling_axis == -1) {
+          rolling_axis = i;
+          continue;
+        }
+        // Otherwise, replace the rolling axis if the current axis has an earlier order
+        if (stripe_config->GetOrder()[i] < stripe_config->GetOrder()[rolling_axis]) {
+          rolling_axis = i;
+        }
+      }
+    }
+    // If we didn't find a rolling axis, just use axis 0
+    if (rolling_axis == -1) {
+      rolling_axis = 0;
+    }
+    int rolling_size = 1;
+    for (size_t i = 0; i < tensor_->GetShape().size(); i++) {
+      if (static_cast<int>(i) == rolling_axis) {
+        rolling_size *= stripe_config->GetShape()[i];
+      } else {
+        rolling_size *= tensor_->GetShape()[i];
+      }
+    }
+    buffer_size += rolling_size;
+  }
+  return buffer_size * tensor_->GetDataType().bytes() * tensor_->GetCompressionRatio();
+}
+
+TensorConfig::TensorConfig(const Tensor& tensor, const MemoryRegion& home_region,
+                           TensorConfigState state, BufferMode buffer_mode,
+                           const std::vector<StripeConfig>& stripe_configs, bool copy_tensor,
+                           const MemoryRegion& copy_region) {
+  auto n = make_object<TensorConfigNode>();
+  n->tensor_ = std::move(tensor);
+  n->home_region_ = std::move(home_region);
+  n->state_ = state;
+  n->buffer_mode_ = buffer_mode;
+  n->stripe_configs_ = std::move(stripe_configs);
+  n->copy_tensor_ = copy_tensor;
+  n->copy_region_ = std::move(copy_region);
+  n->ComputeHash_();
+  data_ = std::move(n);
+}
+
+inline bool TensorConfig::operator==(const TensorConfig& other) const {
+  if (get() == other.get()) return true;
+  if (get() == nullptr || other.get() == nullptr) return false;
+  if ((*this)->tensor_ == other->tensor_ && (*this)->home_region_ == other->home_region_ &&
+      (*this)->state_ == other->state_ && (*this)->buffer_mode_ == other->buffer_mode_ &&
+      (*this)->stripe_configs_ == other->stripe_configs_ &&
+      (*this)->copy_tensor_ == other->copy_tensor_ &&
+      (*this)->copy_region_ == other->copy_region_) {
+    return true;
+  }
+  return false;
+}
+
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.TensorConfig")
+    .set_body_typed([](Tensor tensor, MemoryRegion home_region, int state, int buffer_mode,
+                       Array<StripeConfig> stripe_configs, bool copy_tensor,
+                       MemoryRegion copy_region) {
+      TensorConfigState estate = static_cast<TensorConfigState>(state);
+      BufferMode ebuffer_mode = static_cast<BufferMode>(buffer_mode);
+      std::vector<StripeConfig> vstripe_configs(stripe_configs.begin(), stripe_configs.end());
+      return TensorConfig(tensor, home_region, estate, ebuffer_mode, vstripe_configs, copy_tensor,
+                          copy_region);
+    });
+
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.TensorConfigEqual")
+    .set_body_method(&TensorConfig::operator==);
+
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.TensorConfigGetBufferSize")
+    .set_body_method<TensorConfig>(&TensorConfigNode::GetBufferSize);
+
+TVM_REGISTER_NODE_TYPE(TensorConfigNode);
+
+}  // namespace cascader
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace tvm
diff --git a/src/contrib/ethosu/cascader/tensor_config.h b/src/contrib/ethosu/cascader/tensor_config.h
new file mode 100644
index 000000000000..6a37f76ce085
--- /dev/null
+++ b/src/contrib/ethosu/cascader/tensor_config.h
@@ -0,0 +1,226 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/contrib/ethosu/cascader/tensor_config.h
+ * \brief TensorConfig object for the NPU cascader
+ */
+#ifndef TVM_CONTRIB_ETHOSU_CASCADER_TENSOR_CONFIG_H_
+#define TVM_CONTRIB_ETHOSU_CASCADER_TENSOR_CONFIG_H_
+
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/object.h>
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "graph.h"
+#include "stripe_config.h"
+
+namespace tvm {
+namespace contrib {
+namespace ethosu {
+namespace cascader {
+
+class MemoryRegionNode : public Object {
+ public:
+  void VisitAttrs(AttrVisitor* v);
+
+  /*! \brief The name of the region */
+  std::string name;
+  /*! \brief The size of the region */
+  int size;
+  /*! \brief The read bandwidth of the region in bytes per cycle */
+  int read_bandwidth;
+  /*! \brief The write bandwidth of the region in bytes per cycle */
+  int write_bandwidth;
+
+  static constexpr const char* _type_key = "contrib.ethosu.cascader.MemoryRegion";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MemoryRegionNode, Object)
+};
+
+class MemoryRegion : public ObjectRef {
+ public:
+  MemoryRegion(std::string name, int size, int read_bandwidth, int write_bandwidth) {
+    auto n = make_object<MemoryRegionNode>();
+    n->name = name;
+    n->size = size;
+    n->read_bandwidth = read_bandwidth;
+    n->write_bandwidth = write_bandwidth;
+    data_ = std::move(n);
+  }
+
+  TVM_DEFINE_OBJECT_REF_METHODS(MemoryRegion, ObjectRef, MemoryRegionNode);
+};
+
+/*! \brief The 'state' of a TensorConfig as used in the Plan generation algorithm.
+ * BOUNDARY - Should describe a Plan input/output Tensor.
+ * INTERIOR - Should describe an intermediate Tensor in a 'closed' Plan.
+ */
+enum TensorConfigState { BOUNDARY, INTERIOR };
+
+/*! \brief Node to represent a TensorConfig */
+class TensorConfigNode : public Object {
+ public:
+  void VisitAttrs(AttrVisitor* v);
+
+  /*! \return The Tensor the config applies to */
+  const Tensor GetTensor() const { return tensor_; }
+  /*! \return The region where the tensor is allocated */
+  MemoryRegion GetHomeRegion() const { return home_region_; }
+  /*!
+   * \return The state of the TensorConfig.
+   * \note The TensorConfigState is only used as part of the Plan generation algorithm. For a Plan
+   * to be 'closed' (and therefore not subject to any further merging), all the TensorConfigs that
+   * describe Plan input or output Tensors must be in the 'BOUNDARY' state with the rest being
+   * 'INTERIOR'. If any of the input or output tensors are described by an 'INTERIOR' TensorConfig,
+   * then the Plan is 'open' and should be merged with other 'open' Plans until the result becomes
+   * 'closed'.
+   */
+  TensorConfigState GetState() const { return state_; }
+  /*!
+   * \return The mode in which the buffer should be realized
+   * \note There are multiple buffering strategies by which a tensor may be realized (computed).
+   * These affect the amount of recomputation necessary as well as the size of buffer required to
+   * store the tensor. See 'BufferMode' for a description of the allowable buffering modes.
+   */
+  BufferMode GetBufferMode() const { return buffer_mode_; }
+  /*!
+   * \return Whether to copy the tensor.
+   * \note While a tensor will originally reside in its home region, the TensorConfig may optionally
+   * specify that the tensor should be copied (according to the StripeConfigs) into another
+   * MemoryRegion. As an example for where this may be used, if a weights tensor initially resides
+   * in slow Flash memory then necessarily the home region will be Flash. However, if the weights
+   * values are used multiple times by a Part, it may be more performant to choose to copy the
+   * weights into a faster memory like SRAM.
+   */
+  bool DoCopy() const { return copy_tensor_; }
+  /*! \return The region to copy the tensor to */
+  MemoryRegion GetCopyRegion() const {
+    if (!copy_tensor_) {
+      return home_region_;
+    }
+    return copy_region_;
+  }
+  /*!
+   * \return The StripeConfigs with which to compute the tensor.
+   * \note The StripeConfigs determine the order in which the elements of the tensor should be
+   * computed, including potentially computing them multiple times (recompute). Multiple
+   * StripeConfigs are used over just a single StripeConfig for the case where the tensor is
+   * consumed by two different Parts executing themselves with different StripeConfigs. In this
+   * case, there is a StripeConfig per consumer of the tensor.
+   */
+  const std::vector<StripeConfig> GetStripeConfigs() const { return stripe_configs_; }
+  /*!
+   * \return The size of the buffer needed for the TensorConfig.
+   * \note The size of buffer necessary to store a tensor being produced using the TensorConfig is
+   * not necessarily just the size of the tensor. In Plans, a tensor may be being produced and
+   * consumed in 'stripes' which are smaller than the full tensor. Therefore, the buffer necessary
+   * to store the tensor may only need to be as large as the stripe. The precise size of the buffer
+   * will depend both on the BufferMode and StripeConfigs (as well as, of course, the Tensor).
+   */
+  int GetBufferSize() const;
+  /*! \return The hash of the TensorConfigNode */
+  size_t GetHash() const { return hash_; }
+
+  static constexpr const char* _type_key = "contrib.ethosu.cascader.TensorConfig";
+  TVM_DECLARE_FINAL_OBJECT_INFO(TensorConfigNode, Object);
+
+ protected:
+  friend class TensorConfig;
+
+  /*! \brief Compute the hash of the TensorConfigNode */
+  void ComputeHash_();
+
+  /*! \return The size of the recompute buffer needed*/
+  int GetRecomputeBufferSize_() const;
+  /*! \return The size of the rolling buffer needed*/
+  int GetRollingBufferSize_() const;
+
+  /*! \brief The Tensor the config applies to */
+  Tensor tensor_;
+  /*! \brief The region where the tensor is allocated */
+  MemoryRegion home_region_;
+  /*! \return The state of the TensorConfig */
+  TensorConfigState state_;
+  /*! \brief The mode in which the buffer should be realized */
+  BufferMode buffer_mode_;
+  /*! \return The StripeConfigs with which to compute the tensor */
+  std::vector<StripeConfig> stripe_configs_;
+  /*! \brief Whether to copy the tensor */
+  bool copy_tensor_;
+  /*! \brief The region to copy the tensor to */
+  MemoryRegion copy_region_;
+  /*! \brief The hash of the TensorConfigNode */
+  size_t hash_{0};
+};
+
+/*!
+ * \brief A class which describes how to realize a Tensor.
+ * \note The TensorConfig describes both how a Tensor is scheduled (the order in which it's
+ * produced/consumed) and how its allocated in memory (which region it should reside in and whether
+ * it should be copied). For further detail on how TensorConfig stores this information, consult the
+ * documentation of TensorConfigNode.
+ */
+class TensorConfig : public ObjectRef {
+ public:
+  TensorConfig(const Tensor& tensor, const MemoryRegion& home_region, TensorConfigState state,
+               BufferMode buffer_mode, const std::vector<StripeConfig>& stripe_configs,
+               bool copy_tensor, const MemoryRegion& copy_region);
+  /*!
+   * \brief Check if two TensorConfigs are equal to each other.
+   * \param other TensorConfig to be checked.
+   * \return Whether the two TensorConfigs equal each other.
+   */
+  bool operator==(const TensorConfig& other) const;
+
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(TensorConfig, ObjectRef, TensorConfigNode);
+};
+
+}  // namespace cascader
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace tvm
+
+// Hash and equal function for TensorConfig
+namespace std {
+
+/*! \brief The equal_to function for tvm::contrib::ethosu::cascader::TensorConfig */
+template <>
+struct equal_to<::tvm::contrib::ethosu::cascader::TensorConfig> {
+  bool operator()(const ::tvm::contrib::ethosu::cascader::TensorConfig& lhs,
+                  const ::tvm::contrib::ethosu::cascader::TensorConfig& rhs) const {
+    return lhs == rhs;
+  }
+};
+
+/*! \brief The hash function for tvm::contrib::ethosu::cascader::TensorConfig */
+template <>
+struct hash<::tvm::contrib::ethosu::cascader::TensorConfig> {
+  std::size_t operator()(
+      const ::tvm::contrib::ethosu::cascader::TensorConfig& tensor_config) const {
+    return tensor_config->GetHash();
+  }
+};
+
+}  // namespace std
+
+#endif  // TVM_CONTRIB_ETHOSU_CASCADER_TENSOR_CONFIG_H_
diff --git a/tests/python/contrib/test_ethosu/cascader/conftest.py b/tests/python/contrib/test_ethosu/cascader/conftest.py
index eacf57c251a8..9fdee01bd4e7 100644
--- a/tests/python/contrib/test_ethosu/cascader/conftest.py
+++ b/tests/python/contrib/test_ethosu/cascader/conftest.py
@@ -28,6 +28,7 @@
     from tvm import relay
     from tvm.relay.testing import run_opt_pass
 
+    import tvm.contrib.ethosu.cascader as cs
     from .infra import create_te_graph
     from ..infra import (
         make_ethosu_conv2d,
@@ -35,6 +36,64 @@
         make_ethosu_binary_elementwise,
     )
 
+    @pytest.fixture
+    def FLASH():
+        return cs.MemoryRegion(name="FLASH", size=10 ** 7, read_bandwidth=4, write_bandwidth=4)
+
+    @pytest.fixture
+    def DRAM():
+        return cs.MemoryRegion(name="DRAM", size=10 ** 9, read_bandwidth=8, write_bandwidth=8)
+
+    @pytest.fixture
+    def SRAM():
+        return cs.MemoryRegion(name="SRAM", size=10 ** 6, read_bandwidth=16, write_bandwidth=16)
+
+    def make_TwoConv2DTE():
+        def _get_func():
+            ifm = relay.var("ifm", shape=(1, 12, 12, 8), dtype="int8")
+            conv1 = make_ethosu_conv2d(
+                ifm=ifm,
+                ifm_channels=8,
+                ofm_channels=32,
+                kernel_shape=(1, 1),
+                padding=(0, 0),
+                strides=(1, 1),
+                dilation=(1, 1),
+                activation="NONE",
+                ifm_layout="NHWC",
+                ofm_layout="NHCWB16",
+            )
+            conv2 = make_ethosu_conv2d(
+                ifm=conv1,
+                ifm_channels=32,
+                ofm_channels=16,
+                kernel_shape=(3, 3),
+                padding=(1, 1),
+                strides=(1, 1),
+                dilation=(1, 1),
+                activation="NONE",
+                ifm_layout="NHCWB16",
+                ofm_layout="NHWC",
+            )
+            func = relay.Function(relay.analysis.free_vars(conv2), conv2)
+            func = run_opt_pass(func, relay.transform.InferType())
+            return func
+
+        func = _get_func()
+        te_graph, const_dict = create_te_graph(func)
+        sch = tvm.te.create_schedule([t.op for t in te_graph.outputs])
+        return sch, te_graph, const_dict
+
+    @pytest.fixture
+    def TwoConv2DTE():
+        return make_TwoConv2DTE()
+
+    @pytest.fixture
+    def TwoConv2DGraph():
+        _, te_graph, const_dict = make_TwoConv2DTE()
+        device_config = cs.EthosuDeviceConfig("ethos-u55-256")
+        return cs.create_cascader_graph(te_graph, const_dict, device_config)
+
     def make_TwoConv2DWithSliceTE():
         def _get_func():
             ifm = relay.var("ifm", shape=(1, 12, 12, 8), dtype="int8")
@@ -75,6 +134,12 @@ def _get_func():
     @pytest.fixture
     def TwoConv2DWithSliceTE():
         return make_TwoConv2DWithSliceTE()
+    
+    @pytest.fixture
+    def TwoConv2DWithSliceGraph():
+        _, te_graph, const_dict = make_TwoConv2DWithSliceTE()
+        device_config = cs.EthosuDeviceConfig("ethos-u55-256")
+        return cs.create_cascader_graph(te_graph, const_dict, device_config)
 
     def make_MobileNetv2DiamondTE():
         def _get_func():
diff --git a/tests/python/contrib/test_ethosu/cascader/infra.py b/tests/python/contrib/test_ethosu/cascader/infra.py
index 5f41dce30147..23d0640e2de6 100644
--- a/tests/python/contrib/test_ethosu/cascader/infra.py
+++ b/tests/python/contrib/test_ethosu/cascader/infra.py
@@ -17,6 +17,24 @@
 import tvm
 from tvm import relay
 from tvm.relay.backend.contrib.ethosu.tir.compiler import extract_constants, lower_to_te
+import tvm.contrib.ethosu.cascader as cs
+
+
+def make_options(
+    cascade_region: cs.MemoryRegion,
+    max_proposals: int = 1,
+    stripe_factors: int = 1,
+    max_plan_size: int = 1,
+    always_copy_size: int = 1024,
+):
+    return cs.CascaderOptions(
+        cascade_region=cascade_region,
+        max_proposals=max_proposals,
+        stripe_factors=stripe_factors,
+        max_plan_size=max_plan_size,
+        always_copy_size=always_copy_size,
+    )
+
 
 import numpy as np
 
@@ -124,3 +142,14 @@ def make_matrices(
         scale_bias_matrix,
         scale_bias_offset,
     )
+
+
+def make_simple_home_map(graph, var_region, const_region):
+    home_map = {}
+    for tensor in graph.tensor_order:
+        if tensor.is_constant:
+            home_map[tensor] = [const_region]
+        else:
+            home_map[tensor] = [var_region]
+
+    return home_map
diff --git a/tests/python/contrib/test_ethosu/cascader/test_pareto.py b/tests/python/contrib/test_ethosu/cascader/test_pareto.py
new file mode 100644
index 000000000000..0e9393d69061
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/cascader/test_pareto.py
@@ -0,0 +1,145 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from tvm.tir import IntImm
+from tvm.contrib.ethosu.cascader.pareto import get_pareto_frontier, thin_vector, pareto_cull_plans
+from tvm.contrib.ethosu.cascader import (
+    Plan,
+    StripeConfig,
+    TensorConfig,
+    TensorConfigState,
+    BufferMode,
+    Tensor,
+)
+
+import pytest
+import numpy as np
+
+
+def _ref_get_pareto_frontier(costs):
+    is_efficient = np.ones(costs.shape[0], dtype=bool)
+    for i, c in enumerate(costs):
+        if is_efficient[i]:
+            is_efficient[is_efficient] = np.any(
+                costs[is_efficient] < c, axis=1
+            )  # Keep any point with a lower cost
+            is_efficient[i] = True  # And keep self
+    return is_efficient
+
+
+def _ref_thin_vector(vec, max_size):
+    if max_size < 1:
+        return []
+    if len(vec) <= max_size or len(vec) == 0:
+        return vec
+    if max_size == 1:
+        return [vec[0]]
+    samples = np.linspace(0, len(vec), max_size - 1, endpoint=False).astype(int)
+    samples = np.append(samples, len(vec) - 1)
+    return vec[samples]
+
+
+def _ref_pareto_cull_plans(plans, points):
+    if len(plans) <= points:
+        return plans
+    plans = np.array(sorted(plans, key=lambda x: x.memory_usage))
+    costs = []
+    for plan in plans:
+        costs.append(np.array([plan.memory_usage, plan.cycles]))
+    is_efficient = _ref_get_pareto_frontier(np.array(costs))
+    culled_plans = plans[is_efficient]
+    thinned_plans = (
+        culled_plans
+        if len(culled_plans) <= points
+        else _ref_thin_vector(np.array(culled_plans), points)
+    )
+    return thinned_plans
+
+
+@pytest.mark.parametrize("num_costs", [1, 10, 30, 100, 300, 1000])
+def test_get_pareto_frontier(num_costs):
+    cost_low = 1
+    cost_high = 100
+    dims = 2
+    costs = []
+    for i in range(num_costs):
+        costs.append(list(np.random.randint(cost_low, cost_high, size=(dims,))))
+    reference = list(_ref_get_pareto_frontier(np.array(costs)))
+    result = get_pareto_frontier(costs)
+    assert result == reference
+
+
+@pytest.mark.parametrize("vec_length", [0, 1, 10, 25, 100])
+@pytest.mark.parametrize("max_size", [0, 1, 2, 5, 11, 51])
+def test_thin_vector(vec_length, max_size):
+    def _make_vector(length):
+        vector = []
+        for i in range(length):
+            obj = IntImm("int32", i)
+            vector.append(obj)
+
+        return vector
+
+    vector = _make_vector(vec_length)
+    reference = list(_ref_thin_vector(np.array(vector), max_size))
+    result = thin_vector(vector, max_size)
+    assert result == reference
+
+
+@pytest.mark.parametrize("num_plans", [0, 1, 10, 25, 100])
+@pytest.mark.parametrize("max_plans", [0, 1, 2, 5, 11, 51])
+def test_pareto_cull_plans(num_plans, max_plans, SRAM):
+    memory_usage_low = 1
+    memory_usage_high = 1000
+    cycles_low = 100
+    cycles_high = 10000
+
+    def _make_plan(memory_usage, cycles):
+        output_config = TensorConfig(
+            tensor=Tensor([1], "int8"),
+            home_region=SRAM,
+            state=TensorConfigState.BOUNDARY,
+            buffer_mode=BufferMode.RECOMPUTE,
+            stripe_configs=[StripeConfig([1], [1], [1], [1], [1], [0])],
+        )
+        return Plan(
+            tensor_configs={},
+            open_configs=[],
+            output_config=output_config,
+            part_group=[],
+            interior_region=SRAM,
+            memory_usage=memory_usage,
+            cycles=cycles,
+        )
+
+    def _make_plans(num):
+        plans = []
+        for _ in range(num):
+            memory_usage = np.random.randint(memory_usage_low, memory_usage_high)
+            cycles = np.random.randint(cycles_low, cycles_high)
+            plan = _make_plan(memory_usage, cycles)
+            plans.append(plan)
+
+        return plans
+
+    plans = _make_plans(num_plans)
+    reference = list(_ref_pareto_cull_plans(plans, max_plans))
+    result = pareto_cull_plans(plans, max_plans)
+    assert result == reference
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/cascader/test_plan.py b/tests/python/contrib/test_ethosu/cascader/test_plan.py
new file mode 100644
index 000000000000..ddc40b49ac8a
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/cascader/test_plan.py
@@ -0,0 +1,244 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm.contrib.ethosu.cascader as cs
+
+import pytest
+
+
+def test_plan(DRAM, SRAM):
+    subgraph = cs.TESubgraph([], None)
+    part = cs.InlinePart(
+        subgraph,
+        [
+            cs.Propagator(
+                [[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+                [0, 0],
+            ),
+        ],
+    )
+    tensor_1 = cs.Tensor([10, 10], "uint8")
+    tensor_2 = cs.Tensor([10, 10], "uint8")
+
+    part.set_input(0, tensor_1)
+    part.set_output(tensor_2)
+    tensor_1.add_consumer(part)
+    tensor_2.add_producer(part)
+
+    output_stripe_config = cs.StripeConfig(
+        shape=[5, 5],
+        extent=[10, 10],
+        strides=[5, 5],
+        order=[1, 2],
+        stripes=[2, 2],
+        offset=[0, 0],
+    )
+    tensor_config_out = cs.TensorConfig(
+        tensor=tensor_2,
+        home_region=DRAM,
+        state=cs.TensorConfigState.BOUNDARY,
+        buffer_mode=cs.BufferMode.RECOMPUTE,
+        stripe_configs=[output_stripe_config],
+        copy_tensor=False,
+    )
+    input_stripe_config = part.calculate_input_stripe_configs(output_stripe_config)[0]
+    tensor_config_in = cs.TensorConfig(
+        tensor=tensor_1,
+        home_region=DRAM,
+        state=cs.TensorConfigState.INTERIOR,
+        buffer_mode=cs.BufferMode.ROLLING,
+        stripe_configs=[input_stripe_config],
+        copy_tensor=False,
+    )
+    tensor_configs = {tensor_1: tensor_config_in, tensor_2: tensor_config_out}
+    open_configs = frozenset([tensor_config_in])
+    part_group = frozenset([part])
+    interior_region = SRAM
+    memory_usage = 100
+    cycles = 20
+    plan = cs.Plan(
+        tensor_configs=tensor_configs,
+        open_configs=open_configs,
+        output_config=tensor_config_out,
+        part_group=part_group,
+        interior_region=interior_region,
+        memory_usage=memory_usage,
+        cycles=cycles,
+    )
+
+    assert plan.tensor_configs == tensor_configs
+    assert plan.open_configs == open_configs
+    assert plan.output_config == tensor_config_out
+    assert plan.part_group == part_group
+    assert plan.interior_region == interior_region
+    assert plan.memory_usage == memory_usage
+    assert plan.cycles == cycles
+
+
+def test_plan_merge(DRAM, SRAM):
+    subgraph = cs.TESubgraph([], None)
+    part_1 = cs.InlinePart(
+        subgraph,
+        [
+            cs.Propagator(
+                [[2, 0, 0], [0, 2, 0], [0, 0, 1]],
+                [0, 0],
+            ),
+        ],
+    )
+    part_2 = cs.InlinePart(
+        subgraph,
+        [
+            cs.Propagator(
+                [[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+                [0, 0],
+            ),
+            cs.Propagator(
+                [[0, 0, 6], [0, 0, 6], [0, 0, 1]],
+                [0, 0],
+            ),
+            cs.Propagator(
+                [[1, 0], [0, 1]],
+                [0],
+            ),
+        ],
+    )
+    tensor_1 = cs.Tensor([20, 20], "uint8")
+    tensor_2 = cs.Tensor([10, 10], "uint8")
+    tensor_3 = cs.Tensor([6, 6], "uint8")
+    tensor_4 = cs.Tensor([10], "uint8")
+    tensor_5 = cs.Tensor([10, 10], "uint8")
+
+    part_1.set_input(0, tensor_1)
+    part_1.set_output(tensor_2)
+    tensor_1.add_consumer(part_1)
+    tensor_2.add_producer(part_1)
+
+    part_2.set_input(0, tensor_2)
+    part_2.set_input(1, tensor_3)
+    part_2.set_input(2, tensor_4)
+    part_2.set_output(tensor_5)
+    tensor_2.add_consumer(part_2)
+    tensor_3.add_consumer(part_2)
+    tensor_4.add_consumer(part_2)
+    tensor_5.add_producer(part_2)
+
+    output_stripe_config = cs.StripeConfig(
+        shape=[5, 5],
+        extent=[10, 10],
+        strides=[5, 5],
+        order=[1, 2],
+        stripes=[2, 2],
+        offset=[0, 0],
+    )
+    tensor_config_5 = cs.TensorConfig(
+        tensor=tensor_5,
+        home_region=DRAM,
+        state=cs.TensorConfigState.BOUNDARY,
+        buffer_mode=cs.BufferMode.RECOMPUTE,
+        stripe_configs=[output_stripe_config],
+        copy_tensor=False,
+    )
+    input_stripe_configs = part_2.calculate_input_stripe_configs(output_stripe_config)
+    tensor_config_4 = cs.TensorConfig(
+        tensor=tensor_4,
+        home_region=DRAM,
+        state=cs.TensorConfigState.BOUNDARY,
+        buffer_mode=cs.BufferMode.RECOMPUTE,
+        stripe_configs=[input_stripe_configs[2]],
+        copy_tensor=False,
+    )
+    tensor_config_3 = cs.TensorConfig(
+        tensor=tensor_3,
+        home_region=SRAM,
+        state=cs.TensorConfigState.INTERIOR,
+        buffer_mode=cs.BufferMode.RECOMPUTE,
+        stripe_configs=[input_stripe_configs[1]],
+        copy_tensor=False,
+    )
+    tensor_config_2 = cs.TensorConfig(
+        tensor=tensor_2,
+        home_region=SRAM,
+        state=cs.TensorConfigState.INTERIOR,
+        buffer_mode=cs.BufferMode.ROLLING,
+        stripe_configs=[input_stripe_configs[0]],
+        copy_tensor=False,
+    )
+    input_stripe_config = part_1.calculate_input_stripe_configs(input_stripe_configs[0])[0]
+    tensor_config_1 = cs.TensorConfig(
+        tensor=tensor_1,
+        home_region=DRAM,
+        state=cs.TensorConfigState.BOUNDARY,
+        buffer_mode=cs.BufferMode.ROLLING,
+        stripe_configs=[input_stripe_config],
+        copy_tensor=False,
+    )
+    tensor_configs = {tensor_1: tensor_config_1, tensor_2: tensor_config_2}
+    open_configs = frozenset([tensor_config_2])
+    part_group = frozenset([part_1])
+    interior_region = SRAM
+    memory_usage = 100
+    cycles = 20
+    plan_1 = cs.Plan(
+        tensor_configs=tensor_configs,
+        open_configs=open_configs,
+        output_config=tensor_config_2,
+        part_group=part_group,
+        interior_region=interior_region,
+        memory_usage=memory_usage,
+        cycles=cycles,
+    )
+
+    tensor_configs = {
+        tensor_2: tensor_config_2,
+        tensor_3: tensor_config_3,
+        tensor_4: tensor_config_4,
+        tensor_5: tensor_config_5,
+    }
+    open_configs = frozenset([tensor_config_2, tensor_config_3])
+    part_group = frozenset([part_2])
+    interior_region = SRAM
+    memory_usage = 200
+    cycles = 30
+    plan_2 = cs.Plan(
+        tensor_configs=tensor_configs,
+        open_configs=open_configs,
+        output_config=tensor_config_5,
+        part_group=part_group,
+        interior_region=interior_region,
+        memory_usage=memory_usage,
+        cycles=cycles,
+    )
+
+    merged_plan = plan_1.merge(plan_2)
+
+    assert merged_plan.tensor_configs == {
+        tensor_1: tensor_config_1,
+        tensor_2: tensor_config_2,
+        tensor_3: tensor_config_3,
+        tensor_4: tensor_config_4,
+        tensor_5: tensor_config_5,
+    }
+    assert merged_plan.open_configs == frozenset([tensor_config_3])
+    assert merged_plan.output_config == tensor_config_5
+    assert merged_plan.part_group == frozenset([part_1, part_2])
+    assert merged_plan.interior_region == interior_region
+    assert merged_plan.memory_usage == plan_1.memory_usage + plan_2.memory_usage
+    assert merged_plan.cycles == plan_1.cycles + plan_2.cycles
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
new file mode 100644
index 000000000000..753a2710774e
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
@@ -0,0 +1,179 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+import tvm.contrib.ethosu.cascader as cs
+from .infra import make_simple_home_map, make_options
+
+from tvm.contrib.ethosu.cascader.plan_generator import (
+    generate_output_stripe_configs,
+    generate_single_plans,
+    generate_graph_plans,
+)
+
+
+def test_generate_output_stripe_configs():
+    stripe_factors = 3
+    expected_configs = 13
+    subgraph = cs.TESubgraph([], None)
+    part_1 = cs.InlinePart(
+        subgraph,
+        [
+            cs.Propagator(
+                [[2, 0, 0], [0, 2, 0], [0, 0, 1]],
+                [0, 0],
+            ),
+        ],
+    )
+    tensor_1 = cs.Tensor([800, 800], "uint8")
+    tensor_2 = cs.Tensor([400, 400], "uint8")
+
+    part_1.set_input(0, tensor_1)
+    part_1.set_output(tensor_2)
+    tensor_1.add_consumer(part_1)
+    tensor_2.add_producer(part_1)
+
+    assert len(generate_output_stripe_configs(part_1, stripe_factors)) == expected_configs
+
+
+def test_generate_single_plans(SRAM, DRAM):
+    subgraph = cs.TESubgraph([], None)
+    part_1 = cs.InlinePart(
+        subgraph,
+        [
+            cs.Propagator(
+                [[2, 0, 0], [0, 2, 0], [0, 0, 1]],
+                [0, 0],
+            ),
+        ],
+    )
+    tensor_1 = cs.Tensor([800, 800], "int8")
+    tensor_2 = cs.Tensor([400, 400], "int8")
+
+    part_1.set_input(0, tensor_1)
+    part_1.set_output(tensor_2)
+    tensor_1.add_consumer(part_1)
+    tensor_2.add_producer(part_1)
+
+    home_map = {
+        tensor_1: [SRAM, DRAM],
+        tensor_2: [SRAM],
+    }
+    options = make_options(cascade_region=SRAM, stripe_factors=1)
+    output_stripe_configs = generate_output_stripe_configs(part_1, options.stripe_factors)
+    plans = generate_single_plans(part_1, output_stripe_configs, home_map, options)
+    for plan in plans:
+        assert plan.interior_region == SRAM
+        assert plan.part_group == frozenset([part_1])
+        assert set(plan.tensor_configs.keys()) == set([tensor_1, tensor_2])
+        for open_config in plan.open_configs:
+            assert open_config.state == cs.TensorConfigState.INTERIOR
+
+
+def test_generate_graph_plans(SRAM, DRAM):
+    num_part_groups = 3
+    stripe_factors = 4
+    max_plan_size = 10
+    subgraph = cs.TESubgraph([], None)
+    part_a = cs.InlinePart(
+        subgraph,
+        [
+            cs.Propagator(
+                [[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+                [0, 0],
+            ),
+            cs.Propagator(
+                [[0, 1, 0], [1, 0, 0], [0, 0, 1]],
+                [-1, -1],
+            ),
+        ],
+    )
+    part_b = cs.InlinePart(
+        subgraph,
+        [
+            cs.Propagator(
+                [[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+                [0, 0],
+            ),
+        ],
+    )
+    tensor_1 = cs.Tensor([10, 10], "int8")
+    tensor_2 = cs.Tensor([9, 9], "int8")
+    tensor_3 = cs.Tensor([10, 10], "int8")
+    tensor_4 = cs.Tensor([10, 10], "int8")
+
+    part_a.set_input(0, tensor_1)
+    part_a.set_input(1, tensor_2)
+    part_a.set_output(tensor_3)
+    tensor_1.add_consumer(part_a)
+    tensor_2.add_consumer(part_a)
+    tensor_3.add_producer(part_a)
+    part_b.set_input(0, tensor_3)
+    part_b.set_output(tensor_4)
+    tensor_3.add_consumer(part_b)
+    tensor_4.add_producer(part_b)
+
+    graph = cs.CascaderGraph([tensor_1, tensor_2], [tensor_4])
+    home_map = {
+        tensor_1: [SRAM, DRAM],
+        tensor_2: [SRAM],
+        tensor_3: [SRAM],
+        tensor_4: [SRAM, DRAM],
+    }
+
+    options = make_options(
+        cascade_region=SRAM,
+        stripe_factors=stripe_factors,
+        max_plan_size=max_plan_size,
+    )
+    closed_plans = generate_graph_plans(graph, home_map, options)
+
+    assert len(closed_plans) == num_part_groups
+
+
+def test_plan_generator_two_conv2d(FLASH, SRAM, TwoConv2DGraph):
+    num_part_groups = 3
+    graph = TwoConv2DGraph
+    home_map = make_simple_home_map(graph, SRAM, FLASH)
+    options = make_options(
+        cascade_region=SRAM,
+        stripe_factors=4,
+        max_plan_size=10,
+    )
+
+    closed_plans = generate_graph_plans(graph, home_map, options)
+
+    assert len(closed_plans) == num_part_groups
+
+
+def test_plan_generator_two_conv2d_with_slice(FLASH, SRAM, TwoConv2DWithSliceGraph):
+    num_part_groups = 4  # Note this is not 6 because 'slice' has an opaque Propagator
+    graph = TwoConv2DWithSliceGraph
+    home_map = make_simple_home_map(graph, SRAM, FLASH)
+    options = make_options(
+        cascade_region=SRAM,
+        stripe_factors=4,
+        max_plan_size=10,
+    )
+
+    closed_plans = generate_graph_plans(graph, home_map, options)
+
+    assert len(closed_plans) == num_part_groups
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/cascader/test_tensor_config.py b/tests/python/contrib/test_ethosu/cascader/test_tensor_config.py
new file mode 100644
index 000000000000..68290e667eb0
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/cascader/test_tensor_config.py
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from tvm.contrib.ethosu.cascader import (
+    StripeConfig,
+    Tensor,
+    MemoryRegion,
+    TensorConfig,
+    TensorConfigState,
+    BufferMode,
+)
+
+import pytest
+
+
+def test_tensor_config(DRAM, SRAM):
+    stripe_config = StripeConfig(
+        shape=[1, 2, 3],
+        extent=[2, 3, 4],
+        strides=[3, 4, 5],
+        order=[4, 5, 6],
+        stripes=[5, 6, 7],
+        offset=[6, 7, 8],
+    )
+    tensor = Tensor(
+        shape=[10, 10, 10],
+        dtype="int8",
+    )
+    home_region = DRAM
+    state = TensorConfigState.BOUNDARY
+    buffer_mode = BufferMode.ROLLING
+    copy_tensor = True
+    copy_region = SRAM
+    tensor_config = TensorConfig(
+        tensor=tensor,
+        home_region=home_region,
+        state=state,
+        buffer_mode=buffer_mode,
+        stripe_configs=[stripe_config],
+        copy_tensor=copy_tensor,
+        copy_region=copy_region,
+    )
+
+    assert tensor_config.tensor == tensor
+    assert tensor_config.home_region == home_region
+    assert tensor_config.state == state
+    assert tensor_config.buffer_mode == buffer_mode
+    assert tensor_config.stripe_configs == [stripe_config]
+    assert tensor_config.copy_tensor == copy_tensor
+    assert tensor_config.copy_region == copy_region
+    assert hash(tensor_config) != 0
+
+
+def test_get_rolling_buffer(DRAM):
+    stripe_config = StripeConfig(
+        shape=[9, 4, 7],
+        extent=[9, 16, 21],
+        strides=[3, 5, 7],
+        order=[1, 3, 2],
+        stripes=[1, 3, 3],
+        offset=[0, 0, 0],
+    )
+    tensor = Tensor(shape=[9, 16, 21], dtype="int32", compression_ratio=0.5)
+    tensor_config = TensorConfig(
+        tensor=tensor,
+        home_region=DRAM,
+        state=TensorConfigState.BOUNDARY,
+        buffer_mode=BufferMode.ROLLING,
+        stripe_configs=[stripe_config],
+    )
+
+    assert tensor_config.get_buffer_size() == 2016
+
+
+def test_get_recompute_buffer(DRAM):
+    stripe_config = StripeConfig(
+        shape=[4, 5, 7],
+        extent=[6, 7, 14],
+        strides=[2, 3, 7],
+        order=[1, 3, 2],
+        stripes=[2, 2, 2],
+        offset=[0, 0, 0],
+    )
+    tensor = Tensor(shape=[6, 7, 14], dtype="int32", compression_ratio=0.5)
+    tensor_config = TensorConfig(
+        tensor=tensor,
+        home_region=DRAM,
+        state=TensorConfigState.BOUNDARY,
+        buffer_mode=BufferMode.RECOMPUTE,
+        stripe_configs=[stripe_config],
+    )
+
+    assert tensor_config.get_buffer_size() == 280
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])

From 714ad55bd2d6bd67c66826f87b6e464378053583 Mon Sep 17 00:00:00 2001
From: Matthew Barrett <matthew.barrett@arm.com>
Date: Tue, 1 Feb 2022 11:33:03 +0000
Subject: [PATCH 2/4] Fixes to lint/test

Change-Id: If4e083a3c96af75a8ffa72510704818d21a477d9
---
 src/contrib/ethosu/cascader/plan_generator.cc |   3 +-
 .../contrib/test_ethosu/cascader/conftest.py  |  32 +--
 .../contrib/test_ethosu/cascader/infra.py     | 217 +++++++++---------
 .../cascader/test_plan_generator.py           |  47 ++--
 4 files changed, 154 insertions(+), 145 deletions(-)

diff --git a/src/contrib/ethosu/cascader/plan_generator.cc b/src/contrib/ethosu/cascader/plan_generator.cc
index 7cb6baf39159..9acffb7e9479 100644
--- a/src/contrib/ethosu/cascader/plan_generator.cc
+++ b/src/contrib/ethosu/cascader/plan_generator.cc
@@ -188,8 +188,7 @@ std::vector<TensorConfig> GetPossibleInputConfigs(const StripeConfig& stripe_con
   std::vector<TensorConfig> configs;
   for (const auto& home_region : home_regions) {
     // Boundary configs
-    if (home_region == options->cascade_region ||
-        tensor->GetSize() > options->always_copy_size) {
+    if (home_region == options->cascade_region || tensor->GetSize() > options->always_copy_size) {
       configs.push_back(TensorConfig(tensor, home_region, TensorConfigState::BOUNDARY,
                                      BufferMode::RECOMPUTE, {stripe_config}, false, home_region));
     }
diff --git a/tests/python/contrib/test_ethosu/cascader/conftest.py b/tests/python/contrib/test_ethosu/cascader/conftest.py
index 9fdee01bd4e7..21ed401994c1 100644
--- a/tests/python/contrib/test_ethosu/cascader/conftest.py
+++ b/tests/python/contrib/test_ethosu/cascader/conftest.py
@@ -22,13 +22,29 @@
 except ImportError:
     ethosu_enabled = False
 
+import tvm.contrib.ethosu.cascader as cs
+
+
+@pytest.fixture
+def FLASH():
+    return cs.MemoryRegion(name="FLASH", size=10 ** 7, read_bandwidth=4, write_bandwidth=4)
+
+
+@pytest.fixture
+def DRAM():
+    return cs.MemoryRegion(name="DRAM", size=10 ** 9, read_bandwidth=8, write_bandwidth=8)
+
+
+@pytest.fixture
+def SRAM():
+    return cs.MemoryRegion(name="SRAM", size=10 ** 6, read_bandwidth=16, write_bandwidth=16)
+
 
 if ethosu_enabled:
     import tvm
     from tvm import relay
     from tvm.relay.testing import run_opt_pass
 
-    import tvm.contrib.ethosu.cascader as cs
     from .infra import create_te_graph
     from ..infra import (
         make_ethosu_conv2d,
@@ -36,18 +52,6 @@
         make_ethosu_binary_elementwise,
     )
 
-    @pytest.fixture
-    def FLASH():
-        return cs.MemoryRegion(name="FLASH", size=10 ** 7, read_bandwidth=4, write_bandwidth=4)
-
-    @pytest.fixture
-    def DRAM():
-        return cs.MemoryRegion(name="DRAM", size=10 ** 9, read_bandwidth=8, write_bandwidth=8)
-
-    @pytest.fixture
-    def SRAM():
-        return cs.MemoryRegion(name="SRAM", size=10 ** 6, read_bandwidth=16, write_bandwidth=16)
-
     def make_TwoConv2DTE():
         def _get_func():
             ifm = relay.var("ifm", shape=(1, 12, 12, 8), dtype="int8")
@@ -134,7 +138,7 @@ def _get_func():
     @pytest.fixture
     def TwoConv2DWithSliceTE():
         return make_TwoConv2DWithSliceTE()
-    
+
     @pytest.fixture
     def TwoConv2DWithSliceGraph():
         _, te_graph, const_dict = make_TwoConv2DWithSliceTE()
diff --git a/tests/python/contrib/test_ethosu/cascader/infra.py b/tests/python/contrib/test_ethosu/cascader/infra.py
index 23d0640e2de6..aa681c41f210 100644
--- a/tests/python/contrib/test_ethosu/cascader/infra.py
+++ b/tests/python/contrib/test_ethosu/cascader/infra.py
@@ -14,10 +14,16 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+ethosu_enabled = True
+try:
+    import ethosu.vela
+except ImportError:
+    ethosu_enabled = False
+
 import tvm
 from tvm import relay
-from tvm.relay.backend.contrib.ethosu.tir.compiler import extract_constants, lower_to_te
 import tvm.contrib.ethosu.cascader as cs
+import numpy as np
 
 
 def make_options(
@@ -36,120 +42,119 @@ def make_options(
     )
 
 
-import numpy as np
+def make_simple_home_map(graph, var_region, const_region):
+    home_map = {}
+    for tensor in graph.tensor_order:
+        if tensor.is_constant:
+            home_map[tensor] = [const_region]
+        else:
+            home_map[tensor] = [var_region]
 
+    return home_map
 
-def create_te_graph(func):
-    func, consts = extract_constants(func)
-    mod = tvm.IRModule.from_expr(func)
-    func = relay.transform.InferType()(mod)["main"]
-    te_graph = lower_to_te(func)
-    return te_graph, consts
 
+if ethosu_enabled:
+    from tvm.relay.backend.contrib.ethosu.tir.compiler import extract_constants, lower_to_te
 
-def make_matrices(
-    op_type, kernel, stride, padding, ifm_layout, ofm_layout, dilation=(1, 1), ifm_channels=1
-):
-    kernel_h, kernel_w = kernel
-    stride_h, stride_w = stride
-    dilation_h, dilation_w = dilation
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
-    if op_type == "ethosu_conv2d":
-        ifm_matrix = [
-            [1, 0, 0, 0, 0],
-            [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)],
-            [0, 0, stride_w, 0, (dilated_kernel_w - stride_w)],
-            [0, 0, 0, 0, ifm_channels],
-            [0, 0, 0, 0, 1],
-        ]
-        weight_matrix = [
-            [0, 0, 0, 1, 0],
-            [0, 0, 0, 0, kernel_h],
-            [0, 0, 0, 0, kernel_w],
-            [0, 0, 0, 0, ifm_channels],
-            [0, 0, 0, 0, 1],
-        ]
-    elif op_type == "ethosu_depthwise_conv2d":
-        ifm_matrix = [
+    def create_te_graph(func):
+        func, consts = extract_constants(func)
+        mod = tvm.IRModule.from_expr(func)
+        func = relay.transform.InferType()(mod)["main"]
+        te_graph = lower_to_te(func)
+        return te_graph, consts
+
+    def make_matrices(
+        op_type, kernel, stride, padding, ifm_layout, ofm_layout, dilation=(1, 1), ifm_channels=1
+    ):
+        kernel_h, kernel_w = kernel
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+        dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+        nhwc_to_nhcwb16 = [
             [1, 0, 0, 0, 0],
-            [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)],
-            [0, 0, stride_w, 0, (dilated_kernel_w - stride_w)],
-            [0, 0, 0, 1, 0],
+            [0, 1, 0, 0, 0],
+            [0, 0, 0, 1 / 16, 0],
+            [0, 0, 1, 0, 0],
+            [0, 0, 0, 0, 16],
             [0, 0, 0, 0, 1],
         ]
-        weight_matrix = [
-            [0, 0, 0, 1, 0],
-            [0, 0, 0, 0, kernel_h],
-            [0, 0, 0, 0, kernel_w],
-            [0, 0, 0, 0, 1],
-            [0, 0, 0, 0, 1],
+        nhcwb16_to_nhwc = [
+            [1, 0, 0, 0, 0, 0],
+            [0, 1, 0, 0, 0, 0],
+            [0, 0, 0, 1, 0, 0],
+            [0, 0, 16, 0, 1, -16],
+            [0, 0, 0, 0, 0, 1],
         ]
-    elif op_type == "ethosu_pooling":
-        ifm_matrix = [
-            [1, 0, 0, 0, 0],
-            [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)],
-            [0, 0, stride_w, 0, (dilated_kernel_w - stride_w)],
+        if op_type == "ethosu_conv2d":
+            ifm_matrix = [
+                [1, 0, 0, 0, 0],
+                [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)],
+                [0, 0, stride_w, 0, (dilated_kernel_w - stride_w)],
+                [0, 0, 0, 0, ifm_channels],
+                [0, 0, 0, 0, 1],
+            ]
+            weight_matrix = [
+                [0, 0, 0, 1, 0],
+                [0, 0, 0, 0, kernel_h],
+                [0, 0, 0, 0, kernel_w],
+                [0, 0, 0, 0, ifm_channels],
+                [0, 0, 0, 0, 1],
+            ]
+        elif op_type == "ethosu_depthwise_conv2d":
+            ifm_matrix = [
+                [1, 0, 0, 0, 0],
+                [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)],
+                [0, 0, stride_w, 0, (dilated_kernel_w - stride_w)],
+                [0, 0, 0, 1, 0],
+                [0, 0, 0, 0, 1],
+            ]
+            weight_matrix = [
+                [0, 0, 0, 1, 0],
+                [0, 0, 0, 0, kernel_h],
+                [0, 0, 0, 0, kernel_w],
+                [0, 0, 0, 0, 1],
+                [0, 0, 0, 0, 1],
+            ]
+        elif op_type == "ethosu_pooling":
+            ifm_matrix = [
+                [1, 0, 0, 0, 0],
+                [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)],
+                [0, 0, stride_w, 0, (dilated_kernel_w - stride_w)],
+                [0, 0, 0, 1, 0],
+                [0, 0, 0, 0, 1],
+            ]
+            weight_matrix = [
+                [0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0],
+            ]
+        scale_bias_matrix = [
             [0, 0, 0, 1, 0],
+            [0, 0, 0, 0, 10],
             [0, 0, 0, 0, 1],
         ]
-        weight_matrix = [
-            [0, 0, 0, 0, 0],
-            [0, 0, 0, 0, 0],
-            [0, 0, 0, 0, 0],
-            [0, 0, 0, 0, 0],
-            [0, 0, 0, 0, 0],
-        ]
-    scale_bias_matrix = [
-        [0, 0, 0, 1, 0],
-        [0, 0, 0, 0, 10],
-        [0, 0, 0, 0, 1],
-    ]
-    if ofm_layout == "NHCWB16":
-        ifm_matrix = np.matmul(ifm_matrix, nhcwb16_to_nhwc).tolist()
-        weight_matrix = np.matmul(weight_matrix, nhcwb16_to_nhwc).tolist()
-        scale_bias_matrix = np.matmul(scale_bias_matrix, nhcwb16_to_nhwc).tolist()
-    if ifm_layout == "NHCWB16":
-        ifm_matrix = np.matmul(nhwc_to_nhcwb16, ifm_matrix).tolist()
-
-    ifm_offset = (
-        [0, -padding[0], -padding[1], 0]
-        if ifm_layout == "NHWC"
-        else [0, -padding[0], 0, -padding[1], 0]
-    )
-    weight_offset = [0, 0, 0, 0]
-    scale_bias_offset = [0, 0]
-    return (
-        ifm_matrix,
-        ifm_offset,
-        weight_matrix,
-        weight_offset,
-        scale_bias_matrix,
-        scale_bias_offset,
-    )
-
+        if ofm_layout == "NHCWB16":
+            ifm_matrix = np.matmul(ifm_matrix, nhcwb16_to_nhwc).tolist()
+            weight_matrix = np.matmul(weight_matrix, nhcwb16_to_nhwc).tolist()
+            scale_bias_matrix = np.matmul(scale_bias_matrix, nhcwb16_to_nhwc).tolist()
+        if ifm_layout == "NHCWB16":
+            ifm_matrix = np.matmul(nhwc_to_nhcwb16, ifm_matrix).tolist()
 
-def make_simple_home_map(graph, var_region, const_region):
-    home_map = {}
-    for tensor in graph.tensor_order:
-        if tensor.is_constant:
-            home_map[tensor] = [const_region]
-        else:
-            home_map[tensor] = [var_region]
-
-    return home_map
+        ifm_offset = (
+            [0, -padding[0], -padding[1], 0]
+            if ifm_layout == "NHWC"
+            else [0, -padding[0], 0, -padding[1], 0]
+        )
+        weight_offset = [0, 0, 0, 0]
+        scale_bias_offset = [0, 0]
+        return (
+            ifm_matrix,
+            ifm_offset,
+            weight_matrix,
+            weight_offset,
+            scale_bias_matrix,
+            scale_bias_offset,
+        )
diff --git a/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
index 753a2710774e..34cd93c54e4f 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
@@ -17,7 +17,7 @@
 import pytest
 
 import tvm.contrib.ethosu.cascader as cs
-from .infra import make_simple_home_map, make_options
+from .infra import make_simple_home_map, make_options, ethosu_enabled
 
 from tvm.contrib.ethosu.cascader.plan_generator import (
     generate_output_stripe_configs,
@@ -145,34 +145,35 @@ def test_generate_graph_plans(SRAM, DRAM):
     assert len(closed_plans) == num_part_groups
 
 
-def test_plan_generator_two_conv2d(FLASH, SRAM, TwoConv2DGraph):
-    num_part_groups = 3
-    graph = TwoConv2DGraph
-    home_map = make_simple_home_map(graph, SRAM, FLASH)
-    options = make_options(
-        cascade_region=SRAM,
-        stripe_factors=4,
-        max_plan_size=10,
-    )
+if ethosu_enabled:
 
-    closed_plans = generate_graph_plans(graph, home_map, options)
+    def test_plan_generator_two_conv2d(FLASH, SRAM, TwoConv2DGraph):
+        num_part_groups = 3
+        graph = TwoConv2DGraph
+        home_map = make_simple_home_map(graph, SRAM, FLASH)
+        options = make_options(
+            cascade_region=SRAM,
+            stripe_factors=4,
+            max_plan_size=10,
+        )
 
-    assert len(closed_plans) == num_part_groups
+        closed_plans = generate_graph_plans(graph, home_map, options)
 
+        assert len(closed_plans) == num_part_groups
 
-def test_plan_generator_two_conv2d_with_slice(FLASH, SRAM, TwoConv2DWithSliceGraph):
-    num_part_groups = 4  # Note this is not 6 because 'slice' has an opaque Propagator
-    graph = TwoConv2DWithSliceGraph
-    home_map = make_simple_home_map(graph, SRAM, FLASH)
-    options = make_options(
-        cascade_region=SRAM,
-        stripe_factors=4,
-        max_plan_size=10,
-    )
+    def test_plan_generator_two_conv2d_with_slice(FLASH, SRAM, TwoConv2DWithSliceGraph):
+        num_part_groups = 4  # Note this is not 6 because 'slice' has an opaque Propagator
+        graph = TwoConv2DWithSliceGraph
+        home_map = make_simple_home_map(graph, SRAM, FLASH)
+        options = make_options(
+            cascade_region=SRAM,
+            stripe_factors=4,
+            max_plan_size=10,
+        )
 
-    closed_plans = generate_graph_plans(graph, home_map, options)
+        closed_plans = generate_graph_plans(graph, home_map, options)
 
-    assert len(closed_plans) == num_part_groups
+        assert len(closed_plans) == num_part_groups
 
 
 if __name__ == "__main__":

From 854b1e0cd51f6e863a583224128abc7caa531ec2 Mon Sep 17 00:00:00 2001
From: Matthew Barrett <matthew.barrett@arm.com>
Date: Wed, 2 Feb 2022 15:16:08 +0000
Subject: [PATCH 3/4] Improve python docs

Change-Id: I831137f8235665bc20ab4c060cc7049ffd48088a
---
 .../ethosu/cascader/cascader_options.py       | 18 +++-
 python/tvm/contrib/ethosu/cascader/pareto.py  |  6 +-
 python/tvm/contrib/ethosu/cascader/plan.py    | 76 ++++++++++++++-
 .../contrib/ethosu/cascader/plan_generator.py |  6 +-
 .../contrib/ethosu/cascader/tensor_config.py  | 92 ++++++++++++++++++-
 .../test_ethosu/cascader/test_pareto.py       | 12 ++-
 .../cascader/test_plan_generator.py           | 18 ++--
 7 files changed, 202 insertions(+), 26 deletions(-)

diff --git a/python/tvm/contrib/ethosu/cascader/cascader_options.py b/python/tvm/contrib/ethosu/cascader/cascader_options.py
index 83a222589deb..ff831ff37990 100644
--- a/python/tvm/contrib/ethosu/cascader/cascader_options.py
+++ b/python/tvm/contrib/ethosu/cascader/cascader_options.py
@@ -25,7 +25,23 @@
 
 @tvm._ffi.register_object("contrib.ethosu.cascader.CascaderOptions")
 class CascaderOptions(Object):
-    """A class to hold configuration options for the cascader."""
+    """
+    A class to hold configuration options for the cascader.
+
+    Attributes
+    ----------
+    cascade_region : MemoryRegion
+        The MemoryRegion to place cascading buffers into.
+    max_proposals : int
+        The maximum number of Proposals to generate.
+    stripe_factors : int
+        How many striping factors to try per axis.
+    max_plan_size : int
+        The maximum number of Parts in a Plan.
+    always_copy_size : int
+        The maximum size of a Tensor that will always be copied into the cascade region.
+
+    """
 
     def __init__(
         self,
diff --git a/python/tvm/contrib/ethosu/cascader/pareto.py b/python/tvm/contrib/ethosu/cascader/pareto.py
index 985937696b2e..3c4dcbc88a45 100644
--- a/python/tvm/contrib/ethosu/cascader/pareto.py
+++ b/python/tvm/contrib/ethosu/cascader/pareto.py
@@ -23,7 +23,7 @@
 from .plan import Plan
 
 
-def get_pareto_frontier(costs: List[List[float]]) -> List[bool]:
+def _get_pareto_frontier(costs: List[List[float]]) -> List[bool]:
     for i, cost in enumerate(costs):
         for j, value in enumerate(cost):
             costs[i][j] = float(value)
@@ -31,9 +31,9 @@ def get_pareto_frontier(costs: List[List[float]]) -> List[bool]:
     return [bool(v) for v in _ffi_api.GetParetoFrontier(costs)]
 
 
-def thin_vector(vec: List[Object], max_size: int) -> List[Object]:
+def _thin_vector(vec: List[Object], max_size: int) -> List[Object]:
     return list(_ffi_api.ThinVector(vec, max_size))
 
 
-def pareto_cull_plans(plans: List[Plan], max_plans: int) -> List[Plan]:
+def _pareto_cull_plans(plans: List[Plan], max_plans: int) -> List[Plan]:
     return list(_ffi_api.ParetoCullPlans(plans, max_plans))
diff --git a/python/tvm/contrib/ethosu/cascader/plan.py b/python/tvm/contrib/ethosu/cascader/plan.py
index 1d2396a8026e..f960911ca133 100644
--- a/python/tvm/contrib/ethosu/cascader/plan.py
+++ b/python/tvm/contrib/ethosu/cascader/plan.py
@@ -27,7 +27,43 @@
 
 @tvm._ffi.register_object("contrib.ethosu.cascader.Plan")
 class Plan(Object):
-    """Plan class"""
+    """
+    A class which describes how to schedule a subgraph of Parts together.
+
+    A Plan takes the form of a subgraph of connected Parts (recorded in part_group) with
+    TensorConfigs for all of the required Tensors (recorded in tensor_configs). This information
+    can be used to produce a Tensor Expression schedule with inter-operator scheduling. A Plan is
+    necessarily single-output such that all non-output Parts are 'computed_at'ed the scope of the
+    output Part. This is what achieves the technique referred to as 'cascading'. A Plan also has
+    an interior memory region which specifies the region of memory into which all the Plans
+    intermediate buffers should be allocated.
+
+    Additionally, a Plan contains some other information used during the Plan generation and
+    selection algorithms. Both the memory and cycles required to run the Plan are accounted for so
+    that Plans can be ranked and Pareto-culled on these metrics. Furthermore, the TensorConfigs
+    which are 'open' is recorded indicating that these are valid points to merge with another Plan.
+    A Plan can only be turned into a schedule if it has no 'open' TensorConfigs - at which point
+    the Plan is said to be 'closed'.
+
+    Attributes
+    ----------
+    tensor_configs : Dict[Tensor, TensorConfig]
+        The TensorConfigs specified by the Plan.
+    open_configs : FrozenSet[TensorConfig]
+        The TensorConfigs which are 'open' meaning they are a Plan input/output but have
+        'interior' state.
+    output_config : TensorConfig
+        The TensorConfig of the Plan's output tensor.
+    part_group : FrozenSet[Part]
+        The Parts which are covered by the Plan.
+    interior_region : MemoryRegion
+        The MemoryRegion in which to store 'interior' Plan buffers.
+    memory_usage : int
+        The interior memory used by the Plan in bytes.
+    cycles : int
+        The cycles taken to execute the Plan.
+
+    """
 
     def __init__(
         self,
@@ -51,13 +87,36 @@ def __init__(
         )
 
     def merge(self, other):
-        return _ffi_api.PlanMerge(self, other)
+        """
+        Merge two Plans with share an 'open' TensorConfig.
 
-    def benchmark_merge(self, other, repeats):
-        return _ffi_api.PlanMergeBenchmark(self, other, repeats)
+        The current Plan is referred to as the 'upper Plan' and the other Plan as the 'lower
+        Plan'. The 'open' output config of the upper Plan must be an 'open' input config of the
+        lower Plan. The Tensor referenced by these configs is the Tensor on which the two Plans
+        will be merged. The merge process does the following:
+
+        The tensor config maps will be merged with TensorConfigs from the upper Plan taking
+        priority. The open configs will be merged with the TensorConfigs that are being merged
+        having been removed. The output config will be that of the lower Plan. The part groups
+        will be merged. The interior region is necessarily the same for both the upper and lower
+        Plan. The cycles and memory usage will be summed.
+
+        Parameters
+        ----------
+        other : Plan
+            The Plan to merge with.
+
+        Return
+        ------
+        Plan
+            The merged Plan.
+
+        """
+        return _ffi_api.PlanMerge(self, other)
 
     @property
     def tensor_configs(self):
+        """The TensorConfigs specified by the Plan."""
         tensor_configs = {}
         for config in self._tensor_configs:
             tensor_configs[config.tensor] = config
@@ -65,26 +124,35 @@ def tensor_configs(self):
 
     @property
     def open_configs(self):
+        """
+        The TensorConfigs which are 'open' meaning they are a Plan input/output but have
+        'interior' state.
+        """
         return frozenset(self._open_configs)
 
     @property
     def output_config(self):
+        """The TensorConfig of the Plan's output tensor."""
         return self._output_config
 
     @property
     def part_group(self):
+        """The Parts which are covered by the Plan."""
         return frozenset(self._part_group)
 
     @property
     def interior_region(self):
+        """The MemoryRegion in which to store 'interior' Plan buffers."""
         return self._interior_region
 
     @property
     def memory_usage(self):
+        """The interior memory used by the Plan in bytes."""
         return self._memory_usage
 
     @property
     def cycles(self):
+        """The cycles taken to execute the Plan."""
         return self._cycles
 
     def __repr__(self):
diff --git a/python/tvm/contrib/ethosu/cascader/plan_generator.py b/python/tvm/contrib/ethosu/cascader/plan_generator.py
index b36799d903cf..36e0cf4420ea 100644
--- a/python/tvm/contrib/ethosu/cascader/plan_generator.py
+++ b/python/tvm/contrib/ethosu/cascader/plan_generator.py
@@ -26,11 +26,11 @@
 from .graph import CascaderGraph, Part, Tensor
 
 
-def generate_output_stripe_configs(part: Part, stripe_factors: int) -> List[StripeConfig]:
+def _generate_output_stripe_configs(part: Part, stripe_factors: int) -> List[StripeConfig]:
     return list(_ffi_api.GenerateOutputStripeConfigs(part, stripe_factors))
 
 
-def generate_single_plans(
+def _generate_single_plans(
     part: Part,
     output_stripe_configs: List[StripeConfig],
     home_map: Dict[Tensor, List[MemoryRegion]],
@@ -39,7 +39,7 @@ def generate_single_plans(
     return list(_ffi_api.GenerateSinglePlans(part, output_stripe_configs, home_map, cascade_region))
 
 
-def generate_graph_plans(
+def _generate_graph_plans(
     graph: CascaderGraph,
     home_map: Dict[Tensor, List[MemoryRegion]],
     options: CascaderOptions,
diff --git a/python/tvm/contrib/ethosu/cascader/tensor_config.py b/python/tvm/contrib/ethosu/cascader/tensor_config.py
index 493c45ea0f40..6787ea4f052e 100644
--- a/python/tvm/contrib/ethosu/cascader/tensor_config.py
+++ b/python/tvm/contrib/ethosu/cascader/tensor_config.py
@@ -28,13 +28,35 @@
 
 
 class TensorConfigState(IntEnum):
+    """
+    The 'state' of a TensorConfig as used in the Plan generation algorithm.
+
+    BOUNDARY - Should describe a Plan input/output Tensor.
+    INTERIOR - Should describe an intermediate Tensor in a 'closed' Plan.
+
+    """
+
     BOUNDARY = 0
     INTERIOR = 1
 
 
 @tvm._ffi.register_object("contrib.ethosu.cascader.MemoryRegion")
 class MemoryRegion(Object):
-    """MemoryRegion class"""
+    """
+    MemoryRegion class to store information about device memories.
+
+    Attributes
+    ----------
+    name : str
+        The name of the region.
+    size : int
+        The size of the region.
+    read_bandwidth : int
+        The read bandwidth of the region in bytes per cycle.
+    write_bandwidth : int
+        The write bandwidth of the region in bytes per cycle.
+
+    """
 
     def __init__(self, name: str, size: int, read_bandwidth: int, write_bandwidth: int):
         self.__init_handle_by_constructor__(
@@ -44,7 +66,55 @@ def __init__(self, name: str, size: int, read_bandwidth: int, write_bandwidth: i
 
 @tvm._ffi.register_object("contrib.ethosu.cascader.TensorConfig")
 class TensorConfig(Object):
-    """TensorConfig class"""
+    """
+    A class which describes how to realize a Tensor.
+
+    The TensorConfig describes both how a Tensor is scheduled (the order in which it's
+    produced/consumed) and how its allocated in memory (which region it should reside in
+    and whether it should be copied).
+
+    Attributes
+    ----------
+    tensor : Tensor
+        The Tensor the config applies to.
+    home_region : MemoryRegion
+        The region where the tensor is allocated.
+    state : TensorConfigState
+        The state of the TensorConfig.
+
+        The TensorConfigState is only used as part of the Plan generation algorithm. For a Plan
+        to be 'closed' (and therefore not subject to any further merging), all the TensorConfigs
+        that describe Plan input or output Tensors must be in the 'BOUNDARY' state with the rest
+        being 'INTERIOR'. If any of the input or output tensors are described by an 'INTERIOR'
+        TensorConfig, then the Plan is 'open' and should be merged with other 'open' Plans until
+        the result becomes 'closed'.
+    buffer_mode : BufferMode
+        The mode in which the buffer should be realized.
+
+        There are multiple buffering strategies by which a tensor may be realized (computed).
+        These affect the amount of recomputation necessary as well as the size of buffer required
+        to store the tensor. See 'BufferMode' for a description of the allowable buffering modes.
+    stripe_configs : List[StringConfig]
+       The StripeConfigs with which to compute the tensor.
+
+       The StripeConfigs determine the order in which the elements of the tensor should be
+       computed, including potentially computing them multiple times (recompute). Multiple
+       StripeConfigs are used over just a single StripeConfig for the case where the tensor is
+       consumed by two different Parts executing themselves with different StripeConfigs. In this
+       case, there is a StripeConfig per consumer of the tensor.
+    copy_tensor : bool, optional
+        Whether to copy the tensor.
+
+        While a tensor will originally reside in its home region, the TensorConfig may optionally
+        specify that the tensor should be copied (according to the StripeConfigs) into another
+        MemoryRegion. As an example for where this may be used, if a weights tensor initially
+        resides in slow Flash memory then necessarily the home region will be Flash. However, if
+        the weights values are used multiple times by a Part, it may be more performant to choose
+        to copy the weights into a faster memory like SRAM.
+    copy_region : Union[MemoryRegion, None], optional
+        The region to copy the tensor to.
+
+    """
 
     def __init__(
         self,
@@ -70,34 +140,52 @@ def __init__(
         )
 
     def get_buffer_size(self):
+        """
+        The size of the buffer needed for the TensorConfig.
+
+        The size of buffer necessary to store a tensor being produced using the TensorConfig is
+        not necessarily just the size of the tensor. In Plans, a tensor may be being produced and
+        consumed in 'stripes' which are smaller than the full tensor. Therefore, the buffer
+        necessary to store the tensor may only need to be as large as the stripe. The precise size
+        of the buffer will depend both on the BufferMode and StripeConfigs (as well as, of course,
+        the Tensor).
+
+        """
         return _ffi_api.TensorConfigGetBufferSize(self)
 
     @property
     def tensor(self):
+        """The Tensor the config applies to."""
         return self._tensor
 
     @property
     def home_region(self):
+        """The region where the tensor is allocated."""
         return self._home_region
 
     @property
     def state(self):
+        """The state of the TensorConfig."""
         return TensorConfigState(self._state)
 
     @property
     def buffer_mode(self):
+        """The mode in which the buffer should be realized."""
         return BufferMode(self._buffer_mode)
 
     @property
     def stripe_configs(self):
+        """The StripeConfigs with which to compute the tensor."""
         return list(self._stripe_configs)
 
     @property
     def copy_tensor(self):
+        """Whether to copy the tensor."""
         return bool(self._copy_tensor)
 
     @property
     def copy_region(self):
+        """The region to copy the tensor to."""
         return self._copy_region
 
     def __hash__(self):
diff --git a/tests/python/contrib/test_ethosu/cascader/test_pareto.py b/tests/python/contrib/test_ethosu/cascader/test_pareto.py
index 0e9393d69061..2d897a79310f 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_pareto.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_pareto.py
@@ -15,7 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 from tvm.tir import IntImm
-from tvm.contrib.ethosu.cascader.pareto import get_pareto_frontier, thin_vector, pareto_cull_plans
+from tvm.contrib.ethosu.cascader.pareto import (
+    _get_pareto_frontier,
+    _thin_vector,
+    _pareto_cull_plans,
+)
 from tvm.contrib.ethosu.cascader import (
     Plan,
     StripeConfig,
@@ -78,7 +82,7 @@ def test_get_pareto_frontier(num_costs):
     for i in range(num_costs):
         costs.append(list(np.random.randint(cost_low, cost_high, size=(dims,))))
     reference = list(_ref_get_pareto_frontier(np.array(costs)))
-    result = get_pareto_frontier(costs)
+    result = _get_pareto_frontier(costs)
     assert result == reference
 
 
@@ -95,7 +99,7 @@ def _make_vector(length):
 
     vector = _make_vector(vec_length)
     reference = list(_ref_thin_vector(np.array(vector), max_size))
-    result = thin_vector(vector, max_size)
+    result = _thin_vector(vector, max_size)
     assert result == reference
 
 
@@ -137,7 +141,7 @@ def _make_plans(num):
 
     plans = _make_plans(num_plans)
     reference = list(_ref_pareto_cull_plans(plans, max_plans))
-    result = pareto_cull_plans(plans, max_plans)
+    result = _pareto_cull_plans(plans, max_plans)
     assert result == reference
 
 
diff --git a/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
index 34cd93c54e4f..ffee071f0e95 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
@@ -20,9 +20,9 @@
 from .infra import make_simple_home_map, make_options, ethosu_enabled
 
 from tvm.contrib.ethosu.cascader.plan_generator import (
-    generate_output_stripe_configs,
-    generate_single_plans,
-    generate_graph_plans,
+    _generate_output_stripe_configs,
+    _generate_single_plans,
+    _generate_graph_plans,
 )
 
 
@@ -47,7 +47,7 @@ def test_generate_output_stripe_configs():
     tensor_1.add_consumer(part_1)
     tensor_2.add_producer(part_1)
 
-    assert len(generate_output_stripe_configs(part_1, stripe_factors)) == expected_configs
+    assert len(_generate_output_stripe_configs(part_1, stripe_factors)) == expected_configs
 
 
 def test_generate_single_plans(SRAM, DRAM):
@@ -74,8 +74,8 @@ def test_generate_single_plans(SRAM, DRAM):
         tensor_2: [SRAM],
     }
     options = make_options(cascade_region=SRAM, stripe_factors=1)
-    output_stripe_configs = generate_output_stripe_configs(part_1, options.stripe_factors)
-    plans = generate_single_plans(part_1, output_stripe_configs, home_map, options)
+    output_stripe_configs = _generate_output_stripe_configs(part_1, options.stripe_factors)
+    plans = _generate_single_plans(part_1, output_stripe_configs, home_map, options)
     for plan in plans:
         assert plan.interior_region == SRAM
         assert plan.part_group == frozenset([part_1])
@@ -140,7 +140,7 @@ def test_generate_graph_plans(SRAM, DRAM):
         stripe_factors=stripe_factors,
         max_plan_size=max_plan_size,
     )
-    closed_plans = generate_graph_plans(graph, home_map, options)
+    closed_plans = _generate_graph_plans(graph, home_map, options)
 
     assert len(closed_plans) == num_part_groups
 
@@ -157,7 +157,7 @@ def test_plan_generator_two_conv2d(FLASH, SRAM, TwoConv2DGraph):
             max_plan_size=10,
         )
 
-        closed_plans = generate_graph_plans(graph, home_map, options)
+        closed_plans = _generate_graph_plans(graph, home_map, options)
 
         assert len(closed_plans) == num_part_groups
 
@@ -171,7 +171,7 @@ def test_plan_generator_two_conv2d_with_slice(FLASH, SRAM, TwoConv2DWithSliceGra
             max_plan_size=10,
         )
 
-        closed_plans = generate_graph_plans(graph, home_map, options)
+        closed_plans = _generate_graph_plans(graph, home_map, options)
 
         assert len(closed_plans) == num_part_groups
 

From 1a0df5cf708b88f533d1a08aa03cdabf07892642 Mon Sep 17 00:00:00 2001
From: Matthew Barrett <matthew.barrett@arm.com>
Date: Thu, 3 Feb 2022 13:32:46 +0000
Subject: [PATCH 4/4] Fix enum hashing issue with old gcc

Change-Id: Ifbe97eb33b1ef313710f24c687a8155421a3c195
---
 src/contrib/ethosu/cascader/tensor_config.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/contrib/ethosu/cascader/tensor_config.cc b/src/contrib/ethosu/cascader/tensor_config.cc
index 5fec8b31e9ea..5e60f522fe4e 100644
--- a/src/contrib/ethosu/cascader/tensor_config.cc
+++ b/src/contrib/ethosu/cascader/tensor_config.cc
@@ -73,8 +73,8 @@ int TensorConfigNode::GetBufferSize() const {
 void TensorConfigNode::ComputeHash_() {
   hash_ = ObjectHash()(tensor_);
   hash_combine(&hash_, std::hash<std::string>()(home_region_->name));
-  hash_combine(&hash_, std::hash<TensorConfigState>()(state_));
-  hash_combine(&hash_, std::hash<BufferMode>()(buffer_mode_));
+  hash_combine(&hash_, std::hash<int>()(static_cast<int>(state_)));
+  hash_combine(&hash_, std::hash<int>()(static_cast<int>(buffer_mode_)));
   hash_combine(&hash_, hash_vector(stripe_configs_));
   hash_combine(&hash_, std::hash<bool>()(copy_tensor_));
   hash_combine(&hash_, std::hash<std::string>()(copy_region_->name));