diff --git a/python/tvm/contrib/ethosu/cascader/__init__.py b/python/tvm/contrib/ethosu/cascader/__init__.py index 03753b4049bb..f151fa9bc418 100644 --- a/python/tvm/contrib/ethosu/cascader/__init__.py +++ b/python/tvm/contrib/ethosu/cascader/__init__.py @@ -34,3 +34,6 @@ ) from .parts import InlinePart, EthosuPart from .device_config import EthosuDeviceConfig +from .tensor_config import TensorConfigState, MemoryRegion, TensorConfig +from .plan import Plan +from .cascader_options import CascaderOptions diff --git a/python/tvm/contrib/ethosu/cascader/cascader_options.py b/python/tvm/contrib/ethosu/cascader/cascader_options.py new file mode 100644 index 000000000000..ff831ff37990 --- /dev/null +++ b/python/tvm/contrib/ethosu/cascader/cascader_options.py @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Object to hold options for the NPU cascader""" +import tvm._ffi + +from tvm.runtime import Object + +from . import _ffi_api +from .tensor_config import MemoryRegion + + +@tvm._ffi.register_object("contrib.ethosu.cascader.CascaderOptions") +class CascaderOptions(Object): + """ + A class to hold configuration options for the cascader. + + Attributes + ---------- + cascade_region : MemoryRegion + The MemoryRegion to place cascading buffers into. + max_proposals : int + The maximum number of Proposals to generate. + stripe_factors : int + How many striping factors to try per axis. + max_plan_size : int + The maximum number of Parts in a Plan. + always_copy_size : int + The maximum size of a Tensor that will always be copied into the cascade region. + + """ + + def __init__( + self, + cascade_region: MemoryRegion, + max_proposals: int, + stripe_factors: int, + max_plan_size: int, + always_copy_size: int, + ): + self.__init_handle_by_constructor__( + _ffi_api.CascaderOptions, + cascade_region, + max_proposals, + stripe_factors, + max_plan_size, + always_copy_size, + ) diff --git a/python/tvm/contrib/ethosu/cascader/pareto.py b/python/tvm/contrib/ethosu/cascader/pareto.py new file mode 100644 index 000000000000..3c4dcbc88a45 --- /dev/null +++ b/python/tvm/contrib/ethosu/cascader/pareto.py @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Pareto optimisation functions for the NPU cascader.""" +from typing import List + +from tvm import Object + +from . import _ffi_api +from .plan import Plan + + +def _get_pareto_frontier(costs: List[List[float]]) -> List[bool]: + for i, cost in enumerate(costs): + for j, value in enumerate(cost): + costs[i][j] = float(value) + + return [bool(v) for v in _ffi_api.GetParetoFrontier(costs)] + + +def _thin_vector(vec: List[Object], max_size: int) -> List[Object]: + return list(_ffi_api.ThinVector(vec, max_size)) + + +def _pareto_cull_plans(plans: List[Plan], max_plans: int) -> List[Plan]: + return list(_ffi_api.ParetoCullPlans(plans, max_plans)) diff --git a/python/tvm/contrib/ethosu/cascader/plan.py b/python/tvm/contrib/ethosu/cascader/plan.py new file mode 100644 index 000000000000..f960911ca133 --- /dev/null +++ b/python/tvm/contrib/ethosu/cascader/plan.py @@ -0,0 +1,167 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Plan class to hold subgraph scheduling information.""" +from typing import Dict, FrozenSet +import tvm._ffi + +from tvm.runtime import Object + +from . import _ffi_api +from .graph import Tensor, Part +from .tensor_config import TensorConfig, MemoryRegion + + +@tvm._ffi.register_object("contrib.ethosu.cascader.Plan") +class Plan(Object): + """ + A class which describes how to schedule a subgraph of Parts together. + + A Plan takes the form of a subgraph of connected Parts (recorded in part_group) with + TensorConfigs for all of the required Tensors (recorded in tensor_configs). This information + can be used to produce a Tensor Expression schedule with inter-operator scheduling. A Plan is + necessarily single-output such that all non-output Parts are 'computed_at'ed the scope of the + output Part. This is what achieves the technique referred to as 'cascading'. A Plan also has + an interior memory region which specifies the region of memory into which all the Plans + intermediate buffers should be allocated. + + Additionally, a Plan contains some other information used during the Plan generation and + selection algorithms. Both the memory and cycles required to run the Plan are accounted for so + that Plans can be ranked and Pareto-culled on these metrics. Furthermore, the TensorConfigs + which are 'open' is recorded indicating that these are valid points to merge with another Plan. + A Plan can only be turned into a schedule if it has no 'open' TensorConfigs - at which point + the Plan is said to be 'closed'. + + Attributes + ---------- + tensor_configs : Dict[Tensor, TensorConfig] + The TensorConfigs specified by the Plan. + open_configs : FrozenSet[TensorConfig] + The TensorConfigs which are 'open' meaning they are a Plan input/output but have + 'interior' state. + output_config : TensorConfig + The TensorConfig of the Plan's output tensor. + part_group : FrozenSet[Part] + The Parts which are covered by the Plan. + interior_region : MemoryRegion + The MemoryRegion in which to store 'interior' Plan buffers. + memory_usage : int + The interior memory used by the Plan in bytes. + cycles : int + The cycles taken to execute the Plan. + + """ + + def __init__( + self, + tensor_configs: Dict[Tensor, TensorConfig], + open_configs: FrozenSet[TensorConfig], + output_config: TensorConfig, + part_group: FrozenSet[Part], + interior_region: MemoryRegion, + memory_usage: int, + cycles: int, + ): + self.__init_handle_by_constructor__( + _ffi_api.Plan, + list(tensor_configs.values()), + list(open_configs), + output_config, + list(part_group), + interior_region, + memory_usage, + cycles, + ) + + def merge(self, other): + """ + Merge two Plans with share an 'open' TensorConfig. + + The current Plan is referred to as the 'upper Plan' and the other Plan as the 'lower + Plan'. The 'open' output config of the upper Plan must be an 'open' input config of the + lower Plan. The Tensor referenced by these configs is the Tensor on which the two Plans + will be merged. The merge process does the following: + + The tensor config maps will be merged with TensorConfigs from the upper Plan taking + priority. The open configs will be merged with the TensorConfigs that are being merged + having been removed. The output config will be that of the lower Plan. The part groups + will be merged. The interior region is necessarily the same for both the upper and lower + Plan. The cycles and memory usage will be summed. + + Parameters + ---------- + other : Plan + The Plan to merge with. + + Return + ------ + Plan + The merged Plan. + + """ + return _ffi_api.PlanMerge(self, other) + + @property + def tensor_configs(self): + """The TensorConfigs specified by the Plan.""" + tensor_configs = {} + for config in self._tensor_configs: + tensor_configs[config.tensor] = config + return tensor_configs + + @property + def open_configs(self): + """ + The TensorConfigs which are 'open' meaning they are a Plan input/output but have + 'interior' state. + """ + return frozenset(self._open_configs) + + @property + def output_config(self): + """The TensorConfig of the Plan's output tensor.""" + return self._output_config + + @property + def part_group(self): + """The Parts which are covered by the Plan.""" + return frozenset(self._part_group) + + @property + def interior_region(self): + """The MemoryRegion in which to store 'interior' Plan buffers.""" + return self._interior_region + + @property + def memory_usage(self): + """The interior memory used by the Plan in bytes.""" + return self._memory_usage + + @property + def cycles(self): + """The cycles taken to execute the Plan.""" + return self._cycles + + def __repr__(self): + return ( + f"Plan(tensor_configs={self.tensor_configs}, " + f"open_configs={self.open_configs}, " + f"output_config={self.output_config}, " + f"part_group={self.part_group}, " + f"interior_region={self.interior_region.name}, " + f"memory_usage={self.memory_usage}, " + f"cycles={self.cycles}, " + ) diff --git a/python/tvm/contrib/ethosu/cascader/plan_generator.py b/python/tvm/contrib/ethosu/cascader/plan_generator.py new file mode 100644 index 000000000000..36e0cf4420ea --- /dev/null +++ b/python/tvm/contrib/ethosu/cascader/plan_generator.py @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Algorithms to generate Plans for a CascaderGraph.""" +from typing import List, Dict + +from tvm.contrib.ethosu.cascader.tensor_config import MemoryRegion + +from . import _ffi_api +from .cascader_options import CascaderOptions +from .plan import Plan +from .stripe_config import StripeConfig +from .graph import CascaderGraph, Part, Tensor + + +def _generate_output_stripe_configs(part: Part, stripe_factors: int) -> List[StripeConfig]: + return list(_ffi_api.GenerateOutputStripeConfigs(part, stripe_factors)) + + +def _generate_single_plans( + part: Part, + output_stripe_configs: List[StripeConfig], + home_map: Dict[Tensor, List[MemoryRegion]], + cascade_region: MemoryRegion, +) -> List[Plan]: + return list(_ffi_api.GenerateSinglePlans(part, output_stripe_configs, home_map, cascade_region)) + + +def _generate_graph_plans( + graph: CascaderGraph, + home_map: Dict[Tensor, List[MemoryRegion]], + options: CascaderOptions, +): + return _ffi_api.GenerateGraphPlans( + graph, + home_map, + options, + ) diff --git a/python/tvm/contrib/ethosu/cascader/tensor_config.py b/python/tvm/contrib/ethosu/cascader/tensor_config.py new file mode 100644 index 000000000000..6787ea4f052e --- /dev/null +++ b/python/tvm/contrib/ethosu/cascader/tensor_config.py @@ -0,0 +1,206 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Tensor config class to hold tensor scheduling information.""" +from typing import List, Union +from enum import IntEnum +import tvm._ffi +from tvm.contrib.ethosu.cascader.stripe_config import StripeConfig + +from tvm.runtime import Object + +from . import _ffi_api +from .stripe_config import StripeConfig +from .graph import Tensor, BufferMode + + +class TensorConfigState(IntEnum): + """ + The 'state' of a TensorConfig as used in the Plan generation algorithm. + + BOUNDARY - Should describe a Plan input/output Tensor. + INTERIOR - Should describe an intermediate Tensor in a 'closed' Plan. + + """ + + BOUNDARY = 0 + INTERIOR = 1 + + +@tvm._ffi.register_object("contrib.ethosu.cascader.MemoryRegion") +class MemoryRegion(Object): + """ + MemoryRegion class to store information about device memories. + + Attributes + ---------- + name : str + The name of the region. + size : int + The size of the region. + read_bandwidth : int + The read bandwidth of the region in bytes per cycle. + write_bandwidth : int + The write bandwidth of the region in bytes per cycle. + + """ + + def __init__(self, name: str, size: int, read_bandwidth: int, write_bandwidth: int): + self.__init_handle_by_constructor__( + _ffi_api.MemoryRegion, name, size, read_bandwidth, write_bandwidth + ) + + +@tvm._ffi.register_object("contrib.ethosu.cascader.TensorConfig") +class TensorConfig(Object): + """ + A class which describes how to realize a Tensor. + + The TensorConfig describes both how a Tensor is scheduled (the order in which it's + produced/consumed) and how its allocated in memory (which region it should reside in + and whether it should be copied). + + Attributes + ---------- + tensor : Tensor + The Tensor the config applies to. + home_region : MemoryRegion + The region where the tensor is allocated. + state : TensorConfigState + The state of the TensorConfig. + + The TensorConfigState is only used as part of the Plan generation algorithm. For a Plan + to be 'closed' (and therefore not subject to any further merging), all the TensorConfigs + that describe Plan input or output Tensors must be in the 'BOUNDARY' state with the rest + being 'INTERIOR'. If any of the input or output tensors are described by an 'INTERIOR' + TensorConfig, then the Plan is 'open' and should be merged with other 'open' Plans until + the result becomes 'closed'. + buffer_mode : BufferMode + The mode in which the buffer should be realized. + + There are multiple buffering strategies by which a tensor may be realized (computed). + These affect the amount of recomputation necessary as well as the size of buffer required + to store the tensor. See 'BufferMode' for a description of the allowable buffering modes. + stripe_configs : List[StringConfig] + The StripeConfigs with which to compute the tensor. + + The StripeConfigs determine the order in which the elements of the tensor should be + computed, including potentially computing them multiple times (recompute). Multiple + StripeConfigs are used over just a single StripeConfig for the case where the tensor is + consumed by two different Parts executing themselves with different StripeConfigs. In this + case, there is a StripeConfig per consumer of the tensor. + copy_tensor : bool, optional + Whether to copy the tensor. + + While a tensor will originally reside in its home region, the TensorConfig may optionally + specify that the tensor should be copied (according to the StripeConfigs) into another + MemoryRegion. As an example for where this may be used, if a weights tensor initially + resides in slow Flash memory then necessarily the home region will be Flash. However, if + the weights values are used multiple times by a Part, it may be more performant to choose + to copy the weights into a faster memory like SRAM. + copy_region : Union[MemoryRegion, None], optional + The region to copy the tensor to. + + """ + + def __init__( + self, + tensor: Tensor, + home_region: MemoryRegion, + state: TensorConfigState, + buffer_mode: BufferMode, + stripe_configs: List[StripeConfig], + copy_tensor: bool = False, + copy_region: Union[MemoryRegion, None] = None, + ): + if copy_region is None: + copy_region = home_region + self.__init_handle_by_constructor__( + _ffi_api.TensorConfig, + tensor, + home_region, + state, + buffer_mode, + stripe_configs, + copy_tensor, + copy_region, + ) + + def get_buffer_size(self): + """ + The size of the buffer needed for the TensorConfig. + + The size of buffer necessary to store a tensor being produced using the TensorConfig is + not necessarily just the size of the tensor. In Plans, a tensor may be being produced and + consumed in 'stripes' which are smaller than the full tensor. Therefore, the buffer + necessary to store the tensor may only need to be as large as the stripe. The precise size + of the buffer will depend both on the BufferMode and StripeConfigs (as well as, of course, + the Tensor). + + """ + return _ffi_api.TensorConfigGetBufferSize(self) + + @property + def tensor(self): + """The Tensor the config applies to.""" + return self._tensor + + @property + def home_region(self): + """The region where the tensor is allocated.""" + return self._home_region + + @property + def state(self): + """The state of the TensorConfig.""" + return TensorConfigState(self._state) + + @property + def buffer_mode(self): + """The mode in which the buffer should be realized.""" + return BufferMode(self._buffer_mode) + + @property + def stripe_configs(self): + """The StripeConfigs with which to compute the tensor.""" + return list(self._stripe_configs) + + @property + def copy_tensor(self): + """Whether to copy the tensor.""" + return bool(self._copy_tensor) + + @property + def copy_region(self): + """The region to copy the tensor to.""" + return self._copy_region + + def __hash__(self): + return self._hash + + def __eq__(self, other): + return _ffi_api.TensorConfigEqual(self, other) + + def __repr__(self): + return ( + f"TensorConfig(tensor={self.tensor}, " + f"home_region={self.home_region.name}, " + f"state={self.state.name}, " + f"buffer_mode={self.buffer_mode.name}, " + f"stripe_configs={self.stripe_configs}, " + f"copy_tensor={self.copy_tensor}, " + f"copy_region={self.copy_region.name}" + ) diff --git a/src/contrib/ethosu/cascader/cascader_options.cc b/src/contrib/ethosu/cascader/cascader_options.cc new file mode 100644 index 000000000000..fb4b07940e2c --- /dev/null +++ b/src/contrib/ethosu/cascader/cascader_options.cc @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "cascader_options.h" + +#include + +namespace tvm { +namespace contrib { +namespace ethosu { +namespace cascader { + +void CascaderOptionsNode::VisitAttrs(AttrVisitor* v) { + v->Visit("cascade_region", &cascade_region); + v->Visit("max_proposals", &max_proposals); + v->Visit("stripe_factors", &stripe_factors); + v->Visit("max_plan_size", &max_plan_size); + v->Visit("always_copy_size", &always_copy_size); +} + +CascaderOptions::CascaderOptions(const MemoryRegion& cascade_region, int max_proposals, + int stripe_factors, int max_plan_size, int always_copy_size) { + auto n = make_object(); + n->cascade_region = std::move(cascade_region); + n->max_proposals = max_proposals; + n->stripe_factors = stripe_factors; + n->max_plan_size = max_plan_size; + n->always_copy_size = always_copy_size; + data_ = std::move(n); +} + +TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.CascaderOptions") + .set_body_typed([](MemoryRegion cascade_region, int max_proposals, int stripe_factors, + int max_plan_size, int always_copy_size) { + return CascaderOptions(cascade_region, max_proposals, stripe_factors, max_plan_size, + always_copy_size); + }); + +TVM_REGISTER_NODE_TYPE(CascaderOptionsNode); + +} // namespace cascader +} // namespace ethosu +} // namespace contrib +} // namespace tvm diff --git a/src/contrib/ethosu/cascader/cascader_options.h b/src/contrib/ethosu/cascader/cascader_options.h new file mode 100644 index 000000000000..135de784ad3c --- /dev/null +++ b/src/contrib/ethosu/cascader/cascader_options.h @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/contrib/ethosu/cascader/cascader_options.h + * \brief Class to store configuration options for the NPU cascader + */ +#ifndef TVM_CONTRIB_ETHOSU_CASCADER_CASCADER_OPTIONS_H_ +#define TVM_CONTRIB_ETHOSU_CASCADER_CASCADER_OPTIONS_H_ + +#include +#include + +#include "tensor_config.h" + +namespace tvm { +namespace contrib { +namespace ethosu { +namespace cascader { + +/*! \brief Node to represent CascaderOptions */ +class CascaderOptionsNode : public Object { + public: + void VisitAttrs(AttrVisitor* v); + + /*! \brief The MemoryRegion to place cascading buffer into. */ + MemoryRegion cascade_region; + /*! \brief The maximum number of Proposals to generate. */ + int max_proposals; + /*! \brief How many striping factors to try per axis. */ + int stripe_factors; + /*! \brief The maximum number of Parts in a Plan. */ + int max_plan_size; + /*! \brief The maximum size of Tensor that will always be copied into the cascade region. */ + int always_copy_size; + + static constexpr const char* _type_key = "contrib.ethosu.cascader.CascaderOptions"; + TVM_DECLARE_FINAL_OBJECT_INFO(CascaderOptionsNode, Object) +}; + +/*! \brief A class to hold configuration options for the cascader. */ +class CascaderOptions : public ObjectRef { + public: + CascaderOptions(const MemoryRegion& cascade_region, int max_proposals, int stripe_factors, + int max_plan_size, int always_copy_size); + + TVM_DEFINE_OBJECT_REF_METHODS(CascaderOptions, ObjectRef, CascaderOptionsNode); +}; + +} // namespace cascader +} // namespace ethosu +} // namespace contrib +} // namespace tvm + +#endif // TVM_CONTRIB_ETHOSU_CASCADER_CASCADER_OPTIONS_H_ diff --git a/src/contrib/ethosu/cascader/pareto.cc b/src/contrib/ethosu/cascader/pareto.cc new file mode 100644 index 000000000000..255719088cb0 --- /dev/null +++ b/src/contrib/ethosu/cascader/pareto.cc @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "pareto.h" + +#include +#include +#include +#include + +#include +#include +#include + +#include "common.h" +#include "plan.h" + +namespace tvm { +namespace contrib { +namespace ethosu { +namespace cascader { + +template +std::vector GetParetoFrontier(const std::vector>& costs) { + std::vector is_optimal(costs.size(), true); + for (size_t i = 0; i < costs.size(); i++) { + if (is_optimal[i]) { + for (size_t j = 0; j < costs.size(); j++) { + if (is_optimal[j]) { + bool optimal = false; + for (size_t k = 0; k < N; k++) { + if (costs[i][k] > costs[j][k]) { + optimal = true; + break; + } + } + is_optimal[j] = optimal; + } + } + is_optimal[i] = true; + } + } + return is_optimal; +} + +template +std::vector ThinVector(const std::vector& vec, size_t max_size) { + if (max_size < 1) { + return std::vector(); + } + if (vec.size() <= max_size || vec.size() == 0) { + return vec; + } + if (max_size == 1) { + return std::vector{vec[0]}; + } + std::vector thin_vec; + float step = static_cast(vec.size()) / static_cast(max_size - 1); + for (float i = 0; i < vec.size() - 1; i += step) { + thin_vec.push_back(vec[static_cast(i)]); + } + thin_vec.push_back(vec.back()); + return thin_vec; +} + +std::vector ParetoCullPlans(std::vector plans, size_t max_plans) { + if (plans.size() <= max_plans) { + return plans; + } + std::sort(plans.begin(), plans.end(), [](const Plan& a, const Plan& b) -> bool { + return a->GetMemoryUsage() < b->GetMemoryUsage(); + }); + std::vector> costs; + for (const auto& plan : plans) { + std::array cost = {static_cast(plan->GetMemoryUsage()), + static_cast(plan->GetCycles())}; + costs.emplace_back(cost); + } + std::vector is_optimal = GetParetoFrontier<2>(costs); + std::vector optimal_plans; + size_t i = 0; + for (bool optimal : is_optimal) { + if (optimal) { + optimal_plans.push_back(plans[i]); + } + i++; + } + if (optimal_plans.size() <= max_plans) { + return optimal_plans; + } + return ThinVector(optimal_plans, max_plans); +} + +TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GetParetoFrontier") + .set_body_typed([](Array> tcosts) { + std::vector> costs; + for (const auto& tcost : tcosts) { + ICHECK_EQ(tcost.size(), 2); + std::array point = {static_cast(tcost[0]->value), + static_cast(tcost[1]->value)}; + costs.push_back(point); + } + Array is_optimal; + for (bool opt : GetParetoFrontier<2>(costs)) { + is_optimal.push_back(Bool(opt)); + } + return is_optimal; + }); + +TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.ThinVector") + .set_body_typed([](Array vec, int max_size) { + std::vector vvec(vec.begin(), vec.end()); + return Array(ThinVector(vvec, max_size)); + }); + +TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.ParetoCullPlans") + .set_body_typed([](Array plans, int max_size) { + std::vector vplans(plans.begin(), plans.end()); + return Array(ParetoCullPlans(vplans, max_size)); + }); + +} // namespace cascader +} // namespace ethosu +} // namespace contrib +} // namespace tvm diff --git a/src/contrib/ethosu/cascader/pareto.h b/src/contrib/ethosu/cascader/pareto.h new file mode 100644 index 000000000000..6b0cd4e294ac --- /dev/null +++ b/src/contrib/ethosu/cascader/pareto.h @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/contrib/ethosu/cascader/pareto.h + * \brief Pareto optimisation functions for the NPU cascader + */ +#ifndef TVM_CONTRIB_ETHOSU_CASCADER_PARETO_H_ +#define TVM_CONTRIB_ETHOSU_CASCADER_PARETO_H_ + +#include +#include + +#include +#include +#include + +namespace tvm { +namespace contrib { +namespace ethosu { +namespace cascader { + +class Plan; + +/*! + * \brief Determine the Pareto optimal points. + * \param costs The points as a vector of N-dimensional costs. + * \return A vector that is true where a point is Pareto optimal and false otherwise. + */ +template +std::vector GetParetoFrontier(const std::vector>& costs); + +/*! + * \brief Evenly sample items from a vector to reduce its size. + * \param vec The vector to thin. + * \param max_size The maximum size of the thinned vector. + * \return The thinned vector. + */ +template +std::vector ThinVector(const std::vector& vec, size_t max_size); + +/*! + * \brief Cull plans which are not Pareto optimal then thin them down. + * \param plans The plans to apply the Pareto culling to. + * \param max_plans The maximum number of plans after the culling. + * \return The culled plans. + * \note Plan Pareto-optimality is determined based upon a Plan's memory_usage + * and cycles. + */ +std::vector ParetoCullPlans(std::vector plans, size_t max_plans); + +} // namespace cascader +} // namespace ethosu +} // namespace contrib +} // namespace tvm + +#endif // TVM_CONTRIB_ETHOSU_CASCADER_PARETO_H_ diff --git a/src/contrib/ethosu/cascader/plan.cc b/src/contrib/ethosu/cascader/plan.cc new file mode 100644 index 000000000000..173b3f9e8d20 --- /dev/null +++ b/src/contrib/ethosu/cascader/plan.cc @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "plan.h" + +#include +#include +#include +#include + +#include +#include +#include + +#include "graph.h" +#include "tensor_config.h" + +namespace tvm { +namespace contrib { +namespace ethosu { +namespace cascader { + +void PlanNode::VisitAttrs(AttrVisitor* v) { + Array tmp_arr(tensor_configs_); + v->Visit("_tensor_configs", &tmp_arr); + Array tmp_cfgs(open_configs_.begin(), open_configs_.end()); + v->Visit("_open_configs", &tmp_cfgs); + v->Visit("_output_config", &output_config_); + Array tmp_parts(part_group_.begin(), part_group_.end()); + v->Visit("_part_group", &tmp_parts); + v->Visit("_interior_region", &interior_region_); + v->Visit("_memory_usage", &memory_usage_); + v->Visit("_cycles", &cycles_); +} + +Plan::Plan(const std::vector& tensor_configs, + const std::vector& open_configs, const TensorConfig& output_config, + const std::vector& part_group, const MemoryRegion& interior_region, + int memory_usage, int cycles) { + auto n = make_object(); + n->tensor_configs_ = std::move(tensor_configs); + n->open_configs_ = std::move(open_configs); + n->output_config_ = std::move(output_config); + n->part_group_ = std::move(part_group); + n->interior_region_ = interior_region; + n->memory_usage_ = memory_usage; + n->cycles_ = cycles; + data_ = std::move(n); +} + +Plan Plan::Merge(const Plan& other) const { + auto n = make_object(*this->operator->()); + n->tensor_configs_.insert(n->tensor_configs_.end(), other->tensor_configs_.begin(), + other->tensor_configs_.end()); + n->open_configs_.erase( + std::remove(n->open_configs_.begin(), n->open_configs_.end(), (*this)->output_config_), + n->open_configs_.end()); + for (const auto& config : other->open_configs_) { + if (config->GetTensor() != (*this)->output_config_->GetTensor()) { + n->open_configs_.push_back(config); + } + } + n->output_config_ = other->output_config_; + n->part_group_.insert(n->part_group_.end(), other->part_group_.begin(), other->part_group_.end()); + std::sort(n->part_group_.begin(), n->part_group_.end()); + n->memory_usage_ += other->memory_usage_; + n->cycles_ += other->cycles_; + return Plan(n); +} + +TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.Plan") + .set_body_typed([](Array tensor_configs, Array open_configs, + TensorConfig output_config, Array part_group, + MemoryRegion interior_region, int memory_usage, int cycles) { + std::vector vtensor_configs(tensor_configs.begin(), tensor_configs.end()); + std::vector sopen_configs(open_configs.begin(), open_configs.end()); + std::vector spart_group(part_group.begin(), part_group.end()); + return Plan(vtensor_configs, sopen_configs, output_config, spart_group, interior_region, + memory_usage, cycles); + }); + +TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.PlanMerge").set_body_method(&Plan::Merge); + +TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.PlanMergeBenchmark") + .set_body_typed([](Plan plan, Plan other, int repeats) { + for (int i = 0; i < repeats; i++) { + plan.Merge(other); + } + return plan.Merge(other); + }); + +TVM_REGISTER_NODE_TYPE(PlanNode); + +} // namespace cascader +} // namespace ethosu +} // namespace contrib +} // namespace tvm diff --git a/src/contrib/ethosu/cascader/plan.h b/src/contrib/ethosu/cascader/plan.h new file mode 100644 index 000000000000..65efe98e4ff5 --- /dev/null +++ b/src/contrib/ethosu/cascader/plan.h @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/contrib/ethosu/cascader/plan.h + * \brief Plan object for the NPU cascader + */ +#ifndef TVM_CONTRIB_ETHOSU_CASCADER_PLAN_H_ +#define TVM_CONTRIB_ETHOSU_CASCADER_PLAN_H_ + +#include +#include + +#include +#include +#include +#include + +#include "graph.h" +#include "tensor_config.h" + +namespace tvm { +namespace contrib { +namespace ethosu { +namespace cascader { + +/*! \brief Node to represent a Plan */ +class PlanNode : public Object { + public: + void VisitAttrs(AttrVisitor* v); + + /*! \return The TensorConfigs specified by the Plan */ + const std::vector& GetTensorConfigs() const { return tensor_configs_; } + /*! \return The TensorConfigs which are 'open' meaning they are a Plan input/output but have + * INTERIOR state */ + const std::vector& GetOpenConfigs() const { return open_configs_; } + /*! \return The TensorConfig of the Plan's output tensor */ + const TensorConfig GetOutputConfig() const { return output_config_; } + /*! \return The Parts which are covered by the Plan */ + const std::vector& GetPartGroup() const { return part_group_; } + /*! \return The memory region in which to store interior Plan buffers */ + MemoryRegion const GetInteriorRegion() const { return interior_region_; } + /*! + * \return The interior memory used by the Plan in bytes. + * \note The interior memory usage is defined as being the memory required in the interior region + * to execute the Plan excluding input and output buffers. + */ + int GetMemoryUsage() const { return memory_usage_; } + /*! \return The cycles taken to execute the Plan */ + int GetCycles() const { return cycles_; } + /*! \return Whether the Plan is 'closed' meaning it has no 'open' TensorConfigs */ + bool IsClosed() const { return open_configs_.size() == 0; } + + static constexpr const char* _type_key = "contrib.ethosu.cascader.Plan"; + TVM_DECLARE_FINAL_OBJECT_INFO(PlanNode, Object); + + protected: + friend class Plan; + + /*! \brief The TensorConfigs specified by the Plan */ + std::vector tensor_configs_; + /*! \brief The TensorConfigs which are 'open' meaning they are a Plan input/output but have + * INTERIOR state */ + std::vector open_configs_; + /*! \brief The TensorConfig of the Plan's output tensor */ + TensorConfig output_config_; + /*! \brief The Parts which are covered by the Plan */ + std::vector part_group_; + /*! \brief The memory region in which to store interior Plan buffers */ + MemoryRegion interior_region_; + /*! \brief The interior memory used by the Plan in bytes */ + int memory_usage_; + /*! \brief The cycles taken to execute the Plan */ + int cycles_; +}; + +/*! + * \brief A class which describes how to schedule a subgraph of Parts together. + * \note A Plan takes the form of a subgraph of connected Parts (recorded in part_group) with + * TensorConfigs for all of the required Tensors (recorded in tensor_configs). This information can + * be used to produce a Tensor Expression schedule with inter-operator scheduling. A Plan is + * necessarily single-output such that all non-output Parts are 'computed_at'ed the scope of the + * output Part. This is what achieves the technique referred to as 'cascading'. A Plan also has an + * interior memory region which specifies the region of memory into which all the Plans intermediate + * buffers should be allocated. + * + * Additionally, a Plan contains some other information used during the Plan generation and + * selection algorithms. Both the memory and cycles required to run the Plan are accounted for so + * that Plans can be ranked and Pareto-culled on these metrics. Furthermore, the TensorConfigs which + * are 'open' is recorded indicating that these are valid points to merge with another Plan. A Plan + * can only be turned into a schedule if it has no 'open' TensorConfigs - at which point the Plan is + * said to be 'closed'. + */ +class Plan : public ObjectRef { + public: + Plan(const std::vector& tensor_configs, + const std::vector& open_configs, const TensorConfig& output_config, + const std::vector& part_group, const MemoryRegion& interior_region, int memory_usage, + int cycles); + /*! + * \brief Merge two Plans which share an 'open' TensorConfig. + * \param other The Plan to merge with. + * \return The merged Plan. + * \note The current Plan is referred to as the 'upper Plan' and the other Plan as the 'lower + * Plan'. The 'open' output config of the upper Plan must be an 'open' input config of the lower + * Plan. The Tensor referenced by these configs is the Tensor on which the two Plans will be + * merged. The merge process does the following: + * + * The tensor config maps will be merged with TensorConfigs from the upper Plan taking priority. + * The open configs will be merged with the TensorConfigs that are being merged having been + * removed. The output config will be that of the lower Plan. The part groups will be merged. The + * interior region is necessarily the same for both the upper and lower Plan. The cycles and + * memory usage will be summed. + */ + Plan Merge(const Plan& other) const; + + TVM_DEFINE_OBJECT_REF_METHODS(Plan, ObjectRef, PlanNode); +}; + +} // namespace cascader +} // namespace ethosu +} // namespace contrib +} // namespace tvm + +// Hash functions TensorConfig and Part sets +namespace std { + +using TensorConfigSet = std::vector<::tvm::contrib::ethosu::cascader::TensorConfig>; +using PartSet = std::vector<::tvm::contrib::ethosu::cascader::Part>; + +template <> +struct hash { + std::size_t operator()(const TensorConfigSet& tensor_config_set) const { + size_t seed = 0; + for (const auto& tensor_config : tensor_config_set) { + seed ^= hash<::tvm::contrib::ethosu::cascader::TensorConfig>()(tensor_config); + } + return seed; + } +}; + +template <> +struct equal_to { + bool operator()(const TensorConfigSet& lhs, const TensorConfigSet& rhs) const { + std::unordered_set<::tvm::contrib::ethosu::cascader::TensorConfig> lh_set(lhs.begin(), + lhs.end()); + std::unordered_set<::tvm::contrib::ethosu::cascader::TensorConfig> rh_set(rhs.begin(), + rhs.end()); + return lh_set == rh_set; + } +}; + +template <> +struct hash { + std::size_t operator()(const PartSet& part_set) const { + size_t seed = 0; + for (const auto& part : part_set) { + seed ^= tvm::runtime::ObjectHash()(part); + } + return seed; + } +}; + +template <> +struct equal_to { + bool operator()(const PartSet& lhs, const PartSet& rhs) const { return lhs == rhs; } +}; + +} // namespace std + +#endif // TVM_CONTRIB_ETHOSU_CASCADER_PLAN_H_ diff --git a/src/contrib/ethosu/cascader/plan_generator.cc b/src/contrib/ethosu/cascader/plan_generator.cc new file mode 100644 index 000000000000..9acffb7e9479 --- /dev/null +++ b/src/contrib/ethosu/cascader/plan_generator.cc @@ -0,0 +1,529 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "plan_generator.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cascader_options.h" +#include "common.h" +#include "graph.h" +#include "pareto.h" +#include "plan.h" +#include "stripe_config.h" +#include "tensor_config.h" + +namespace tvm { +namespace contrib { +namespace ethosu { +namespace cascader { + +template +std::vector> EnumerateCombinations(std::vector> values) { + if (values.size() == 0) { + return values; + } + if (values.size() == 1) { + std::vector> combs; + for (const auto& value : values[0]) { + combs.push_back(std::vector(1, value)); + } + return combs; + } + auto combs = EnumerateCombinations(std::vector>(values.begin(), values.end() - 1)); + std::vector> new_combs; + for (const auto& value : values.back()) { + for (const auto& comb : combs) { + auto new_comb = std::vector(comb); + new_comb.push_back(value); + new_combs.push_back(new_comb); + } + } + return new_combs; +} + +std::vector GetCascadableAxes(const Part& part) { + std::vector cascadable_axes(part->GetOutputTensor()->GetShape().size()); + // Check all the propagators to see if an output axis is projected into any + // of the inputs. If it is, then that axis is cascadable. + for (const auto& propagator : part->GetPropagators()) { + auto transform = propagator->GetTransform(); + for (size_t i = 0; i < transform.size(); i++) { + for (size_t j = 0; j < transform[0].size() - 1; j++) { + // An axis is projected if there's a non-zero element + // in the transform matrix + if (transform[i][j] != 0) { + cascadable_axes[j] = true; + } + } + } + } + return cascadable_axes; +} + +std::vector GenerateOutputStripeConfigs(const Part& part, int stripe_factors) { + // If stripe_factors is <= 0, then we won't produce any StripeConfigs + if (stripe_factors <= 0) { + return std::vector(); + } + // Work out the factors to divide by as inverse powers of 2. + // The last factor is always reserved to be '0' which will correspond to + // choosing a stripe size of 1 in the dimension. We always include this + // as it represents the most extreme striping choice that uses the least + // memory, so it is our choice of last resort. + // For example, if stripe_factors = 4 then the factors are 1, 1/2, 1/4, 0. + std::vector factors; + for (size_t i = 0; i < static_cast(stripe_factors) - 1; i++) { + factors.push_back(1.0f / (std::pow(2.0f, i))); + } + factors.push_back(0); + // Then use the factors to derive the possible ways to split each dimension + // into stripes. As an example, if an had extent 128 then by applying + // the factors derived above we get the following possible splits for that axis: + // 128, 64, 32, 1 + std::vector> splits; + std::vector output_shape = part->GetOutputTensor()->GetShape(); + size_t output_dims = output_shape.size(); + // Only bother striping along the axes which are cascadable + auto cascadable_axes = GetCascadableAxes(part); + for (size_t i = 0; i < output_dims; i++) { + auto axis = output_shape[i]; + auto axis_align = part->GetStripeAlignHint()[i]; + std::set axis_splits; // Note this is a set to remove duplicate splits + if (!cascadable_axes[i]) { + axis_splits.insert(axis); + } else { + for (float factor : factors) { + int split = + std::max(static_cast(std::ceil(axis * factor / axis_align)), 1) * axis_align; + split = std::min(axis, split); + axis_splits.insert(split); + } + } + splits.push_back(std::vector(axis_splits.begin(), axis_splits.end())); + } + // Now calculate all the possible combinations of splits for each dimension + // to give us all the possible stripe shapes. For example, if we had two axes + // both with possible splits in {128, 64, 32, 1}, the stripe shapes would be: + // (128, 128), (128, 64), (128, 32) ... (1, 64), (1, 32), (1, 1) + auto stripe_shapes = EnumerateCombinations(splits); + auto offset = std::vector(output_dims); + std::vector stripe_configs; + // Calculate the possible axis orderings such that each axis has the opportunity + // to be the 'outermost' axis (which is axis that is chosen for rolling). + std::vector> orders; + for (size_t i = 0; i < output_dims; i++) { + std::vector order(output_dims); + for (size_t j = 0; j < output_dims; j++) { + order[j] = 1 + (j + i) % output_dims; + } + orders.push_back(order); + } + // Finally, create the StripeConfigs from the possible stripe shapes and orders + for (const auto& stripe_shape : stripe_shapes) { + std::vector stripes; + std::vector strides; + for (size_t i = 0; i < output_dims; i++) { + stripes.push_back(std::ceil(static_cast(output_shape[i]) / stripe_shape[i])); + strides.push_back(static_cast(stripe_shape[i])); // strides = stripe_shape + } + // If the stripe shape equals the output shape (i.e. there's no striping), then + // the order doesn't matter, so just pick the first order and continue. + if (stripe_shape == output_shape) { + stripe_configs.push_back( + StripeConfig(stripe_shape, output_shape, strides, orders[0], stripes, offset)); + continue; + } + for (const auto& order : orders) { + // Some logic to avoid having an axis be the 'outermost' if the stripe is full + // size in that axis. This would otherwise be a waste because we can't roll + // over an axis that hasn't been split. + bool skip = false; + for (size_t i = 0; i < output_dims; i++) { + if (order[i] == 1 && stripe_shape[i] == output_shape[i]) { + skip = true; + break; + } + } + if (skip) continue; + stripe_configs.push_back( + StripeConfig(stripe_shape, output_shape, strides, order, stripes, offset)); + } + } + return stripe_configs; +} + +std::vector GetPossibleInputConfigs(const StripeConfig& stripe_config, + const Tensor& tensor, + const std::vector& home_regions, + const CascaderOptions& options) { + std::vector configs; + for (const auto& home_region : home_regions) { + // Boundary configs + if (home_region == options->cascade_region || tensor->GetSize() > options->always_copy_size) { + configs.push_back(TensorConfig(tensor, home_region, TensorConfigState::BOUNDARY, + BufferMode::RECOMPUTE, {stripe_config}, false, home_region)); + } + if (home_region != options->cascade_region) { + configs.push_back(TensorConfig(tensor, home_region, TensorConfigState::BOUNDARY, + BufferMode::ROLLING, {stripe_config}, true, + options->cascade_region)); + } + } + if (!tensor->IsConstant()) { + // Interior configs + configs.push_back(TensorConfig(tensor, options->cascade_region, TensorConfigState::INTERIOR, + BufferMode::RECOMPUTE, {stripe_config}, false, + options->cascade_region)); + configs.push_back(TensorConfig(tensor, options->cascade_region, TensorConfigState::INTERIOR, + BufferMode::ROLLING, {stripe_config}, false, + options->cascade_region)); + } + return configs; +} + +// Check whether a StripeConfig can be an output boundary config +bool CanBound(const StripeConfig& stripe_config) { + // Determine whether the StripeConfig results in non-overlapping stripes + // which is the case when the stripe shape equals the strides + for (size_t i = 0; i < stripe_config->GetShape().size(); i++) { + // Check that the stripe shape and strides are equal + if (stripe_config->GetShape()[i] - stripe_config->GetStrides()[i] != 0) { + return false; + } + } + return true; +} + +std::vector GetPossibleOutputConfigs(const StripeConfig& stripe_config, + const Tensor& tensor, + const std::vector& home_regions, + const CascaderOptions& options) { + std::vector configs; + // Only StripeConfigs with non-overlapping stripes can be output boundary configs + if (CanBound(stripe_config)) { + for (const auto& home_region : home_regions) { + // Boundary configs + configs.push_back(TensorConfig(tensor, home_region, TensorConfigState::BOUNDARY, + BufferMode::RECOMPUTE, {stripe_config}, false, home_region)); + } + } + // Interior configs + configs.push_back(TensorConfig(tensor, options->cascade_region, TensorConfigState::INTERIOR, + BufferMode::RECOMPUTE, {stripe_config}, false, + options->cascade_region)); + configs.push_back(TensorConfig(tensor, options->cascade_region, TensorConfigState::INTERIOR, + BufferMode::ROLLING, {stripe_config}, false, + options->cascade_region)); + return configs; +} + +int GetInteriorMemoryUsage(const std::vector& input_configs, + const TensorConfig& output_config, const MemoryRegion& interior_region) { + int memory_usage = 0; + if (output_config->GetHomeRegion() == interior_region && + output_config->GetState() == TensorConfigState::BOUNDARY) { + memory_usage += output_config->GetTensor()->GetSize(); + } + for (const auto& input_config : input_configs) { + if (input_config->GetHomeRegion() == interior_region && + input_config->GetState() == TensorConfigState::BOUNDARY) { + memory_usage += input_config->GetTensor()->GetSize(); + } else if (input_config->GetHomeRegion() == interior_region || + input_config->GetCopyRegion() == interior_region) { + memory_usage += input_config->GetBufferSize(); + } + } + return memory_usage; +} + +std::vector GenerateSinglePlans( + const Part& part, const std::vector& output_stripe_configs, + const std::unordered_map, ObjectPtrHash, ObjectPtrEqual>& + home_map, + const CascaderOptions& options) { + std::vector plans; + std::vector part_group{part}; + // Create a selection of Plans per output_stripe_config + for (const auto& output_stripe_config : output_stripe_configs) { + // Calculate the input_stripe_configs + auto input_stripe_configs = part->CalculateInputStripeConfigs(output_stripe_config); + // From the input_stripe_configs, now derive all the possible input TensorConfigs + std::vector> all_possible_input_configs; + size_t i = 0; + for (const auto& stripe_config : input_stripe_configs) { + Tensor tensor = part->GetInputTensors()[i]; + all_possible_input_configs.push_back( + GetPossibleInputConfigs(stripe_config, tensor, home_map.at(tensor), options)); + i++; + } + // Now work out all the possible combinations of input TensorConfigs + auto input_config_combinations = + EnumerateCombinations(all_possible_input_configs); + Tensor output_tensor = part->GetOutputTensor(); + // Then determine the possible output TensorConfigs (no combinations here because there's only + // one output) + auto output_configs = GetPossibleOutputConfigs(output_stripe_config, output_tensor, + home_map.at(output_tensor), options); + // Calculate the performance information for the output_stripe_config for both the recompute and + // rolling cases + PerformanceInfo rolling_perf = + part->GetPerformanceInfo(output_stripe_config, BufferMode::ROLLING); + PerformanceInfo recompute_perf = + part->GetPerformanceInfo(output_stripe_config, BufferMode::RECOMPUTE); + // For all the possible input TensorConfig combinations + for (const auto& input_configs : input_config_combinations) { + std::vector tensor_configs; + std::vector open_input_configs; + // Add the input TensorConfigs to the 'tensor_configs' and + // record which input TensorConfigs are 'open' (i.e. 'INTERIOR') + for (const auto& input_config : input_configs) { + tensor_configs.push_back(input_config); + if (input_config->GetState() == TensorConfigState::INTERIOR) { + open_input_configs.push_back(input_config); + } + } + for (const auto& output_config : output_configs) { + // Add the output TensorConfig to the tensor_configs and to + // the open configs (if it's 'INTERIOR') + tensor_configs.push_back(output_config); + std::vector open_configs = open_input_configs; + if (output_config->GetState() == TensorConfigState::INTERIOR) { + open_configs.push_back(output_config); + } + int bandwidth_cycles = 0; + int compute_cycles = 0; + int mem2mem_cycles = 0; + + // Pick the correct performance info based on the BufferMode + PerformanceInfo perf_info; + if (output_config->GetBufferMode() == BufferMode::RECOMPUTE) { + perf_info = recompute_perf; + } else { + perf_info = rolling_perf; + } + // Calculate the bandwidth cycles by multiplying the bytes read/written by the + // bandwidth of the memories + for (size_t i = 0; i < input_configs.size(); i++) { + bandwidth_cycles += + perf_info->read_bytes[i] / input_configs[i]->GetCopyRegion()->read_bandwidth; + if (input_configs[i]->DoCopy()) { + // This Tensor needs to be copied - Count stripes for this config + Tensor tensor = input_configs[i]->GetTensor(); + for (const auto& stripe_config : input_configs[i]->GetStripeConfigs()) { + std::map, int> input_blocks = CountStripes(stripe_config, true); + for (const auto& block : input_blocks) { + int bytes_transferred = mul_reduce(block.first) * tensor->GetDataType().bytes() * + tensor->GetCompressionRatio() * block.second; + int read_cycles = + bytes_transferred * input_configs[i]->GetHomeRegion()->read_bandwidth; + int write_cycles = + bytes_transferred * input_configs[i]->GetCopyRegion()->write_bandwidth; + mem2mem_cycles += std::max(read_cycles, write_cycles); + } + } + } + } + bandwidth_cycles += + perf_info->write_bytes / output_config->GetCopyRegion()->write_bandwidth; + compute_cycles = perf_info->compute_cycles; + // Take the max of compute and bandwidth cycles as we assume compute cycles + // can hide memory latency + int cycles = std::max(std::max(compute_cycles, bandwidth_cycles), mem2mem_cycles); + int memory_usage = + GetInteriorMemoryUsage(input_configs, output_config, options->cascade_region); + plans.push_back(Plan(tensor_configs, open_configs, output_config, part_group, + options->cascade_region, memory_usage, cycles)); + } + } + } + return plans; +} + +std::unordered_map, std::vector> GenerateGraphPlans( + const CascaderGraph& graph, + const std::unordered_map, ObjectPtrHash, ObjectPtrEqual>& + home_map, + const CascaderOptions& options) { + ICHECK_GT(options->stripe_factors, 0) + << "stripe_factors = " << options->stripe_factors << ", but must be > 0"; + ICHECK_GT(options->max_plan_size, 0) + << "max_plan_size = " << options->max_plan_size << ", but must be > 0"; + // Define a map between the graph Tensors and possible StripeConfigs that the Tensor may be + // executed with + std::unordered_map, ObjectPtrHash, ObjectPtrEqual> + stripe_configs_by_tensor; + // Define a map between a given open TensorConfig and all the Plans which provide it + std::unordered_map> plans_by_config; + // Define a map between a group of connected Parts and all the closed plans covering them + std::unordered_map, std::vector> closed_plans; + // Define a nested map which indexes open plans by both Part group and the open TensorConfigs they + // provide. Note that we index in this way because Part group + open TensorConfigs combined + // defines a group of Plans which can be mutually Pareto culled. If we culled of Part group alone, + // we'd lose potentially valuable open Plans which could have gone on to be grown into Pareto + // optimal closed plans. + std::unordered_map, + std::unordered_map, std::vector>> + open_plans; + // Traverse the graph in a reverse topological order (should be enforced by GetPartOrder) + for (const auto& part : graph->GetPartOrder()) { + // First generate all the possible StripeConfigs for the Part assuming that it will become the + // output of a Plan. The number generated is a function of stripe_factors and the number of + // cascadable dimensions in the Part. + std::vector stripe_configs = + GenerateOutputStripeConfigs(part, options->stripe_factors); + // Check to see if the output Tensor is part of any existing open Plans + if (stripe_configs_by_tensor.find(part->GetOutputTensor()) != stripe_configs_by_tensor.end()) { + // If there are other open Plans which have this Part's output Tensor as an input, then + // additionally consider the StripeConfigs of those open TensorConfigs so that we have the + // option to merge into those open Plans. + const std::set& connecting_configs = + stripe_configs_by_tensor.at(part->GetOutputTensor()); + std::copy(connecting_configs.begin(), connecting_configs.end(), + std::back_inserter(stripe_configs)); + } + // Generate all the single Part Plans for the previously determined StripeConfigs + auto single_part_plans = GenerateSinglePlans(part, stripe_configs, home_map, options); + std::vector plans; + for (const auto& partial_plan : single_part_plans) { + // If the output TensorConfig of the Plan is 'INTERIOR', then it must be merged with + // another open Plan + if (partial_plan->GetOutputConfig()->GetState() == TensorConfigState::INTERIOR) { + if (plans_by_config.find(partial_plan->GetOutputConfig()) != plans_by_config.end() && + partial_plan->GetOutputConfig()->GetTensor()->GetConsumers().size() == 1) { + // Search for all the open Plans which require the same TensorConfig + const auto& join_plans = plans_by_config.at(partial_plan->GetOutputConfig()); + for (const auto& join_plan : join_plans) { + // Only merge to form a new Plan if the resulting Plan size won't exceed the + // max_plan_size + if (join_plan->GetPartGroup().size() < static_cast(options->max_plan_size)) { + if (partial_plan->GetMemoryUsage() + join_plan->GetMemoryUsage() < + options->cascade_region->size) { + plans.push_back(partial_plan.Merge(join_plan)); + } + } + } + } + } else { + // If the single Part Plan had a 'BOUNDARY' output TensorConfig, then it doesn't need + // merging and can stand on its own. + plans.push_back(partial_plan); + } + } + // For all the newly created Plans, update the various maps + std::unordered_set> new_part_groups; + for (const auto& plan : plans) { + new_part_groups.insert(plan->GetPartGroup()); + if (plan->IsClosed()) { + closed_plans[plan->GetPartGroup()].push_back(plan); + } else { + open_plans[plan->GetPartGroup()][plan->GetOpenConfigs()].push_back(plan); + } + } + // Now Pareto cull both the open and closed Plans to remove non-optimal Plans + // Additionally, once culled we update another two maps, the stripe_configs_by_tensor + // and plans_by_config maps. + for (const auto& part_group : new_part_groups) { + if (closed_plans.find(part_group) != closed_plans.end()) { + closed_plans[part_group] = ParetoCullPlans(closed_plans.at(part_group), 32); + } + for (const auto& it : open_plans[part_group]) { + auto pareto_plans = ParetoCullPlans(it.second, 8); + for (const auto& plan : pareto_plans) { + for (const auto& open_config : plan->GetOpenConfigs()) { + if (open_config != plan->GetOutputConfig()) { + for (const auto& stripe_config : open_config->GetStripeConfigs()) { + // Only add a StripeConfig if it contains for than one stripe + if (mul_reduce(stripe_config->GetStripes()) > 1) { + stripe_configs_by_tensor[open_config->GetTensor()].insert(stripe_config); + } + } + plans_by_config[open_config].push_back(plan); + } + } + } + } + } + } + return closed_plans; +} + +TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateOutputStripeConfigs") + .set_body_typed([](Part part, int stripe_factors) { + if (stripe_factors < 0) { + return Array(); + } + return Array(GenerateOutputStripeConfigs(part, stripe_factors)); + }); + +TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateSinglePlans") + .set_body_typed([](Part part, Array output_stripe_configs, + Map> home_map, CascaderOptions options) { + std::vector voutput_stripe_configs(output_stripe_configs.begin(), + output_stripe_configs.end()); + std::unordered_map, ObjectPtrHash, ObjectPtrEqual> + mhome_map; + for (const auto& it : home_map) { + std::vector home_regions; + for (const auto& i : it.second) { + home_regions.push_back(i); + } + mhome_map[it.first] = home_regions; + } + return Array(GenerateSinglePlans(part, voutput_stripe_configs, mhome_map, options)); + }); + +TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateGraphPlans") + .set_body_typed([](CascaderGraph graph, Map> home_map, + CascaderOptions options) { + std::unordered_map, ObjectPtrHash, ObjectPtrEqual> + mhome_map; + for (const auto& it : home_map) { + std::vector home_regions; + for (const auto& i : it.second) { + home_regions.push_back(i); + } + mhome_map[it.first] = home_regions; + } + auto closed_plans = GenerateGraphPlans(graph, mhome_map, options); + Map, Array> tclosed_plans; + for (auto& it : closed_plans) { + Array part_arr(it.first.begin(), it.first.end()); + Array plan_arr(it.second); + tclosed_plans.Set(part_arr, plan_arr); + } + return tclosed_plans; + }); + +} // namespace cascader +} // namespace ethosu +} // namespace contrib +} // namespace tvm diff --git a/src/contrib/ethosu/cascader/plan_generator.h b/src/contrib/ethosu/cascader/plan_generator.h new file mode 100644 index 000000000000..947728addfd1 --- /dev/null +++ b/src/contrib/ethosu/cascader/plan_generator.h @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/contrib/ethosu/cascader/plan_generator.h + * \brief Algorithm to generate possible Plans in the NPU cascader + */ +#ifndef TVM_CONTRIB_ETHOSU_CASCADER_PLAN_GENERATOR_H_ +#define TVM_CONTRIB_ETHOSU_CASCADER_PLAN_GENERATOR_H_ + +#include +#include + +#include +#include +#include + +namespace tvm { +namespace contrib { +namespace ethosu { +namespace cascader { + +class CascaderGraph; +class MemoryRegion; +class Part; +class Tensor; +class StripeConfig; +class Plan; +class CascaderOptions; + +using HomeMap = + std::unordered_map, ObjectPtrHash, ObjectPtrEqual>; + +/*! + * \brief Generate possible output StripeConfigs that could be applied to a Part's output. + * \param part The Part to generate StripeConfigs for. + * \param stripe_factors How many striping factors to try per axis. + * \return The generated StripeConfigs for the Part's output. + */ +std::vector GenerateOutputStripeConfigs(const Part& part, int stripe_factors); + +/*! + * \brief Generate single-Part Plans for a Part for a given list of output StripeConfigs. + * \param part The Part to generate Plans for. + * \param output_stripe_configs The output StripeConfigs to generate Plans with. + * \param home_map The Tensor homing map defining valid memory homes for Tensors. + * \param options The configuration options with which to run the generator. + * \return The generated Plans covering the Part. + * \note For each of the output StripeConfigs provided, this algorithm will produce a number + * of Plans corresponding to different choices of Tensor homing/copying, buffer modes + * and INTERIOR/BOUNDARY states. For each of these variants, the Part's performance will + * be queried and the memory usage will be calculated. + */ +std::vector GenerateSinglePlans(const Part& part, + const std::vector& output_stripe_configs, + const HomeMap& home_map, const CascaderOptions& options); + +/*! + * \brief Generate pareto optimal Plans for a Graph. + * \param graph The Graph to generate Plans for. + * \param home_map The Tensor homing map defining valid memory homes for Tensors. + * \param options The configuration options with which to run the generator. + * \return A map between Part groups and a list of pareto optimal Plans which cover that group. + * \note This algorithm does the following: + * + * Iterate Part-by-Part in a reversed topological ordering (starting at the output Parts and + * working towards the input Parts). + * + * For each Part: + * 1. Determine the possible StripeConfigs we might want to use to stripe the Part using + * GenerateOutputStripeConfigs. + * 2. Additionally, collect all the StripeConfigs of open Plans that could connect to this + * Part (i.e. the Plan has an open TensorConfig for the Part's output Tensor). + * 3. Use these two lists of StripeConfigs to produce single Part Plans with GenerateSinglePlans. + * 4. For the generated Plans that have an open output TensorConfig, try and merge these into + * existing Plans which share an open input TensorConfig. + * 5. All Plans are then indexed by both the Part group they cover and their open TensorConfigs. + * 6. Plans which cover the same Part group and share the same open TensorConfigs are culled + * using ParetoCullPlans. + * + * Once every Part has been visited, return the Plans with no open TensorConfigs indexed by Part + * group. + */ +std::unordered_map, std::vector> GenerateGraphPlans( + const CascaderGraph& graph, const HomeMap& home_map, const CascaderOptions& options); + +} // namespace cascader +} // namespace ethosu +} // namespace contrib +} // namespace tvm + +#endif // TVM_CONTRIB_ETHOSU_CASCADER_PLAN_GENERATOR_H_ diff --git a/src/contrib/ethosu/cascader/tensor_config.cc b/src/contrib/ethosu/cascader/tensor_config.cc new file mode 100644 index 000000000000..5e60f522fe4e --- /dev/null +++ b/src/contrib/ethosu/cascader/tensor_config.cc @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "tensor_config.h" + +#include +#include +#include + +#include +#include +#include + +#include "common.h" + +namespace tvm { +namespace contrib { +namespace ethosu { +namespace cascader { + +void MemoryRegionNode::VisitAttrs(AttrVisitor* v) { + v->Visit("name", &name); + v->Visit("size", &size); + v->Visit("read_bandwidth", &read_bandwidth); + v->Visit("write_bandwidth", &write_bandwidth); +} + +TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.MemoryRegion") + .set_body_typed([](String name, int size, int read_bandwidth, int write_bandwidth) { + return MemoryRegion(name, size, read_bandwidth, write_bandwidth); + }); + +TVM_REGISTER_NODE_TYPE(MemoryRegionNode); + +void TensorConfigNode::VisitAttrs(AttrVisitor* v) { + v->Visit("_tensor", &tensor_); + v->Visit("_home_region", &home_region_); + int state = static_cast(state_); + v->Visit("_state", &state); + int buffer_mode = static_cast(buffer_mode_); + v->Visit("_buffer_mode", &buffer_mode); + Array tmp_arr(stripe_configs_); + v->Visit("_stripe_configs", &tmp_arr); + v->Visit("_copy_tensor", ©_tensor_); + v->Visit("_copy_region", ©_region_); + int64_t tmp_hash = static_cast(hash_); + v->Visit("_hash", &tmp_hash); +} + +int TensorConfigNode::GetBufferSize() const { + if (buffer_mode_ == BufferMode::RECOMPUTE) { + return GetRecomputeBufferSize_(); + } else { + return GetRollingBufferSize_(); + } +} + +void TensorConfigNode::ComputeHash_() { + hash_ = ObjectHash()(tensor_); + hash_combine(&hash_, std::hash()(home_region_->name)); + hash_combine(&hash_, std::hash()(static_cast(state_))); + hash_combine(&hash_, std::hash()(static_cast(buffer_mode_))); + hash_combine(&hash_, hash_vector(stripe_configs_)); + hash_combine(&hash_, std::hash()(copy_tensor_)); + hash_combine(&hash_, std::hash()(copy_region_->name)); +} + +int TensorConfigNode::GetRecomputeBufferSize_() const { + size_t buffer_size = 0; + for (const auto& stripe_config : stripe_configs_) { + buffer_size += mul_reduce(stripe_config->GetShape()); + } + return buffer_size * tensor_->GetDataType().bytes() * tensor_->GetCompressionRatio(); +} + +int TensorConfigNode::GetRollingBufferSize_() const { + int buffer_size = 0; + for (const auto& stripe_config : stripe_configs_) { + int rolling_axis = -1; + for (size_t i = 0; i < stripe_config->GetOrder().size(); i++) { + // The axis must be striped (> 1 stripes) and ordered (order != 0) + if (stripe_config->GetStripes()[i] > 1 && stripe_config->GetOrder()[i] != 0) { + // If we've yet to find a possible rolling axis, use this one + if (rolling_axis == -1) { + rolling_axis = i; + continue; + } + // Otherwise, replace the rolling axis if the current axis has an earlier order + if (stripe_config->GetOrder()[i] < stripe_config->GetOrder()[rolling_axis]) { + rolling_axis = i; + } + } + } + // If we didn't find a rolling axis, just use axis 0 + if (rolling_axis == -1) { + rolling_axis = 0; + } + int rolling_size = 1; + for (size_t i = 0; i < tensor_->GetShape().size(); i++) { + if (static_cast(i) == rolling_axis) { + rolling_size *= stripe_config->GetShape()[i]; + } else { + rolling_size *= tensor_->GetShape()[i]; + } + } + buffer_size += rolling_size; + } + return buffer_size * tensor_->GetDataType().bytes() * tensor_->GetCompressionRatio(); +} + +TensorConfig::TensorConfig(const Tensor& tensor, const MemoryRegion& home_region, + TensorConfigState state, BufferMode buffer_mode, + const std::vector& stripe_configs, bool copy_tensor, + const MemoryRegion& copy_region) { + auto n = make_object(); + n->tensor_ = std::move(tensor); + n->home_region_ = std::move(home_region); + n->state_ = state; + n->buffer_mode_ = buffer_mode; + n->stripe_configs_ = std::move(stripe_configs); + n->copy_tensor_ = copy_tensor; + n->copy_region_ = std::move(copy_region); + n->ComputeHash_(); + data_ = std::move(n); +} + +inline bool TensorConfig::operator==(const TensorConfig& other) const { + if (get() == other.get()) return true; + if (get() == nullptr || other.get() == nullptr) return false; + if ((*this)->tensor_ == other->tensor_ && (*this)->home_region_ == other->home_region_ && + (*this)->state_ == other->state_ && (*this)->buffer_mode_ == other->buffer_mode_ && + (*this)->stripe_configs_ == other->stripe_configs_ && + (*this)->copy_tensor_ == other->copy_tensor_ && + (*this)->copy_region_ == other->copy_region_) { + return true; + } + return false; +} + +TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.TensorConfig") + .set_body_typed([](Tensor tensor, MemoryRegion home_region, int state, int buffer_mode, + Array stripe_configs, bool copy_tensor, + MemoryRegion copy_region) { + TensorConfigState estate = static_cast(state); + BufferMode ebuffer_mode = static_cast(buffer_mode); + std::vector vstripe_configs(stripe_configs.begin(), stripe_configs.end()); + return TensorConfig(tensor, home_region, estate, ebuffer_mode, vstripe_configs, copy_tensor, + copy_region); + }); + +TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.TensorConfigEqual") + .set_body_method(&TensorConfig::operator==); + +TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.TensorConfigGetBufferSize") + .set_body_method(&TensorConfigNode::GetBufferSize); + +TVM_REGISTER_NODE_TYPE(TensorConfigNode); + +} // namespace cascader +} // namespace ethosu +} // namespace contrib +} // namespace tvm diff --git a/src/contrib/ethosu/cascader/tensor_config.h b/src/contrib/ethosu/cascader/tensor_config.h new file mode 100644 index 000000000000..6a37f76ce085 --- /dev/null +++ b/src/contrib/ethosu/cascader/tensor_config.h @@ -0,0 +1,226 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/contrib/ethosu/cascader/tensor_config.h + * \brief TensorConfig object for the NPU cascader + */ +#ifndef TVM_CONTRIB_ETHOSU_CASCADER_TENSOR_CONFIG_H_ +#define TVM_CONTRIB_ETHOSU_CASCADER_TENSOR_CONFIG_H_ + +#include +#include + +#include +#include +#include +#include + +#include "graph.h" +#include "stripe_config.h" + +namespace tvm { +namespace contrib { +namespace ethosu { +namespace cascader { + +class MemoryRegionNode : public Object { + public: + void VisitAttrs(AttrVisitor* v); + + /*! \brief The name of the region */ + std::string name; + /*! \brief The size of the region */ + int size; + /*! \brief The read bandwidth of the region in bytes per cycle */ + int read_bandwidth; + /*! \brief The write bandwidth of the region in bytes per cycle */ + int write_bandwidth; + + static constexpr const char* _type_key = "contrib.ethosu.cascader.MemoryRegion"; + TVM_DECLARE_FINAL_OBJECT_INFO(MemoryRegionNode, Object) +}; + +class MemoryRegion : public ObjectRef { + public: + MemoryRegion(std::string name, int size, int read_bandwidth, int write_bandwidth) { + auto n = make_object(); + n->name = name; + n->size = size; + n->read_bandwidth = read_bandwidth; + n->write_bandwidth = write_bandwidth; + data_ = std::move(n); + } + + TVM_DEFINE_OBJECT_REF_METHODS(MemoryRegion, ObjectRef, MemoryRegionNode); +}; + +/*! \brief The 'state' of a TensorConfig as used in the Plan generation algorithm. + * BOUNDARY - Should describe a Plan input/output Tensor. + * INTERIOR - Should describe an intermediate Tensor in a 'closed' Plan. + */ +enum TensorConfigState { BOUNDARY, INTERIOR }; + +/*! \brief Node to represent a TensorConfig */ +class TensorConfigNode : public Object { + public: + void VisitAttrs(AttrVisitor* v); + + /*! \return The Tensor the config applies to */ + const Tensor GetTensor() const { return tensor_; } + /*! \return The region where the tensor is allocated */ + MemoryRegion GetHomeRegion() const { return home_region_; } + /*! + * \return The state of the TensorConfig. + * \note The TensorConfigState is only used as part of the Plan generation algorithm. For a Plan + * to be 'closed' (and therefore not subject to any further merging), all the TensorConfigs that + * describe Plan input or output Tensors must be in the 'BOUNDARY' state with the rest being + * 'INTERIOR'. If any of the input or output tensors are described by an 'INTERIOR' TensorConfig, + * then the Plan is 'open' and should be merged with other 'open' Plans until the result becomes + * 'closed'. + */ + TensorConfigState GetState() const { return state_; } + /*! + * \return The mode in which the buffer should be realized + * \note There are multiple buffering strategies by which a tensor may be realized (computed). + * These affect the amount of recomputation necessary as well as the size of buffer required to + * store the tensor. See 'BufferMode' for a description of the allowable buffering modes. + */ + BufferMode GetBufferMode() const { return buffer_mode_; } + /*! + * \return Whether to copy the tensor. + * \note While a tensor will originally reside in its home region, the TensorConfig may optionally + * specify that the tensor should be copied (according to the StripeConfigs) into another + * MemoryRegion. As an example for where this may be used, if a weights tensor initially resides + * in slow Flash memory then necessarily the home region will be Flash. However, if the weights + * values are used multiple times by a Part, it may be more performant to choose to copy the + * weights into a faster memory like SRAM. + */ + bool DoCopy() const { return copy_tensor_; } + /*! \return The region to copy the tensor to */ + MemoryRegion GetCopyRegion() const { + if (!copy_tensor_) { + return home_region_; + } + return copy_region_; + } + /*! + * \return The StripeConfigs with which to compute the tensor. + * \note The StripeConfigs determine the order in which the elements of the tensor should be + * computed, including potentially computing them multiple times (recompute). Multiple + * StripeConfigs are used over just a single StripeConfig for the case where the tensor is + * consumed by two different Parts executing themselves with different StripeConfigs. In this + * case, there is a StripeConfig per consumer of the tensor. + */ + const std::vector GetStripeConfigs() const { return stripe_configs_; } + /*! + * \return The size of the buffer needed for the TensorConfig. + * \note The size of buffer necessary to store a tensor being produced using the TensorConfig is + * not necessarily just the size of the tensor. In Plans, a tensor may be being produced and + * consumed in 'stripes' which are smaller than the full tensor. Therefore, the buffer necessary + * to store the tensor may only need to be as large as the stripe. The precise size of the buffer + * will depend both on the BufferMode and StripeConfigs (as well as, of course, the Tensor). + */ + int GetBufferSize() const; + /*! \return The hash of the TensorConfigNode */ + size_t GetHash() const { return hash_; } + + static constexpr const char* _type_key = "contrib.ethosu.cascader.TensorConfig"; + TVM_DECLARE_FINAL_OBJECT_INFO(TensorConfigNode, Object); + + protected: + friend class TensorConfig; + + /*! \brief Compute the hash of the TensorConfigNode */ + void ComputeHash_(); + + /*! \return The size of the recompute buffer needed*/ + int GetRecomputeBufferSize_() const; + /*! \return The size of the rolling buffer needed*/ + int GetRollingBufferSize_() const; + + /*! \brief The Tensor the config applies to */ + Tensor tensor_; + /*! \brief The region where the tensor is allocated */ + MemoryRegion home_region_; + /*! \return The state of the TensorConfig */ + TensorConfigState state_; + /*! \brief The mode in which the buffer should be realized */ + BufferMode buffer_mode_; + /*! \return The StripeConfigs with which to compute the tensor */ + std::vector stripe_configs_; + /*! \brief Whether to copy the tensor */ + bool copy_tensor_; + /*! \brief The region to copy the tensor to */ + MemoryRegion copy_region_; + /*! \brief The hash of the TensorConfigNode */ + size_t hash_{0}; +}; + +/*! + * \brief A class which describes how to realize a Tensor. + * \note The TensorConfig describes both how a Tensor is scheduled (the order in which it's + * produced/consumed) and how its allocated in memory (which region it should reside in and whether + * it should be copied). For further detail on how TensorConfig stores this information, consult the + * documentation of TensorConfigNode. + */ +class TensorConfig : public ObjectRef { + public: + TensorConfig(const Tensor& tensor, const MemoryRegion& home_region, TensorConfigState state, + BufferMode buffer_mode, const std::vector& stripe_configs, + bool copy_tensor, const MemoryRegion& copy_region); + /*! + * \brief Check if two TensorConfigs are equal to each other. + * \param other TensorConfig to be checked. + * \return Whether the two TensorConfigs equal each other. + */ + bool operator==(const TensorConfig& other) const; + + TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(TensorConfig, ObjectRef, TensorConfigNode); +}; + +} // namespace cascader +} // namespace ethosu +} // namespace contrib +} // namespace tvm + +// Hash and equal function for TensorConfig +namespace std { + +/*! \brief The equal_to function for tvm::contrib::ethosu::cascader::TensorConfig */ +template <> +struct equal_to<::tvm::contrib::ethosu::cascader::TensorConfig> { + bool operator()(const ::tvm::contrib::ethosu::cascader::TensorConfig& lhs, + const ::tvm::contrib::ethosu::cascader::TensorConfig& rhs) const { + return lhs == rhs; + } +}; + +/*! \brief The hash function for tvm::contrib::ethosu::cascader::TensorConfig */ +template <> +struct hash<::tvm::contrib::ethosu::cascader::TensorConfig> { + std::size_t operator()( + const ::tvm::contrib::ethosu::cascader::TensorConfig& tensor_config) const { + return tensor_config->GetHash(); + } +}; + +} // namespace std + +#endif // TVM_CONTRIB_ETHOSU_CASCADER_TENSOR_CONFIG_H_ diff --git a/tests/python/contrib/test_ethosu/cascader/conftest.py b/tests/python/contrib/test_ethosu/cascader/conftest.py index eacf57c251a8..21ed401994c1 100644 --- a/tests/python/contrib/test_ethosu/cascader/conftest.py +++ b/tests/python/contrib/test_ethosu/cascader/conftest.py @@ -22,6 +22,23 @@ except ImportError: ethosu_enabled = False +import tvm.contrib.ethosu.cascader as cs + + +@pytest.fixture +def FLASH(): + return cs.MemoryRegion(name="FLASH", size=10 ** 7, read_bandwidth=4, write_bandwidth=4) + + +@pytest.fixture +def DRAM(): + return cs.MemoryRegion(name="DRAM", size=10 ** 9, read_bandwidth=8, write_bandwidth=8) + + +@pytest.fixture +def SRAM(): + return cs.MemoryRegion(name="SRAM", size=10 ** 6, read_bandwidth=16, write_bandwidth=16) + if ethosu_enabled: import tvm @@ -35,6 +52,52 @@ make_ethosu_binary_elementwise, ) + def make_TwoConv2DTE(): + def _get_func(): + ifm = relay.var("ifm", shape=(1, 12, 12, 8), dtype="int8") + conv1 = make_ethosu_conv2d( + ifm=ifm, + ifm_channels=8, + ofm_channels=32, + kernel_shape=(1, 1), + padding=(0, 0), + strides=(1, 1), + dilation=(1, 1), + activation="NONE", + ifm_layout="NHWC", + ofm_layout="NHCWB16", + ) + conv2 = make_ethosu_conv2d( + ifm=conv1, + ifm_channels=32, + ofm_channels=16, + kernel_shape=(3, 3), + padding=(1, 1), + strides=(1, 1), + dilation=(1, 1), + activation="NONE", + ifm_layout="NHCWB16", + ofm_layout="NHWC", + ) + func = relay.Function(relay.analysis.free_vars(conv2), conv2) + func = run_opt_pass(func, relay.transform.InferType()) + return func + + func = _get_func() + te_graph, const_dict = create_te_graph(func) + sch = tvm.te.create_schedule([t.op for t in te_graph.outputs]) + return sch, te_graph, const_dict + + @pytest.fixture + def TwoConv2DTE(): + return make_TwoConv2DTE() + + @pytest.fixture + def TwoConv2DGraph(): + _, te_graph, const_dict = make_TwoConv2DTE() + device_config = cs.EthosuDeviceConfig("ethos-u55-256") + return cs.create_cascader_graph(te_graph, const_dict, device_config) + def make_TwoConv2DWithSliceTE(): def _get_func(): ifm = relay.var("ifm", shape=(1, 12, 12, 8), dtype="int8") @@ -76,6 +139,12 @@ def _get_func(): def TwoConv2DWithSliceTE(): return make_TwoConv2DWithSliceTE() + @pytest.fixture + def TwoConv2DWithSliceGraph(): + _, te_graph, const_dict = make_TwoConv2DWithSliceTE() + device_config = cs.EthosuDeviceConfig("ethos-u55-256") + return cs.create_cascader_graph(te_graph, const_dict, device_config) + def make_MobileNetv2DiamondTE(): def _get_func(): ifm = relay.var("ifm", shape=(1, 56, 56, 96), dtype="int8") diff --git a/tests/python/contrib/test_ethosu/cascader/infra.py b/tests/python/contrib/test_ethosu/cascader/infra.py index 5f41dce30147..aa681c41f210 100644 --- a/tests/python/contrib/test_ethosu/cascader/infra.py +++ b/tests/python/contrib/test_ethosu/cascader/infra.py @@ -14,113 +14,147 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +ethosu_enabled = True +try: + import ethosu.vela +except ImportError: + ethosu_enabled = False + import tvm from tvm import relay -from tvm.relay.backend.contrib.ethosu.tir.compiler import extract_constants, lower_to_te - +import tvm.contrib.ethosu.cascader as cs import numpy as np -def create_te_graph(func): - func, consts = extract_constants(func) - mod = tvm.IRModule.from_expr(func) - func = relay.transform.InferType()(mod)["main"] - te_graph = lower_to_te(func) - return te_graph, consts +def make_options( + cascade_region: cs.MemoryRegion, + max_proposals: int = 1, + stripe_factors: int = 1, + max_plan_size: int = 1, + always_copy_size: int = 1024, +): + return cs.CascaderOptions( + cascade_region=cascade_region, + max_proposals=max_proposals, + stripe_factors=stripe_factors, + max_plan_size=max_plan_size, + always_copy_size=always_copy_size, + ) -def make_matrices( - op_type, kernel, stride, padding, ifm_layout, ofm_layout, dilation=(1, 1), ifm_channels=1 -): - kernel_h, kernel_w = kernel - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 - dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 - nhwc_to_nhcwb16 = [ - [1, 0, 0, 0, 0], - [0, 1, 0, 0, 0], - [0, 0, 0, 1 / 16, 0], - [0, 0, 1, 0, 0], - [0, 0, 0, 0, 16], - [0, 0, 0, 0, 1], - ] - nhcwb16_to_nhwc = [ - [1, 0, 0, 0, 0, 0], - [0, 1, 0, 0, 0, 0], - [0, 0, 0, 1, 0, 0], - [0, 0, 16, 0, 1, -16], - [0, 0, 0, 0, 0, 1], - ] - if op_type == "ethosu_conv2d": - ifm_matrix = [ - [1, 0, 0, 0, 0], - [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)], - [0, 0, stride_w, 0, (dilated_kernel_w - stride_w)], - [0, 0, 0, 0, ifm_channels], - [0, 0, 0, 0, 1], - ] - weight_matrix = [ - [0, 0, 0, 1, 0], - [0, 0, 0, 0, kernel_h], - [0, 0, 0, 0, kernel_w], - [0, 0, 0, 0, ifm_channels], - [0, 0, 0, 0, 1], - ] - elif op_type == "ethosu_depthwise_conv2d": - ifm_matrix = [ +def make_simple_home_map(graph, var_region, const_region): + home_map = {} + for tensor in graph.tensor_order: + if tensor.is_constant: + home_map[tensor] = [const_region] + else: + home_map[tensor] = [var_region] + + return home_map + + +if ethosu_enabled: + from tvm.relay.backend.contrib.ethosu.tir.compiler import extract_constants, lower_to_te + + def create_te_graph(func): + func, consts = extract_constants(func) + mod = tvm.IRModule.from_expr(func) + func = relay.transform.InferType()(mod)["main"] + te_graph = lower_to_te(func) + return te_graph, consts + + def make_matrices( + op_type, kernel, stride, padding, ifm_layout, ofm_layout, dilation=(1, 1), ifm_channels=1 + ): + kernel_h, kernel_w = kernel + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 + dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 + nhwc_to_nhcwb16 = [ [1, 0, 0, 0, 0], - [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)], - [0, 0, stride_w, 0, (dilated_kernel_w - stride_w)], - [0, 0, 0, 1, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 1 / 16, 0], + [0, 0, 1, 0, 0], + [0, 0, 0, 0, 16], [0, 0, 0, 0, 1], ] - weight_matrix = [ - [0, 0, 0, 1, 0], - [0, 0, 0, 0, kernel_h], - [0, 0, 0, 0, kernel_w], - [0, 0, 0, 0, 1], - [0, 0, 0, 0, 1], + nhcwb16_to_nhwc = [ + [1, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0], + [0, 0, 0, 1, 0, 0], + [0, 0, 16, 0, 1, -16], + [0, 0, 0, 0, 0, 1], ] - elif op_type == "ethosu_pooling": - ifm_matrix = [ - [1, 0, 0, 0, 0], - [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)], - [0, 0, stride_w, 0, (dilated_kernel_w - stride_w)], + if op_type == "ethosu_conv2d": + ifm_matrix = [ + [1, 0, 0, 0, 0], + [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)], + [0, 0, stride_w, 0, (dilated_kernel_w - stride_w)], + [0, 0, 0, 0, ifm_channels], + [0, 0, 0, 0, 1], + ] + weight_matrix = [ + [0, 0, 0, 1, 0], + [0, 0, 0, 0, kernel_h], + [0, 0, 0, 0, kernel_w], + [0, 0, 0, 0, ifm_channels], + [0, 0, 0, 0, 1], + ] + elif op_type == "ethosu_depthwise_conv2d": + ifm_matrix = [ + [1, 0, 0, 0, 0], + [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)], + [0, 0, stride_w, 0, (dilated_kernel_w - stride_w)], + [0, 0, 0, 1, 0], + [0, 0, 0, 0, 1], + ] + weight_matrix = [ + [0, 0, 0, 1, 0], + [0, 0, 0, 0, kernel_h], + [0, 0, 0, 0, kernel_w], + [0, 0, 0, 0, 1], + [0, 0, 0, 0, 1], + ] + elif op_type == "ethosu_pooling": + ifm_matrix = [ + [1, 0, 0, 0, 0], + [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)], + [0, 0, stride_w, 0, (dilated_kernel_w - stride_w)], + [0, 0, 0, 1, 0], + [0, 0, 0, 0, 1], + ] + weight_matrix = [ + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + ] + scale_bias_matrix = [ [0, 0, 0, 1, 0], + [0, 0, 0, 0, 10], [0, 0, 0, 0, 1], ] - weight_matrix = [ - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - ] - scale_bias_matrix = [ - [0, 0, 0, 1, 0], - [0, 0, 0, 0, 10], - [0, 0, 0, 0, 1], - ] - if ofm_layout == "NHCWB16": - ifm_matrix = np.matmul(ifm_matrix, nhcwb16_to_nhwc).tolist() - weight_matrix = np.matmul(weight_matrix, nhcwb16_to_nhwc).tolist() - scale_bias_matrix = np.matmul(scale_bias_matrix, nhcwb16_to_nhwc).tolist() - if ifm_layout == "NHCWB16": - ifm_matrix = np.matmul(nhwc_to_nhcwb16, ifm_matrix).tolist() + if ofm_layout == "NHCWB16": + ifm_matrix = np.matmul(ifm_matrix, nhcwb16_to_nhwc).tolist() + weight_matrix = np.matmul(weight_matrix, nhcwb16_to_nhwc).tolist() + scale_bias_matrix = np.matmul(scale_bias_matrix, nhcwb16_to_nhwc).tolist() + if ifm_layout == "NHCWB16": + ifm_matrix = np.matmul(nhwc_to_nhcwb16, ifm_matrix).tolist() - ifm_offset = ( - [0, -padding[0], -padding[1], 0] - if ifm_layout == "NHWC" - else [0, -padding[0], 0, -padding[1], 0] - ) - weight_offset = [0, 0, 0, 0] - scale_bias_offset = [0, 0] - return ( - ifm_matrix, - ifm_offset, - weight_matrix, - weight_offset, - scale_bias_matrix, - scale_bias_offset, - ) + ifm_offset = ( + [0, -padding[0], -padding[1], 0] + if ifm_layout == "NHWC" + else [0, -padding[0], 0, -padding[1], 0] + ) + weight_offset = [0, 0, 0, 0] + scale_bias_offset = [0, 0] + return ( + ifm_matrix, + ifm_offset, + weight_matrix, + weight_offset, + scale_bias_matrix, + scale_bias_offset, + ) diff --git a/tests/python/contrib/test_ethosu/cascader/test_pareto.py b/tests/python/contrib/test_ethosu/cascader/test_pareto.py new file mode 100644 index 000000000000..2d897a79310f --- /dev/null +++ b/tests/python/contrib/test_ethosu/cascader/test_pareto.py @@ -0,0 +1,149 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from tvm.tir import IntImm +from tvm.contrib.ethosu.cascader.pareto import ( + _get_pareto_frontier, + _thin_vector, + _pareto_cull_plans, +) +from tvm.contrib.ethosu.cascader import ( + Plan, + StripeConfig, + TensorConfig, + TensorConfigState, + BufferMode, + Tensor, +) + +import pytest +import numpy as np + + +def _ref_get_pareto_frontier(costs): + is_efficient = np.ones(costs.shape[0], dtype=bool) + for i, c in enumerate(costs): + if is_efficient[i]: + is_efficient[is_efficient] = np.any( + costs[is_efficient] < c, axis=1 + ) # Keep any point with a lower cost + is_efficient[i] = True # And keep self + return is_efficient + + +def _ref_thin_vector(vec, max_size): + if max_size < 1: + return [] + if len(vec) <= max_size or len(vec) == 0: + return vec + if max_size == 1: + return [vec[0]] + samples = np.linspace(0, len(vec), max_size - 1, endpoint=False).astype(int) + samples = np.append(samples, len(vec) - 1) + return vec[samples] + + +def _ref_pareto_cull_plans(plans, points): + if len(plans) <= points: + return plans + plans = np.array(sorted(plans, key=lambda x: x.memory_usage)) + costs = [] + for plan in plans: + costs.append(np.array([plan.memory_usage, plan.cycles])) + is_efficient = _ref_get_pareto_frontier(np.array(costs)) + culled_plans = plans[is_efficient] + thinned_plans = ( + culled_plans + if len(culled_plans) <= points + else _ref_thin_vector(np.array(culled_plans), points) + ) + return thinned_plans + + +@pytest.mark.parametrize("num_costs", [1, 10, 30, 100, 300, 1000]) +def test_get_pareto_frontier(num_costs): + cost_low = 1 + cost_high = 100 + dims = 2 + costs = [] + for i in range(num_costs): + costs.append(list(np.random.randint(cost_low, cost_high, size=(dims,)))) + reference = list(_ref_get_pareto_frontier(np.array(costs))) + result = _get_pareto_frontier(costs) + assert result == reference + + +@pytest.mark.parametrize("vec_length", [0, 1, 10, 25, 100]) +@pytest.mark.parametrize("max_size", [0, 1, 2, 5, 11, 51]) +def test_thin_vector(vec_length, max_size): + def _make_vector(length): + vector = [] + for i in range(length): + obj = IntImm("int32", i) + vector.append(obj) + + return vector + + vector = _make_vector(vec_length) + reference = list(_ref_thin_vector(np.array(vector), max_size)) + result = _thin_vector(vector, max_size) + assert result == reference + + +@pytest.mark.parametrize("num_plans", [0, 1, 10, 25, 100]) +@pytest.mark.parametrize("max_plans", [0, 1, 2, 5, 11, 51]) +def test_pareto_cull_plans(num_plans, max_plans, SRAM): + memory_usage_low = 1 + memory_usage_high = 1000 + cycles_low = 100 + cycles_high = 10000 + + def _make_plan(memory_usage, cycles): + output_config = TensorConfig( + tensor=Tensor([1], "int8"), + home_region=SRAM, + state=TensorConfigState.BOUNDARY, + buffer_mode=BufferMode.RECOMPUTE, + stripe_configs=[StripeConfig([1], [1], [1], [1], [1], [0])], + ) + return Plan( + tensor_configs={}, + open_configs=[], + output_config=output_config, + part_group=[], + interior_region=SRAM, + memory_usage=memory_usage, + cycles=cycles, + ) + + def _make_plans(num): + plans = [] + for _ in range(num): + memory_usage = np.random.randint(memory_usage_low, memory_usage_high) + cycles = np.random.randint(cycles_low, cycles_high) + plan = _make_plan(memory_usage, cycles) + plans.append(plan) + + return plans + + plans = _make_plans(num_plans) + reference = list(_ref_pareto_cull_plans(plans, max_plans)) + result = _pareto_cull_plans(plans, max_plans) + assert result == reference + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/python/contrib/test_ethosu/cascader/test_plan.py b/tests/python/contrib/test_ethosu/cascader/test_plan.py new file mode 100644 index 000000000000..ddc40b49ac8a --- /dev/null +++ b/tests/python/contrib/test_ethosu/cascader/test_plan.py @@ -0,0 +1,244 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import tvm.contrib.ethosu.cascader as cs + +import pytest + + +def test_plan(DRAM, SRAM): + subgraph = cs.TESubgraph([], None) + part = cs.InlinePart( + subgraph, + [ + cs.Propagator( + [[1, 0, 0], [0, 1, 0], [0, 0, 1]], + [0, 0], + ), + ], + ) + tensor_1 = cs.Tensor([10, 10], "uint8") + tensor_2 = cs.Tensor([10, 10], "uint8") + + part.set_input(0, tensor_1) + part.set_output(tensor_2) + tensor_1.add_consumer(part) + tensor_2.add_producer(part) + + output_stripe_config = cs.StripeConfig( + shape=[5, 5], + extent=[10, 10], + strides=[5, 5], + order=[1, 2], + stripes=[2, 2], + offset=[0, 0], + ) + tensor_config_out = cs.TensorConfig( + tensor=tensor_2, + home_region=DRAM, + state=cs.TensorConfigState.BOUNDARY, + buffer_mode=cs.BufferMode.RECOMPUTE, + stripe_configs=[output_stripe_config], + copy_tensor=False, + ) + input_stripe_config = part.calculate_input_stripe_configs(output_stripe_config)[0] + tensor_config_in = cs.TensorConfig( + tensor=tensor_1, + home_region=DRAM, + state=cs.TensorConfigState.INTERIOR, + buffer_mode=cs.BufferMode.ROLLING, + stripe_configs=[input_stripe_config], + copy_tensor=False, + ) + tensor_configs = {tensor_1: tensor_config_in, tensor_2: tensor_config_out} + open_configs = frozenset([tensor_config_in]) + part_group = frozenset([part]) + interior_region = SRAM + memory_usage = 100 + cycles = 20 + plan = cs.Plan( + tensor_configs=tensor_configs, + open_configs=open_configs, + output_config=tensor_config_out, + part_group=part_group, + interior_region=interior_region, + memory_usage=memory_usage, + cycles=cycles, + ) + + assert plan.tensor_configs == tensor_configs + assert plan.open_configs == open_configs + assert plan.output_config == tensor_config_out + assert plan.part_group == part_group + assert plan.interior_region == interior_region + assert plan.memory_usage == memory_usage + assert plan.cycles == cycles + + +def test_plan_merge(DRAM, SRAM): + subgraph = cs.TESubgraph([], None) + part_1 = cs.InlinePart( + subgraph, + [ + cs.Propagator( + [[2, 0, 0], [0, 2, 0], [0, 0, 1]], + [0, 0], + ), + ], + ) + part_2 = cs.InlinePart( + subgraph, + [ + cs.Propagator( + [[1, 0, 0], [0, 1, 0], [0, 0, 1]], + [0, 0], + ), + cs.Propagator( + [[0, 0, 6], [0, 0, 6], [0, 0, 1]], + [0, 0], + ), + cs.Propagator( + [[1, 0], [0, 1]], + [0], + ), + ], + ) + tensor_1 = cs.Tensor([20, 20], "uint8") + tensor_2 = cs.Tensor([10, 10], "uint8") + tensor_3 = cs.Tensor([6, 6], "uint8") + tensor_4 = cs.Tensor([10], "uint8") + tensor_5 = cs.Tensor([10, 10], "uint8") + + part_1.set_input(0, tensor_1) + part_1.set_output(tensor_2) + tensor_1.add_consumer(part_1) + tensor_2.add_producer(part_1) + + part_2.set_input(0, tensor_2) + part_2.set_input(1, tensor_3) + part_2.set_input(2, tensor_4) + part_2.set_output(tensor_5) + tensor_2.add_consumer(part_2) + tensor_3.add_consumer(part_2) + tensor_4.add_consumer(part_2) + tensor_5.add_producer(part_2) + + output_stripe_config = cs.StripeConfig( + shape=[5, 5], + extent=[10, 10], + strides=[5, 5], + order=[1, 2], + stripes=[2, 2], + offset=[0, 0], + ) + tensor_config_5 = cs.TensorConfig( + tensor=tensor_5, + home_region=DRAM, + state=cs.TensorConfigState.BOUNDARY, + buffer_mode=cs.BufferMode.RECOMPUTE, + stripe_configs=[output_stripe_config], + copy_tensor=False, + ) + input_stripe_configs = part_2.calculate_input_stripe_configs(output_stripe_config) + tensor_config_4 = cs.TensorConfig( + tensor=tensor_4, + home_region=DRAM, + state=cs.TensorConfigState.BOUNDARY, + buffer_mode=cs.BufferMode.RECOMPUTE, + stripe_configs=[input_stripe_configs[2]], + copy_tensor=False, + ) + tensor_config_3 = cs.TensorConfig( + tensor=tensor_3, + home_region=SRAM, + state=cs.TensorConfigState.INTERIOR, + buffer_mode=cs.BufferMode.RECOMPUTE, + stripe_configs=[input_stripe_configs[1]], + copy_tensor=False, + ) + tensor_config_2 = cs.TensorConfig( + tensor=tensor_2, + home_region=SRAM, + state=cs.TensorConfigState.INTERIOR, + buffer_mode=cs.BufferMode.ROLLING, + stripe_configs=[input_stripe_configs[0]], + copy_tensor=False, + ) + input_stripe_config = part_1.calculate_input_stripe_configs(input_stripe_configs[0])[0] + tensor_config_1 = cs.TensorConfig( + tensor=tensor_1, + home_region=DRAM, + state=cs.TensorConfigState.BOUNDARY, + buffer_mode=cs.BufferMode.ROLLING, + stripe_configs=[input_stripe_config], + copy_tensor=False, + ) + tensor_configs = {tensor_1: tensor_config_1, tensor_2: tensor_config_2} + open_configs = frozenset([tensor_config_2]) + part_group = frozenset([part_1]) + interior_region = SRAM + memory_usage = 100 + cycles = 20 + plan_1 = cs.Plan( + tensor_configs=tensor_configs, + open_configs=open_configs, + output_config=tensor_config_2, + part_group=part_group, + interior_region=interior_region, + memory_usage=memory_usage, + cycles=cycles, + ) + + tensor_configs = { + tensor_2: tensor_config_2, + tensor_3: tensor_config_3, + tensor_4: tensor_config_4, + tensor_5: tensor_config_5, + } + open_configs = frozenset([tensor_config_2, tensor_config_3]) + part_group = frozenset([part_2]) + interior_region = SRAM + memory_usage = 200 + cycles = 30 + plan_2 = cs.Plan( + tensor_configs=tensor_configs, + open_configs=open_configs, + output_config=tensor_config_5, + part_group=part_group, + interior_region=interior_region, + memory_usage=memory_usage, + cycles=cycles, + ) + + merged_plan = plan_1.merge(plan_2) + + assert merged_plan.tensor_configs == { + tensor_1: tensor_config_1, + tensor_2: tensor_config_2, + tensor_3: tensor_config_3, + tensor_4: tensor_config_4, + tensor_5: tensor_config_5, + } + assert merged_plan.open_configs == frozenset([tensor_config_3]) + assert merged_plan.output_config == tensor_config_5 + assert merged_plan.part_group == frozenset([part_1, part_2]) + assert merged_plan.interior_region == interior_region + assert merged_plan.memory_usage == plan_1.memory_usage + plan_2.memory_usage + assert merged_plan.cycles == plan_1.cycles + plan_2.cycles + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py new file mode 100644 index 000000000000..ffee071f0e95 --- /dev/null +++ b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py @@ -0,0 +1,180 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest + +import tvm.contrib.ethosu.cascader as cs +from .infra import make_simple_home_map, make_options, ethosu_enabled + +from tvm.contrib.ethosu.cascader.plan_generator import ( + _generate_output_stripe_configs, + _generate_single_plans, + _generate_graph_plans, +) + + +def test_generate_output_stripe_configs(): + stripe_factors = 3 + expected_configs = 13 + subgraph = cs.TESubgraph([], None) + part_1 = cs.InlinePart( + subgraph, + [ + cs.Propagator( + [[2, 0, 0], [0, 2, 0], [0, 0, 1]], + [0, 0], + ), + ], + ) + tensor_1 = cs.Tensor([800, 800], "uint8") + tensor_2 = cs.Tensor([400, 400], "uint8") + + part_1.set_input(0, tensor_1) + part_1.set_output(tensor_2) + tensor_1.add_consumer(part_1) + tensor_2.add_producer(part_1) + + assert len(_generate_output_stripe_configs(part_1, stripe_factors)) == expected_configs + + +def test_generate_single_plans(SRAM, DRAM): + subgraph = cs.TESubgraph([], None) + part_1 = cs.InlinePart( + subgraph, + [ + cs.Propagator( + [[2, 0, 0], [0, 2, 0], [0, 0, 1]], + [0, 0], + ), + ], + ) + tensor_1 = cs.Tensor([800, 800], "int8") + tensor_2 = cs.Tensor([400, 400], "int8") + + part_1.set_input(0, tensor_1) + part_1.set_output(tensor_2) + tensor_1.add_consumer(part_1) + tensor_2.add_producer(part_1) + + home_map = { + tensor_1: [SRAM, DRAM], + tensor_2: [SRAM], + } + options = make_options(cascade_region=SRAM, stripe_factors=1) + output_stripe_configs = _generate_output_stripe_configs(part_1, options.stripe_factors) + plans = _generate_single_plans(part_1, output_stripe_configs, home_map, options) + for plan in plans: + assert plan.interior_region == SRAM + assert plan.part_group == frozenset([part_1]) + assert set(plan.tensor_configs.keys()) == set([tensor_1, tensor_2]) + for open_config in plan.open_configs: + assert open_config.state == cs.TensorConfigState.INTERIOR + + +def test_generate_graph_plans(SRAM, DRAM): + num_part_groups = 3 + stripe_factors = 4 + max_plan_size = 10 + subgraph = cs.TESubgraph([], None) + part_a = cs.InlinePart( + subgraph, + [ + cs.Propagator( + [[1, 0, 0], [0, 1, 0], [0, 0, 1]], + [0, 0], + ), + cs.Propagator( + [[0, 1, 0], [1, 0, 0], [0, 0, 1]], + [-1, -1], + ), + ], + ) + part_b = cs.InlinePart( + subgraph, + [ + cs.Propagator( + [[1, 0, 0], [0, 1, 0], [0, 0, 1]], + [0, 0], + ), + ], + ) + tensor_1 = cs.Tensor([10, 10], "int8") + tensor_2 = cs.Tensor([9, 9], "int8") + tensor_3 = cs.Tensor([10, 10], "int8") + tensor_4 = cs.Tensor([10, 10], "int8") + + part_a.set_input(0, tensor_1) + part_a.set_input(1, tensor_2) + part_a.set_output(tensor_3) + tensor_1.add_consumer(part_a) + tensor_2.add_consumer(part_a) + tensor_3.add_producer(part_a) + part_b.set_input(0, tensor_3) + part_b.set_output(tensor_4) + tensor_3.add_consumer(part_b) + tensor_4.add_producer(part_b) + + graph = cs.CascaderGraph([tensor_1, tensor_2], [tensor_4]) + home_map = { + tensor_1: [SRAM, DRAM], + tensor_2: [SRAM], + tensor_3: [SRAM], + tensor_4: [SRAM, DRAM], + } + + options = make_options( + cascade_region=SRAM, + stripe_factors=stripe_factors, + max_plan_size=max_plan_size, + ) + closed_plans = _generate_graph_plans(graph, home_map, options) + + assert len(closed_plans) == num_part_groups + + +if ethosu_enabled: + + def test_plan_generator_two_conv2d(FLASH, SRAM, TwoConv2DGraph): + num_part_groups = 3 + graph = TwoConv2DGraph + home_map = make_simple_home_map(graph, SRAM, FLASH) + options = make_options( + cascade_region=SRAM, + stripe_factors=4, + max_plan_size=10, + ) + + closed_plans = _generate_graph_plans(graph, home_map, options) + + assert len(closed_plans) == num_part_groups + + def test_plan_generator_two_conv2d_with_slice(FLASH, SRAM, TwoConv2DWithSliceGraph): + num_part_groups = 4 # Note this is not 6 because 'slice' has an opaque Propagator + graph = TwoConv2DWithSliceGraph + home_map = make_simple_home_map(graph, SRAM, FLASH) + options = make_options( + cascade_region=SRAM, + stripe_factors=4, + max_plan_size=10, + ) + + closed_plans = _generate_graph_plans(graph, home_map, options) + + assert len(closed_plans) == num_part_groups + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/python/contrib/test_ethosu/cascader/test_tensor_config.py b/tests/python/contrib/test_ethosu/cascader/test_tensor_config.py new file mode 100644 index 000000000000..68290e667eb0 --- /dev/null +++ b/tests/python/contrib/test_ethosu/cascader/test_tensor_config.py @@ -0,0 +1,110 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from tvm.contrib.ethosu.cascader import ( + StripeConfig, + Tensor, + MemoryRegion, + TensorConfig, + TensorConfigState, + BufferMode, +) + +import pytest + + +def test_tensor_config(DRAM, SRAM): + stripe_config = StripeConfig( + shape=[1, 2, 3], + extent=[2, 3, 4], + strides=[3, 4, 5], + order=[4, 5, 6], + stripes=[5, 6, 7], + offset=[6, 7, 8], + ) + tensor = Tensor( + shape=[10, 10, 10], + dtype="int8", + ) + home_region = DRAM + state = TensorConfigState.BOUNDARY + buffer_mode = BufferMode.ROLLING + copy_tensor = True + copy_region = SRAM + tensor_config = TensorConfig( + tensor=tensor, + home_region=home_region, + state=state, + buffer_mode=buffer_mode, + stripe_configs=[stripe_config], + copy_tensor=copy_tensor, + copy_region=copy_region, + ) + + assert tensor_config.tensor == tensor + assert tensor_config.home_region == home_region + assert tensor_config.state == state + assert tensor_config.buffer_mode == buffer_mode + assert tensor_config.stripe_configs == [stripe_config] + assert tensor_config.copy_tensor == copy_tensor + assert tensor_config.copy_region == copy_region + assert hash(tensor_config) != 0 + + +def test_get_rolling_buffer(DRAM): + stripe_config = StripeConfig( + shape=[9, 4, 7], + extent=[9, 16, 21], + strides=[3, 5, 7], + order=[1, 3, 2], + stripes=[1, 3, 3], + offset=[0, 0, 0], + ) + tensor = Tensor(shape=[9, 16, 21], dtype="int32", compression_ratio=0.5) + tensor_config = TensorConfig( + tensor=tensor, + home_region=DRAM, + state=TensorConfigState.BOUNDARY, + buffer_mode=BufferMode.ROLLING, + stripe_configs=[stripe_config], + ) + + assert tensor_config.get_buffer_size() == 2016 + + +def test_get_recompute_buffer(DRAM): + stripe_config = StripeConfig( + shape=[4, 5, 7], + extent=[6, 7, 14], + strides=[2, 3, 7], + order=[1, 3, 2], + stripes=[2, 2, 2], + offset=[0, 0, 0], + ) + tensor = Tensor(shape=[6, 7, 14], dtype="int32", compression_ratio=0.5) + tensor_config = TensorConfig( + tensor=tensor, + home_region=DRAM, + state=TensorConfigState.BOUNDARY, + buffer_mode=BufferMode.RECOMPUTE, + stripe_configs=[stripe_config], + ) + + assert tensor_config.get_buffer_size() == 280 + + +if __name__ == "__main__": + pytest.main([__file__])