From 5fe193006ab21d5118c7d7e723d3cff56b302e0c Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Fri, 26 Nov 2021 21:50:48 +0000 Subject: [PATCH 1/3] [microNPU] Support different constant datatypes Currently only uint8 datatype is supported for constants, as this is all that was necessary until now. This PR allows different datatypes to be used for constants, including different datatypes within the same graph. A workaround was previously added for Mean legalization, this has also been removed and replaced with the expected datatype of the constant. Change-Id: I99e34fe17905b1bb7d916e346cebfc324e3a2a0c --- .../relay/backend/contrib/ethosu/legalize.py | 3 +- .../contrib/ethosu/tir_to_cs_translator.py | 45 +++++++------- .../op/contrib/ethosu/binary_elementwise.cc | 23 ++----- .../contrib/test_ethosu/test_codegen.py | 10 ++- .../test_ethosu/test_tir_to_cs_translator.py | 62 ++++++++++++++++--- 5 files changed, 90 insertions(+), 53 deletions(-) diff --git a/python/tvm/relay/backend/contrib/ethosu/legalize.py b/python/tvm/relay/backend/contrib/ethosu/legalize.py index e35fe1543fa2..04635efb0fd7 100644 --- a/python/tvm/relay/backend/contrib/ethosu/legalize.py +++ b/python/tvm/relay/backend/contrib/ethosu/legalize.py @@ -1040,7 +1040,7 @@ def callback( n = int(filter_height * filter_width) eps = 1 / (256 * (n + 1)) if n % 2 == 0 else 0 - scalar_tensor = relay.const(np.ones([1, 1, 1, 1], dtype="uint8"), dtype="uint8") + scalar_tensor = relay.const(np.ones([1, 1, 1, 1], dtype="int16"), dtype="int16") reduced_op = ethosu_ops.ethosu_binary_elementwise( ifm=reduced_op, @@ -1156,6 +1156,7 @@ def transform_module( mod = LegalizeReshape()(mod) mod = LegalizeStridedSlice()(mod) mod = LegalizeNoOps()(mod) + print(mod) return mod def __call__(self, *args, **kwargs): diff --git a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py index c8e3d34d1c29..a22fb11e7093 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py @@ -110,12 +110,11 @@ def translate(tir_module, params): _npu_ops = list() for call_extern in call_extern_list: _npu_ops.append(translate_ethosu_tir_call_extern(call_extern)) - _npu_ops, constant_tensor, scratch_size = assign_addresses(buffer_info, _npu_ops) + _npu_ops, constant_data, scratch_size = assign_addresses(buffer_info, _npu_ops) target_accel_config = vela_api.get_accelerator_config() cmds = vapi.npu_generate_register_command_stream(_npu_ops, target_accel_config) payload = vapi.npu_create_driver_payload(cmds, target_accel_config) - hex_value = "" if constant_tensor is None else constant_tensor.tobytes().hex() - return payload.hex(), hex_value, scratch_size + return payload.hex(), constant_data, scratch_size def extract_call_extern_list(mod): @@ -277,27 +276,24 @@ def classify_io(buffer): raise ValueError(f"Unused IO : {buffer} in tir module.") scratch_size = 0 - constant_tensor = None + constant_hex_data = [] + total_constant_len = 0 buffer_addresses = dict() for _buffer, info in buffer_info.items(): + dtype_bytes = np.iinfo(np.dtype(info.dtype)).bits // 8 if info.values is not None: - assert np.dtype(info.dtype) == np.uint8 assert info.btype == BufferType.constant assert len(info.shape) == 1 - if constant_tensor is None: - buffer_addresses[_buffer] = (0, info.btype) - assert info.values.dtype == np.uint8 - size_in_bytes = info.values.size - # Every memory address the NPU access have to be 16 byte aligned - size_in_bytes = util.round_up(size_in_bytes, 16) - constant_tensor = np.resize(info.values, size_in_bytes) - else: - buffer_addresses[_buffer] = (constant_tensor.size, info.btype) - assert info.values.dtype == np.uint8 - size_in_bytes = info.values.size - # Every memory address the NPU access have to be 16 byte aligned - size_in_bytes = util.round_up(size_in_bytes, 16) - constant_tensor = np.append(constant_tensor, np.resize(info.values, size_in_bytes)) + buffer_addresses[_buffer] = ( + (total_constant_len, info.btype) if constant_hex_data else (0, info.btype) + ) + size_in_bytes = dtype_bytes * np.prod(list(info.shape)) + # Every memory address the NPU access have to be 16 byte aligned + size_in_bytes = util.round_up(size_in_bytes, 16) + constant_tensor = np.resize(info.values, size_in_bytes // dtype_bytes) + constant_tensor = constant_tensor.tobytes().hex() + constant_hex_data.append(constant_tensor) + total_constant_len += len(constant_tensor) // 2 else: if info.btype == BufferType.input_or_output: buffer_type = classify_io(_buffer) @@ -310,9 +306,7 @@ def classify_io(buffer): address = arch_config.lut_start_address buffer_addresses[_buffer] = (address, info.btype) else: - size_in_bytes = int( - (np.iinfo(np.dtype(info.dtype)).bits // 8) * np.prod(list(info.shape)) - ) + size_in_bytes = int(dtype_bytes * np.prod(list(info.shape))) # Every memory address the NPU access have to be 16 byte aligned size_in_bytes = util.round_up(size_in_bytes, 16) assert info.btype == BufferType.scratch @@ -330,7 +324,12 @@ def classify_io(buffer): else: setattr(npu_op, attr_name, replace_tir_loads(attr)) - return npu_ops, constant_tensor, scratch_size + constant_data = "".join(constant_hex_data) + return ( + npu_ops, + constant_data, + scratch_size, + ) def translate_ethosu_tir_call_extern(tir_call_extern): diff --git a/src/relay/op/contrib/ethosu/binary_elementwise.cc b/src/relay/op/contrib/ethosu/binary_elementwise.cc index 4e0d086e66b8..e7622452166c 100644 --- a/src/relay/op/contrib/ethosu/binary_elementwise.cc +++ b/src/relay/op/contrib/ethosu/binary_elementwise.cc @@ -128,21 +128,6 @@ struct EthosuBinaryElementwiseAttrs : public tvm::AttrsNode& ifm_shape, const DataType& ifm_dtype) { - if (ifm_dtype != DataType::UInt(8)) { - return false; - } - - for (const auto& expr : ifm_shape) { - const auto& dim_int_node = expr.as(); - CHECK(dim_int_node) << "Expected IntImmNode for shape dimensions."; - int dim = dim_int_node->value; - if (dim != 1) return false; - } - - return true; -} - bool EthosuBinaryElementwiseRel(const Array& types, int num_inputs, const Attrs& attrs, const TypeReporter& reporter) { const int ifm_index = 0; @@ -167,11 +152,13 @@ bool EthosuBinaryElementwiseRel(const Array& types, int num_inputs, const ofm_dtype = DataType::Int(8); } else if (param->ofm_dtype == "uint8") { ofm_dtype = DataType::UInt(8); + } else if (param->ofm_dtype == "int16") { + ofm_dtype = DataType::Int(16); } else if (param->ofm_dtype == "int32") { ofm_dtype = DataType::Int(32); } - if (ifm_dtype != ifm2_dtype && !IsScalarTensor(ifm2->shape, ifm2_dtype)) { + if (ifm_dtype != ifm2_dtype) { reporter->GetDiagCtx().EmitFatal(Diagnostic::Error(reporter->GetSpan()) << "Invalid operator: expected ethosu_binary_elementwise " << "type for ifm2 be the same of ifm but was " << ifm2_dtype @@ -189,11 +176,11 @@ bool EthosuBinaryElementwiseRel(const Array& types, int num_inputs, const return false; } if (ofm_dtype != DataType::UInt(8) && ofm_dtype != DataType::Int(8) && - ofm_dtype != DataType::Int(32)) { + ofm_dtype != DataType::Int(16) && ofm_dtype != DataType::Int(32)) { reporter->GetDiagCtx().EmitFatal( Diagnostic::Error(reporter->GetSpan()) << "Invalid operator: expected ethosu_binary_elementwise " << operator_type - << " type(uint8) or type(int8) or type(int32) for ofm but was " << ofm_dtype); + << " type(uint8), type(int8), type(int16) or type(int32) for ofm but was " << ofm_dtype); return false; } } else if (operator_type == "MIN" || operator_type == "MAX") { diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py index 42695db08342..3eba30b654f7 100644 --- a/tests/python/contrib/test_ethosu/test_codegen.py +++ b/tests/python/contrib/test_ethosu/test_codegen.py @@ -515,8 +515,8 @@ def create_mod_from_relay(): @pytest.mark.parametrize("accel_type", ACCEL_TYPES) -def test_binary_add_from_constant_scalar(accel_type): - dtype = "uint8" +@pytest.mark.parametrize("dtype", ["int8", "uint8"]) +def test_elementwise_add_from_constant_scalar(accel_type, dtype): ifm_shape = (1, 4, 4, 8) def create_relay_graph(): @@ -538,7 +538,11 @@ def create_relay_graph(): ethosu_mod = partition_for_ethosu(cpu_mod) # Generate reference data - input_data = {"input": np.random.randint(low=0, high=255, size=ifm_shape, dtype=dtype)} + input_data = { + "input": np.random.randint( + low=np.iinfo(dtype).min, high=np.iinfo(dtype).max, size=ifm_shape, dtype=dtype + ), + } output_data = generate_ref_data(cpu_mod, input_data) _compare_ethosu_with_reference( diff --git a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py index 59b7b2c21723..c14deb636c25 100644 --- a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py +++ b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py @@ -651,6 +651,31 @@ def populate_ethosu_copy_calls(stmt): assert npu_dma_op.dest.length == test_case["ref"][idx]["length"] +# fmt: off +@tvm.script.ir_module +class MixedConstantDatatypes: + @T.prim_func + def main(placeholder: T.handle, placeholder_1: T.handle, placeholder_2: T.handle, ethosu_write: T.handle, placeholder_3: T.handle) -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + placeholder_4 = T.match_buffer(placeholder, [1, 8, 16, 16], dtype="int8") + buffer = T.match_buffer(placeholder_1, [160], dtype="uint8") + placeholder_5 = T.match_buffer(placeholder_2, [1, 1, 1, 1], dtype="int16") + ethosu_write_1 = T.match_buffer(ethosu_write, [1, 1, 1, 16], dtype="int8") + buffer_1 = T.match_buffer(placeholder_3, [272], dtype="uint8") + # body + placeholder_global = T.allocate([272], "uint8", "global") + placeholder_d_global = T.allocate([160], "uint8", "global") + ethosu_write_2 = T.allocate([16], "int16", "global") + placeholder_d_global_1 = T.allocate([1], "int16", "global") + T.evaluate(T.call_extern("ethosu_copy", T.load("uint8", buffer_1.data, 0), 272, T.load("uint8", placeholder_global, 0), dtype="uint8")) + T.evaluate(T.call_extern("ethosu_copy", T.load("uint8", buffer.data, 0), 160, T.load("uint8", placeholder_d_global, 0), dtype="uint8")) + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8, 16, 16, 8, 0, 16, T.load("int8", placeholder_4.data, 0), 0, 0, 0, T.float32(0.0039215548895299435), -128, "NHWC", 256, 16, 1, "int16", 1, 1, 16, 1, 0, 1, T.load("int16", ethosu_write_2, 0), 0, 0, 0, T.float32(0.0023205536417663097), -128, "NHWC", 1, 1, 1, 16, 8, 1, 1, 1, 1, T.load("uint8", placeholder_global, 0), 272, 0, T.load("uint8", placeholder_d_global, 0), 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="int16")) + T.evaluate(T.call_extern("ethosu_copy", T.load("int16", placeholder_5.data, 0), 1, T.load("int16", placeholder_d_global_1, 0), dtype="int16")) + T.evaluate(T.call_extern("ethosu_binary_elementwise", "int16", 1, 1, 16, 1, 0, 1, T.load("int16", ethosu_write_2, 0), 0, 0, 0, T.float32(0.0023205536417663097), -128, "NHWC", 1, 1, 1, "int16", 1, 1, 1, 1, 0, 1, T.load("int16", placeholder_d_global_1, 0), 0, 0, 0, T.float32(0.0078125018482064768), 0, "NHWC", 1, 1, 1, "int8", 1, 1, 16, 1, 0, 1, T.load("int8", ethosu_write_1.data, 0), 0, 0, 0, T.float32(0.0023205536417663097), -128, "NHWC", 1, 1, 1, "MUL", 0, "NONE", 0, 0, "NATURAL", dtype="int8")) +# fmt: on + + def test_assign_addresses(): test_cases = [ { @@ -683,6 +708,15 @@ def test_assign_addresses(): 11: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"), }, }, + { + # Stimulus + "tir_module": MixedConstantDatatypes, + "param_dict": { + 1: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [160], "uint8"), + 2: np.random.randint(np.iinfo("int16").min, np.iinfo("int16").max, [1], "int16"), + 4: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [272], "uint8"), + }, + }, ] def extract_call_extern_list(mod): @@ -747,24 +781,36 @@ def _check_buffer(address, region, length, buffer_var): 4: tir_to_cs_translator.BufferType.output, } buffer_type = inverse_region_map[region] + buffer_dtype = buffer_var.type_annotation.element_type.dtype + dtype_bytes = np.iinfo(np.dtype(buffer_dtype)).bits // 8 if buffer_type == tir_to_cs_translator.BufferType.constant: ref = buffer_info[buffer_var].values - assert (constant_tensor[address : address + length] == ref).all() + hex_from = address * dtype_bytes * 2 + hex_to = hex_from + length * dtype_bytes * 2 + constant_hex = constant_hex_string[hex_from:hex_to] + constant_tensor = np.frombuffer(bytearray.fromhex(constant_hex), dtype=buffer_dtype) + np.array_equal(constant_tensor, ref) # Every buffer is adjusted to align to 16 bytes length = util.round_up(length, 16) # Mark these constants are read at least once - constant_tensor_read_mask[address : address + length] = np.ones(length, dtype="uint8") + constant_tensor_read_mask[address : address + length] = np.ones( + length, dtype=buffer_dtype + ) elif buffer_type == tir_to_cs_translator.BufferType.scratch: shape = list(buffer_info[buffer_var].shape) assert length == np.prod(shape) assert address < scratch_size + + size_in_bytes = int(np.prod(shape)) * dtype_bytes # Every buffer is adjusted to align to 16 bytes - length = util.round_up(length, 16) - assert address + length <= scratch_size + size_in_bytes = util.round_up(size_in_bytes, 16) + assert address + size_in_bytes <= scratch_size # The scratch area should not be used by anyother buffer - assert not scratch_allocation_mask[address : address + length].any() + assert not scratch_allocation_mask[address : address + size_in_bytes].any() # The scratch area is marked as used - scratch_allocation_mask[address : address + length] = np.ones(length, dtype="uint8") + scratch_allocation_mask[address : address + size_in_bytes] = np.ones( + size_in_bytes, dtype="uint8" + ) elif buffer_type == tir_to_cs_translator.BufferType.input: assert address == 0 else: @@ -841,11 +887,11 @@ def check_buffer(address, region, length, buffer_var): for extern_call in extern_calls: _npu_ops.append(tir_to_cs_translator.translate_ethosu_tir_call_extern(extern_call)) npu_op_tir_buffers = collect_tir_buffer_info(_npu_ops) - _npu_ops, constant_tensor, scratch_size = tir_to_cs_translator.assign_addresses( + _npu_ops, constant_hex_string, scratch_size = tir_to_cs_translator.assign_addresses( buffer_info, _npu_ops ) scratch_allocation_mask = np.zeros(scratch_size, dtype="uint8") - constant_tensor_read_mask = np.zeros(constant_tensor.size, dtype="uint8") + constant_tensor_read_mask = np.zeros(len(constant_hex_string) // 2, dtype="uint8") verify(_npu_ops) # This will be only 1 if all allocated scratch is used. assert np.prod(scratch_allocation_mask) == 1 From 3a088df16577aaaa7f8a7b65f2649db09cdcc28c Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Thu, 2 Dec 2021 10:57:43 +0000 Subject: [PATCH 2/3] dont calculate dtype bytes for input or output Change-Id: I8432f3d9dcc1001bbad40b76127075a1019197d4 --- .../tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py index a22fb11e7093..7136776e234e 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py @@ -280,13 +280,13 @@ def classify_io(buffer): total_constant_len = 0 buffer_addresses = dict() for _buffer, info in buffer_info.items(): - dtype_bytes = np.iinfo(np.dtype(info.dtype)).bits // 8 if info.values is not None: assert info.btype == BufferType.constant assert len(info.shape) == 1 buffer_addresses[_buffer] = ( (total_constant_len, info.btype) if constant_hex_data else (0, info.btype) ) + dtype_bytes = np.iinfo(np.dtype(info.dtype)).bits // 8 size_in_bytes = dtype_bytes * np.prod(list(info.shape)) # Every memory address the NPU access have to be 16 byte aligned size_in_bytes = util.round_up(size_in_bytes, 16) @@ -306,6 +306,7 @@ def classify_io(buffer): address = arch_config.lut_start_address buffer_addresses[_buffer] = (address, info.btype) else: + dtype_bytes = np.iinfo(np.dtype(info.dtype)).bits // 8 size_in_bytes = int(dtype_bytes * np.prod(list(info.shape))) # Every memory address the NPU access have to be 16 byte aligned size_in_bytes = util.round_up(size_in_bytes, 16) From d785a1625adcea55ed6ba585663444f6165a5b4c Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Thu, 2 Dec 2021 17:13:19 +0000 Subject: [PATCH 3/3] remove print Change-Id: I9b8b9df3af27023f60ef4934a918d40140a9534f --- python/tvm/relay/backend/contrib/ethosu/legalize.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/tvm/relay/backend/contrib/ethosu/legalize.py b/python/tvm/relay/backend/contrib/ethosu/legalize.py index 04635efb0fd7..0e99c8aa479e 100644 --- a/python/tvm/relay/backend/contrib/ethosu/legalize.py +++ b/python/tvm/relay/backend/contrib/ethosu/legalize.py @@ -1156,7 +1156,6 @@ def transform_module( mod = LegalizeReshape()(mod) mod = LegalizeStridedSlice()(mod) mod = LegalizeNoOps()(mod) - print(mod) return mod def __call__(self, *args, **kwargs):