From 582f90cbab430b254a60f41c6ebfa1ac3a36f4e8 Mon Sep 17 00:00:00 2001 From: ibsidorenko Date: Wed, 4 May 2022 12:59:05 +0300 Subject: [PATCH 1/4] [QNN] Enable constant folding for QNN operations. This commit enables constant folding for QNN operations. This functionalty is disabled by default, use fold_qnn=True to enable. Co-authored-by: Alexander Peskov --- include/tvm/relay/transform.h | 4 ++- python/tvm/relay/transform/transform.py | 15 +++++++--- src/relay/backend/interpreter.cc | 3 +- src/relay/transforms/fold_constant.cc | 27 +++++++++++------ tests/python/relay/test_pass_fold_constant.py | 29 +++++++++++++++++++ 5 files changed, 63 insertions(+), 15 deletions(-) diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h index 4a6b06f14f94..13162d804620 100644 --- a/include/tvm/relay/transform.h +++ b/include/tvm/relay/transform.h @@ -105,9 +105,11 @@ TVM_DLL Pass LazyGradientInit(); /*! * \brief Fold constant expressions. * + * \param fold_qnn Whether to fold constants for QNN operations. + * * \return The pass. */ -TVM_DLL Pass FoldConstant(); +TVM_DLL Pass FoldConstant(bool fold_qnn = false); /*! * \brief Split function with huge number of arguments to smaller pieces. diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py index 566d0ffa2bfa..2bd071cc058a 100644 --- a/python/tvm/relay/transform/transform.py +++ b/python/tvm/relay/transform/transform.py @@ -261,7 +261,7 @@ def LazyGradientInit(): return _ffi_api.LazyGradientInit() -def FoldConstantExpr(expr, mod): +def FoldConstantExpr(expr, mod, fold_qnn=False): """Fold the constant expressions in a Relay program. Parameters ---------- @@ -269,24 +269,31 @@ def FoldConstantExpr(expr, mod): The expression to fold mod: IRModule The module the expr lives in (for global calls) + fskip: bool + Whether to fold constants for QNN operations. Returns ------- new_expr: Expr The expr after Constant Folding """ - return _ffi_api.FoldConstantExpr(expr, mod) + return _ffi_api.FoldConstantExpr(expr, mod, fold_qnn) -def FoldConstant(): +def FoldConstant(fold_qnn=False): """Fold the constant expressions in a Relay program. + Parameters + ---------- + fskip: bool + Whether to fold constants for QNN operations. + Returns ------- ret : tvm.transform.Pass The registered pass for constant folding. """ - return _ffi_api.FoldConstant() + return _ffi_api.FoldConstant(fold_qnn) def FuseOps(fuse_opt_level=-1): diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc index 673a547d2df0..65ef29651695 100644 --- a/src/relay/backend/interpreter.cc +++ b/src/relay/backend/interpreter.cc @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -948,7 +949,7 @@ IRModule Prepare(IRModule mod, CompilationConfig config) { VirtualDevice host_virtual_device = config->host_virtual_device; // Run minimal transforms on module to establish invariants needed by interpreter. transform::Sequential seq( - {transform::SimplifyInference(), + {transform::SimplifyInference(), qnn::transform::Legalize(), // Figure out which devices should be used to execute. // TODO(mbs): Should ignore all existing annotations when constant folding transform::PlanDevices(std::move(config)), diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc index c1bbbb331139..9dec840be0a7 100644 --- a/src/relay/transforms/fold_constant.cc +++ b/src/relay/transforms/fold_constant.cc @@ -67,8 +67,9 @@ bool IsComplexConstant(const Expr& expr) { // or make a more powerful partial evaluator. class ConstantFolder : public MixedModeMutator { public: - explicit ConstantFolder(IRModule module) + explicit ConstantFolder(IRModule module, bool fold_qnn) : module_(std::move(module)), + fold_qnn_(fold_qnn), device_copy_op_(Op::Get("device_copy")), shape_of_op_(Op::Get("shape_of")), vm_shape_of_op_(Op::Get("vm.shape_of")), @@ -158,8 +159,6 @@ class ConstantFolder : public MixedModeMutator { return std::move(pre_call); } - static auto fnoncomputational = Op::GetAttrMap("TNonComputational"); - const auto* op_node = post_call->op.as(); if (op_node == nullptr) { // Only evaluate primitives. @@ -182,8 +181,15 @@ class ConstantFolder : public MixedModeMutator { if (Optional opt_result = EvaluateNdarraySize(pre_call)) { return opt_result.value(); } - if ((fnoncomputational.count(op) && fnoncomputational[op]) || op == device_copy_op_ || - op == shape_of_op_ || op == vm_shape_of_op_ || op == ndarray_size_op_) { + static auto fnoncomputational = Op::GetAttrMap("TNonComputational"); + static auto qnn_canonicalize = Op::GetAttrMap("FTVMQnnCanonicalize"); + bool is_no_qnn_canonicalized = !qnn_canonicalize.count(op); + bool is_no_computational = fnoncomputational.count(op) && fnoncomputational[op]; + if (is_no_computational && (is_no_qnn_canonicalized || !fold_qnn_)) { + return std::move(post_call); + } + if (op == device_copy_op_ || op == shape_of_op_ || op == vm_shape_of_op_ || + op == ndarray_size_op_) { // We should think about potentially constant evaluation over these ops too. return std::move(post_call); } @@ -387,6 +393,9 @@ class ConstantFolder : public MixedModeMutator { // Module IRModule module_; + // Whether to fold constants for QNN operations. + bool fold_qnn_; + // The kDLCPU device assumed to be available to the compiler. Used only when evaluating // sub-expressions. Device eval_cpu_dev_{kDLCPU, /*device_id=*/0}; @@ -417,20 +426,20 @@ TVM_REGISTER_GLOBAL("relay.analysis.check_constant").set_body_typed(IsComplexCon * from their p.o.v. Furthermore, this function can be called before conversion to ANF so * we must avoid all recursion. */ -Expr FoldConstantExpr(const Expr& expr, const IRModule& mod) { +Expr FoldConstantExpr(const Expr& expr, const IRModule& mod, bool fold_qnn) { VLOG_CONTEXT << "FoldConstantExpr"; VLOG(1) << "folding:" << std::endl << PrettyPrint(expr); - Expr result = ConstantFolder(mod).VisitExpr(expr); + Expr result = ConstantFolder(mod, fold_qnn).VisitExpr(expr); VLOG(1) << "folded to:" << std::endl << PrettyPrint(result); return result; } TVM_REGISTER_GLOBAL("relay._transform.FoldConstantExpr").set_body_typed(FoldConstantExpr); -Pass FoldConstant() { +Pass FoldConstant(bool fold_qnn) { runtime::TypedPackedFunc pass_func = [=](Function f, IRModule m, PassContext pc) { - return Downcast(FoldConstantExpr(f, m)); + return Downcast(FoldConstantExpr(f, m, fold_qnn)); }; return CreateFunctionPass(pass_func, 2, "FoldConstant", {}); } diff --git a/tests/python/relay/test_pass_fold_constant.py b/tests/python/relay/test_pass_fold_constant.py index 298c4f177fd1..8e74f152e0f1 100644 --- a/tests/python/relay/test_pass_fold_constant.py +++ b/tests/python/relay/test_pass_fold_constant.py @@ -370,6 +370,35 @@ def before(): tvm.ir.assert_structural_equal(run_infer_type(before_mod["main"]), after_mod["main"]) +def test_fold_qnn_const(): + qx = relay.var("x", shape=[2, 3], dtype="int8") + + def before(): + # Quantized INT8 weights + qw = relay.const(np.array([[1, 3, 5], [2, 4, 6]], dtype="int8"), "int8") + op = relay.op.nn.dense( + relay.qnn.op.dequantize(qx, relay.const(2.0), relay.const(0)), + relay.qnn.op.dequantize(qw, relay.const(2.0), relay.const(0)), + ) + return relay.Function([qx], op) + + def expected(): + # FP32 weights + w = relay.const(np.array([[2.0, 6.0, 10.0], [4.0, 8.0, 12.0]], dtype="float32"), "float32") + op = relay.op.nn.dense(relay.qnn.op.dequantize(qx, relay.const(2.0), relay.const(0)), w) + return relay.Function([qx], op) + + # Nothing changed after applying FoldConstant + a = run_opt_pass(before(), transform.FoldConstant()) + b = run_opt_pass(before(), transform.InferType()) + tvm.ir.assert_structural_equal(a, b) + + # Fold QNN constants + a = run_opt_pass(before(), transform.FoldConstant(fold_qnn=True)) + b = run_opt_pass(expected(), transform.InferType()) + tvm.ir.assert_structural_equal(a, b) + + def test_pass_link_params(): """ This test checks ensures that proper executor is passed to interpreter instance From 66ddb3eea392f60e8c4bc20252917149b62a14ae Mon Sep 17 00:00:00 2001 From: ibsidorenko Date: Mon, 9 May 2022 20:21:30 +0300 Subject: [PATCH 2/4] [NFC] Fixed comments --- python/tvm/relay/transform/transform.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py index 2bd071cc058a..1c715f785533 100644 --- a/python/tvm/relay/transform/transform.py +++ b/python/tvm/relay/transform/transform.py @@ -269,7 +269,7 @@ def FoldConstantExpr(expr, mod, fold_qnn=False): The expression to fold mod: IRModule The module the expr lives in (for global calls) - fskip: bool + fold_qnn: bool Whether to fold constants for QNN operations. Returns @@ -285,7 +285,7 @@ def FoldConstant(fold_qnn=False): Parameters ---------- - fskip: bool + fold_qnn: bool Whether to fold constants for QNN operations. Returns From b455d849a122011caab91eb112c6bb6c7ad99051 Mon Sep 17 00:00:00 2001 From: ibsidorenko Date: Wed, 11 May 2022 18:49:55 +0300 Subject: [PATCH 3/4] Added more unit tests for QNN opers in constant folding pass. --- tests/python/relay/test_pass_fold_constant.py | 143 ++++++++++++++++++ 1 file changed, 143 insertions(+) diff --git a/tests/python/relay/test_pass_fold_constant.py b/tests/python/relay/test_pass_fold_constant.py index 8e74f152e0f1..270759b1f0d6 100644 --- a/tests/python/relay/test_pass_fold_constant.py +++ b/tests/python/relay/test_pass_fold_constant.py @@ -399,6 +399,149 @@ def expected(): tvm.ir.assert_structural_equal(a, b) +def test_fold_quantize(): + t = relay.TensorType([1, 2, 3], "int8") + + def before(): + data = tvm.nd.array(np.array([1.0, 2.0, 3.0], dtype="float32")) + const_fp = relay.const(data, dtype="float32") + const_i8 = relay.qnn.op.quantize( + const_fp, output_scale=relay.const(0.5), output_zero_point=relay.const(0) + ) + x = relay.var("x", t) + add = relay.op.subtract(x, const_i8) + func = relay.Function([x], add) + return func + + def expected(): + data = tvm.nd.array(np.array([2, 4, 6], dtype="int8")) + const_i8 = relay.const(data, dtype="int8") + x = relay.var("x", t) + add = relay.op.subtract(x, const_i8) + func = relay.Function([x], add) + return func + + # Nothing changed after applying FoldConstant + a = run_opt_pass(before(), transform.FoldConstant()) + b = run_opt_pass(before(), transform.InferType()) + tvm.ir.assert_structural_equal(a, b) + + # Fold QNN constants + a = run_opt_pass(before(), transform.FoldConstant(fold_qnn=True)) + b = run_opt_pass(expected(), transform.InferType()) + tvm.ir.assert_structural_equal(a, b) + + +def test_fold_qnn_add(): + dtype = "uint8" + + def before(): + add = relay.qnn.op.add( + relay.const(np.ones((2, 3), dtype=dtype), dtype=dtype), + relay.const(np.ones((2, 3), dtype=dtype), dtype=dtype), + relay.const(2.0, dtype="float32"), + relay.const(0, dtype="int32"), + relay.const(2.0, dtype="float32"), + relay.const(0, dtype="int32"), + relay.const(1.0, dtype="float32"), + relay.const(0, dtype="int32"), + ) + func = relay.Function([], add) + return func + + def expected(): + data = relay.const(np.array([[4, 4, 4], [4, 4, 4]], dtype=dtype), dtype) + func = relay.Function([], data) + return func + + # Nothing changed after applying FoldConstant + a = run_opt_pass(before(), transform.FoldConstant()) + b = run_opt_pass(before(), transform.InferType()) + tvm.ir.assert_structural_equal(a, b) + + # Fold QNN constants + a = run_opt_pass(before(), transform.FoldConstant(fold_qnn=True)) + b = run_opt_pass(expected(), transform.InferType()) + tvm.ir.assert_structural_equal(a, b) + + +def test_fold_qnn_conv2d_qnn_mul(): + def before(): + dtype = "uint8" + op0 = relay.qnn.op.conv2d( + relay.const(np.ones((1, 1, 2, 2), dtype=dtype), dtype=dtype), + relay.const(np.ones((1, 1, 2, 2), dtype=dtype), dtype=dtype), + input_zero_point=relay.const(0, "int32"), + kernel_zero_point=relay.const(0, "int32"), + input_scale=relay.const(1.0, "float32"), + kernel_scale=relay.const(1.0, "float32"), + kernel_size=(2, 2), + channels=1, + ) + op = relay.qnn.op.mul( + op0, + relay.const(np.array([10], dtype="int32"), dtype="int32"), + relay.const(1.0, dtype="float32"), + relay.const(0, dtype="int32"), + relay.const(1.0, dtype="float32"), + relay.const(0, dtype="int32"), + relay.const(1.0, dtype="float32"), + relay.const(0, dtype="int32"), + ) + func = relay.Function([], op) + return func + + def expected(): + data = relay.const(np.array([[[[40]]]], dtype="int32"), dtype="int32") + func = relay.Function([], data) + return func + + # Nothing changed after applying FoldConstant + a = run_opt_pass(before(), transform.FoldConstant()) + b = run_opt_pass(before(), transform.InferType()) + tvm.ir.assert_structural_equal(a, b) + + # Fold QNN constants + a = run_opt_pass(before(), transform.FoldConstant(fold_qnn=True)) + b = run_opt_pass(expected(), transform.InferType()) + tvm.ir.assert_structural_equal(a, b) + + +def test_fold_requantize(): + def before(): + data = tvm.nd.array(np.array([1, 2, 3], dtype="int8")) + const_i8 = relay.const(data, dtype="int8") + op = relay.qnn.op.requantize( + const_i8, + input_scale=relay.const(2.0, dtype="float32"), + input_zero_point=relay.const(1, dtype="int32"), + output_scale=relay.const(1.0, dtype="float32"), + output_zero_point=relay.const(1, dtype="int32"), + ) + x = relay.var("x", relay.TensorType([3], "int8")) + add = relay.op.add(op, x) + func = relay.Function([x], add) + return func + + def expected(): + data = tvm.nd.array(np.array([1, 3, 5], dtype="int8")) + const_i8 = relay.const(data, dtype="int8") + x = relay.var("x", relay.TensorType([3], "int8")) + add = relay.op.add(const_i8, x) + func = relay.Function([x], add) + return func + + # Nothing changed after applying FoldConstant + a = run_opt_pass(before(), transform.FoldConstant()) + b = run_opt_pass(before(), transform.InferType()) + tvm.ir.assert_structural_equal(a, b) + + # Fold QNN constants + a = run_opt_pass(before(), transform.FoldConstant(fold_qnn=True)) + b = run_opt_pass(expected(), transform.InferType()) + tvm.ir.assert_structural_equal(a, b) + + def test_pass_link_params(): """ This test checks ensures that proper executor is passed to interpreter instance From e3d9f6da2a12866949391a403ca7b7c06063b667 Mon Sep 17 00:00:00 2001 From: ibsidorenko Date: Fri, 13 May 2022 13:48:19 +0300 Subject: [PATCH 4/4] Address PR feedbacks --- include/tvm/relay/transform.h | 6 + python/tvm/relay/transform/transform.py | 6 + tests/python/relay/test_pass_fold_constant.py | 109 ++++++++++-------- 3 files changed, 72 insertions(+), 49 deletions(-) diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h index 13162d804620..0d518e4ed547 100644 --- a/include/tvm/relay/transform.h +++ b/include/tvm/relay/transform.h @@ -105,6 +105,12 @@ TVM_DLL Pass LazyGradientInit(); /*! * \brief Fold constant expressions. * + * Because of backward compatibility reason it skips QNN primitives from folding by default. + * There are some transformation passes like FakeQuantizationToInteger, which requires to keep QNN + * primitives for constant subgraphs. Uncontrolled constant folding of QNN primitives may break + * applicability of FakeQuantizationToInteger. We suggest to use FoldConstant pass with none + * default fold_qnn=True value only when all other QNN sensitive passes were already applied. + * * \param fold_qnn Whether to fold constants for QNN operations. * * \return The pass. diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py index 1c715f785533..9f253f8e88ba 100644 --- a/python/tvm/relay/transform/transform.py +++ b/python/tvm/relay/transform/transform.py @@ -283,6 +283,12 @@ def FoldConstantExpr(expr, mod, fold_qnn=False): def FoldConstant(fold_qnn=False): """Fold the constant expressions in a Relay program. + Because of backward compatibility reason it skips QNN primitives from folding by default. + There are some transformation passes like FakeQuantizationToInteger, which requires to keep QNN + primitives for constant subgraphs. Uncontrolled constant folding of QNN primitives may break + applicability of FakeQuantizationToInteger. We suggest to use FoldConstant pass with none + default fold_qnn=True value only when all other QNN sensitive passes were already applied. + Parameters ---------- fold_qnn: bool diff --git a/tests/python/relay/test_pass_fold_constant.py b/tests/python/relay/test_pass_fold_constant.py index 270759b1f0d6..e7235d6fcfd2 100644 --- a/tests/python/relay/test_pass_fold_constant.py +++ b/tests/python/relay/test_pass_fold_constant.py @@ -371,22 +371,66 @@ def before(): def test_fold_qnn_const(): - qx = relay.var("x", shape=[2, 3], dtype="int8") - def before(): - # Quantized INT8 weights - qw = relay.const(np.array([[1, 3, 5], [2, 4, 6]], dtype="int8"), "int8") - op = relay.op.nn.dense( - relay.qnn.op.dequantize(qx, relay.const(2.0), relay.const(0)), - relay.qnn.op.dequantize(qw, relay.const(2.0), relay.const(0)), + # QNN op with 2 constant arguments. + add = relay.qnn.op.add( + relay.const(np.ones((2, 3), dtype="uint8"), dtype="uint8"), + relay.const(np.ones((2, 3), dtype="uint8"), dtype="uint8"), + lhs_scale=relay.const(2.0), + lhs_zero_point=relay.const(0), + rhs_scale=relay.const(2.0), + rhs_zero_point=relay.const(0), + output_scale=relay.const(1.0), + output_zero_point=relay.const(0), + ) + # QNN op with 1 constant and 1 non-constant arguments. + a = relay.var("a", shape=[2, 3], dtype="float32") + dense = relay.qnn.op.dense( + relay.qnn.op.quantize(a, relay.const(1.0), relay.const(0)), + add, + input_zero_point=relay.const(0), + kernel_zero_point=relay.const(0), + input_scale=relay.const(2.0), + kernel_scale=relay.const(2.0), + units=None, ) - return relay.Function([qx], op) + # QNN op with 2 non-constant arguments. + b = relay.var("b", shape=[2], dtype="float32") + bias = relay.qnn.op.add( + dense, + relay.qnn.op.quantize(b, relay.const(1.0), relay.const(0), out_dtype="int32"), + lhs_scale=relay.const(2.0), + lhs_zero_point=relay.const(0), + rhs_scale=relay.const(2.0), + rhs_zero_point=relay.const(0), + output_scale=relay.const(1.0), + output_zero_point=relay.const(0), + ) + return relay.Function([a, b], bias) def expected(): - # FP32 weights - w = relay.const(np.array([[2.0, 6.0, 10.0], [4.0, 8.0, 12.0]], dtype="float32"), "float32") - op = relay.op.nn.dense(relay.qnn.op.dequantize(qx, relay.const(2.0), relay.const(0)), w) - return relay.Function([qx], op) + a = relay.var("a", shape=[2, 3], dtype="float32") + dense = relay.qnn.op.dense( + relay.qnn.op.quantize(a, relay.const(1.0), relay.const(0)), + relay.const(np.array([[4, 4, 4], [4, 4, 4]], dtype="uint8"), dtype="uint8"), + input_zero_point=relay.const(0), + kernel_zero_point=relay.const(0), + input_scale=relay.const(2.0), + kernel_scale=relay.const(2.0), + units=None, + ) + b = relay.var("b", shape=[2], dtype="float32") + bias = relay.qnn.op.add( + dense, + relay.qnn.op.quantize(b, relay.const(1.0), relay.const(0), out_dtype="int32"), + lhs_scale=relay.const(2.0), + lhs_zero_point=relay.const(0), + rhs_scale=relay.const(2.0), + rhs_zero_point=relay.const(0), + output_scale=relay.const(1.0), + output_zero_point=relay.const(0), + ) + return relay.Function([a, b], bias) # Nothing changed after applying FoldConstant a = run_opt_pass(before(), transform.FoldConstant()) @@ -409,49 +453,16 @@ def before(): const_fp, output_scale=relay.const(0.5), output_zero_point=relay.const(0) ) x = relay.var("x", t) - add = relay.op.subtract(x, const_i8) - func = relay.Function([x], add) + sub = relay.op.subtract(x, const_i8) + func = relay.Function([x], sub) return func def expected(): data = tvm.nd.array(np.array([2, 4, 6], dtype="int8")) const_i8 = relay.const(data, dtype="int8") x = relay.var("x", t) - add = relay.op.subtract(x, const_i8) - func = relay.Function([x], add) - return func - - # Nothing changed after applying FoldConstant - a = run_opt_pass(before(), transform.FoldConstant()) - b = run_opt_pass(before(), transform.InferType()) - tvm.ir.assert_structural_equal(a, b) - - # Fold QNN constants - a = run_opt_pass(before(), transform.FoldConstant(fold_qnn=True)) - b = run_opt_pass(expected(), transform.InferType()) - tvm.ir.assert_structural_equal(a, b) - - -def test_fold_qnn_add(): - dtype = "uint8" - - def before(): - add = relay.qnn.op.add( - relay.const(np.ones((2, 3), dtype=dtype), dtype=dtype), - relay.const(np.ones((2, 3), dtype=dtype), dtype=dtype), - relay.const(2.0, dtype="float32"), - relay.const(0, dtype="int32"), - relay.const(2.0, dtype="float32"), - relay.const(0, dtype="int32"), - relay.const(1.0, dtype="float32"), - relay.const(0, dtype="int32"), - ) - func = relay.Function([], add) - return func - - def expected(): - data = relay.const(np.array([[4, 4, 4], [4, 4, 4]], dtype=dtype), dtype) - func = relay.Function([], data) + sub = relay.op.subtract(x, const_i8) + func = relay.Function([x], sub) return func # Nothing changed after applying FoldConstant