diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index e43097b95180..bc25b2df2715 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -218,6 +218,7 @@ struct ArmIntrinsic { MangleRetArgs = 1 << 4, // Most intrinsics only mangle the return type. Some mangle the return type and arguments instead. ScalarsAreVectors = 1 << 5, // Some intrinsics have scalar arguments that are vector parameters :( SplitArg0 = 1 << 6, // This intrinsic requires splitting the argument into the low and high halves. + NoPrefix = 1 << 7, // Don't prefix the intrinsic with llvm.* }; }; @@ -546,6 +547,18 @@ const ArmIntrinsic intrinsic_defs[] = { {nullptr, "sdot.v4i32.v16i8", Int(32, 4), "dot_product", {Int(32, 4), Int(8, 16), Int(8, 16)}, ArmIntrinsic::NoMangle}, {nullptr, "udot.v4i32.v16i8", Int(32, 4), "dot_product", {Int(32, 4), UInt(8, 16), UInt(8, 16)}, ArmIntrinsic::NoMangle}, {nullptr, "udot.v4i32.v16i8", UInt(32, 4), "dot_product", {UInt(32, 4), UInt(8, 16), UInt(8, 16)}, ArmIntrinsic::NoMangle}, + + // ABDL - Widening absolute difference + // Need to be able to handle both signed and unsigned outputs for signed inputs. + {"vabdl_i8x8", "vabdl_i8x8", Int(16, 8), "widening_absd", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, + {"vabdl_i8x8", "vabdl_i8x8", UInt(16, 8), "widening_absd", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, + {"vabdl_u8x8", "vabdl_u8x8", UInt(16, 8), "widening_absd", {UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, + {"vabdl_i16x4", "vabdl_i16x4", Int(32, 4), "widening_absd", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, + {"vabdl_i16x4", "vabdl_i16x4", UInt(32, 4), "widening_absd", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, + {"vabdl_u16x4", "vabdl_u16x4", UInt(32, 4), "widening_absd", {UInt(16, 4), UInt(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, + {"vabdl_i32x2", "vabdl_i32x2", Int(64, 2), "widening_absd", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, + {"vabdl_i32x2", "vabdl_i32x2", UInt(64, 2), "widening_absd", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, + {"vabdl_u32x2", "vabdl_u32x2", UInt(64, 2), "widening_absd", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, }; // clang-format on @@ -606,7 +619,7 @@ void CodeGen_ARM::init_module() { continue; } string full_name = intrin_name; - if (!starts_with(full_name, "llvm.")) { + if (!starts_with(full_name, "llvm.") && (intrin.flags & ArmIntrinsic::NoPrefix) == 0) { full_name = prefix + full_name; } @@ -671,6 +684,8 @@ void CodeGen_ARM::init_module() { intrin_impl = get_llvm_intrin(ret_type, mangled_name, arg_types, scalars_are_vectors); } + intrin_impl->addFnAttr(llvm::Attribute::ReadNone); + intrin_impl->addFnAttr(llvm::Attribute::NoUnwind); declare_intrin_overload(intrin.name, ret_type, intrin_impl, arg_types); if (intrin.flags & ArmIntrinsic::AllowUnsignedOp1) { // Also generate a version of this intrinsic where the second operand is unsigned. @@ -711,10 +726,7 @@ void CodeGen_ARM::visit(const Cast *op) { (op->value.type().is_int() || op->value.type().is_uint()) && t.bits() == op->value.type().bits() * 2) { if (const Call *absd = Call::as_intrinsic(op->value, {Call::absd})) { - ostringstream ss; - int intrin_lanes = 128 / t.bits(); - ss << "vabdl_" << (absd->args[0].type().is_int() ? "i" : "u") << t.bits() / 2 << "x" << intrin_lanes; - value = call_intrin(t, intrin_lanes, ss.str(), absd->args); + value = call_overloaded_intrin(t, "widening_absd", absd->args); return; } } diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp index 1f033fdcc33c..91d13b13596d 100644 --- a/src/CodeGen_Hexagon.cpp +++ b/src/CodeGen_Hexagon.cpp @@ -89,7 +89,6 @@ class CodeGen_Hexagon : public CodeGen_Posix { * return null if the maybe option is true and the intrinsic is * not found. */ ///@{ - using CodeGen_LLVM::call_intrin; llvm::Value *call_intrin(Type t, const std::string &name, std::vector, bool maybe = false); llvm::Value *call_intrin(llvm::Type *t, const std::string &name, @@ -1791,8 +1790,10 @@ Value *CodeGen_Hexagon::call_intrin(Type result_type, const string &name, fn = fn2; } } - return call_intrin(result_type, get_vector_num_elements(fn->getReturnType()), - get_llvm_function_name(fn), std::move(args)); + fn->addFnAttr(llvm::Attribute::ReadNone); + fn->addFnAttr(llvm::Attribute::NoUnwind); + return CodeGen_Posix::call_intrin(result_type, get_vector_num_elements(fn->getReturnType()), + fn, std::move(args)); } Value *CodeGen_Hexagon::call_intrin(llvm::Type *result_type, const string &name, @@ -1812,8 +1813,10 @@ Value *CodeGen_Hexagon::call_intrin(llvm::Type *result_type, const string &name, fn = fn2; } } - return call_intrin(result_type, get_vector_num_elements(fn->getReturnType()), - get_llvm_function_name(fn), std::move(args)); + fn->addFnAttr(llvm::Attribute::ReadNone); + fn->addFnAttr(llvm::Attribute::NoUnwind); + return CodeGen_Posix::call_intrin(result_type, get_vector_num_elements(fn->getReturnType()), + fn, std::move(args)); } string CodeGen_Hexagon::mcpu() const { diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index eda89aca82da..7da41a7211fa 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -4661,10 +4661,11 @@ llvm::Function *CodeGen_LLVM::get_llvm_intrin(const Type &ret_type, const std::s return get_llvm_intrin(llvm_ret_type, name, llvm_arg_types); } -void CodeGen_LLVM::declare_intrin_overload(const std::string &name, const Type &ret_type, const std::string &impl_name, std::vector arg_types, bool scalars_are_vectors) { +llvm::Function *CodeGen_LLVM::declare_intrin_overload(const std::string &name, const Type &ret_type, const std::string &impl_name, std::vector arg_types, bool scalars_are_vectors) { llvm::Function *intrin = get_llvm_intrin(ret_type, impl_name, arg_types, scalars_are_vectors); internal_assert(intrin); intrinsics[name].emplace_back(ret_type, std::move(arg_types), intrin); + return intrin; } void CodeGen_LLVM::declare_intrin_overload(const std::string &name, const Type &ret_type, llvm::Function *impl, std::vector arg_types) { @@ -4893,10 +4894,6 @@ Value *CodeGen_LLVM::call_intrin(llvm::Type *result_type, int intrin_lanes, } CallInst *call = builder->CreateCall(intrin, arg_values); - - call->setDoesNotAccessMemory(); - call->setDoesNotThrow(); - return call; } diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 092bc7713b5b..54a9f393fd8e 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -446,7 +446,7 @@ class CodeGen_LLVM : public IRVisitor { llvm::Function *get_llvm_intrin(const Type &ret_type, const std::string &name, const std::vector &arg_types, bool scalars_are_vectors = false); llvm::Function *get_llvm_intrin(llvm::Type *ret_type, const std::string &name, const std::vector &arg_types); /** Declare an intrinsic function that participates in overload resolution. */ - void declare_intrin_overload(const std::string &name, const Type &ret_type, const std::string &impl_name, std::vector arg_types, bool scalars_are_vectors = false); + llvm::Function *declare_intrin_overload(const std::string &name, const Type &ret_type, const std::string &impl_name, std::vector arg_types, bool scalars_are_vectors = false); void declare_intrin_overload(const std::string &name, const Type &ret_type, llvm::Function *impl, std::vector arg_types); /** Call an overloaded intrinsic function. Returns nullptr if no suitable overload is found. */ llvm::Value *call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector &args); diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index 26822cda2296..3b354e51e342 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -228,14 +228,29 @@ void CodeGen_PTX_Dev::init_module() { module = get_initial_module_for_ptx_device(target, context); - declare_intrin_overload("dp4a", Int(32), "dp4a_s32_s32", {Int(8, 4), Int(8, 4), Int(32)}); - declare_intrin_overload("dp4a", Int(32), "dp4a_s32_u32", {Int(8, 4), UInt(8, 4), Int(32)}); - declare_intrin_overload("dp4a", Int(32), "dp4a_u32_s32", {UInt(8, 4), Int(8, 4), Int(32)}); - declare_intrin_overload("dp4a", UInt(32), "dp4a_u32_u32", {UInt(8, 4), UInt(8, 4), UInt(32)}); - declare_intrin_overload("dp2a", Int(32), "dp2a_s32_s32", {Int(16, 4), Int(8, 4), Int(32)}); - declare_intrin_overload("dp2a", Int(32), "dp2a_s32_u32", {Int(16, 4), UInt(8, 4), Int(32)}); - declare_intrin_overload("dp2a", Int(32), "dp2a_u32_s32", {UInt(16, 4), Int(8, 4), Int(32)}); - declare_intrin_overload("dp2a", UInt(32), "dp2a_u32_u32", {UInt(16, 4), UInt(8, 4), UInt(32)}); + struct Intrinsic { + const char *name; + Type ret_type; + const char *intrin_name; + vector arg_types; + }; + + Intrinsic ptx_intrins[] = { + {"dp4a", Int(32), "dp4a_s32_s32", {Int(8, 4), Int(8, 4), Int(32)}}, + {"dp4a", Int(32), "dp4a_s32_u32", {Int(8, 4), UInt(8, 4), Int(32)}}, + {"dp4a", Int(32), "dp4a_u32_s32", {UInt(8, 4), Int(8, 4), Int(32)}}, + {"dp4a", UInt(32), "dp4a_u32_u32", {UInt(8, 4), UInt(8, 4), UInt(32)}}, + {"dp2a", Int(32), "dp2a_s32_s32", {Int(16, 4), Int(8, 4), Int(32)}}, + {"dp2a", Int(32), "dp2a_s32_u32", {Int(16, 4), UInt(8, 4), Int(32)}}, + {"dp2a", Int(32), "dp2a_u32_s32", {UInt(16, 4), Int(8, 4), Int(32)}}, + {"dp2a", UInt(32), "dp2a_u32_u32", {UInt(16, 4), UInt(8, 4), UInt(32)}}, + }; + + for (auto &&i : ptx_intrins) { + auto *fn = declare_intrin_overload(i.name, i.ret_type, i.intrin_name, std::move(i.arg_types)); + fn->addFnAttr(llvm::Attribute::ReadNone); + fn->addFnAttr(llvm::Attribute::NoUnwind); + } } void CodeGen_PTX_Dev::visit(const Call *op) { diff --git a/src/CodeGen_PowerPC.cpp b/src/CodeGen_PowerPC.cpp index 312649280f62..42dec77fd75d 100644 --- a/src/CodeGen_PowerPC.cpp +++ b/src/CodeGen_PowerPC.cpp @@ -1,5 +1,7 @@ #include "CodeGen_Posix.h" +#include "LLVM_Headers.h" + namespace Halide { namespace Internal { @@ -113,7 +115,9 @@ void CodeGen_PowerPC::init_module() { arg_types.emplace_back(j); } - declare_intrin_overload(i.name, ret_type, i.intrin_name, std::move(arg_types)); + auto *fn = declare_intrin_overload(i.name, ret_type, i.intrin_name, std::move(arg_types)); + fn->addFnAttr(llvm::Attribute::ReadNone); + fn->addFnAttr(llvm::Attribute::NoUnwind); } } diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp index fe7ee2fe6db0..93183780ccf8 100644 --- a/src/CodeGen_WebAssembly.cpp +++ b/src/CodeGen_WebAssembly.cpp @@ -1,5 +1,6 @@ #include "CodeGen_Posix.h" +#include "LLVM_Headers.h" #include namespace Halide { @@ -87,7 +88,9 @@ void CodeGen_WebAssembly::init_module() { arg_types.emplace_back(i); } - declare_intrin_overload(i.name, ret_type, i.intrin_name, std::move(arg_types)); + auto *fn = declare_intrin_overload(i.name, ret_type, i.intrin_name, std::move(arg_types)); + fn->addFnAttr(llvm::Attribute::ReadNone); + fn->addFnAttr(llvm::Attribute::NoUnwind); } } diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index fbfc4881e70d..2038dcce75c8 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -205,7 +205,9 @@ void CodeGen_X86::init_module() { arg_types.emplace_back(j); } - declare_intrin_overload(i.name, ret_type, i.intrin_name, std::move(arg_types)); + auto *fn = declare_intrin_overload(i.name, ret_type, i.intrin_name, std::move(arg_types)); + fn->addFnAttr(llvm::Attribute::ReadNone); + fn->addFnAttr(llvm::Attribute::NoUnwind); } }