diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h index 119d0f7fd339..2a06856fea89 100644 --- a/include/tvm/runtime/ndarray.h +++ b/include/tvm/runtime/ndarray.h @@ -110,9 +110,10 @@ class NDArray : public ObjectRef { /*! * \brief Copy the data to another device. * \param dev The target device. + * \param mem_scope The memory scope of the target array. * \return The array under another device. */ - inline NDArray CopyTo(const Device& dev) const; + inline NDArray CopyTo(const Device& dev, Optional mem_scope = NullOpt) const; /*! * \brief Load NDArray from stream * \param stream The input data stream @@ -398,10 +399,11 @@ inline void NDArray::CopyTo(const NDArray& other) const { CopyFromTo(&(get_mutable()->dl_tensor), &(other.get_mutable()->dl_tensor)); } -inline NDArray NDArray::CopyTo(const Device& dev) const { +inline NDArray NDArray::CopyTo(const Device& dev, Optional mem_scope) const { ICHECK(data_ != nullptr); const DLTensor* dptr = operator->(); - NDArray ret = Empty(ShapeTuple(dptr->shape, dptr->shape + dptr->ndim), dptr->dtype, dev); + NDArray ret = + Empty(ShapeTuple(dptr->shape, dptr->shape + dptr->ndim), dptr->dtype, dev, mem_scope); this->CopyTo(ret); return ret; } diff --git a/include/tvm/runtime/vm/bytecode.h b/include/tvm/runtime/vm/bytecode.h index 2fe855f9645b..637c1e70a79f 100644 --- a/include/tvm/runtime/vm/bytecode.h +++ b/include/tvm/runtime/vm/bytecode.h @@ -157,6 +157,8 @@ struct Instruction { struct /* LoadConst Operands */ { /* \brief The index into the constant pool. */ Index const_index; + /*! \brief The index of the device on which the load will be made. */ + Index device_index; }; struct /* LoadConsti Operands */ { /* \brief The index into the constant pool. */ @@ -195,12 +197,18 @@ struct Instruction { RegName* free_vars; }; struct /* AllocStorage Operands */ { - /*! \brief The size of the allocation. */ - RegName allocation_size; /*! \brief The alignment of the allocation. */ Index alignment; /*! \brief The hint of the dtype. */ DLDataType dtype_hint; + /*! \brief The number of dimensions. */ + uint32_t ndim; + union { + /*! \brief The shape of tensor. */ + int64_t* shape; + /*! \brief The size of the allocation. */ + RegName allocation_size; + }; /*! \brief The index of the device on which the allocation will be made. */ Index device_index; } alloc_storage; @@ -332,10 +340,11 @@ struct Instruction { /*! * \brief Construct a load constant instruction. * \param const_index The index of the constant. + * \param device_index The index of the device to load on. * \param dst The destination register. * \return The load constant instruction. */ - static Instruction LoadConst(Index const_index, RegName dst); + static Instruction LoadConst(Index const_index, Index device_index, RegName dst); /*! * \brief Construct a load_constanti instruction. * \param val The interger constant value. @@ -356,11 +365,13 @@ struct Instruction { * \param alignment The allocation's alignment. * \param dtype_hint The data type hint for the allocator. * \param device_index The index of the device to allocate on. + * \param shape The shape of the allocation. * \param dst The destination to place the storage. * \return The alloc storage instruction. */ static Instruction AllocStorage(RegName size, Index alignment, DLDataType dtype_hint, - Index device_index, RegName dst); + Index device_index, const std::vector& shape, + RegName dst); /*! * \brief Get the shape of an input tensor. * \param tensor The input tensor. diff --git a/include/tvm/runtime/vm/executable.h b/include/tvm/runtime/vm/executable.h index 071484740074..d4872837b0c6 100644 --- a/include/tvm/runtime/vm/executable.h +++ b/include/tvm/runtime/vm/executable.h @@ -34,6 +34,7 @@ #include #include #include +#include #include namespace tvm { @@ -262,9 +263,9 @@ class TVM_DLL Executable : public ModuleNode { /*! * \brief The (compile-time, virtual) devices corresponding to each device index. - * Currently we only support at most one device per device type. + * This vector contains a pair Device and its memory_scope. */ - std::vector virtual_devices; + std::vector> virtual_devices; /*! * \brief The device index corresponding to the 'host' device. That will hold and evaluate * shape-related data and code. diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc index c5b6c7f2f040..848c23eba63b 100644 --- a/src/relay/backend/vm/compiler.cc +++ b/src/relay/backend/vm/compiler.cc @@ -352,19 +352,6 @@ class VMFunctionCompiler : DeviceAwareExprFunctor { return 0; } - // However, otherwise we allow at most one VirtualDevice per device type. - // TODO(mbs): This will eventually need to account for memory scopes somehow so device_copy - // instructions can do the right thing. - itr = std::find_if(context_->virtual_devices_.begin() + 1, context_->virtual_devices_.end(), - [&virtual_device](const VirtualDevice& existing_virtual_device) { - return existing_virtual_device->device_type() == - virtual_device->device_type(); - }); - CHECK(itr == context_->virtual_devices_.end()) - << "The VM does not currently support using more than one device with the same device type " - "for primitives, however the program is using the distinct scopes " - << virtual_device << " and " << *itr << " of device type " << virtual_device->device_type(); - ICHECK(virtual_device != host_virtual_device_); Index index = context_->virtual_devices_.size(); VLOG(2) << "virtual_device[" << index << "] = " << virtual_device; @@ -384,7 +371,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor { VLOG(2) << "constant[" << const_index << "] on device[" << device_index << "]"; context_->const_device_indexes.push_back(device_index); context_->constants.push_back(const_node->data); - Emit(Instruction::LoadConst(const_index, NewRegister())); + Emit(Instruction::LoadConst(const_index, device_index, NewRegister())); } void VisitExpr_(const VarNode* var_node) final { @@ -602,13 +589,21 @@ class VMFunctionCompiler : DeviceAwareExprFunctor { }) .Match("memory.alloc_storage", [this](const Array& args, const Attrs& attrs, const Array& type_arg) { - ICHECK_EQ(args.size(), 2); + ICHECK_EQ(args.size(), 3); // Compute the size of the allocation. this->VisitExpr(args[0]); auto size_register = last_register_; - ICHECK(args[1].as()); // Always a literal. - NDArray alignment_arr = args[1].as()->data; + auto const_shape = AsIgnoringOnDevice(args[1]); + std::vector raw_shape; + if (const_shape) { + NDArray shape = const_shape->data; + // TODO(@jroesch): we need to get an RFC done to standarize shape dtype + raw_shape = ToAllocTensorShape(shape); + } + + ICHECK(args[2].as()); // Always a literal. + NDArray alignment_arr = args[2].as()->data; ICHECK_EQ(alignment_arr->dtype.code, 0U) << "The dtype of constant shape must be int32 or int64, but got " << DLDataType2String(alignment_arr->dtype); @@ -622,7 +617,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor { Emit(Instruction::AllocStorage(size_register, alignment, dtype, GetDeviceIndex(alloc_attrs->virtual_device), - NewRegister())); + raw_shape, NewRegister())); }) .Match("vm.shape_of", [this](const Array& args, const Attrs& attrs, const Array& type_arg) { @@ -739,7 +734,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor { /*! * \brief Compile a match value - * Generate byte code that compute the value specificed in val + * Generate byte code that compute the value specified in val * * \return The register number assigned for the final value */ @@ -946,9 +941,10 @@ void VMCompiler::LowerImpl(IRModule mod) { for (const auto& virtual_device : context_.virtual_devices_) { ICHECK(!virtual_device->IsFullyUnconstrained()); ICHECK_GT(virtual_device->device_type(), 0); - // TODO(mbs): We forget the memory scope. - exec_->virtual_devices.push_back(Device{/*device_type=*/virtual_device->device_type(), - /*device_id=*/virtual_device->virtual_device_id}); + exec_->virtual_devices.push_back( + std::make_pair(Device{/*device_type=*/virtual_device->device_type(), + /*device_id=*/virtual_device->virtual_device_id}, + virtual_device->memory_scope)); } exec_->host_device_index = kHostDeviceIndex; @@ -1068,6 +1064,7 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) { } pass_seqs.push_back(transform::FuseOps()); + pass_seqs.push_back(transform::AnnotateMemoryScope()); // Do layout rewrite for auto-scheduler. transform::PassContext pass_ctx = PassContext::Current(); diff --git a/src/relay/backend/vm/manifest_lifetimes.cc b/src/relay/backend/vm/manifest_lifetimes.cc index 7028c88f2e15..892648d67844 100644 --- a/src/relay/backend/vm/manifest_lifetimes.cc +++ b/src/relay/backend/vm/manifest_lifetimes.cc @@ -167,7 +167,9 @@ class AliasEliminator : public MixedModeMutator { if (copy_props.src_virtual_device->device_type() == copy_props.dst_virtual_device->device_type() && copy_props.src_virtual_device->virtual_device_id == - copy_props.dst_virtual_device->virtual_device_id) { + copy_props.dst_virtual_device->virtual_device_id && + copy_props.src_virtual_device->memory_scope == + copy_props.dst_virtual_device->memory_scope) { Expr to_copy = Downcast(unwrapped)->args[0]; if (const VarNode* alias_of_n = to_copy.as()) { alias_[var] = Downcast(VisitExpr_(alias_of_n)); diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc index 65351562054e..008dbff841f3 100644 --- a/src/relay/op/memory/memory.cc +++ b/src/relay/op/memory/memory.cc @@ -50,25 +50,32 @@ TVM_REGISTER_NODE_TYPE(AllocTensorAttrs); // The passing value in attrs and args doesn't seem super great. // We should consider a better solution, i.e the type relation // being able to see the arguments as well? -Expr AllocStorage(Expr size, Expr alignment, VirtualDevice virtual_device, DataType dtype_hint) { +Expr AllocStorage(Expr size, Expr shape, Expr alignment, VirtualDevice virtual_device, + DataType dtype_hint) { auto attrs = make_object(); attrs->dtype = dtype_hint; attrs->virtual_device = std::move(virtual_device); static const Op& op = Op::Get("memory.alloc_storage"); - return Call(op, {std::move(size), std::move(alignment)}, Attrs(std::move(attrs)), {}); + return Call(op, {std::move(size), std::move(shape), std::move(alignment)}, + Attrs(std::move(attrs)), {}); } TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_storage").set_body_typed(AllocStorage); bool AllocStorageRel(const Array& types, int num_inputs, const Attrs& attrs, const TypeReporter& reporter) { - ICHECK_EQ(types.size(), 3u); + ICHECK_EQ(types.size(), 4u); auto size_type = types[0]; auto tensor_type = size_type.as(); ICHECK(tensor_type != nullptr); ICHECK_EQ(tensor_type->dtype, DataType::Int(64)); ICHECK_EQ(tensor_type->shape.size(), 0); - auto align_type = types[1]; + + // Tensor shape + auto tt = types[1].as(); + ICHECK(tt != nullptr) << "must be tensor type"; + + auto align_type = types[2]; auto align_ttype = align_type.as(); ICHECK(align_ttype != nullptr); ICHECK_EQ(align_ttype->dtype, DataType::Int(64)); @@ -77,14 +84,15 @@ bool AllocStorageRel(const Array& types, int num_inputs, const Attrs& attr ICHECK(mod.defined()); auto storage_name = mod->GetGlobalTypeVar("Storage"); auto storage = TypeCall(storage_name, {}); - reporter->Assign(types[2], storage); + reporter->Assign(types[3], storage); return true; } RELAY_REGISTER_OP("memory.alloc_storage") .describe(R"code(Explicitly allocate storage to be used by tensors.)code" TVM_ADD_FILELINE) - .set_num_inputs(2) + .set_num_inputs(3) .add_argument("size", "Tensor", "The size of the storage to allocate.") + .add_argument("shape", "Tensor", "The shape of the storage to allocate.") .add_argument("alignment", "Tensor", "The alignment of the storage.") .add_type_rel("AllocStorage", AllocStorageRel) .set_attrs_type_key("relay.attrs.AllocStorageAttrs") diff --git a/src/relay/op/memory/memory.h b/src/relay/op/memory/memory.h index 690854c38293..5533553393ec 100644 --- a/src/relay/op/memory/memory.h +++ b/src/relay/op/memory/memory.h @@ -34,10 +34,11 @@ namespace tvm { namespace relay { -Expr AllocStorage(Expr size, Expr alignment, VirtualDevice virtual_device, DataType dtype_hint); +Expr AllocStorage(Expr size, Expr shape, Expr alignment, VirtualDevice virtual_device, + DataType dtype_hint); /*! \brief Returns the "memory.alloc_tensor" operator. */ const Op& MemoryAllocTensorOp(); -Expr AllocTensor(Expr storage, Expr offset, tvm::relay::Expr shape, DataType dtype, +Expr AllocTensor(Expr storage, Expr offset, Expr shape, DataType dtype, Array assert_shape); Expr ToTupleType(const Type& ty, const std::vector& exprs); std::vector FromTupleType(const Type& type, const Expr& expr); diff --git a/src/relay/transforms/annotate_texture_storage.cc b/src/relay/transforms/annotate_texture_storage.cc index d3748449adaa..4921cef4c8c2 100644 --- a/src/relay/transforms/annotate_texture_storage.cc +++ b/src/relay/transforms/annotate_texture_storage.cc @@ -407,6 +407,15 @@ class StorageInfo : private transform::DeviceAwareExprVisitor { if (pattern <= kCommReduce) { if (const auto* ttype = call->checked_type().as()) { if (ttype->shape.size() == 5) { + auto node0 = ttype->shape[0].as(); + auto node1 = ttype->shape[1].as(); + auto node2 = ttype->shape[2].as(); + auto node3 = ttype->shape[3].as(); + auto node4 = ttype->shape[4].as(); + // if tensor has any dimension then textures are not supported + if (!node0 || !node1 || !node2 || !node3 || !node4) { + return false; + } supports_texture_storage = true; } } diff --git a/src/relay/transforms/device_domains.cc b/src/relay/transforms/device_domains.cc index e7d3a65dfe68..e2af20022a40 100644 --- a/src/relay/transforms/device_domains.cc +++ b/src/relay/transforms/device_domains.cc @@ -236,12 +236,13 @@ DeviceDomainPtr DeviceDomains::DomainForCallee(const Call& call) { args_and_result.emplace_back(ForVirtualDevice(device_copy_props.body->checked_type(), device_copy_props.dst_virtual_device)); } else if (call->op == alloc_storage_op) { - ICHECK_EQ(call->args.size(), 2U); - // alloc_storage(size, alignment, virtual_device=) - // alloc_storage: fn(, ): + ICHECK_EQ(call->args.size(), 3U); + // alloc_storage(size, shape, alignment, virtual_device=) + // alloc_storage: fn(, , ): const auto* attrs = call->attrs.as(); args_and_result.emplace_back(host_domain_); args_and_result.emplace_back(host_domain_); + args_and_result.emplace_back(host_domain_); args_and_result.emplace_back(ForVirtualDevice(call->checked_type(), attrs->virtual_device)); } else if (call->op == alloc_tensor_op) { ICHECK_EQ(call->args.size(), 3U); diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc index 5b584e199dc7..fcf8a784a9e7 100644 --- a/src/relay/transforms/memory_alloc.cc +++ b/src/relay/transforms/memory_alloc.cc @@ -260,7 +260,7 @@ class DialectRewriter : public transform::DeviceAwareExprMutator { Expr alignment = ComputeAlignment(type->dtype); // Run type inference later to get the correct type. Var var("storage_" + name_hint, Type(nullptr)); - Expr value = AllocStorage(size, alignment, virtual_device, type->dtype); + Expr value = AllocStorage(size, shape, alignment, virtual_device, type->dtype); auto sto = scope->Push(var, MaybeOnDeviceFixed(value, virtual_device)); // TODO(@jroesch): There is a bug with typing based on the constant shape. @@ -366,7 +366,7 @@ class DialectRewriter : public transform::DeviceAwareExprMutator { // Alignment is directly captured in the instruction so don't wrap in "on_device". auto alignment = ComputeAlignment(out_type->dtype); Var sto_var("storage_" + std::to_string(i), Type(nullptr)); - auto val = AllocStorage(size, alignment, virtual_device, out_type->dtype); + auto val = AllocStorage(size, out_shape, alignment, virtual_device, out_type->dtype); storages.push_back(scope->Push(sto_var, MaybeOnDeviceFixed(val, virtual_device))); } diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index 0132e9009c15..d7739b7b2225 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -152,7 +152,7 @@ static size_t GetDataAlignment(const DLDataType dtype) { void* DeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, Optional mem_scope) { - if (!mem_scope.defined() || mem_scope.value() == "global") { + if (!mem_scope.defined() || mem_scope.value() == "" || mem_scope.value() == "global") { // by default, we can always redirect to the flat memory allocations DLTensor temp; temp.data = nullptr; diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 0d1f4af2bbf1..35e77eb6d1f6 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -239,7 +239,7 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment, void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, Optional mem_scope) { - if (!mem_scope.defined() || mem_scope.value() == "global") { + if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") { return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope); } ICHECK(IsTextureStorage(std::string(mem_scope.value()))) diff --git a/src/runtime/vm/bytecode.cc b/src/runtime/vm/bytecode.cc index 424dfe87c79b..dc52e8c8f01d 100644 --- a/src/runtime/vm/bytecode.cc +++ b/src/runtime/vm/bytecode.cc @@ -99,6 +99,7 @@ Instruction::Instruction(const Instruction& instr) { return; case Opcode::LoadConst: this->const_index = instr.const_index; + this->device_index = instr.device_index; return; case Opcode::LoadConsti: this->load_consti = instr.load_consti; @@ -114,7 +115,15 @@ Instruction::Instruction(const Instruction& instr) { this->pc_offset = instr.pc_offset; return; case Opcode::AllocStorage: - this->alloc_storage = instr.alloc_storage; + this->alloc_storage.allocation_size = instr.alloc_storage.allocation_size; + this->alloc_storage.alignment = instr.alloc_storage.alignment; + this->alloc_storage.dtype_hint = instr.alloc_storage.dtype_hint; + this->alloc_storage.device_index = instr.alloc_storage.device_index; + this->alloc_storage.ndim = instr.alloc_storage.ndim; + if (this->alloc_storage.ndim > 0) { + this->alloc_storage.shape = + Duplicate(instr.alloc_storage.shape, instr.alloc_storage.ndim); + } return; case Opcode::ShapeOf: this->shape_of.tensor = instr.shape_of.tensor; @@ -207,6 +216,7 @@ Instruction& Instruction::operator=(const Instruction& instr) { return *this; case Opcode::LoadConst: this->const_index = instr.const_index; + this->device_index = instr.device_index; return *this; case Opcode::GetField: this->object = instr.object; @@ -219,7 +229,15 @@ Instruction& Instruction::operator=(const Instruction& instr) { this->pc_offset = instr.pc_offset; return *this; case Opcode::AllocStorage: - this->alloc_storage = instr.alloc_storage; + this->alloc_storage.allocation_size = instr.alloc_storage.allocation_size; + this->alloc_storage.alignment = instr.alloc_storage.alignment; + this->alloc_storage.dtype_hint = instr.alloc_storage.dtype_hint; + this->alloc_storage.device_index = instr.alloc_storage.device_index; + this->alloc_storage.ndim = instr.alloc_storage.ndim; + if (this->alloc_storage.ndim > 0) { + this->alloc_storage.shape = + Duplicate(instr.alloc_storage.shape, instr.alloc_storage.ndim); + } return *this; case Opcode::ShapeOf: this->shape_of.tensor = instr.shape_of.tensor; @@ -250,13 +268,17 @@ Instruction::~Instruction() { case Opcode::GetTag: case Opcode::Goto: case Opcode::LoadConsti: - case Opcode::AllocStorage: case Opcode::ShapeOf: case Opcode::ReshapeTensor: case Opcode::DeviceCopy: case Opcode::Fatal: case Opcode::KillRegister: return; + case Opcode::AllocStorage: + if (this->alloc_storage.ndim > 0) { + delete[] this->alloc_storage.shape; + } + return; case Opcode::AllocTensor: delete[] this->alloc_tensor.shape; return; @@ -338,7 +360,8 @@ Instruction Instruction::AllocTensorReg(RegName storage, RegName offset, RegName } Instruction Instruction::AllocStorage(RegName size, Index alignment, DLDataType dtype_hint, - Index device_index, RegName dst) { + Index device_index, const std::vector& shape, + RegName dst) { Instruction instr; instr.op = Opcode::AllocStorage; instr.dst = dst; @@ -346,6 +369,13 @@ Instruction Instruction::AllocStorage(RegName size, Index alignment, DLDataType instr.alloc_storage.alignment = alignment; instr.alloc_storage.dtype_hint = dtype_hint; instr.alloc_storage.device_index = device_index; + instr.alloc_storage.ndim = static_cast(shape.size()); + if (instr.alloc_storage.ndim > 0) { + instr.alloc_storage.shape = new int64_t[shape.size()]; + for (size_t i = 0; i < shape.size(); ++i) { + instr.alloc_storage.shape[i] = shape[i]; + } + } return instr; } @@ -474,11 +504,12 @@ Instruction Instruction::InvokeClosure(RegName closure, const std::vector 0) { + os << "[" << StrJoin(instr.alloc_storage.shape, 0, instr.alloc_storage.ndim) + << "] "; + } else { + os << "$" << instr.alloc_storage.allocation_size << " " << instr.alloc_storage.alignment + << " "; + } + os << DLDataType2String(instr.alloc_storage.dtype_hint) << " " << instr.alloc_storage.device_index; break; } diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc index 2b3119b16965..58c509f8d967 100644 --- a/src/runtime/vm/executable.cc +++ b/src/runtime/vm/executable.cc @@ -183,7 +183,7 @@ std::string Executable::GetConstants() const { const auto& constant = constants[i]; auto ndarray = Downcast(constant); oss << "VM Const[" << i - << "]: " << RuntimeObject2String(ndarray, virtual_devices[host_device_index]) + << "]: " << RuntimeObject2String(ndarray, virtual_devices[host_device_index].first) << " on device index " << const_device_indexes[i] << std::endl; } return oss.str(); @@ -192,9 +192,9 @@ std::string Executable::GetConstants() const { std::string Executable::GetVirtualDevices() const { std::ostringstream oss; for (size_t i = 0; i < virtual_devices.size(); ++i) { - const auto& device = virtual_devices[i]; - oss << "VM VirtualDevice[" << i << "]: device type " << device.device_type << " and id " - << device.device_id << std::endl; + const auto& [device, scope] = virtual_devices[i]; + oss << "VM VirtualDevice[" << i << "]: device type " << device.device_type << ", id " + << device.device_id << " and mem_scope " << scope << std::endl; } return oss.str(); } @@ -596,7 +596,13 @@ VMInstructionSerializer SerializeInstruction(const Instruction& instr) { fields.push_back(dtype.bits); fields.push_back(dtype.lanes); fields.push_back(instr.alloc_storage.device_index); + fields.push_back(instr.alloc_storage.ndim); fields.push_back(instr.dst); + + // Save the shape of the tensor. + // Note that this field is rotated to the end of the list. + fields.insert(fields.end(), instr.alloc_storage.shape, + instr.alloc_storage.shape + instr.alloc_storage.ndim); break; } case Opcode::AllocADT: { @@ -639,8 +645,8 @@ VMInstructionSerializer SerializeInstruction(const Instruction& instr) { break; } case Opcode::LoadConst: { - // Number of fields = 2 - fields.assign({instr.const_index, instr.dst}); + // Number of fields = 3 + fields.assign({instr.const_index, instr.device_index, instr.dst}); break; } case Opcode::LoadConsti: { @@ -910,8 +916,8 @@ Instruction DeserializeInstruction(const VMInstructionSerializer& instr) { return Instruction::AllocClosure(clo_index, num_freevar, free_vars, dst); } case Opcode::AllocStorage: { - // Number of fields = 7 - DCHECK_GE(instr.fields.size(), 7U); + // Number of fields = 9 + DCHECK_GE(instr.fields.size(), 9U); Index allocation_size = instr.fields[0]; Index alignment = instr.fields[1]; @@ -921,9 +927,11 @@ Instruction DeserializeInstruction(const VMInstructionSerializer& instr) { dtype.lanes = instr.fields[4]; Index device_type = instr.fields[5]; - RegName dst = instr.fields[6]; + Index ndim = instr.fields[6]; + RegName dst = instr.fields[7]; + std::vector shape = ExtractFields(instr.fields, 8, ndim); - return Instruction::AllocStorage(allocation_size, alignment, dtype, device_type, dst); + return Instruction::AllocStorage(allocation_size, alignment, dtype, device_type, shape, dst); } case Opcode::If: { // Number of fields = 4 @@ -960,9 +968,9 @@ Instruction DeserializeInstruction(const VMInstructionSerializer& instr) { return Instruction::InvokeClosure(closure, args, dst); } case Opcode::LoadConst: { - // Number of fields = 2 - DCHECK_EQ(instr.fields.size(), 2U); - return Instruction::LoadConst(instr.fields[0], instr.fields[1]); + // Number of fields = 3 + DCHECK_EQ(instr.fields.size(), 3U); + return Instruction::LoadConst(instr.fields[0], instr.fields[1], instr.fields[2]); } case Opcode::LoadConsti: { // Number of fields = 2 diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc index 360185aac53f..7df6b928a32e 100644 --- a/src/runtime/vm/profiler/vm.cc +++ b/src/runtime/vm/profiler/vm.cc @@ -129,9 +129,21 @@ void VirtualMachineDebug::OpStartHook(Instruction instr) { {{"Argument Shapes", profiling::ShapeString(shape_tensor, instr.alloc_tensor_reg.dtype)}}); } else if (instr.op == Opcode::AllocStorage) { - auto size = LoadScalarInt(instr.alloc_storage.allocation_size); std::ostringstream shape; - shape << DLDataType2String(instr.alloc_storage.dtype_hint) << "[" << size << "]"; + if (instr.alloc_storage.ndim > 0) { + std::string shape_str = "["; + for (uint32_t i = 0; i < instr.alloc_storage.ndim; ++i) { + if (i > 0) { + shape_str += ", "; + } + shape_str += std::to_string(instr.alloc_storage.shape[i]); + } + shape_str += "]"; + shape << DLDataType2String(instr.alloc_storage.dtype_hint) << shape_str; + } else { + auto size = LoadScalarInt(instr.alloc_storage.allocation_size); + shape << DLDataType2String(instr.alloc_storage.dtype_hint) << "[" << size << "]"; + } Device dev = GetDevice(instr.alloc_storage.device_index); prof_.operator*().StartCall("VM::AllocStorage", dev, {{"VM::Argument Shapes", String(shape.str())}}); diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc index 50c757f8fb75..188a4153e1c0 100644 --- a/src/runtime/vm/vm.cc +++ b/src/runtime/vm/vm.cc @@ -66,7 +66,7 @@ std::ostream& operator<<(std::ostream& os, const VMFunction& vm_func) { return os; } -inline ObjectRef CopyTo(ObjectRef src, const DLDevice& dev) { +inline ObjectRef CopyTo(ObjectRef src, const DLDevice& dev, Optional mem_scope = NullOpt) { if (src->IsInstance()) { auto nd_array = Downcast(src); // TODO(mbs): Should respect device id also. @@ -79,7 +79,7 @@ inline ObjectRef CopyTo(ObjectRef src, const DLDevice& dev) { VLOG(2) << "copying from " << nd_array->device.device_type << "[" << nd_array->device.device_id << "] to " << dev.device_type << "[" << dev.device_id << "]"; - return nd_array.CopyTo(dev); + return nd_array.CopyTo(dev, mem_scope); } return src; } else { @@ -88,7 +88,7 @@ inline ObjectRef CopyTo(ObjectRef src, const DLDevice& dev) { std::vector ret; ADT adt = Downcast(src); for (size_t i = 0; i < adt.size(); i++) { - ret.push_back(CopyTo(adt[i], dev)); + ret.push_back(CopyTo(adt[i], dev, mem_scope)); } return ADT(adt->tag, ret.begin(), ret.end()); } @@ -532,7 +532,7 @@ void VirtualMachine::Init(const std::vector& physical_devices, for (size_t device_index = 0; device_index < num_virtual_devices; ++device_index) { // We'll retain the legacy behaviour and just match by device type. // TODO(mbs): Generalize. - DLDeviceType virtual_device_type = exec_->virtual_devices[device_index].device_type; + DLDeviceType virtual_device_type = exec_->virtual_devices[device_index].first.device_type; auto itr = std::find_if(physical_devices.begin(), physical_devices.end(), [virtual_device_type](const Device& physical_device) { return physical_device.device_type == virtual_device_type; @@ -658,8 +658,9 @@ void VirtualMachine::RunLoop(const std::vector& output_tensor_reg_indices } if (!const_pool_[instr.const_index].defined()) { - Device dev = GetDevice(exec_->const_device_indexes[instr.const_index]); - const_pool_[instr.const_index] = CopyTo(constant_obj, dev); + auto& [dev, mem_scope] = + exec_->virtual_devices[exec_->const_device_indexes[instr.const_index]]; + const_pool_[instr.const_index] = CopyTo(constant_obj, dev, String(mem_scope)); } WriteRegister(instr.dst, const_pool_[instr.const_index]); if (is_not_cached) { @@ -819,17 +820,36 @@ void VirtualMachine::RunLoop(const std::vector& output_tensor_reg_indices } case Opcode::AllocStorage: { OpStartHook(instr); - auto size = LoadScalarInt(instr.alloc_storage.allocation_size); - auto alignment = instr.alloc_storage.alignment; auto storage_obj = SimpleObjAllocator().make_object(); Allocator* allocator = GetAllocator(instr.alloc_storage.device_index); ICHECK(allocator) << "Did you forget to init the VirtualMachine with devices?"; - VLOG(2) << "allocating with allocation_size=" << size << ", alignment=" << alignment - << ", dtype_hint=" << DLDataType2String(instr.alloc_storage.dtype_hint) - << ", device_index=" << instr.alloc_storage.device_index; - storage_obj->buffer = allocator->Alloc(size, alignment, instr.alloc_storage.dtype_hint); + if (instr.alloc_storage.ndim > 0) { + std::string shape = "["; + for (uint32_t i = 0; i < instr.alloc_storage.ndim; ++i) { + if (i > 0) { + shape += ", "; + } + shape += std::to_string(instr.alloc_storage.shape[i]); + } + shape += "]"; + std::string mem_scope = exec_->virtual_devices[instr.alloc_storage.device_index].second; + VLOG(2) << "allocating with ndims=" << instr.alloc_storage.ndim << ", shape=" << shape + << ", dtype_hint=" << DLDataType2String(instr.alloc_storage.dtype_hint) + << ", device_index=" << instr.alloc_storage.device_index + << ", memory_scope=" << mem_scope; + storage_obj->buffer = + allocator->Alloc(instr.alloc_storage.ndim, instr.alloc_storage.shape, + instr.alloc_storage.dtype_hint, mem_scope); + } else { + auto size = LoadScalarInt(instr.alloc_storage.allocation_size); + auto alignment = instr.alloc_storage.alignment; + VLOG(2) << "allocating with allocation_size=" << size << ", alignment=" << alignment + << ", dtype_hint=" << DLDataType2String(instr.alloc_storage.dtype_hint) + << ", device_index=" << instr.alloc_storage.device_index; + storage_obj->buffer = allocator->Alloc(size, alignment, instr.alloc_storage.dtype_hint); + } Storage storage(storage_obj); WriteRegister(instr.dst, storage); OpStopHook(); @@ -899,8 +919,9 @@ void VirtualMachine::RunLoop(const std::vector& output_tensor_reg_indices ICHECK_EQ(actual_src_dev.device_type, inst_src_dev.device_type); ICHECK_EQ(actual_src_dev.device_id, inst_src_dev.device_id); Device dst_dev = GetDevice(instr.device_copy.dst_device_index); + auto mem_scope = exec_->virtual_devices[instr.device_copy.dst_device_index].second; - NDArray dst_data = src_data.CopyTo(dst_dev); + NDArray dst_data = src_data.CopyTo(dst_dev, String(mem_scope)); WriteRegister(instr.dst, dst_data); OpStopHook(); pc_++; diff --git a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py index 3476037946ff..3c9c3f2caf1e 100644 --- a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py +++ b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py @@ -21,16 +21,17 @@ from tvm import relay from tvm.relay import testing from tvm.contrib import utils -from utils.adreno_utils import gpu_preprocess, build_run_compare +from utils.adreno_utils import gpu_preprocess, build_run_compare, build_run_compare_vm import pytest +executor_type = tvm.testing.parameter("ge", "vm") dtype = tvm.testing.parameter("float32") @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(remote, target, dtype): +def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(remote, target, executor_type, dtype): input_shape = (1, 32, 42, 42) filter_shape = (96, 32, 3, 3) bias_shape = (1, 96, 1, 1) @@ -65,14 +66,19 @@ def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess - ) + if executor_type == "ge": + build_run_compare( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess + ) + else: + build_run_compare_vm( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess + ) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(remote, target, dtype): +def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(remote, target, executor_type, dtype): input_shape = (1, 32, 40, 40) filter_shape = (96, 32, 2, 2) bias_shape = (1, 96, 1, 1) @@ -107,14 +113,19 @@ def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(remote, target, dtype) "bias": tvm.nd.array(bias_data), } - build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess - ) + if executor_type == "ge": + build_run_compare( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess + ) + else: + build_run_compare_vm( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess + ) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_inceptionv3_35_35_strides(remote, target, dtype): +def test_conv2d_inceptionv3_35_35_strides(remote, target, executor_type, dtype): input_shape = (1, 48, 35, 35) filter_shape = (64, 48, 5, 5) bias_shape = (1, 64, 1, 1) @@ -149,14 +160,19 @@ def test_conv2d_inceptionv3_35_35_strides(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess - ) + if executor_type == "ge": + build_run_compare( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess + ) + else: + build_run_compare_vm( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess + ) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_resnet50_v2_nchw_3c(remote, target, dtype): +def test_conv2d_resnet50_v2_nchw_3c(remote, target, executor_type, dtype): input_shape = (1, 3, 224, 224) filter_shape = (64, 3, 7, 7) bias_shape = (1, 64, 1, 1) @@ -192,12 +208,15 @@ def test_conv2d_resnet50_v2_nchw_3c(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_inceptionv3_nchw_3c(remote, target, dtype): +def test_conv2d_inceptionv3_nchw_3c(remote, target, executor_type, dtype): input_shape = (1, 3, 299, 299) filter_shape = (64, 3, 3, 3) bias_shape = (1, 64, 1, 1) @@ -232,12 +251,15 @@ def test_conv2d_inceptionv3_nchw_3c(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_1x1_16c16spatial(remote, target, dtype): +def test_conv2d_1x1_16c16spatial(remote, target, executor_type, dtype): input_shape = (1, 16, 256, 256) filter_shape = (32, 16, 4, 4) bias_shape = (1, 32, 1, 1) @@ -272,12 +294,15 @@ def test_conv2d_1x1_16c16spatial(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_4x4_16c16pad(remote, target, dtype): +def test_conv2d_4x4_16c16pad(remote, target, executor_type, dtype): input_shape = (1, 32, 256, 256) filter_shape = (32, 32, 4, 4) bias_shape = (1, 32, 1, 1) @@ -312,12 +337,15 @@ def test_conv2d_4x4_16c16pad(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_4x4x4_16c16pad(remote, target, dtype): +def test_conv2d_4x4x4_16c16pad(remote, target, executor_type, dtype): input_shape = (1, 32, 256, 256) filter_shape = (4, 32, 4, 4) bias_shape = (1, 4, 1, 1) @@ -352,12 +380,15 @@ def test_conv2d_4x4x4_16c16pad(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_yolov3_v2_nchw_3c(remote, target, dtype): +def test_conv2d_yolov3_v2_nchw_3c(remote, target, executor_type, dtype): input_shape = (1, 1024, 13, 13) filter_shape = (255, 1024, 1, 1) A = relay.var("data", shape=input_shape, dtype=dtype) @@ -385,12 +416,15 @@ def test_conv2d_yolov3_v2_nchw_3c(remote, target, dtype): "weight": tvm.nd.array(filter_data), } - build_run_compare(remote, mod, params, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_vgg16_winograd_4d(remote, target, dtype): +def test_conv2d_vgg16_winograd_4d(remote, target, executor_type, dtype): input_shape = (1, 512, 28, 28) filter_shape = (512, 512, 3, 3) bias_shape = (1, 512, 1, 1) @@ -429,16 +463,35 @@ def test_conv2d_vgg16_winograd_4d(remote, target, dtype): f.write( f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 512, 28, 28], "{dtype}"], ["TENSOR", [512, 512, 3, 3], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n' ) - graph = build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, stat_file=stat_file - ) - matches = re.findall("winograd", graph) - assert len(matches) > 0 + if executor_type == "ge": + graph = build_run_compare( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + stat_file=stat_file, + ) + matches = re.findall("winograd", graph) + assert len(matches) > 0 + else: + vmc = build_run_compare_vm( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + stat_file=stat_file, + ) + matches = re.findall("winograd", vmc.primitives) + assert len(matches) > 0 @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_winograd_conv(remote, target, dtype): +def test_conv2d_winograd_conv(remote, target, executor_type, dtype): input_shape = (1, 4, 3, 3) A = relay.var("data", shape=input_shape, dtype=dtype) filter_shape3 = (8, 4, 3, 3) @@ -476,16 +529,35 @@ def test_conv2d_winograd_conv(remote, target, dtype): f.write( f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 4, 3, 3], "{dtype}"], ["TENSOR", [8, 4, 3, 3], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n' ) - graph = build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, stat_file=stat_file - ) - matches = re.findall("winograd", graph) - assert len(matches) > 0 + if executor_type == "ge": + graph = build_run_compare( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + stat_file=stat_file, + ) + matches = re.findall("winograd", graph) + assert len(matches) > 0 + else: + vmc = build_run_compare_vm( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + stat_file=stat_file, + ) + matches = re.findall("winograd", vmc.primitives) + assert len(matches) > 0 @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_residual_block(remote, target, dtype): +def test_residual_block(remote, target, executor_type, dtype): """ - some kind of residual block followed by convolution to have texture after residual block - scalar data type verification which should be mapped to global memory scope @@ -602,14 +674,31 @@ def test_residual_block(remote, target, dtype): "", ] - build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, static_memory_scope - ) + if executor_type == "ge": + build_run_compare( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + static_memory_scope, + ) + else: + build_run_compare_vm( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + static_memory_scope, + ) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_concat(remote, target, dtype): +def test_concat(remote, target, executor_type, dtype): """ layout_transform (NCHW->NCHW4c) | <- buffer @@ -716,14 +805,31 @@ def test_concat(remote, target, dtype): static_memory_scope = [] - build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, static_memory_scope - ) + if executor_type == "ge": + build_run_compare( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + static_memory_scope, + ) + else: + build_run_compare_vm( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + static_memory_scope, + ) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_pooling_branching_texture_params(remote, target, dtype): +def test_pooling_branching_texture_params(remote, target, executor_type, dtype): """ Verification of the pooling and many branches having textures layout_transform (NCHW->NCHW4c) @@ -844,14 +950,31 @@ def test_pooling_branching_texture_params(remote, target, dtype): "", ] - build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, static_memory_scope - ) + if executor_type == "ge": + build_run_compare( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + static_memory_scope, + ) + else: + build_run_compare_vm( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + static_memory_scope, + ) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_branching_texture_params(remote, target, dtype): +def test_branching_texture_params(remote, target, executor_type, dtype): """ Verification of passing texture to several consumers markup of relay variables in primary functions + on_device @@ -970,15 +1093,32 @@ def test_branching_texture_params(remote, target, dtype): "", ] - build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, static_memory_scope - ) + if executor_type == "ge": + build_run_compare( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + static_memory_scope, + ) + else: + build_run_compare_vm( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + static_memory_scope, + ) # function repeat, params scope are different in reused functions @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_different_lowering_same_op(remote, target, dtype): +def test_conv2d_different_lowering_same_op(remote, target, executor_type, dtype): """ Use case for verification of caching compiled functions Three convolutions following by each other in this case should be @@ -1054,14 +1194,31 @@ def test_conv2d_different_lowering_same_op(remote, target, dtype): "", ] - build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, static_memory_scope - ) + if executor_type == "ge": + build_run_compare( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + static_memory_scope, + ) + else: + build_run_compare_vm( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + static_memory_scope, + ) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_winograd_non_rect(remote, target, dtype): +def test_conv2d_winograd_non_rect(remote, target, executor_type, dtype): input_shape = (1, 771, 36, 64) A = relay.var("data", shape=input_shape, dtype=dtype) filter_shape = (128, 771, 3, 3) @@ -1085,17 +1242,36 @@ def test_conv2d_winograd_non_rect(remote, target, dtype): f.write( f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256 -texture_spatial_limit=16384 -thread_warp_size=1", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 771, 36, 64], "{dtype}"], ["TENSOR", [128, 771, 3, 3], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 5399, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 16], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 8]], ["tile_rc", "sp", [-1, 193]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n' ) - graph = build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, stat_file=stat_file - ) - matches = re.findall("winograd", graph) - assert len(matches) > 0 + if executor_type == "ge": + graph = build_run_compare( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + stat_file=stat_file, + ) + matches = re.findall("winograd", graph) + assert len(matches) > 0 + else: + vmc = build_run_compare_vm( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + stat_file=stat_file, + ) + matches = re.findall("winograd", vmc.primitives) + assert len(matches) > 0 # function repeat, params scope are different in reused functions @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_injective_nwo_inputs1(remote, target, dtype): +def test_injective_nwo_inputs1(remote, target, executor_type, dtype): """ Use case for verification of stability of annotation primary functions having several ops accepting data outside of Primary function @@ -1186,15 +1362,32 @@ def test_injective_nwo_inputs1(remote, target, dtype): "global", "global", ] - build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, static_memory_scope - ) + if executor_type == "ge": + build_run_compare( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + static_memory_scope, + ) + else: + build_run_compare_vm( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + static_memory_scope, + ) # function repeat, params scope are different in reused functions @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_injective_nwo_inputs2(remote, target, dtype): +def test_injective_nwo_inputs2(remote, target, executor_type, dtype): """ Use case for verification of stability of annotation primary functions having several ops accepting data outside of Primary function @@ -1284,14 +1477,31 @@ def test_injective_nwo_inputs2(remote, target, dtype): "global.texture", "global", ] - build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, static_memory_scope - ) + if executor_type == "ge": + build_run_compare( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + static_memory_scope, + ) + else: + build_run_compare_vm( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + static_memory_scope, + ) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_to_3_channels(remote, target, dtype): +def test_conv2d_to_3_channels(remote, target, executor_type, dtype): input_shape = (1, 256, 200, 200) filter_shape = (3, 256, 1, 1) A = relay.var("data", shape=input_shape, dtype=dtype) @@ -1316,7 +1526,12 @@ def test_conv2d_to_3_channels(remote, target, dtype): "weight": tvm.nd.array(filter_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, []) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, []) + else: + build_run_compare_vm( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [] + ) if __name__ == "__main__": diff --git a/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py index 5f69e777d9ea..dc86a231877f 100644 --- a/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py +++ b/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py @@ -22,16 +22,17 @@ from tvm import relay from tvm.relay import testing from tvm.contrib import utils -from utils.adreno_utils import gpu_preprocess, build_run_compare +from utils.adreno_utils import gpu_preprocess, build_run_compare, build_run_compare_vm import pytest +executor_type = tvm.testing.parameter("ge", "vm") dtype = tvm.testing.parameter("float32") @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16(remote, target, dtype): +def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16(remote, target, executor_type, dtype): input_shape = (1, 257, 257, 32) filter_shape = (1, 1, 32, 16) bias_shape = (filter_shape[-1],) @@ -63,12 +64,15 @@ def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16_with_padding(remote, target, dtype): +def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16_with_padding(remote, target, executor_type, dtype): input_shape = (1, 257, 257, 32) filter_shape = (1, 1, 32, 16) bias_shape = (filter_shape[-1],) @@ -103,12 +107,15 @@ def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16_with_padding(remote, target, dt "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_4_35_35_32x3_3_144_16(remote, target, dtype): +def test_conv2d_4_35_35_32x3_3_144_16(remote, target, executor_type, dtype): input_shape = (4, 35, 35, 32) filter_shape = (3, 3, 32, 16) bias_shape = (filter_shape[-1],) @@ -141,12 +148,15 @@ def test_conv2d_4_35_35_32x3_3_144_16(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_deeplabv3_1_513_513_3x3_3_3_32(remote, target, dtype): +def test_conv2d_deeplabv3_1_513_513_3x3_3_3_32(remote, target, executor_type, dtype): input_shape = (1, 513, 513, 3) filter_shape = (3, 3, 3, 32) bias_shape = (filter_shape[-1],) @@ -179,12 +189,15 @@ def test_conv2d_deeplabv3_1_513_513_3x3_3_3_32(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(remote, target, dtype): +def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(remote, target, executor_type, dtype): input_shape = (1, 42, 42, 32) filter_shape = (3, 3, 32, 96) bias_shape = (1, 1, 1, 96) @@ -219,14 +232,19 @@ def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess - ) + if executor_type == "ge": + build_run_compare( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess + ) + else: + build_run_compare_vm( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess + ) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(remote, target, dtype): +def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(remote, target, executor_type, dtype): input_shape = (1, 40, 40, 32) filter_shape = (2, 2, 32, 96) bias_shape = (1, 1, 1, 96) @@ -261,14 +279,19 @@ def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(remote, target, dtype) "bias": tvm.nd.array(bias_data), } - build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess - ) + if executor_type == "ge": + build_run_compare( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess + ) + else: + build_run_compare_vm( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess + ) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_inceptionv3_35_35_strides(remote, target, dtype): +def test_conv2d_inceptionv3_35_35_strides(remote, target, executor_type, dtype): input_shape = (1, 35, 35, 48) filter_shape = (5, 5, 48, 64) bias_shape = (1, 1, 1, 64) @@ -303,14 +326,19 @@ def test_conv2d_inceptionv3_35_35_strides(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess - ) + if executor_type == "ge": + build_run_compare( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess + ) + else: + build_run_compare_vm( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess + ) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_resnet50_v2_nhwc_3c(remote, target, dtype): +def test_conv2d_resnet50_v2_nhwc_3c(remote, target, executor_type, dtype): input_shape = (1, 224, 224, 3) filter_shape = (7, 7, 3, 64) bias_shape = (1, 1, 1, 64) @@ -346,12 +374,15 @@ def test_conv2d_resnet50_v2_nhwc_3c(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_inceptionv3_nhwc_3c(remote, target, dtype): +def test_conv2d_inceptionv3_nhwc_3c(remote, target, executor_type, dtype): input_shape = (1, 299, 299, 3) filter_shape = (3, 3, 3, 64) bias_shape = (1, 1, 1, 64) @@ -386,12 +417,15 @@ def test_conv2d_inceptionv3_nhwc_3c(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_1x1_16c16spatial(remote, target, dtype): +def test_conv2d_1x1_16c16spatial(remote, target, executor_type, dtype): input_shape = (1, 128, 128, 16) filter_shape = (4, 4, 16, 32) bias_shape = (1, 1, 1, 32) @@ -426,12 +460,15 @@ def test_conv2d_1x1_16c16spatial(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_4x4_16c16pad(remote, target, dtype): +def test_conv2d_4x4_16c16pad(remote, target, executor_type, dtype): input_shape = (1, 256, 256, 32) filter_shape = (4, 4, 32, 32) bias_shape = (1, 1, 1, 32) @@ -466,12 +503,15 @@ def test_conv2d_4x4_16c16pad(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_4x4x4_16c16pad(remote, target, dtype): +def test_conv2d_4x4x4_16c16pad(remote, target, executor_type, dtype): input_shape = (1, 256, 256, 32) filter_shape = (4, 4, 32, 4) bias_shape = (1, 1, 1, 4) @@ -505,12 +545,15 @@ def test_conv2d_4x4x4_16c16pad(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_yolov3_v2_nhwc_3c(remote, target, dtype): +def test_conv2d_yolov3_v2_nhwc_3c(remote, target, executor_type, dtype): input_shape = (1, 13, 13, 1024) filter_shape = (1, 1, 1024, 255) A = relay.var("data", shape=input_shape, dtype=dtype) @@ -538,12 +581,15 @@ def test_conv2d_yolov3_v2_nhwc_3c(remote, target, dtype): "weight": tvm.nd.array(filter_data), } - build_run_compare(remote, mod, params, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_vgg16_winograd_4d(remote, target, dtype): +def test_conv2d_vgg16_winograd_4d(remote, target, executor_type, dtype): input_shape = (1, 28, 28, 512) filter_shape = (3, 3, 512, 512) bias_shape = (1, 1, 1, 512) @@ -582,16 +628,35 @@ def test_conv2d_vgg16_winograd_4d(remote, target, dtype): f.write( f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 28, 28, 512], "{dtype}"], ["TENSOR", [3, 3, 512, 512], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n' ) - graph = build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, stat_file=stat_file - ) - matches = re.findall("winograd", graph) - assert len(matches) > 0 + if executor_type == "ge": + graph = build_run_compare( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + stat_file=stat_file, + ) + matches = re.findall("winograd", graph) + assert len(matches) > 0 + else: + vmc = build_run_compare_vm( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + stat_file=stat_file, + ) + matches = re.findall("winograd", vmc.primitives) + assert len(matches) > 0 @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_vgg16_winograd_4d_expand_spatial_dims(remote, target, dtype): +def test_conv2d_vgg16_winograd_4d_expand_spatial_dims(remote, target, executor_type, dtype): input_shape = (1, 28, 28, 1) filter_shape = (3, 3, 1, 64) bias_shape = (1, 1, 1, 64) @@ -629,16 +694,35 @@ def test_conv2d_vgg16_winograd_4d_expand_spatial_dims(remote, target, dtype): f.write( f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 28, 28, 1], "{dtype}"], ["TENSOR", [3, 3, 1, 64], "{dtype}"], [1, 1], [0, 0, 0, 0], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n' ) - graph = build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, stat_file=stat_file - ) - matches = re.findall("winograd", graph) - assert len(matches) > 0 + if executor_type == "ge": + graph = build_run_compare( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + stat_file=stat_file, + ) + matches = re.findall("winograd", graph) + assert len(matches) > 0 + else: + vmc = build_run_compare_vm( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + stat_file=stat_file, + ) + matches = re.findall("winograd", vmc.primitives) + assert len(matches) > 0 @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_winograd_conv(remote, target, dtype): +def test_conv2d_winograd_conv(remote, target, executor_type, dtype): input_shape = (1, 3, 3, 4) A = relay.var("data", shape=input_shape, dtype=dtype) filter_shape3 = (3, 3, 4, 8) @@ -690,16 +774,35 @@ def test_conv2d_winograd_conv(remote, target, dtype): f.write( f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 3, 3, 4], "{dtype}"], ["TENSOR", [3, 3, 4, 8], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n' ) - graph = build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, stat_file=stat_file - ) - matches = re.findall("winograd", graph) - assert len(matches) > 0 + if executor_type == "ge": + graph = build_run_compare( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + stat_file=stat_file, + ) + matches = re.findall("winograd", graph) + assert len(matches) > 0 + else: + vmc = build_run_compare_vm( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + stat_file=stat_file, + ) + matches = re.findall("winograd", vmc.primitives) + assert len(matches) > 0 @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_winograd_non_rect(remote, target, dtype): +def test_conv2d_winograd_non_rect(remote, target, executor_type, dtype): input_shape = (1, 36, 64, 771) A = relay.var("data", shape=input_shape, dtype=dtype) filter_shape = (3, 3, 771, 128) @@ -730,16 +833,35 @@ def test_conv2d_winograd_non_rect(remote, target, dtype): f.write( f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256 -texture_spatial_limit=16384 -thread_warp_size=1", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 36, 64, 771], "{dtype}"], ["TENSOR", [3, 3, 771, 128], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 5399, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 16], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 8]], ["tile_rc", "sp", [-1, 193]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n' ) - graph = build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, stat_file=stat_file - ) - matches = re.findall("winograd", graph) - assert len(matches) > 0 + if executor_type == "ge": + graph = build_run_compare( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + stat_file=stat_file, + ) + matches = re.findall("winograd", graph) + assert len(matches) > 0 + else: + vmc = build_run_compare_vm( + remote, + mod, + params1, + {"data": input_shape}, + {"data": dtype}, + target, + stat_file=stat_file, + ) + matches = re.findall("winograd", vmc.primitives) + assert len(matches) > 0 @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_to_3_channels(remote, target, dtype): +def test_conv2d_to_3_channels(remote, target, executor_type, dtype): input_shape = (1, 200, 200, 256) filter_shape = (1, 1, 256, 3) A = relay.var("data", shape=input_shape, dtype=dtype) @@ -764,7 +886,12 @@ def test_conv2d_to_3_channels(remote, target, dtype): "weight": tvm.nd.array(filter_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, []) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, []) + else: + build_run_compare_vm( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [] + ) if __name__ == "__main__": diff --git a/tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py index 2c729a36ebe0..87e9542140d1 100644 --- a/tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py +++ b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py @@ -20,14 +20,15 @@ import numpy as np from tvm import relay from tvm.relay import testing -from utils.adreno_utils import gpu_preprocess, build_run_compare +from utils.adreno_utils import gpu_preprocess, build_run_compare, build_run_compare_vm +executor_type = tvm.testing.parameter("ge", "vm") dtype = tvm.testing.parameter("float32") @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_depthwise_conv2d_bias_nchwc(remote, target, dtype): +def test_depthwise_conv2d_bias_nchwc(remote, target, executor_type, dtype): input_shape = (1, 64, 112, 112) filter_shape = (64, 1, 3, 3) bias_shape = (1, 64, 1, 1) @@ -64,14 +65,19 @@ def test_depthwise_conv2d_bias_nchwc(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess - ) + if executor_type == "ge": + build_run_compare( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess + ) + else: + build_run_compare_vm( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess + ) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_depthwise_conv2d_nchwc(remote, target, dtype): +def test_depthwise_conv2d_nchwc(remote, target, executor_type, dtype): input_shape = (1, 64, 112, 112) filter_shape = (64, 1, 3, 3) bias_shape = (1, 64, 1, 1) @@ -103,14 +109,19 @@ def test_depthwise_conv2d_nchwc(remote, target, dtype): "weight": tvm.nd.array(filter_data), } - build_run_compare( - remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess - ) + if executor_type == "ge": + build_run_compare( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess + ) + else: + build_run_compare_vm( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [], gpu_preprocess + ) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_depthwise_conv2d_bias_nchw(remote, target, dtype): +def test_depthwise_conv2d_bias_nchw(remote, target, executor_type, dtype): input_shape = (1, 64, 112, 112) filter_shape = (64, 1, 3, 3) bias_shape = (1, 64, 1, 1) @@ -147,12 +158,15 @@ def test_depthwise_conv2d_bias_nchw(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_depthwise_conv2d_repack_bias_nchw(remote, target, dtype): +def test_depthwise_conv2d_repack_bias_nchw(remote, target, executor_type, dtype): input_shape = (1, 63, 112, 112) filter_shape = (63, 1, 3, 3) bias_shape = (1, 63, 1, 1) @@ -189,12 +203,15 @@ def test_depthwise_conv2d_repack_bias_nchw(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_to_3_channels(remote, target, dtype): +def test_conv2d_to_3_channels(remote, target, executor_type, dtype): input_shape = (1, 3, 200, 200) filter_shape = (3, 1, 1, 1) A = relay.var("data", shape=input_shape, dtype=dtype) @@ -220,7 +237,12 @@ def test_conv2d_to_3_channels(remote, target, dtype): "weight": tvm.nd.array(filter_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, []) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, []) + else: + build_run_compare_vm( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [] + ) if __name__ == "__main__": diff --git a/tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py index 28f0f4cefaee..782c99a96a8f 100644 --- a/tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py +++ b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py @@ -20,14 +20,16 @@ import numpy as np from tvm import relay from tvm.relay import testing -from utils.adreno_utils import build_run_compare +from utils.adreno_utils import build_run_compare, build_run_compare_vm + +executor_type = tvm.testing.parameter("ge", "vm") dtype = tvm.testing.parameter("float32") @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1(remote, target, dtype): +def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1(remote, target, executor_type, dtype): input_shape = (1, 129, 129, 144) filter_shape = (3, 3, 144, 1) kernel_size = (filter_shape[0], filter_shape[1]) @@ -62,12 +64,15 @@ def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1(remote, target, dtyp "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_depthwise_conv2d_deeplabv3_4_35_35_576x3_3_576_1(remote, target, dtype): +def test_depthwise_conv2d_deeplabv3_4_35_35_576x3_3_576_1(remote, target, executor_type, dtype): input_shape = (4, 35, 35, 576) filter_shape = (3, 3, 576, 1) kernel_size = (filter_shape[0], filter_shape[1]) @@ -102,12 +107,17 @@ def test_depthwise_conv2d_deeplabv3_4_35_35_576x3_3_576_1(remote, target, dtype) "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1_with_padding(remote, target, dtype): +def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1_with_padding( + remote, target, executor_type, dtype +): input_shape = (1, 129, 129, 144) filter_shape = (3, 3, 144, 1) kernel_size = (filter_shape[0], filter_shape[1]) @@ -144,12 +154,15 @@ def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1_with_padding(remote, "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_depthwise_conv2d_1_513_513_7x3_3_7_1(remote, target, dtype): +def test_depthwise_conv2d_1_513_513_7x3_3_7_1(remote, target, executor_type, dtype): input_shape = (1, 513, 513, 7) filter_shape = (3, 3, 7, 1) bias_shape = (filter_shape[2],) @@ -183,12 +196,15 @@ def test_depthwise_conv2d_1_513_513_7x3_3_7_1(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_depthwise_conv2d_1_513_513_3x3_3_3_1(remote, target, dtype): +def test_depthwise_conv2d_1_513_513_3x3_3_3_1(remote, target, executor_type, dtype): input_shape = (1, 513, 513, 3) filter_shape = (3, 3, 3, 1) bias_shape = (filter_shape[2],) @@ -222,12 +238,15 @@ def test_depthwise_conv2d_1_513_513_3x3_3_3_1(remote, target, dtype): "bias": tvm.nd.array(bias_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_conv2d_to_3_channels(remote, target, dtype): +def test_conv2d_to_3_channels(remote, target, executor_type, dtype): input_shape = (1, 200, 200, 3) filter_shape = (1, 1, 3, 1) A = relay.var("data", shape=input_shape, dtype=dtype) @@ -253,7 +272,12 @@ def test_conv2d_to_3_channels(remote, target, dtype): "weight": tvm.nd.array(filter_data), } - build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, []) + if executor_type == "ge": + build_run_compare(remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, []) + else: + build_run_compare_vm( + remote, mod, params1, {"data": input_shape}, {"data": dtype}, target, [] + ) if __name__ == "__main__": diff --git a/tests/python/relay/opencl_texture/test_injection_texture.py b/tests/python/relay/opencl_texture/test_injection_texture.py index 991983706fcb..31c082c99496 100644 --- a/tests/python/relay/opencl_texture/test_injection_texture.py +++ b/tests/python/relay/opencl_texture/test_injection_texture.py @@ -20,48 +20,56 @@ import tvm import numpy as np from tvm import relay -from tvm.relay import testing -from tvm.contrib import utils -from utils.adreno_utils import gpu_preprocess, build_run_compare +from utils.adreno_utils import build_run_compare, build_run_compare_vm +executor_type = tvm.testing.parameter("ge", "vm") dtype = tvm.testing.parameter("float32") @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_layout_transform_to_block_nchw4c(remote, target, dtype): +def test_layout_transform_to_block_nchw4c(remote, target, executor_type, dtype): """Verification of the case NCHW->NCHW4c""" input_shape = (1, 32, 720, 1280) A = relay.var("data", shape=input_shape, dtype=dtype) lt = relay.layout_transform(A, "NCHW", "NCHW4c") mod = relay.Function([A], lt) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_layout_transform_to_block_nchw(remote, target, dtype): +def test_layout_transform_to_block_nchw(remote, target, executor_type, dtype): """Verification of the case NCHW4c->NCHW""" input_shape = (1, 36, 1, 1, 4) A = relay.var("data", shape=input_shape, dtype=dtype) lt = relay.layout_transform(A, "NCHW4c", "NCHW") mod = relay.Function([A], lt) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_layout_transform_to_block_nhwc4c(remote, target, dtype): +def test_layout_transform_to_block_nhwc4c(remote, target, executor_type, dtype): """Verification of the case NHWC->NHWC4c""" input_shape = (1, 1, 1, 144) A = relay.var("data", shape=input_shape, dtype=dtype) lt = relay.layout_transform(A, "NHWC", "NHWC4c") mod = relay.Function([A], lt) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @pytest.mark.skipif( @@ -69,7 +77,7 @@ def test_layout_transform_to_block_nhwc4c(remote, target, dtype): ) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_layout_transform_to_block_nhwc(remote, target, dtype): +def test_layout_transform_to_block_nhwc(remote, target, executor_type, dtype): """Verification of the case NHWC4c->NHWC""" input_shape = (1, 80, 80, 36, 4) A = relay.var("data", shape=input_shape, dtype=dtype) @@ -78,7 +86,10 @@ def test_layout_transform_to_block_nhwc(remote, target, dtype): lt = relay.layout_transform(cast, "NHWC4c", "NHWC") mod = relay.Function([A], lt) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) if __name__ == "__main__": diff --git a/tests/python/relay/opencl_texture/test_network.py b/tests/python/relay/opencl_texture/test_network.py index 1d0e996f9f97..2b2f3741cba2 100644 --- a/tests/python/relay/opencl_texture/test_network.py +++ b/tests/python/relay/opencl_texture/test_network.py @@ -24,10 +24,13 @@ from tvm.contrib import utils from tvm.relay import testing from tvm.relay.op import register_mixed_precision_conversion -from utils.adreno_utils import build_run_compare, get_model, gpu_preprocess +from utils.adreno_utils import build_run_compare, build_run_compare_vm, get_model, gpu_preprocess -def _test_mobilenet_v1(remote, target, calc_dtype, acc_dtype): +executor_type = tvm.testing.parameter("ge", "vm") + + +def _test_mobilenet_v1(remote, target, calc_dtype, executor_type, acc_dtype): mod, params, inputs, dtypes = get_model( "https://github.com/mlcommons/mobile_models/raw/main/v0_7/tflite/mobilenet_edgetpu_224_1.0_float.tflite", "mobilenet_edgetpu_224_1.0_float.tflite", @@ -46,29 +49,32 @@ def _test_mobilenet_v1(remote, target, calc_dtype, acc_dtype): }, ) - build_run_compare(remote, mod, params, inputs, dtypes, target, []) + if executor_type == "ge": + build_run_compare(remote, mod, params, inputs, dtypes, target, []) + else: + build_run_compare_vm(remote, mod, params, inputs, dtypes, target, []) @pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/13443") @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") @pytest.mark.skipif(tvm.testing.utils.IS_IN_CI, reason="CI doesn't support fp16(half datatypes)") -def test_mobilenet_v1_fp16(remote, target): - _test_mobilenet_v1(remote, target, "float16", "float16") +def test_mobilenet_v1_fp16(remote, target, executor_type): + _test_mobilenet_v1(remote, target, "float16", executor_type, "float16") @pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/13443") @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_mobilenet_v1_fp32(remote, target): - _test_mobilenet_v1(remote, target, "float32", "float32") +def test_mobilenet_v1_fp32(remote, target, executor_type): + _test_mobilenet_v1(remote, target, "float32", executor_type, "float32") @pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/13443") @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_mobilenet_v1_fp16_acc32(remote, target): - _test_mobilenet_v1(remote, target, "float16", "float32") +def test_mobilenet_v1_fp16_acc32(remote, target, executor_type): + _test_mobilenet_v1(remote, target, "float16", executor_type, "float32") if __name__ == "__main__": diff --git a/tests/python/relay/opencl_texture/test_pool_texture.py b/tests/python/relay/opencl_texture/test_pool_texture.py index faeb121c800c..6190790a3dd6 100644 --- a/tests/python/relay/opencl_texture/test_pool_texture.py +++ b/tests/python/relay/opencl_texture/test_pool_texture.py @@ -17,15 +17,16 @@ import tvm from tvm import relay -from utils.adreno_utils import build_run_compare +from utils.adreno_utils import build_run_compare, build_run_compare_vm +executor_type = tvm.testing.parameter("ge", "vm") dtype = tvm.testing.parameter("float32") @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_global_pool2d_nchw_wide(remote, target, dtype): +def test_global_pool2d_nchw_wide(remote, target, executor_type, dtype): """ Use case of NCHW global pooling with big spatial valies """ @@ -34,12 +35,15 @@ def test_global_pool2d_nchw_wide(remote, target, dtype): C = relay.nn.global_avg_pool2d(A) mod = relay.Function([A], C) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_global_pool2d_nchw4c_wide(remote, target, dtype): +def test_global_pool2d_nchw4c_wide(remote, target, executor_type, dtype): """ Use case of blocked NCHW4c global pooling with big spatial valies """ @@ -48,12 +52,15 @@ def test_global_pool2d_nchw4c_wide(remote, target, dtype): C = relay.nn.global_avg_pool2d(A, layout="NCHW4c") mod = relay.Function([A], C) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_global_pool2d_nchw_deep(remote, target, dtype): +def test_global_pool2d_nchw_deep(remote, target, executor_type, dtype): """ Use case of NCHW deep global pooling """ @@ -62,12 +69,15 @@ def test_global_pool2d_nchw_deep(remote, target, dtype): C = relay.nn.global_avg_pool2d(A) mod = relay.Function([A], C) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_global_pool2d_nchw4c_deep(remote, target, dtype): +def test_global_pool2d_nchw4c_deep(remote, target, executor_type, dtype): """ Use case of blocked NCHW4c deep global pooling """ @@ -76,12 +86,15 @@ def test_global_pool2d_nchw4c_deep(remote, target, dtype): C = relay.nn.global_avg_pool2d(A, layout="NCHW4c") mod = relay.Function([A], C) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_global_pool2d_nhwc(remote, target, dtype): +def test_global_pool2d_nhwc(remote, target, executor_type, dtype): """ Use case of NHWC global pooling with big spatial valies """ @@ -90,12 +103,15 @@ def test_global_pool2d_nhwc(remote, target, dtype): C = relay.nn.global_avg_pool2d(A, layout="NHWC") mod = relay.Function([A], C) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_global_pool2d_nhwc4c(remote, target, dtype): +def test_global_pool2d_nhwc4c(remote, target, executor_type, dtype): """ Use case of NHWC deep global pooling """ @@ -104,12 +120,15 @@ def test_global_pool2d_nhwc4c(remote, target, dtype): C = relay.nn.global_avg_pool2d(A, layout="NHWC4c") mod = relay.Function([A], C) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_global_max_pool2d_nchw_wide(remote, target, dtype): +def test_global_max_pool2d_nchw_wide(remote, target, executor_type, dtype): """ Use case of NCHW global pooling with big spatial valies """ @@ -118,12 +137,15 @@ def test_global_max_pool2d_nchw_wide(remote, target, dtype): C = relay.nn.global_max_pool2d(A) mod = relay.Function([A], C) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_global_max_pool2d_nchw4c_wide(remote, target, dtype): +def test_global_max_pool2d_nchw4c_wide(remote, target, executor_type, dtype): """ Use case of blocked NCHW4c global pooling with big spatial valies """ @@ -132,4 +154,11 @@ def test_global_max_pool2d_nchw4c_wide(remote, target, dtype): C = relay.nn.global_max_pool2d(A, layout="NCHW4c") mod = relay.Function([A], C) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/relay/opencl_texture/test_reduction_texture.py b/tests/python/relay/opencl_texture/test_reduction_texture.py index 5728e6294f51..1016a7c88ec6 100644 --- a/tests/python/relay/opencl_texture/test_reduction_texture.py +++ b/tests/python/relay/opencl_texture/test_reduction_texture.py @@ -21,123 +21,151 @@ from tvm import relay from tvm.relay import testing from tvm.contrib import utils -from utils.adreno_utils import gpu_preprocess, build_run_compare +from utils.adreno_utils import gpu_preprocess, build_run_compare, build_run_compare_vm +executor_type = tvm.testing.parameter("ge", "vm") dtype = tvm.testing.parameter("float32") @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_mean(remote, target, dtype): +def test_mean(remote, target, executor_type, dtype): # NCHW input_shape = (1, 3, 720, 1280) A = relay.var("data", shape=input_shape, dtype=dtype) mean = relay.mean(A, axis=1, keepdims=True) mod = relay.Function([A], mean) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_argmax(remote, target, dtype): +def test_argmax(remote, target, executor_type, dtype): # NCHW input_shape = (1, 3, 720, 1280) A = relay.var("data", shape=input_shape, dtype=dtype) argmax = relay.op.argmax(A, axis=[1]) mod = relay.Function([A], argmax) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_reduction_max(remote, target, dtype): +def test_reduction_max(remote, target, executor_type, dtype): # NCHW input_shape = (1, 3, 720, 1280) A = relay.var("data", shape=input_shape, dtype=dtype) argmax = relay.op.max(A, axis=[1]) mod = relay.Function([A], argmax) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_mean_nd4(remote, target, dtype): +def test_mean_nd4(remote, target, executor_type, dtype): # NCHW input_shape = (1, 3, 729, 729) A = relay.var("data", shape=input_shape, dtype=dtype) mean = relay.mean(A, axis=1, keepdims=True) mod = relay.Function([A], mean) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_argmax_nd4(remote, target, dtype): +def test_argmax_nd4(remote, target, executor_type, dtype): # NCHW input_shape = (1, 3, 729, 729) A = relay.var("data", shape=input_shape, dtype=dtype) argmax = relay.op.argmax(A, axis=[1]) mod = relay.Function([A], argmax) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_reduction_max_nd4(remote, target, dtype): +def test_reduction_max_nd4(remote, target, executor_type, dtype): # NCHW input_shape = (1, 3, 729, 729) A = relay.var("data", shape=input_shape, dtype=dtype) argmax = relay.op.max(A, axis=[1]) mod = relay.Function([A], argmax) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_mean_b4(remote, target, dtype): +def test_mean_b4(remote, target, executor_type, dtype): # NCHW input_shape = (1, 3, 720, 320, 4) A = relay.var("data", shape=input_shape, dtype=dtype) mean = relay.mean(A, axis=1, keepdims=True) mod = relay.Function([A], mean) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_argmax_b4(remote, target, dtype): +def test_argmax_b4(remote, target, executor_type, dtype): # NCHW input_shape = (1, 3, 720, 320, 4) A = relay.var("data", shape=input_shape, dtype=dtype) argmax = relay.op.argmax(A, axis=[1]) mod = relay.Function([A], argmax) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_reduction_max_b4(remote, target, dtype): +def test_reduction_max_b4(remote, target, executor_type, dtype): # NCHW input_shape = (1, 3, 720, 320, 4) A = relay.var("data", shape=input_shape, dtype=dtype) argmax = relay.op.max(A, axis=[1]) mod = relay.Function([A], argmax) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_mean_global_pooling(remote, target, dtype): +def test_mean_global_pooling(remote, target, executor_type, dtype): """ Use case of blocked NCHW4c global pooling with big spatial valies """ @@ -146,12 +174,15 @@ def test_mean_global_pooling(remote, target, dtype): mean = relay.mean(A, axis=[1, 2], keepdims=True) mod = relay.Function([A], mean) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_mean_global_pooling_block4(remote, target, dtype): +def test_mean_global_pooling_block4(remote, target, executor_type, dtype): """ Use case of blocked NCHW4c global pooling with big spatial valies """ @@ -160,12 +191,15 @@ def test_mean_global_pooling_block4(remote, target, dtype): mean = relay.mean(A, axis=[1, 2], keepdims=True) mod = relay.Function([A], mean) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl @tvm.testing.parametrize_targets("opencl -device=adreno") -def test_max_global_pooling_block4(remote, target, dtype): +def test_max_global_pooling_block4(remote, target, executor_type, dtype): """ Use case of blocked NCHW4c global pooling with big spatial valies """ @@ -174,7 +208,10 @@ def test_max_global_pooling_block4(remote, target, dtype): mean = relay.max(A, axis=[1, 2], keepdims=True) mod = relay.Function([A], mean) - build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + if executor_type == "ge": + build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) + else: + build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target) @tvm.testing.requires_opencl diff --git a/tests/python/relay/opencl_texture/utils/adreno_utils.py b/tests/python/relay/opencl_texture/utils/adreno_utils.py index e2a271d9f68d..de325d822c3c 100644 --- a/tests/python/relay/opencl_texture/utils/adreno_utils.py +++ b/tests/python/relay/opencl_texture/utils/adreno_utils.py @@ -26,6 +26,7 @@ from tvm.relay import testing from tvm.relay.transform import recast from tvm.contrib import graph_runtime +from tvm.runtime.vm import VirtualMachine import json @@ -122,6 +123,89 @@ def build_run_compare( return graph +def build_run_compare_vm( + remote, + tvm_mod, + params1, + input_shape, + dtypes, + target="llvm", + static_mem_scopes=[], + gpu_preprocess=None, + stat_file=None, +): + if remote is None: + target_host = "llvm" + else: + target_host = "llvm -mtriple=arm64-linux-android" + + if gpu_preprocess: + tvm_mod_nchwc = gpu_preprocess(tvm_mod) + else: + tvm_mod_nchwc = tvm_mod + + if isinstance(tvm_mod_nchwc, relay.Function): + module = tvm.IRModule({}) + module["main"] = tvm_mod_nchwc + tvm_mod_nchwc = module + + if stat_file is not None: + with autotvm.apply_history_best(stat_file): + with tvm.transform.PassContext(opt_level=3): + vmc = relay.vm.compile( + tvm_mod_nchwc, target=target, target_host=target_host, params=params1 + ) + else: + with tvm.transform.PassContext(opt_level=3): + vmc = relay.vm.compile( + tvm_mod_nchwc, target=target, target_host=target_host, params=params1 + ) + + # TODO(echuraev): enable scope checking + ## verification that storage_scope has expected textures scopes + # graph_json = json.loads(graph) + # if "storage_scope" in graph_json["attrs"]: + # assert ( + # len(static_mem_scopes) == len(graph_json["attrs"]["storage_scope"][1]) + # or len(static_mem_scopes) == 0 + # ) + # else: + # assert len(static_mem_scopes) == 0 + + # for i in range(0, len(static_mem_scopes)): + # assert static_mem_scopes[i] == graph_json["attrs"]["storage_scope"][1][i] + + if remote is None: + dev = tvm.opencl() + vm = VirtualMachine(vmc, dev, "naive") + else: + temp = utils.tempdir() + dso_binary = "dev_lib_cl.so" + dso_binary_path = temp.relpath(dso_binary) + dev = remote.cl(0) + vmc.mod.export_library(dso_binary_path, ndk.create_shared) + remote.upload(dso_binary_path) + rlib = remote.load_module(dso_binary) + vm = VirtualMachine(rlib, dev, "naive") + data = {} + inputs = [] + for key in input_shape: + inputs.append(np.random.normal(size=input_shape[key]).astype(dtypes[key])) + data[key] = tvm.nd.array(inputs[-1], dev) + for k, v in params1.items(): + data[k] = tvm.nd.array(v, dev) + vm.set_input("main", **data) + vm.invoke_stateful("main") + + ref_outputs = get_cpu_reference(tvm_mod, params1, input_shape, inputs) + for i, ref_output in enumerate(ref_outputs): + tvm_output = vm.get_outputs()[i] + output = tvm_output.asnumpy() + + np.testing.assert_allclose(output, ref_output, rtol=1e-1, atol=1e-1) + return vmc + + def gpu_preprocess(tvm_mod): layout_config = relay.transform.LayoutConfig() desired_layouts = {"nn.conv2d": ["NCHW4c", "OIHW4o"]} diff --git a/tests/python/relay/test_pass_dead_code_elimination.py b/tests/python/relay/test_pass_dead_code_elimination.py index 68d2919ec38d..70dc1dd4f794 100644 --- a/tests/python/relay/test_pass_dead_code_elimination.py +++ b/tests/python/relay/test_pass_dead_code_elimination.py @@ -16,8 +16,10 @@ # under the License. import tvm import tvm.testing +from tvm import relay from tvm.relay import Function, transform from tvm.relay.testing import inception_v3 +import numpy as np import pytest cpu_scope = tvm.target.VirtualDevice(tvm.cpu(), tvm.target.Target("llvm")) @@ -228,6 +230,11 @@ def @main() { def test_impure_op(): + shape = np.array([64, 2]) + metatable = { + "VirtualDevice": [cpu_scope], + "relay.Constant": [relay.const(shape, dtype="int64")], + } """Don't elide calls to side-effecting operators.""" before_program = tvm.relay.parse( """ @@ -235,7 +242,7 @@ def test_impure_op(): def @main() { let %size: int64 = cast(1024, dtype="int64"); let %alignment: int64 = cast(64, dtype="int64"); - let %x = memory.alloc_storage(%size, %alignment, virtual_device=meta[VirtualDevice][0]); + let %x = memory.alloc_storage(%size, meta[relay.Constant][0], %alignment, virtual_device=meta[VirtualDevice][0]); let %_ = memory.kill(%x); 0 } @@ -250,6 +257,7 @@ def @main() { #[version = "0.0.5"] def @main() { %0 = memory.alloc_storage(cast(1024, dtype="int64"), + meta[relay.Constant][0], cast(64, dtype="int64"), virtual_device=meta[VirtualDevice][0]); let %_ = memory.kill(%0); @@ -267,6 +275,11 @@ def @main() { def test_impure_func(): + shape = np.array([64, 2]) + metatable = { + "VirtualDevice": [cpu_scope], + "relay.Constant": [relay.const(shape, dtype="int64")], + } """Don't elide calls to side-effecting functions.""" before_program = tvm.relay.parse( """ @@ -274,7 +287,7 @@ def test_impure_func(): def @f() -> int { let %size: int64 = cast(1024, dtype="int64"); let %alignment: int64 = cast(64, dtype="int64"); - let %x = memory.alloc_storage(%size, %alignment, virtual_device=meta[VirtualDevice][0]); + let %x = memory.alloc_storage(%size, meta[relay.Constant][0], %alignment, virtual_device=meta[VirtualDevice][0]); let %_ = memory.kill(%x); 0 } @@ -293,6 +306,7 @@ def @main() -> int { #[version = "0.0.5"] def @f() -> int { %0 = memory.alloc_storage(cast(1024, dtype="int64"), + meta[relay.Constant][0], cast(64, dtype="int64"), virtual_device=meta[VirtualDevice][0]); let %_ = memory.kill(%0); diff --git a/tests/python/relay/test_pass_plan_devices.py b/tests/python/relay/test_pass_plan_devices.py index c7f42103cab3..f654b4b4536a 100644 --- a/tests/python/relay/test_pass_plan_devices.py +++ b/tests/python/relay/test_pass_plan_devices.py @@ -761,14 +761,18 @@ def ref(x): def test_alloc_storage(): - metatable = {"VirtualDevice": [HOST, GPU]} + shape = np.array([3, 2]) + metatable = { + "VirtualDevice": [HOST, GPU], + "relay.Constant": [relay.const(shape, dtype="int64")], + } def input(): return tvm.relay.parse( """ #[version = "0.0.5"] def @main(%size: int64, %alignment: int64) { - memory.alloc_storage(%size, %alignment, virtual_device=meta[VirtualDevice][1]) + memory.alloc_storage(%size, meta[relay.Constant][0], %alignment, virtual_device=meta[VirtualDevice][1]) } """, "from_string", @@ -782,7 +786,8 @@ def expected(): #[version = "0.0.5"] def @main(%size {virtual_device=meta[VirtualDevice][0]}: int64, %alignment {virtual_device=meta[VirtualDevice][0]}: int64, virtual_device=meta[VirtualDevice][1]) { - memory.alloc_storage(%size, %alignment, virtual_device=meta[VirtualDevice][1]) + %0 = on_device(meta[relay.Constant][0], virtual_device=meta[VirtualDevice][0], constrain_result=True); + memory.alloc_storage(%size, %0, %alignment, virtual_device=meta[VirtualDevice][1]) } """, "from_string",