From 1e459509cce2b98f14b0449019794ae32fd18733 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 30 May 2024 16:50:39 -0700 Subject: [PATCH 01/22] WIP: IdModel-based indexing --- CMakeLists.txt | 2 + csrc/device_lower/lower2device.cpp | 2 +- csrc/device_lower/utils.cpp | 22 + csrc/device_lower/utils.h | 2 + csrc/id_model/id_model.h | 9 + csrc/id_model/indexing.cpp | 753 +++++++++++++++++++++++++++++ csrc/id_model/indexing.h | 80 +++ csrc/id_model/utils.h | 55 +++ csrc/ir/utils.h | 16 + tests/cpp/test_indexing.cpp | 123 +++++ 10 files changed, 1063 insertions(+), 1 deletion(-) create mode 100644 csrc/id_model/indexing.cpp create mode 100644 csrc/id_model/indexing.h create mode 100644 csrc/id_model/utils.h create mode 100644 tests/cpp/test_indexing.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ed87fd7bbbe..6be651ca58c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,6 +135,7 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/host_ir/executor.cpp ${NVFUSER_SRCS_DIR}/host_ir/host_ir.cpp ${NVFUSER_SRCS_DIR}/id_model/id_model.cpp + ${NVFUSER_SRCS_DIR}/id_model/indexing.cpp ${NVFUSER_SRCS_DIR}/id_model/loop_promotion.cpp ${NVFUSER_SRCS_DIR}/id_model/schedule.cpp ${NVFUSER_SRCS_DIR}/id_model/to_string.cpp @@ -506,6 +507,7 @@ list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/tests/cpp/test_gpu_transpose.cpp ${NVFUSER_ROOT}/tests/cpp/test_gpu_utils.cpp ${NVFUSER_ROOT}/tests/cpp/test_id_model.cpp + ${NVFUSER_ROOT}/tests/cpp/test_indexing.cpp ${NVFUSER_ROOT}/tests/cpp/test_iter_visitor.cpp ${NVFUSER_ROOT}/tests/cpp/test_linked_hash_map.cpp ${NVFUSER_ROOT}/tests/cpp/test_loop_rotation.cpp diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index 0206ab7f885..6871e6a592d 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -390,7 +390,7 @@ void GpuLower::analysis(Fusion* fusion) { // functionality should be affected. New IterDomains may be created, // so it is expected that generated code may use diffrent variable // names - if (isOptionEnabled(EnableOption::IdModel)) { + if (true || isOptionEnabled(EnableOption::IdModel)) { IdModel id_model(fusion_); } diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp index 9e1883b3d91..657effa5442 100644 --- a/csrc/device_lower/utils.cpp +++ b/csrc/device_lower/utils.cpp @@ -920,6 +920,28 @@ std::array getMmaLayout(const MmaOp* expr) { return layout; } +// Returns true if expr is an expression that initializes a reduction +// buffer. +bool isReductionInitExpr(const Expr* expr) { + // False if its output isn't a TensorView + if (!ir_utils::isTvOp(expr)) { + return false; + } + // False if it doesn't have any reduction axis + const auto out_tv = ir_utils::getTvOutput(expr); + if (!out_tv->domain()->hasReduction()) { + return false; + } + // False if it has have TensorView inputs as initialization should + // never use TensorViews + const auto tv_filter_inp_view = + ir_utils::filterByType(expr->inputs()); + if (tv_filter_inp_view.begin() != tv_filter_inp_view.end()) { + return false; + } + return true; +} + } // namespace lower_utils } // namespace nvfuser diff --git a/csrc/device_lower/utils.h b/csrc/device_lower/utils.h index 77acc095750..0d1e0b5f96a 100644 --- a/csrc/device_lower/utils.h +++ b/csrc/device_lower/utils.h @@ -322,6 +322,8 @@ Val* getNumThreadsInTensorView(TensorView* tv); //! Get the unit dimensions of A and B for the given MmaOp. std::array getMmaLayout(const MmaOp* expr); +bool isReductionInitExpr(const Expr* expr); + } // namespace lower_utils } // namespace nvfuser diff --git a/csrc/id_model/id_model.h b/csrc/id_model/id_model.h index 5b60ff474c6..b912978ca48 100644 --- a/csrc/id_model/id_model.h +++ b/csrc/id_model/id_model.h @@ -153,6 +153,15 @@ class IdModel : public PolymorphicBase { std::string toString() const; + bool empty() const { + return tvs_.empty(); + } + + Fusion* fusion() const { + NVF_ERROR(!tvs_.empty()); + return tvs_.at(0)->fusion(); + } + // Build all graphs, i.e., Exact, AlmostExact, Permissive and // LOOP. This is by default called from the constructor void buildAllGraphs(); diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp new file mode 100644 index 00000000000..9a912b4b4b1 --- /dev/null +++ b/csrc/id_model/indexing.cpp @@ -0,0 +1,753 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace nvfuser { + +namespace { + +class IndexingTraversal : public ValGraphBFS { + public: + IndexingTraversal( + const ValGraph& graph, + std::vector from_groups, + std::vector to_groups) + : ValGraphBFS(graph, from_groups, to_groups) {} + + virtual ~IndexingTraversal() = default; + + static ExprPath getExprsBetween( + const std::vector& from_domains, + const std::vector& to_domains, + const ValGraph& graph) { + const ValGroups from_groups = graph.toGroups(from_domains); + const ValGroups to_groups = graph.toGroups(to_domains); + + IndexingTraversal traversal( + graph, + {from_groups.vector().begin(), from_groups.vector().end()}, + {to_groups.vector().begin(), to_groups.vector().end()}); + traversal.traverse(); + return traversal.getShortestExprPath(); + } + + using ValGraphBFS::isVisited; + + bool isDependencySatisfied(const GroupType& group) const override { + if (const ValGroup* vg = std::get_if(&group); + vg != nullptr && (*vg)->front()->as()->isBroadcast()) { + VERBOSE() << "Dependency satisfied as it's broadcast" << std::endl; + return true; + } + return ValGraphBFS::isDependencySatisfied(group); + } +}; + +// Get the promotion domain of a given loop domain. +IterDomain* getLoopPromotion(IterDomain* loop_id, const IdModel& id_model) { + const auto& loop_graph = id_model.idGraph(IdMappingMode::LOOP); + const auto& loop_promotion_map = id_model.loopPromotionMap(); + const auto& loop_group = loop_graph.toGroup(loop_id); + + auto loop_promotion_map_it = loop_promotion_map.find(loop_group); + NVF_ERROR( + loop_promotion_map_it != loop_promotion_map.end(), + "No loop promotion found: ", + loop_id->toString(), + ". Loop group: ", + nvfuser::toString(loop_group)); + + return loop_promotion_map_it->second; +} + +// Get the loop domains of a given expr, which are (potentially +// promoted) loop domains of the consumer tensor. +std::vector getLoopDomains( + const Expr* expr, + const IdModel& id_model) { + // Assume consumer-based indexing. Needs to revisit for ops like + // scatter + auto loop_domains = ir_utils::getTvOutput(expr)->getLeafDomain(); + + // If this is an expr initializing a buffer for a reduction, there + // should be no loops for reduction domains + if (lower_utils::isReductionInitExpr(expr)) { + loop_domains.erase( + std::remove_if( + loop_domains.begin(), + loop_domains.end(), + [](IterDomain* id) -> bool { return id->isReduction(); }), + loop_domains.end()); + } + + for (auto& loop_id : loop_domains) { + loop_id = getLoopPromotion(loop_id, id_model); + } + + return loop_domains; +} + +bool isAllocated(IterDomain* id, const TensorView* tv) { + // If the extent is 1, it's effectively the same as broadcast. + return ir_utils::isShared(tv->getMemoryType(), id->getParallelType()) && + !id->isBroadcast() && !id->isReduction() && !id->extent()->isOneInt(); +} + +Val* getAllocationStride(TensorView* tv, int64_t alloc_dim) { + const auto& alloc_dom = tv->getMaybeAllocationDomain(); + int64_t stride_dim = -1; + for (const auto i : c10::irange(alloc_dim + 1)) { + if (alloc_dom.at(i)->isReduction()) { + continue; + } + ++stride_dim; + } + if (stride_dim == -1) { + return nullptr; + } + + return IrBuilder::getItemExpr( + IrBuilder::getAttrExpr(IrBuilder::metadataExpr(tv), "alloc_stride"), + stride_dim); +} + +// Currently it's only Shared or Local but Global can be the case +// too. +bool isAllocationBasedOnLeaf(TensorView* tv) { + return tv->getMemoryType() == MemoryType::Shared || + tv->getMemoryType() == MemoryType::Local; +} + +// Get the allocation domains of a given tensor. Also returns its +// strides as well as +std::tuple, std::vector> getAllocationDomains( + TensorView* tv, + const IdModel& id_model) { + std::vector allocation_domains; + + auto inlining_pos = tv->getComputeAtPosition(); + + bool use_set_allocatin_domain = false; + + if (tv->hasAllocation()) { + if (tv->getMemoryType() == MemoryType::Shared || + tv->getMemoryType() == MemoryType::Local) { + if (std::is_permutation( + tv->getLeafDomain().begin(), + tv->getLeafDomain().end(), + tv->getAllocationDomain().begin())) { + use_set_allocatin_domain = true; + } + } else { + use_set_allocatin_domain = true; + } + } + + // Ignore allocation of non-global tensors for now + if (use_set_allocatin_domain) { + allocation_domains = tv->getAllocationDomain(); + NVF_ERROR(!tv->isDoubleBuffered()); + } else { + // If allocation domain is not set, assume that: + // Local/Shared: leaf domains to the right of the CA position + // Global: rfactor domains + if (tv->getMemoryType() == MemoryType::Global) { + VERBOSE() << "Tv does not have allocation of " << tv->toString() << ", " + << toDelimitedString(tv->getMaybeAllocationDomain()) + << std::endl; + allocation_domains = tv->getRFactorDomain(); + NVF_ERROR(!tv->isDoubleBuffered()); + } else if (tv->getMemoryType() == MemoryType::Shared) { + for (const auto i : c10::irange(tv->nDims())) { + auto leaf_id = tv->axis(i); + VERBOSE() << "Smem leaf domain: " << leaf_id->toString() << " of " + << tv->toString() << std::endl; + auto pt = leaf_id->getParallelType(); + if (isParallelTypeDeviceDim(pt) || isParallelTypeBlockDim(pt)) { + continue; + } + if (i < inlining_pos && !isParallelTypeThreadDim(pt)) { + continue; + } + allocation_domains.push_back(leaf_id); + } + } else { + allocation_domains = { + tv->getLeafDomain().begin() + inlining_pos, + tv->getLeafDomain().end()}; + } + } + + // TODO: Fix alloation domains with vectorization + // This is an ugly workaround, but the allocation domain of a tensor + // with vectorized domains may not be the same as the leaf fomain + // since the vectorized domain must be at the innermost position in + // the allocation domain, but it's allowed to be located anywhwere + // in the leaf domain. + // This shouldn't be necessary for global memory tensors as their + // allocation domains are rfactor domains + { + if (tv->getMemoryType() != MemoryType::Global) { + IterDomain* id_to_move_back = nullptr; + // Vectorized load + if (tv->definition() != nullptr && tv->definition()->isA() && + tv->definition()->as()->opType() == + LoadStoreOpType::Set) { + auto vec_it = std::find_if( + allocation_domains.begin(), + allocation_domains.end(), + [](auto index_domain) -> bool { + return isParallelTypeVectorize(index_domain->getParallelType()); + }); + if (vec_it != allocation_domains.end() && + *vec_it != allocation_domains.back()) { + id_to_move_back = *vec_it; + } + } else { + for (const auto ls_use : + ir_utils::filterByType(tv->uses())) { + if (ls_use->opType() != LoadStoreOpType::Set) { + continue; + } + auto consumer_tv = ls_use->out()->as(); + auto vec_it = std::find_if( + consumer_tv->getLeafDomain().begin(), + consumer_tv->getLeafDomain().end(), + [](auto consumer_leaf_id) -> bool { + return isParallelTypeVectorize( + consumer_leaf_id->getParallelType()); + }); + if (vec_it == consumer_tv->getLeafDomain().end()) { + continue; + } + const auto& vec_group = + id_model.idGraph(IdMappingMode::EXACT).toGroup(*vec_it); + auto index_it = std::find_if( + allocation_domains.begin(), + allocation_domains.end(), + [&](auto index_id) -> bool { return vec_group->has(index_id); }); + if (index_it == allocation_domains.end() || + *index_it == allocation_domains.back()) { + continue; + } + + id_to_move_back = *index_it; + } + } + + if (id_to_move_back != nullptr) { + // reorder the vec domain to the end of the index domains + std::vector reordered_index_domains; + reordered_index_domains.reserve(allocation_domains.size()); + for (const auto id : allocation_domains) { + if (id != id_to_move_back) { + reordered_index_domains.push_back(id); + } + } + reordered_index_domains.push_back(id_to_move_back); + allocation_domains = reordered_index_domains; + } + } + } + + auto tv_for_promotion = tv; + + std::vector strides(allocation_domains.size(), nullptr); + for (const auto i : c10::irange(allocation_domains.size())) { + auto dim = allocation_domains.size() - i - 1; + auto index_domain = allocation_domains.at(dim); + + if (index_domain->isReduction() || index_domain->isBroadcast()) { + continue; + } + + strides[dim] = getAllocationStride(tv, (int64_t)dim); + } + + std::vector actual_index_domains; + std::vector actual_strides; + std::vector actual_contiguity; + for (const auto i : c10::irange(allocation_domains.size())) { + auto index_domain = allocation_domains.at(i); + if (!isAllocated(index_domain, tv)) { + continue; + } + + // If it's a leaf domain, the promoted domain is the true domain + // for allocation and indexing. + bool is_leaf = std::find( + tv_for_promotion->getLeafDomain().begin(), + tv_for_promotion->getLeafDomain().end(), + index_domain) != tv->getLeafDomain().end(); + auto actual_id = + is_leaf ? getLoopPromotion(index_domain, id_model) : index_domain; + VERBOSE() << "Index domain: " << index_domain->toString() + << ", actual domain (promotion domain): " << actual_id->toString() + << std::endl; + + actual_index_domains.push_back(actual_id); + actual_strides.push_back(strides.at(i)); + NVF_ERROR( + actual_strides.back() != nullptr, + "Stride unknown for ", + index_domain->toString(), + " (promoted to ", + actual_id->toString(), + ")"); + } + + NVF_ERROR(actual_index_domains.size() == actual_strides.size()); + + return {actual_index_domains, actual_strides}; +} + +ExprPath getIndexingTraversalPath( + const Expr* expr, + const std::vector& from_domains, + const std::vector& to_domains, + const ValGraph& traversal_graph) { + VERBOSE() << "getIndexingTraversalPath: " << toDelimitedString(from_domains) + << " -> " << toDelimitedString(to_domains) << std::endl; + + auto indexing_path = IndexingTraversal::getExprsBetween( + from_domains, to_domains, traversal_graph); + + VERBOSE() << "Indexing path:\n"; + for (const auto& [expr_group, direction] : indexing_path) { + Expr* expr = expr_group->front(); + VERBOSE() << direction << " " << expr->toString(); + } + VERBOSE() << "--- path done ---\n"; + + return indexing_path; +} + +class IdGraphIndexCompute : public OptOutDispatch { + public: + IdGraphIndexCompute( + const ValGraph& exact_graph, + const std::unordered_map& initial_index_map) + : traversal_graph_(exact_graph), index_map_(initial_index_map) {} + + using OptOutDispatch::handle; + + void handle(Split* split) override; + + void handle(Merge* merge) override; + + bool isForward(Expr* expr) const; + + bool hasIndex(IterDomain* id) const; + + Val* getIndex(IterDomain* id) const; + + void setIndex(IterDomain* id, Val* idx); + + std::unordered_map indexMap() const { + return index_map_; + } + + void propagate(const ExprGroup& expr_group, Direction direction) { + NVF_ERROR(!expr_group->empty()); + + dispatch(expr_group->front()); + } + + private: + const ValGraph& traversal_graph_; + std::unordered_map index_map_; +}; + +bool IdGraphIndexCompute::hasIndex(IterDomain* id) const { + // If it's a broadcast, its index is always zero. + if (id->isBroadcast()) { + return true; + } + const ValGroup& id_group = traversal_graph_.toGroup(id); + return index_map_.find(id_group) != index_map_.end(); +} + +Val* IdGraphIndexCompute::getIndex(IterDomain* id) const { + // If it's a broadcast, its index is always zero. + if (id->isBroadcast()) { + return id->fusion()->zeroVal(); + } + const ValGroup& id_group = traversal_graph_.toGroup(id); + auto it = index_map_.find(id_group); + NVF_ERROR(it != index_map_.end(), "Index not found: ", id->toString()); + return it->second; +} + +void IdGraphIndexCompute::setIndex(IterDomain* id, Val* idx) { + VERBOSE() << "setIndex: " << id->name() << " -> " << idx->toInlineString() + << std::endl; + const ValGroup& id_group = traversal_graph_.toGroup(id); + index_map_.emplace(id_group, idx); +} + +bool IdGraphIndexCompute::isForward(Expr* expr) const { + bool ready = true; + for (const auto inp : ir_utils::filterByType(expr->inputs())) { + if (!hasIndex(inp)) { + VERBOSE() << "No index for input: " << inp->toString() << std::endl; + ready = false; + break; + } + } + if (ready) { + return true; + } + + // Can just return false here. Just make sure the outputs are + // already processed + for (const auto out : ir_utils::filterByType(expr->outputs())) { + NVF_ERROR(hasIndex(out), "Output index not found: ", out->toString()); + } + + return false; +} + +void IdGraphIndexCompute::handle(Split* split) { + const bool is_forward = isForward(split); + + VERBOSE() << "IdGraphIndexCompute handle (" << (is_forward ? "fwd" : "bwd") + << "): " << split->toString(); + + if (is_forward) { + auto in_idx = getIndex(split->in()); + auto inner_extent = split->inner()->extent(); + auto outer_idx = SimplifyingIrBuilder::divExpr(in_idx, inner_extent); + Val* inner_idx = nullptr; + inner_idx = SimplifyingIrBuilder::modExpr(in_idx, inner_extent); + setIndex(split->outer(), outer_idx); + setIndex(split->inner(), inner_idx); + } else { + auto outer_idx = getIndex(split->outer()); + auto inner_idx = getIndex(split->inner()); + auto inner_extent = split->inner()->extent(); + auto in_idx = SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::mulExpr(outer_idx, inner_extent), inner_idx); + setIndex(split->in(), in_idx); + } +} + +void IdGraphIndexCompute::handle(Merge* merge) { + const bool is_forward = isForward(merge); + + VERBOSE() << "IdGraphIndexCompute handle (" << (is_forward ? "fwd" : "bwd") + << "): " << merge->toString(); + + // TODO: use getMaybeExpandedExtent? + auto inner_ext = merge->inner()->extent(); + + if (is_forward) { + auto outer_idx = getIndex(merge->outer()); + auto inner_idx = getIndex(merge->inner()); + auto out_idx = SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::mulExpr(outer_idx, inner_ext), inner_idx); + setIndex(merge->out(), out_idx); + } else { + auto out_idx = getIndex(merge->out()); + auto outer_idx = SimplifyingIrBuilder::divExpr(out_idx, inner_ext); + setIndex(merge->outer(), outer_idx); + Val* inner_idx = SimplifyingIrBuilder::modExpr(out_idx, inner_ext); + setIndex(merge->inner(), inner_idx); + } +} + +ParallelType getParallelType(const ValGroup& loop_group) { + ParallelType common_pt = ParallelType::Serial; + for (const auto val : *loop_group) { + auto pt = val->as()->getParallelType(); + if (common_pt == pt || pt == ParallelType::Serial) { + continue; + } else if (common_pt == ParallelType::Serial) { + common_pt = pt; + } else { + // Inconsistent parallelization + NVF_ERROR( + false, + "Inconsistent parallelization detected. ", + "Known type: ", + common_pt, + "New type: ", + pt); + } + } + + return common_pt; +} + +kir::ForLoop* getForLoop( + IterDomain* loop_id, + const std::vector& for_loops, + const ValGraph& loop_graph) { + auto it = std::find_if( + for_loops.begin(), for_loops.end(), [&](kir::ForLoop* for_loop) -> bool { + IterDomain* for_loop_id = for_loop->iter_domain(); + return loop_graph.disjointValSets().strictAreMapped( + loop_id, for_loop_id); + }); + if (it != for_loops.end()) { + return *it; + } else { + return nullptr; + } +} + +} // namespace + +TensorIndexer::TensorIndexer(const IdModel& id_model) : id_model_(id_model) { + buildLoopIndexMap(); +} + +void TensorIndexer::buildLoopIndexMap() { + if (id_model_.empty()) { + return; + } + + Fusion* fusion = id_model_.fusion(); + FusionGuard fg(fusion); + + auto shouldUseZeroIndex = [&](const ValGroup& loop_group) -> bool { + ParallelType ptype = getParallelType(loop_group); + if (isParallelTypeThread(ptype)) { + return false; + } + + // The device paralle type is not included in "isThread". We don't + // allocate any index variable for device-parallel domains. + if (isParallelTypeDeviceDim(ptype)) { + return true; + } + + // All loops in this set are non-parallel, non-concretized broadcast + // iterdomains, their "index variable" should be zero. + if (std::all_of(loop_group->begin(), loop_group->end(), [](Val* val) { + return val->as()->isBroadcast(); + })) { + VERBOSE() << "All domains are broadcast: " + << nvfuser::toString(loop_group) << std::endl; + return true; + } + + // Trivial loop + // TODO: consider expanded extent? + auto leaf_id = + getLoopPromotion(loop_group->front()->as(), id_model_); + if (!leaf_id->maybePartial() && + simplifyExpr(leaf_id->extent())->isOneInt()) { + return true; + } + + return false; + }; + + for (auto expr : fusion->exprs()) { + if (!ir_utils::isTvOp(expr)) { + continue; + } + auto tv_output = ir_utils::getTvOutput(expr); + for (auto leaf_id : tv_output->getLeafDomain()) { + const ValGroup& loop_group = + id_model_.idGraph(IdMappingMode::LOOP).toGroup(leaf_id); + + if (loop_index_map_.find(loop_group) != loop_index_map_.end()) { + // Index already assigned + continue; + } + + // TODO: halo loop not considered + // TODO: double buffering not considered + + Val* loop_index = nullptr; + + // First allocate thread and grid parallel indices: + // The validation pass will check that the parallel bindings within the + // loop nodes are consistent so all the loops within this disjoint set + // will be realized implicitly using parallel index variables. + ParallelType ptype = getParallelType(loop_group); + if (isParallelTypeThread(ptype)) { + loop_index = NamedScalar::getParallelIndex(ptype); + } else if (shouldUseZeroIndex(loop_group)) { + VERBOSE() << "Use zero for " << nvfuser::toString(loop_group) + << std::endl; + loop_index = fusion->zeroVal(); + } else { + // Everything now should be serial concrete loops. For the mean + // time, just use the same index integer val generated for + // ComputeAtMap if available. + if (GpuLower::hasCurrent()) { + const auto& ca_map = GpuLower::current()->caMap(); + for (const auto& id : + ir_utils::filterByType(loop_group->vector())) { + if (!ca_map->getIdSets(IdMappingMode::LOOP).mappingExists(id)) { + continue; + } + VERBOSE() << "Trying to find index val for " << id->toString() + << std::endl; + loop_index = ca_map->getIndexVariable(id); + break; + } + if (loop_index == nullptr) { + VERBOSE() << "No existing index found for " + << nvfuser::toString(loop_group) << std::endl; + } + } else { + loop_index = IrBuilder::create(DataType::Index); + } + } + loop_index_map_[loop_group] = loop_index; + VERBOSE() << "Loop index map: " << nvfuser::toString(loop_group) << " -> " + << loop_index->toInlineString() << std::endl; + } + } +} + +Val* TensorIndexer::getLoopIndex(IterDomain* loop_id) const { + // loop_id must be a loop domain. + const auto& loop_group = + id_model_.idGraph(IdMappingMode::LOOP).toGroup(loop_id); + auto loop_index_map_it = loop_index_map_.find(loop_group); + NVF_ERROR( + loop_index_map_it != loop_index_map_.end(), + "No loop index found for ", + loop_id->toString()); + + Val* loop_index = loop_index_map_it->second; + return loop_index; +} + +std::unordered_map TensorIndexer::getInitialIndexMap( + const Expr* expr, + const std::vector& loop_domains) const { + // loop_index_map_ is a map on the loop graph. For index + // propagation, need a map for the exact graph + + std::unordered_map initial_index_map; + + for (IterDomain* loop_id : loop_domains) { + Val* loop_index = getLoopIndex(loop_id); + const auto& exact_group = traversalGraph().toGroup(loop_id); + VERBOSE() << "Setting initial index. " << loop_id->toString() << ", " + << nvfuser::toString(exact_group) << ", " + << loop_index->toInlineString() << std::endl; + + if (initial_index_map.find(exact_group) != initial_index_map.end()) { + // Initial index already set. This can happen as exact_group is + // actually an almost-exact group. It should be just size-1 + // domain. + NVF_ERROR( + loop_index->isZeroInt(), + "Unexpected initial index: ", + loop_index->toInlineString()); + auto existing_index = initial_index_map.at(exact_group); + NVF_ERROR( + existing_index->isZeroInt(), + "Unexpected initial index: ", + existing_index->toInlineString()); + continue; + } + + NVF_ERROR( + initial_index_map.emplace(exact_group, loop_index).second, + "Initial index already set for ", + nvfuser::toString(exact_group), + ". Existing: ", + initial_index_map.at(exact_group)->toInlineString(), + ". New: ", + loop_index->toInlineString()); + } + + return initial_index_map; +} + +Val* TensorIndexer::getTensorIndex( + TensorView* tv, + const Expr* expr, + const std::optional>& for_loops) { + VERBOSE() << "getIndex of " << tv->toString() << " in " << expr->toString(); + + const auto [allocation_domains, strides] = + getAllocationDomains(tv, id_model_); + + VERBOSE() << "Allocation domains: " << toDelimitedString(allocation_domains) + << std::endl; + + const auto& index_info = getIndex(expr, allocation_domains); + const auto& index_map = index_info.index_map; + + // Linearize the indices with strides. + // TODO: Contiguous indexing + + Val* index = tv->fusion()->zeroVal(); + for (const auto i : c10::irange(allocation_domains.size())) { + // Traverse from innermost to outermost + IterDomain* allocation_domain = + allocation_domains.at(allocation_domains.size() - 1 - i); + + Val* stride = strides.at(allocation_domains.size() - 1 - i); + + auto idx_it = index_map.find(traversalGraph().toGroup(allocation_domain)); + NVF_ERROR( + idx_it != index_map.end(), + "Index not found for ", + allocation_domain->toString()); + Val* idx = idx_it->second; + VERBOSE() << "Index of " << allocation_domain->toString() << ": " + << idx->toInlineString() << std::endl; + + index = SimplifyingIrBuilder::addExpr( + index, SimplifyingIrBuilder::mulExpr(idx, stride)); + } + + VERBOSE() << "Final index: " << index->toInlineString() << std::endl; + + return index; +} + +IndexingInfo TensorIndexer::getIndex( + const Expr* expr, + const std::vector& index_domains) const { + const auto loop_domains = getLoopDomains(expr, id_model_); + VERBOSE() << "Loop domains: " << toDelimitedString(loop_domains) << std::endl; + + VERBOSE() << "Index domains: " << toDelimitedString(index_domains) + << std::endl; + + const ExprPath traversal_path = getIndexingTraversalPath( + expr, loop_domains, index_domains, traversalGraph()); + + const std::unordered_map initial_index_map = + getInitialIndexMap(expr, loop_domains); + + IdGraphIndexCompute index_compute(traversalGraph(), initial_index_map); + + for (const auto& [expr_group, direction] : traversal_path) { + index_compute.propagate(expr_group, direction); + } + + IndexingInfo info{traversal_path, index_compute.indexMap()}; + return info; +} + +} // namespace nvfuser diff --git a/csrc/id_model/indexing.h b/csrc/id_model/indexing.h new file mode 100644 index 00000000000..dfd7c847964 --- /dev/null +++ b/csrc/id_model/indexing.h @@ -0,0 +1,80 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#pragma once + +#include +#include +#include +#include +#include +#include + +// Just for RootPredicateInfo. Should be moved to its own header file +#include + +#include + +namespace nvfuser { + +struct IndexingInfo { + ExprPath traversal_path; + std::unordered_map index_map; +}; + +// The basic algorithm of indexing is: +// +// 1. Find the loop domains +// 2. Find the allocation domains +// 3. Find the path from the loop domains to the allocation domains +// 4. Set the initial index vals +// 5. Propagate the initial indices of the loop domains to the allocation +// domains +class TensorIndexer { + public: + TensorIndexer(const IdModel& id_model); + + // The actual ForLoop's are required to support double buffering as + // that affects indexing. If the loops parameter is empty, it's + // simply ignored. That may be useful if (preliminary) indeices are + // needed before the double buffering pass + Val* getTensorIndex( + TensorView* tv, + const Expr* expr, + const std::optional>& loops); + + private: + const ValGraph& traversalGraph() const { + return id_model_.idGraph(IdMappingMode::ALMOSTEXACT); + } + + // Build the map of loop groups to their index Vals. + void buildLoopIndexMap(); + + // Get the index of a loop domain. + Val* getLoopIndex(IterDomain* loop_id) const; + + // + std::unordered_map getInitialIndexMap( + const Expr* expr, + const std::vector& loop_domains) const; + + IndexingInfo getIndex( + const Expr* expr, + const std::vector& index_domains) const; + + private: + const IdModel& id_model_; + + // Mappings from loop groups to their indices. Serial loops will + // be mapped a unique loop index Val. Parallel loops will be mapped + // to NamedScalar such as "threadIdx.x". This map needs to be built + // once and can be reused for different tensors. + std::unordered_map loop_index_map_; +}; + +} // namespace nvfuser diff --git a/csrc/id_model/utils.h b/csrc/id_model/utils.h new file mode 100644 index 00000000000..2d6327bf586 --- /dev/null +++ b/csrc/id_model/utils.h @@ -0,0 +1,55 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#pragma once + +#include + +#include +#include +#include + +#define VERBOSE() verbose(__LINE__) +#define WARN() warn(__LINE__) + +namespace nvfuser { + +// Temporary logging utility +class DebugStream { + public: + DebugStream() + : enabled_(getNvFuserEnv("ID_MODEL_VERBOSE")), out_(std::cerr) {} + + template + DebugStream& operator<<(const T& v) { + if (enabled_) { + out_ << v; + } + return *this; + } + + DebugStream& operator<<(std::ostream& (*endl)(std::ostream&)) { + if (enabled_) { + out_ << endl; + } + return *this; + } + + private: + bool enabled_ = false; + std::ostream& out_; +}; + +inline DebugStream verbose(int line) { + return DebugStream() << "[DEBUG@" << line << "] "; +} + +inline DebugStream warn(int line) { + return DebugStream() << "[WARN@" << line << "] "; +} + +} // namespace nvfuser diff --git a/csrc/ir/utils.h b/csrc/ir/utils.h index a4b5454fa2c..4323cc50e33 100644 --- a/csrc/ir/utils.h +++ b/csrc/ir/utils.h @@ -654,4 +654,20 @@ std::optional> computePermutation( bool hasTrivialAllocationDomain(const TensorView* tv); +// Returns true if memory_type is shared in parallel_type +inline bool isShared(MemoryType memory_type, ParallelType parallel_type) { + switch (memory_type) { + case MemoryType::Local: + return !isParallelTypeThread(parallel_type) && + !isParallelTypeDeviceDim(parallel_type); + case MemoryType::Shared: + return !isParallelTypeBlockDim(parallel_type) && + !isParallelTypeDeviceDim(parallel_type); + case MemoryType::Global: + return !isParallelTypeDeviceDim(parallel_type); + default: + NVF_ERROR(false, "Unknown MemoryType: ", memory_type); + } +} + } // namespace nvfuser::ir_utils diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp new file mode 100644 index 00000000000..be268dcfb85 --- /dev/null +++ b/tests/cpp/test_indexing.cpp @@ -0,0 +1,123 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nvfuser { + +using IndexingTest = NVFuserTest; + +TEST_F(IndexingTest, Simple1) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + + auto tv1 = add(tv0, IrBuilder::create(1.0)); + fusion.addOutput(tv1); + + tv1->merge(0); + tv1->split(0, 4); + + IdModel id_model(&fusion); + TensorIndexer indexer(id_model); + + std::cerr << indexer.getTensorIndex(tv1, tv1->definition(), std::nullopt) + ->toInlineString() + << std::endl; + std::cerr << indexer.getTensorIndex(tv0, tv1->definition(), std::nullopt) + ->toInlineString() + << std::endl; +} + +TEST_F(IndexingTest, Simple2) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(1); + fusion.addInput(tv0); + + auto tv1 = add(tv0, IrBuilder::create(1.0)); + auto tv2 = add(tv0, IrBuilder::create(1.0)); + fusion.addOutput(tv1); + fusion.addOutput(tv2); + + tv1->split(0, 4); + tv2->split(0, 8); + + tv1->axis(1)->parallelize(ParallelType::TIDx); + tv2->axis(1)->parallelize(ParallelType::TIDx); + + fusion.printKernel(); + + std::vector input_shape{17}; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn(input_shape, options); + std::vector aten_inputs({t0}); + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); + auto cg_outputs = fe.runFusion(aten_inputs); + + testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); +} + +TEST_F(IndexingTest, Reshape) { + Fusion fusion; + FusionGuard fg(&fusion); + + const std::vector shape1({100}); + const std::vector shape2({4, 25}); + const std::vector shape3({5, 2, 10}); + + // [i0] + auto tv0 = makeContigConcreteTensor(shape1); + fusion.addInput(tv0); + + auto tv1 = set(tv0); + + // [i2, i3] + auto tv2 = reshape(tv1, shape1, shape2); + + // [i2, i3] + auto tv3 = add(tv2, fusion.oneVal()); + + // [i4, i5, i6] + auto tv4 = reshape(tv3, shape2, shape3); + + // [i4, i5, i6] + auto tv5 = add(tv4, fusion.oneVal()); + + fusion.addOutput(tv5); + + TransformPropagator propagator(tv5); + MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator); + + inlineMost(); + + fusion.print(); + + IdModel id_model(&fusion, true, false, false); +} + +} // namespace nvfuser From f333d42e92fe88333648923c09958067348bb9e4 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Tue, 4 Jun 2024 14:16:47 -0700 Subject: [PATCH 02/22] cleanup --- csrc/id_model/indexing.cpp | 378 ++++++++++-------------------------- csrc/id_model/indexing.h | 40 ++-- tests/cpp/test_indexing.cpp | 4 +- 3 files changed, 136 insertions(+), 286 deletions(-) diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp index 9a912b4b4b1..0e06feb9e34 100644 --- a/csrc/id_model/indexing.cpp +++ b/csrc/id_model/indexing.cpp @@ -25,43 +25,6 @@ namespace nvfuser { namespace { -class IndexingTraversal : public ValGraphBFS { - public: - IndexingTraversal( - const ValGraph& graph, - std::vector from_groups, - std::vector to_groups) - : ValGraphBFS(graph, from_groups, to_groups) {} - - virtual ~IndexingTraversal() = default; - - static ExprPath getExprsBetween( - const std::vector& from_domains, - const std::vector& to_domains, - const ValGraph& graph) { - const ValGroups from_groups = graph.toGroups(from_domains); - const ValGroups to_groups = graph.toGroups(to_domains); - - IndexingTraversal traversal( - graph, - {from_groups.vector().begin(), from_groups.vector().end()}, - {to_groups.vector().begin(), to_groups.vector().end()}); - traversal.traverse(); - return traversal.getShortestExprPath(); - } - - using ValGraphBFS::isVisited; - - bool isDependencySatisfied(const GroupType& group) const override { - if (const ValGroup* vg = std::get_if(&group); - vg != nullptr && (*vg)->front()->as()->isBroadcast()) { - VERBOSE() << "Dependency satisfied as it's broadcast" << std::endl; - return true; - } - return ValGraphBFS::isDependencySatisfied(group); - } -}; - // Get the promotion domain of a given loop domain. IterDomain* getLoopPromotion(IterDomain* loop_id, const IdModel& id_model) { const auto& loop_graph = id_model.idGraph(IdMappingMode::LOOP); @@ -138,7 +101,7 @@ bool isAllocationBasedOnLeaf(TensorView* tv) { } // Get the allocation domains of a given tensor. Also returns its -// strides as well as +// strides. std::tuple, std::vector> getAllocationDomains( TensorView* tv, const IdModel& id_model) { @@ -197,78 +160,6 @@ std::tuple, std::vector> getAllocationDomains( } } - // TODO: Fix alloation domains with vectorization - // This is an ugly workaround, but the allocation domain of a tensor - // with vectorized domains may not be the same as the leaf fomain - // since the vectorized domain must be at the innermost position in - // the allocation domain, but it's allowed to be located anywhwere - // in the leaf domain. - // This shouldn't be necessary for global memory tensors as their - // allocation domains are rfactor domains - { - if (tv->getMemoryType() != MemoryType::Global) { - IterDomain* id_to_move_back = nullptr; - // Vectorized load - if (tv->definition() != nullptr && tv->definition()->isA() && - tv->definition()->as()->opType() == - LoadStoreOpType::Set) { - auto vec_it = std::find_if( - allocation_domains.begin(), - allocation_domains.end(), - [](auto index_domain) -> bool { - return isParallelTypeVectorize(index_domain->getParallelType()); - }); - if (vec_it != allocation_domains.end() && - *vec_it != allocation_domains.back()) { - id_to_move_back = *vec_it; - } - } else { - for (const auto ls_use : - ir_utils::filterByType(tv->uses())) { - if (ls_use->opType() != LoadStoreOpType::Set) { - continue; - } - auto consumer_tv = ls_use->out()->as(); - auto vec_it = std::find_if( - consumer_tv->getLeafDomain().begin(), - consumer_tv->getLeafDomain().end(), - [](auto consumer_leaf_id) -> bool { - return isParallelTypeVectorize( - consumer_leaf_id->getParallelType()); - }); - if (vec_it == consumer_tv->getLeafDomain().end()) { - continue; - } - const auto& vec_group = - id_model.idGraph(IdMappingMode::EXACT).toGroup(*vec_it); - auto index_it = std::find_if( - allocation_domains.begin(), - allocation_domains.end(), - [&](auto index_id) -> bool { return vec_group->has(index_id); }); - if (index_it == allocation_domains.end() || - *index_it == allocation_domains.back()) { - continue; - } - - id_to_move_back = *index_it; - } - } - - if (id_to_move_back != nullptr) { - // reorder the vec domain to the end of the index domains - std::vector reordered_index_domains; - reordered_index_domains.reserve(allocation_domains.size()); - for (const auto id : allocation_domains) { - if (id != id_to_move_back) { - reordered_index_domains.push_back(id); - } - } - reordered_index_domains.push_back(id_to_move_back); - allocation_domains = reordered_index_domains; - } - } - } - auto tv_for_promotion = tv; std::vector strides(allocation_domains.size(), nullptr); @@ -320,34 +211,26 @@ std::tuple, std::vector> getAllocationDomains( return {actual_index_domains, actual_strides}; } -ExprPath getIndexingTraversalPath( - const Expr* expr, - const std::vector& from_domains, - const std::vector& to_domains, - const ValGraph& traversal_graph) { - VERBOSE() << "getIndexingTraversalPath: " << toDelimitedString(from_domains) - << " -> " << toDelimitedString(to_domains) << std::endl; - - auto indexing_path = IndexingTraversal::getExprsBetween( - from_domains, to_domains, traversal_graph); - - VERBOSE() << "Indexing path:\n"; - for (const auto& [expr_group, direction] : indexing_path) { - Expr* expr = expr_group->front(); - VERBOSE() << direction << " " << expr->toString(); - } - VERBOSE() << "--- path done ---\n"; - - return indexing_path; -} - +// Similar to IndexCompute but adapted for the graph-based indexing class IdGraphIndexCompute : public OptOutDispatch { public: IdGraphIndexCompute( - const ValGraph& exact_graph, + const ValGraph& traversal_graph, const std::unordered_map& initial_index_map) - : traversal_graph_(exact_graph), index_map_(initial_index_map) {} + : traversal_graph_(traversal_graph), index_map_(initial_index_map) {} + + // Propagate the index map through a given expr of a specified + // direction. + void propagate(const ExprGroup& expr_group, Direction direction) { + NVF_ERROR(!expr_group->empty()); + dispatch(expr_group->front()); + } + + const std::unordered_map indexMap() const { + return index_map_; + } + private: using OptOutDispatch::handle; void handle(Split* split) override; @@ -356,20 +239,30 @@ class IdGraphIndexCompute : public OptOutDispatch { bool isForward(Expr* expr) const; - bool hasIndex(IterDomain* id) const; - - Val* getIndex(IterDomain* id) const; - - void setIndex(IterDomain* id, Val* idx); + bool hasIndex(IterDomain* id) const { + // If it's a broadcast, its index is always zero. + if (id->isBroadcast()) { + return true; + } + return indexMap().find(toGroup(id)) != indexMap().end(); + } - std::unordered_map indexMap() const { - return index_map_; + Val* getIndex(IterDomain* id) const { + // If it's a broadcast, its index is always zero. + if (id->isBroadcast()) { + return id->fusion()->zeroVal(); + } + auto it = index_map_.find(toGroup(id)); + NVF_ERROR(it != index_map_.end(), "Index not found: ", id->toString()); + return it->second; } - void propagate(const ExprGroup& expr_group, Direction direction) { - NVF_ERROR(!expr_group->empty()); + void setIndex(IterDomain* id, Val* idx) { + index_map_.emplace(toGroup(id), idx); + } - dispatch(expr_group->front()); + const ValGroup& toGroup(IterDomain* id) const { + return traversal_graph_.toGroup(id); } private: @@ -377,33 +270,7 @@ class IdGraphIndexCompute : public OptOutDispatch { std::unordered_map index_map_; }; -bool IdGraphIndexCompute::hasIndex(IterDomain* id) const { - // If it's a broadcast, its index is always zero. - if (id->isBroadcast()) { - return true; - } - const ValGroup& id_group = traversal_graph_.toGroup(id); - return index_map_.find(id_group) != index_map_.end(); -} - -Val* IdGraphIndexCompute::getIndex(IterDomain* id) const { - // If it's a broadcast, its index is always zero. - if (id->isBroadcast()) { - return id->fusion()->zeroVal(); - } - const ValGroup& id_group = traversal_graph_.toGroup(id); - auto it = index_map_.find(id_group); - NVF_ERROR(it != index_map_.end(), "Index not found: ", id->toString()); - return it->second; -} - -void IdGraphIndexCompute::setIndex(IterDomain* id, Val* idx) { - VERBOSE() << "setIndex: " << id->name() << " -> " << idx->toInlineString() - << std::endl; - const ValGroup& id_group = traversal_graph_.toGroup(id); - index_map_.emplace(id_group, idx); -} - +// TODO: Should use the explicit direction bool IdGraphIndexCompute::isForward(Expr* expr) const { bool ready = true; for (const auto inp : ir_utils::filterByType(expr->inputs())) { @@ -456,7 +323,6 @@ void IdGraphIndexCompute::handle(Merge* merge) { VERBOSE() << "IdGraphIndexCompute handle (" << (is_forward ? "fwd" : "bwd") << "): " << merge->toString(); - // TODO: use getMaybeExpandedExtent? auto inner_ext = merge->inner()->extent(); if (is_forward) { @@ -474,6 +340,13 @@ void IdGraphIndexCompute::handle(Merge* merge) { } } +} // namespace + +TensorIndexer::TensorIndexer(const IdModel& id_model) : id_model_(id_model) { + buildLoopIndexMap(); +} + +namespace { ParallelType getParallelType(const ValGroup& loop_group) { ParallelType common_pt = ParallelType::Serial; for (const auto val : *loop_group) { @@ -496,71 +369,14 @@ ParallelType getParallelType(const ValGroup& loop_group) { return common_pt; } - -kir::ForLoop* getForLoop( - IterDomain* loop_id, - const std::vector& for_loops, - const ValGraph& loop_graph) { - auto it = std::find_if( - for_loops.begin(), for_loops.end(), [&](kir::ForLoop* for_loop) -> bool { - IterDomain* for_loop_id = for_loop->iter_domain(); - return loop_graph.disjointValSets().strictAreMapped( - loop_id, for_loop_id); - }); - if (it != for_loops.end()) { - return *it; - } else { - return nullptr; - } -} - } // namespace -TensorIndexer::TensorIndexer(const IdModel& id_model) : id_model_(id_model) { - buildLoopIndexMap(); -} - void TensorIndexer::buildLoopIndexMap() { if (id_model_.empty()) { return; } Fusion* fusion = id_model_.fusion(); - FusionGuard fg(fusion); - - auto shouldUseZeroIndex = [&](const ValGroup& loop_group) -> bool { - ParallelType ptype = getParallelType(loop_group); - if (isParallelTypeThread(ptype)) { - return false; - } - - // The device paralle type is not included in "isThread". We don't - // allocate any index variable for device-parallel domains. - if (isParallelTypeDeviceDim(ptype)) { - return true; - } - - // All loops in this set are non-parallel, non-concretized broadcast - // iterdomains, their "index variable" should be zero. - if (std::all_of(loop_group->begin(), loop_group->end(), [](Val* val) { - return val->as()->isBroadcast(); - })) { - VERBOSE() << "All domains are broadcast: " - << nvfuser::toString(loop_group) << std::endl; - return true; - } - - // Trivial loop - // TODO: consider expanded extent? - auto leaf_id = - getLoopPromotion(loop_group->front()->as(), id_model_); - if (!leaf_id->maybePartial() && - simplifyExpr(leaf_id->extent())->isOneInt()) { - return true; - } - - return false; - }; for (auto expr : fusion->exprs()) { if (!ir_utils::isTvOp(expr)) { @@ -576,9 +392,6 @@ void TensorIndexer::buildLoopIndexMap() { continue; } - // TODO: halo loop not considered - // TODO: double buffering not considered - Val* loop_index = nullptr; // First allocate thread and grid parallel indices: @@ -589,8 +402,6 @@ void TensorIndexer::buildLoopIndexMap() { if (isParallelTypeThread(ptype)) { loop_index = NamedScalar::getParallelIndex(ptype); } else if (shouldUseZeroIndex(loop_group)) { - VERBOSE() << "Use zero for " << nvfuser::toString(loop_group) - << std::endl; loop_index = fusion->zeroVal(); } else { // Everything now should be serial concrete loops. For the mean @@ -603,26 +414,63 @@ void TensorIndexer::buildLoopIndexMap() { if (!ca_map->getIdSets(IdMappingMode::LOOP).mappingExists(id)) { continue; } - VERBOSE() << "Trying to find index val for " << id->toString() - << std::endl; loop_index = ca_map->getIndexVariable(id); break; } - if (loop_index == nullptr) { - VERBOSE() << "No existing index found for " - << nvfuser::toString(loop_group) << std::endl; - } + NVF_ERROR(loop_index != nullptr, + "No existing index found for ", + nvfuser::toString(loop_group)); } else { + // Not reusing the ComputeATMap index assignments loop_index = IrBuilder::create(DataType::Index); } } + + NVF_ERROR(loop_index != nullptr); loop_index_map_[loop_group] = loop_index; - VERBOSE() << "Loop index map: " << nvfuser::toString(loop_group) << " -> " - << loop_index->toInlineString() << std::endl; } } } +bool TensorIndexer::shouldUseZeroIndex(const ValGroup& loop_group) const { + // For parallelized domains that have index NamedScalar's such as + // threadIdx.x, just use the NamedScalar. It doesn't automatically + // mean such parallel indices are actually used in the final index + // expr. For example, TID-parallelized Local tensors won't have + // TID-parallelized iter domains as allocation domains, so threadIdx + // won't appear in the final index expr. + ParallelType ptype = getParallelType(loop_group); + if (isParallelTypeThread(ptype)) { + return false; + } + + // Note that the device paralle type is not included in + // "isThread". This is necessary because we don't have a NamedScalar + // for DID. Since it's always partitioned in any memory space + // currently supported, it's guaranteed to be zero. + if (isParallelTypeDeviceDim(ptype)) { + return true; + } + + // All loops in this set are non-parallel, non-concretized broadcast + // iterdomains, their "index variable" should be zero. + if (std::all_of(loop_group->begin(), loop_group->end(), [](Val* val) { + return val->as()->isBroadcast(); + })) { + return true; + } + + // Trivial loop + auto leaf_id = + getLoopPromotion(loop_group->front()->as(), id_model_); + if (!leaf_id->maybePartial() && + simplifyExpr(leaf_id->extent())->isOneInt()) { + return true; + } + + return false; +} + Val* TensorIndexer::getLoopIndex(IterDomain* loop_id) const { // loop_id must be a loop domain. const auto& loop_group = @@ -638,29 +486,23 @@ Val* TensorIndexer::getLoopIndex(IterDomain* loop_id) const { } std::unordered_map TensorIndexer::getInitialIndexMap( - const Expr* expr, const std::vector& loop_domains) const { - // loop_index_map_ is a map on the loop graph. For index - // propagation, need a map for the exact graph - std::unordered_map initial_index_map; + // For a given list of the loop domains, assign its corresponding + // index Val. for (IterDomain* loop_id : loop_domains) { Val* loop_index = getLoopIndex(loop_id); - const auto& exact_group = traversalGraph().toGroup(loop_id); - VERBOSE() << "Setting initial index. " << loop_id->toString() << ", " - << nvfuser::toString(exact_group) << ", " - << loop_index->toInlineString() << std::endl; - - if (initial_index_map.find(exact_group) != initial_index_map.end()) { - // Initial index already set. This can happen as exact_group is - // actually an almost-exact group. It should be just size-1 - // domain. + const auto& almost_exact_group = traversalGraph().toGroup(loop_id); + + if (initial_index_map.find(almost_exact_group) != initial_index_map.end()) { + // Initial index already set. This can happen as this is an + // almost exact group. It should be just size-1 domain. NVF_ERROR( loop_index->isZeroInt(), "Unexpected initial index: ", loop_index->toInlineString()); - auto existing_index = initial_index_map.at(exact_group); + auto existing_index = initial_index_map.at(almost_exact_group); NVF_ERROR( existing_index->isZeroInt(), "Unexpected initial index: ", @@ -668,23 +510,15 @@ std::unordered_map TensorIndexer::getInitialIndexMap( continue; } - NVF_ERROR( - initial_index_map.emplace(exact_group, loop_index).second, - "Initial index already set for ", - nvfuser::toString(exact_group), - ". Existing: ", - initial_index_map.at(exact_group)->toInlineString(), - ". New: ", - loop_index->toInlineString()); + initial_index_map.emplace(almost_exact_group, loop_index); } return initial_index_map; } -Val* TensorIndexer::getTensorIndex( +Val* TensorIndexer::getLinearIndex( TensorView* tv, - const Expr* expr, - const std::optional>& for_loops) { + const Expr* expr) { VERBOSE() << "getIndex of " << tv->toString() << " in " << expr->toString(); const auto [allocation_domains, strides] = @@ -693,7 +527,7 @@ Val* TensorIndexer::getTensorIndex( VERBOSE() << "Allocation domains: " << toDelimitedString(allocation_domains) << std::endl; - const auto& index_info = getIndex(expr, allocation_domains); + const auto& index_info = computeIndex(expr, allocation_domains); const auto& index_map = index_info.index_map; // Linearize the indices with strides. @@ -725,7 +559,7 @@ Val* TensorIndexer::getTensorIndex( return index; } -IndexingInfo TensorIndexer::getIndex( +IndexingInfo TensorIndexer::computeIndex( const Expr* expr, const std::vector& index_domains) const { const auto loop_domains = getLoopDomains(expr, id_model_); @@ -734,11 +568,13 @@ IndexingInfo TensorIndexer::getIndex( VERBOSE() << "Index domains: " << toDelimitedString(index_domains) << std::endl; - const ExprPath traversal_path = getIndexingTraversalPath( - expr, loop_domains, index_domains, traversalGraph()); + const ValGroups loop_groups = traversalGraph().toGroups(loop_domains); + const ValGroups index_groups = traversalGraph().toGroups(index_domains); + const ExprPath traversal_path = ValGraphBFS::getExprsBetween( + traversalGraph(), loop_groups, index_groups); const std::unordered_map initial_index_map = - getInitialIndexMap(expr, loop_domains); + getInitialIndexMap(loop_domains); IdGraphIndexCompute index_compute(traversalGraph(), initial_index_map); diff --git a/csrc/id_model/indexing.h b/csrc/id_model/indexing.h index dfd7c847964..180a289fa78 100644 --- a/csrc/id_model/indexing.h +++ b/csrc/id_model/indexing.h @@ -22,7 +22,9 @@ namespace nvfuser { struct IndexingInfo { + // Indexing traversal path from loop domains ExprPath traversal_path; + // Index mappings of ID groups along the traversal path std::unordered_map index_map; }; @@ -31,42 +33,54 @@ struct IndexingInfo { // 1. Find the loop domains // 2. Find the allocation domains // 3. Find the path from the loop domains to the allocation domains -// 4. Set the initial index vals +// 4. Set the initial index vals for the loop domains // 5. Propagate the initial indices of the loop domains to the allocation // domains +// +// The indexing traversal is done on the AlmostExact graph augmented +// with the loop promotion map since both the loop and allocations +// domains may be promoted. class TensorIndexer { public: TensorIndexer(const IdModel& id_model); - // The actual ForLoop's are required to support double buffering as - // that affects indexing. If the loops parameter is empty, it's - // simply ignored. That may be useful if (preliminary) indeices are - // needed before the double buffering pass - Val* getTensorIndex( - TensorView* tv, - const Expr* expr, - const std::optional>& loops); + // Get a linear index of a given tensor appearing in a given expr, either + // as a consumer or a producer. The predicate indexing will have a + // separate interface. + Val* getLinearIndex(TensorView* tv, const Expr* expr); private: + // The AlmostExact graph is used since size-1 splits and merges + // should not affect actual index exprs. const ValGraph& traversalGraph() const { return id_model_.idGraph(IdMappingMode::ALMOSTEXACT); } - // Build the map of loop groups to their index Vals. + // Build a map of loop groups to their index Vals. See the comment + // on loop_index_map_. void buildLoopIndexMap(); // Get the index of a loop domain. Val* getLoopIndex(IterDomain* loop_id) const; - // + // Propagate the loop indices of a given list of loop domains to the + // traversal graph (i.e., the AlmostExact graph). Uses the loop + // index map, which is built for the Loop graph. std::unordered_map getInitialIndexMap( - const Expr* expr, const std::vector& loop_domains) const; - IndexingInfo getIndex( + // Returns the index map as well as its traversal path of given + // index domains appearing in a given expr. Used by + // getLinearIndex. + IndexingInfo computeIndex( const Expr* expr, const std::vector& index_domains) const; + // Check if the loop index of a a loop group should be always + // just zero. For example, a loop group with an extent of one, i.e., + // a broadcast-only loop group, should just use zero. + bool shouldUseZeroIndex(const ValGroup& loop_group) const; + private: const IdModel& id_model_; diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp index be268dcfb85..c0f02a04d41 100644 --- a/tests/cpp/test_indexing.cpp +++ b/tests/cpp/test_indexing.cpp @@ -41,10 +41,10 @@ TEST_F(IndexingTest, Simple1) { IdModel id_model(&fusion); TensorIndexer indexer(id_model); - std::cerr << indexer.getTensorIndex(tv1, tv1->definition(), std::nullopt) + std::cerr << indexer.getLinearIndex(tv1, tv1->definition()) ->toInlineString() << std::endl; - std::cerr << indexer.getTensorIndex(tv0, tv1->definition(), std::nullopt) + std::cerr << indexer.getLinearIndex(tv0, tv1->definition()) ->toInlineString() << std::endl; } From 7cc7acf42d9d2b2f12ba76a4c573d2b70b594db8 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Tue, 4 Jun 2024 14:17:54 -0700 Subject: [PATCH 03/22] cleanup --- csrc/id_model/indexing.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp index 0e06feb9e34..a3524b75dbf 100644 --- a/csrc/id_model/indexing.cpp +++ b/csrc/id_model/indexing.cpp @@ -240,18 +240,10 @@ class IdGraphIndexCompute : public OptOutDispatch { bool isForward(Expr* expr) const; bool hasIndex(IterDomain* id) const { - // If it's a broadcast, its index is always zero. - if (id->isBroadcast()) { - return true; - } return indexMap().find(toGroup(id)) != indexMap().end(); } Val* getIndex(IterDomain* id) const { - // If it's a broadcast, its index is always zero. - if (id->isBroadcast()) { - return id->fusion()->zeroVal(); - } auto it = index_map_.find(toGroup(id)); NVF_ERROR(it != index_map_.end(), "Index not found: ", id->toString()); return it->second; From 39ad06a0916071a15e13d082f9f5501793f7df6e Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 30 May 2024 16:50:39 -0700 Subject: [PATCH 04/22] Initial PR of IdModel-based indexing Only includes minimum logic for basic indexing. Notably, no support for broadcast in this PR. --- CMakeLists.txt | 2 + csrc/device_lower/lower2device.cpp | 2 +- csrc/device_lower/utils.cpp | 22 ++ csrc/device_lower/utils.h | 2 + csrc/id_model/id_model.h | 9 + csrc/id_model/indexing.cpp | 443 +++++++++++++++++++++++++++++ csrc/id_model/indexing.h | 100 +++++++ csrc/id_model/utils.h | 55 ++++ csrc/ir/utils.h | 36 +++ tests/cpp/test_indexing.cpp | 384 +++++++++++++++++++++++++ 10 files changed, 1054 insertions(+), 1 deletion(-) create mode 100644 csrc/id_model/indexing.cpp create mode 100644 csrc/id_model/indexing.h create mode 100644 csrc/id_model/utils.h create mode 100644 tests/cpp/test_indexing.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ed87fd7bbbe..6be651ca58c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,6 +135,7 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/host_ir/executor.cpp ${NVFUSER_SRCS_DIR}/host_ir/host_ir.cpp ${NVFUSER_SRCS_DIR}/id_model/id_model.cpp + ${NVFUSER_SRCS_DIR}/id_model/indexing.cpp ${NVFUSER_SRCS_DIR}/id_model/loop_promotion.cpp ${NVFUSER_SRCS_DIR}/id_model/schedule.cpp ${NVFUSER_SRCS_DIR}/id_model/to_string.cpp @@ -506,6 +507,7 @@ list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/tests/cpp/test_gpu_transpose.cpp ${NVFUSER_ROOT}/tests/cpp/test_gpu_utils.cpp ${NVFUSER_ROOT}/tests/cpp/test_id_model.cpp + ${NVFUSER_ROOT}/tests/cpp/test_indexing.cpp ${NVFUSER_ROOT}/tests/cpp/test_iter_visitor.cpp ${NVFUSER_ROOT}/tests/cpp/test_linked_hash_map.cpp ${NVFUSER_ROOT}/tests/cpp/test_loop_rotation.cpp diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index c50f160eb51..1a464417631 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -391,7 +391,7 @@ void GpuLower::analysis(Fusion* fusion) { // functionality should be affected. New IterDomains may be created, // so it is expected that generated code may use diffrent variable // names - if (isOptionEnabled(EnableOption::IdModel)) { + if (true || isOptionEnabled(EnableOption::IdModel)) { IdModel id_model(fusion_); } diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp index 955c1513dd3..bbf0526e461 100644 --- a/csrc/device_lower/utils.cpp +++ b/csrc/device_lower/utils.cpp @@ -920,6 +920,28 @@ std::array getMmaLayout(const MmaOp* expr) { return layout; } +// Returns true if expr is an expression that initializes a reduction +// buffer. +bool isReductionInitExpr(const Expr* expr) { + // False if its output isn't a TensorView + if (!ir_utils::isTvOp(expr)) { + return false; + } + // False if it doesn't have any reduction axis + const auto out_tv = ir_utils::getTvOutput(expr); + if (!out_tv->domain()->hasReduction()) { + return false; + } + // False if it has have TensorView inputs as initialization should + // never use TensorViews + const auto tv_filter_inp_view = + ir_utils::filterByType(expr->inputs()); + if (tv_filter_inp_view.begin() != tv_filter_inp_view.end()) { + return false; + } + return true; +} + } // namespace lower_utils } // namespace nvfuser diff --git a/csrc/device_lower/utils.h b/csrc/device_lower/utils.h index 60f3459814a..cad0ae5b996 100644 --- a/csrc/device_lower/utils.h +++ b/csrc/device_lower/utils.h @@ -322,6 +322,8 @@ Val* getNumThreadsInTensorView(TensorView* tv); //! Get the unit dimensions of A and B for the given MmaOp. std::array getMmaLayout(const MmaOp* expr); +bool isReductionInitExpr(const Expr* expr); + } // namespace lower_utils } // namespace nvfuser diff --git a/csrc/id_model/id_model.h b/csrc/id_model/id_model.h index 5b60ff474c6..b912978ca48 100644 --- a/csrc/id_model/id_model.h +++ b/csrc/id_model/id_model.h @@ -153,6 +153,15 @@ class IdModel : public PolymorphicBase { std::string toString() const; + bool empty() const { + return tvs_.empty(); + } + + Fusion* fusion() const { + NVF_ERROR(!tvs_.empty()); + return tvs_.at(0)->fusion(); + } + // Build all graphs, i.e., Exact, AlmostExact, Permissive and // LOOP. This is by default called from the constructor void buildAllGraphs(); diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp new file mode 100644 index 00000000000..c80b6c3548d --- /dev/null +++ b/csrc/id_model/indexing.cpp @@ -0,0 +1,443 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace nvfuser { + +namespace { + +// True if a given domain is a loop doamin of a given tensor and its +// loop is partitioned with respect to the memory type of the tensor +bool isPartitionedLoop(TensorView* tv, IterDomain* id) { + // False if id is not a loop ID + if (std::find(tv->getLeafDomain().begin(), tv->getLeafDomain().end(), id) == + tv->getLeafDomain().end()) { + return false; + } + + // If the memory of this domain is partitioned with respect to the + // parallel type of the domain, there's no allocation for the domain + return ir_utils::isPartitionedMemory( + tv->getMemoryType(), id->getParallelType()); +} + +bool isSizeOneDomain(IterDomain* id) { + return id->isBroadcast() || id->isReduction() || id->extent()->isOneInt(); +} + +// True if a given domain of a tensor *may* require allocation +bool mayRequireAllocation(TensorView* tv, IterDomain* id) { + return !isPartitionedLoop(tv, id) && !isSizeOneDomain(id); +} + +// Get the allocation stride of a given allocation domain +Val* getStrideOfGlobalMemoryTensor(TensorView* tv, int64_t alloc_dim) { + NVF_ERROR(tv->getMemoryType() == MemoryType::Global); + + // Allocation domains can include reduction domains, but + // alloc_stride arraies do not. + const auto& alloc_dom = tv->getMaybeAllocationDomain(); + int64_t stride_dim = -1; + for (const auto i : c10::irange(alloc_dim + 1)) { + if (alloc_dom.at(i)->isReduction()) { + continue; + } + ++stride_dim; + } + + NVF_ERROR(stride_dim != -1); + + return IrBuilder::getItemExpr( + IrBuilder::getAttrExpr(IrBuilder::metadataExpr(tv), "alloc_stride"), + stride_dim); +} + +// Get the allocation domains of a given tensor. Also returns its +// strides. +// +// TODO: Ideally, all tensors should have their correct allocation +// domains, but that isn't always the case at this moment. The logic +// here is duplicated in multiple locations and should be cleaned up. +std::tuple, std::vector> getAllocationDomains( + TensorView* tv, + const IdModel& id_model) { + std::vector allocation_domains; + std::vector> contiguity; + + // Use the allocation domain if set for the tensor + if (tv->hasAllocation()) { + allocation_domains = tv->getAllocationDomain(); + contiguity = tv->domain()->contiguity(); + } else { + // If allocation domain is not set, assume that: + // - Global: logical domains + // - Local/Shared: loop domains to the right of the CA position + const auto inlining_pos = tv->getComputeAtPosition(); + if (tv->getMemoryType() == MemoryType::Global) { + allocation_domains = tv->getRFactorDomain(); + contiguity = tv->domain()->contiguity(); + } else { + for (const auto i : c10::irange(tv->nDims())) { + auto loop_id = tv->getLeafDomain().at(i); + auto pt = loop_id->getParallelType(); + if (!mayRequireAllocation(tv, loop_id)) { + continue; + } + + // If the position is left of the inlinig position, no need to + // alloate the domain unless it's shared. For example, if this + // is a Shared tensor and the domain is parallelized with TID, + // even if it's outside of the CA position, since the domain + // is shared, it must be allocated. + if (i < inlining_pos && + !ir_utils::isSharedMemory(tv->getMemoryType(), pt)) { + continue; + } + + allocation_domains.push_back(loop_id); + } + // Assume Local and Shared are always fully contiguous + contiguity = + std::vector>(allocation_domains.size(), true); + } + } + + // Compute the strides from innermost to outermost domains + std::vector strides(allocation_domains.size(), nullptr); + Val* cur_contig_stride = tv->fusion()->oneVal(); + for (const auto i : c10::irange(allocation_domains.size())) { + auto dim = allocation_domains.size() - i - 1; + auto allocation_domain = allocation_domains.at(dim); + + if (!mayRequireAllocation(tv, allocation_domain)) { + continue; + } + + const std::optional contig_flag = contiguity.at(dim); + // Broadcast doesn't have contig flag but it must have been + // already filtered out + NVF_ERROR(contig_flag.has_value()); + + if (contig_flag.value()) { + strides[dim] = cur_contig_stride; + cur_contig_stride = SimplifyingIrBuilder::mulExpr( + allocation_domains.at(dim)->extent(), cur_contig_stride); + } else { + // Assume that the tensor should always be a Global memory + // tensor if it has non-contig allocation domains + NVF_ERROR(tv->getMemoryType() == MemoryType::Global); + strides[dim] = getStrideOfGlobalMemoryTensor(tv, (int64_t)dim); + cur_contig_stride = strides[dim]; + } + } + + // Filter out non-allocated domains. This is already done for Local + // and Shared tensors with no set allocation domains, but not for + // the other cases. For example, a reduction output tensor that is + // also a fusion output may still have reduction domains in their + // allocation domains, which aren't relevant for indexing + std::vector actual_allocation_domains; + std::vector actual_strides; + for (const auto i : c10::irange(allocation_domains.size())) { + auto allocation_domain = allocation_domains.at(i); + if (!mayRequireAllocation(tv, allocation_domain)) { + continue; + } + auto stride = strides.at(i); + NVF_ERROR(stride != nullptr); + actual_allocation_domains.push_back(allocation_domain); + actual_strides.push_back(stride); + } + + return {actual_allocation_domains, actual_strides}; +} + +// Similar to IndexCompute but adapted for the graph-based indexing +class IdGraphIndexCompute : public OptOutDispatch { + public: + IdGraphIndexCompute( + const ValGraph& traversal_graph, + const std::unordered_map& initial_index_map) + : traversal_graph_(traversal_graph), index_map_(initial_index_map) {} + + // Propagate the index map through a given expr of a specified + // direction. + void propagate(const ExprGroup& expr_group, Direction direction) { + NVF_ERROR(!expr_group->empty()); + dispatch(expr_group->front()); + } + + const std::unordered_map indexMap() const { + return index_map_; + } + + private: + using OptOutDispatch::handle; + + void handle(Split* split) override; + + void handle(Merge* merge) override; + + bool isForward(Expr* expr) const; + + bool hasIndex(IterDomain* id) const { + return indexMap().find(toGroup(id)) != indexMap().end(); + } + + Val* getIndex(IterDomain* id) const { + auto it = index_map_.find(toGroup(id)); + NVF_ERROR(it != index_map_.end(), "Index not found: ", id->toString()); + return it->second; + } + + void setIndex(IterDomain* id, Val* idx) { + index_map_.emplace(toGroup(id), idx); + } + + const ValGroup& toGroup(IterDomain* id) const { + return traversal_graph_.toGroup(id); + } + + private: + const ValGraph& traversal_graph_; + std::unordered_map index_map_; +}; + +// TODO: Should use the explicit direction +bool IdGraphIndexCompute::isForward(Expr* expr) const { + bool ready = true; + for (const auto inp : ir_utils::filterByType(expr->inputs())) { + if (!hasIndex(inp)) { + ready = false; + break; + } + } + if (ready) { + return true; + } + + // Can just return false here. Just make sure the outputs are + // already processed + for (const auto out : ir_utils::filterByType(expr->outputs())) { + NVF_ERROR(hasIndex(out), "Output index not found: ", out->toString()); + } + + return false; +} + +void IdGraphIndexCompute::handle(Split* split) { + const bool is_forward = isForward(split); + + if (is_forward) { + auto in_idx = getIndex(split->in()); + auto inner_extent = split->inner()->extent(); + auto outer_idx = SimplifyingIrBuilder::divExpr(in_idx, inner_extent); + Val* inner_idx = nullptr; + inner_idx = SimplifyingIrBuilder::modExpr(in_idx, inner_extent); + setIndex(split->outer(), outer_idx); + setIndex(split->inner(), inner_idx); + } else { + auto outer_idx = getIndex(split->outer()); + auto inner_idx = getIndex(split->inner()); + auto inner_extent = split->inner()->extent(); + auto in_idx = SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::mulExpr(outer_idx, inner_extent), inner_idx); + setIndex(split->in(), in_idx); + } +} + +void IdGraphIndexCompute::handle(Merge* merge) { + const bool is_forward = isForward(merge); + + auto inner_ext = merge->inner()->extent(); + + if (is_forward) { + auto outer_idx = getIndex(merge->outer()); + auto inner_idx = getIndex(merge->inner()); + auto out_idx = SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::mulExpr(outer_idx, inner_ext), inner_idx); + setIndex(merge->out(), out_idx); + } else { + auto out_idx = getIndex(merge->out()); + auto outer_idx = SimplifyingIrBuilder::divExpr(out_idx, inner_ext); + setIndex(merge->outer(), outer_idx); + Val* inner_idx = SimplifyingIrBuilder::modExpr(out_idx, inner_ext); + setIndex(merge->inner(), inner_idx); + } +} + +} // namespace + +TensorIndexer::TensorIndexer(const IdModel& id_model) : id_model_(id_model) { + buildLoopIndexMap(); +} + +namespace { +ParallelType getParallelType(const ValGroup& loop_group) { + ParallelType common_pt = ParallelType::Serial; + for (const auto val : *loop_group) { + auto pt = val->as()->getParallelType(); + if (common_pt == pt || pt == ParallelType::Serial) { + continue; + } else if (common_pt == ParallelType::Serial) { + common_pt = pt; + } else { + // Inconsistent parallelization + NVF_ERROR( + false, + "Inconsistent parallelization detected. ", + "Known type: ", + common_pt, + "New type: ", + pt); + } + } + + return common_pt; +} +} // namespace + +void TensorIndexer::buildLoopIndexMap() { + if (id_model_.empty()) { + return; + } + + Fusion* fusion = id_model_.fusion(); + + for (auto expr : fusion->exprs()) { + if (!ir_utils::isTvOp(expr)) { + continue; + } + auto tv_output = ir_utils::getTvOutput(expr); + for (auto leaf_id : tv_output->getLeafDomain()) { + const ValGroup& loop_group = + id_model_.idGraph(IdMappingMode::LOOP).toGroup(leaf_id); + + if (loop_index_map_.find(loop_group) != loop_index_map_.end()) { + // Index already assigned + continue; + } + + Val* loop_index = nullptr; + + ParallelType ptype = getParallelType(loop_group); + if (isParallelTypeThread(ptype)) { + loop_index = NamedScalar::getParallelIndex(ptype); + } else { + loop_index = IrBuilder::create(DataType::Index); + } + + loop_index_map_[loop_group] = loop_index; + } + } +} + +Val* TensorIndexer::getLoopIndex(IterDomain* loop_id) const { + // loop_id must be a loop domain. + const auto& loop_group = + id_model_.idGraph(IdMappingMode::LOOP).toGroup(loop_id); + auto loop_index_map_it = loop_index_map_.find(loop_group); + NVF_ERROR( + loop_index_map_it != loop_index_map_.end(), + "No loop index found for ", + loop_id->toString()); + + Val* loop_index = loop_index_map_it->second; + return loop_index; +} + +std::unordered_map TensorIndexer::getInitialIndexMap( + const std::vector& loop_domains) const { + std::unordered_map initial_index_map; + + // For a given list of the loop domains, assign its corresponding + // index Val. + for (IterDomain* loop_id : loop_domains) { + Val* loop_index = getLoopIndex(loop_id); + const auto& almost_exact_group = traversalGraph().toGroup(loop_id); + + initial_index_map.emplace(almost_exact_group, loop_index); + } + + return initial_index_map; +} + +Val* TensorIndexer::getLinearIndex(TensorView* tv, const Expr* expr) { + const auto [allocation_domains, strides] = + getAllocationDomains(tv, id_model_); + + const auto& index_info = computeIndex(expr, allocation_domains); + const auto& index_map = index_info.index_map; + + // Linearize the indices with strides. + // TODO: Contiguous indexing + Val* index = tv->fusion()->zeroVal(); + for (const auto i : c10::irange(allocation_domains.size())) { + // Traverse from innermost to outermost + IterDomain* allocation_domain = + allocation_domains.at(allocation_domains.size() - 1 - i); + + Val* stride = strides.at(allocation_domains.size() - 1 - i); + + auto idx_it = index_map.find(traversalGraph().toGroup(allocation_domain)); + NVF_ERROR( + idx_it != index_map.end(), + "Index not found for ", + allocation_domain->toString()); + Val* idx = idx_it->second; + index = SimplifyingIrBuilder::addExpr( + index, SimplifyingIrBuilder::mulExpr(idx, stride)); + } + + return index; +} + +std::vector TensorIndexer::getLoopDomains(const Expr* expr) const { + // Assume consumer-based indexing. Needs to revisit for ops like + // scatter + return ir_utils::getTvOutput(expr)->getLeafDomain(); +} + +IndexingInfo TensorIndexer::computeIndex( + const Expr* expr, + const std::vector& index_domains) const { + const auto loop_domains = getLoopDomains(expr); + + const ValGroups loop_groups = traversalGraph().toGroups(loop_domains); + const ValGroups index_groups = traversalGraph().toGroups(index_domains); + const ExprPath traversal_path = + ValGraphBFS::getExprsBetween(traversalGraph(), loop_groups, index_groups); + + const std::unordered_map initial_index_map = + getInitialIndexMap(loop_domains); + + IdGraphIndexCompute index_compute(traversalGraph(), initial_index_map); + + for (const auto& [expr_group, direction] : traversal_path) { + index_compute.propagate(expr_group, direction); + } + + IndexingInfo info{traversal_path, index_compute.indexMap()}; + return info; +} + +} // namespace nvfuser diff --git a/csrc/id_model/indexing.h b/csrc/id_model/indexing.h new file mode 100644 index 00000000000..1dae562c9c7 --- /dev/null +++ b/csrc/id_model/indexing.h @@ -0,0 +1,100 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#pragma once + +#include +#include +#include +#include +#include +#include + +// Just for RootPredicateInfo. Should be moved to its own header file +#include + +#include + +namespace nvfuser { + +struct IndexingInfo { + // Indexing traversal path from loop domains + ExprPath traversal_path; + // Index mappings of ID groups along the traversal path + std::unordered_map index_map; +}; + +// The basic algorithm of indexing is: +// +// 1. Find the loop domains +// 2. Find the allocation domains +// 3. Find the path from the loop domains to the allocation domains +// 4. Set the initial index vals for the loop domains +// 5. Propagate the initial indices of the loop domains to the allocation +// domains +// +// The indexing traversal is done on the AlmostExact graph augmented +// with the loop promotion map since both the loop and allocations +// domains may be promoted. +class TensorIndexer { + public: + TensorIndexer(const IdModel& id_model); + + // Get a linear index of a given tensor appearing in a given expr, either + // as a consumer or a producer. The predicate indexing will have a + // separate interface. + Val* getLinearIndex(TensorView* tv, const Expr* expr); + + private: + // The AlmostExact graph is used since size-1 splits and merges + // should not affect actual index exprs. + const ValGraph& traversalGraph() const { + return id_model_.idGraph(IdMappingMode::ALMOSTEXACT); + } + + // Build a map of loop groups to their index Vals. See the comment + // on loop_index_map_. + void buildLoopIndexMap(); + + // Get the index of a loop domain. + Val* getLoopIndex(IterDomain* loop_id) const; + + // Propagate the loop indices of a given list of loop domains to the + // traversal graph (i.e., the AlmostExact graph). Uses the loop + // index map, which is built for the Loop graph. + std::unordered_map getInitialIndexMap( + const std::vector& loop_domains) const; + + // Get the loop domains of a given expr. Currently, they're always + // the loop domains of a consumer tensor, but in the future this + // function may return the loop domains of a producer for + // producer-based indexing. + std::vector getLoopDomains(const Expr* expr) const; + + // Returns the index map as well as its traversal path of given + // index domains appearing in a given expr. Used by + // getLinearIndex. + IndexingInfo computeIndex( + const Expr* expr, + const std::vector& index_domains) const; + + // Check if the loop index of a a loop group should be always + // just zero. For example, a loop group with an extent of one, i.e., + // a broadcast-only loop group, should just use zero. + bool shouldUseZeroIndex(const ValGroup& loop_group) const; + + private: + const IdModel& id_model_; + + // Mappings from loop groups to their indices. Serial loops will + // be mapped a unique loop index Val. Parallel loops will be mapped + // to NamedScalar such as "threadIdx.x". This map needs to be built + // once and can be reused for different tensors. + std::unordered_map loop_index_map_; +}; + +} // namespace nvfuser diff --git a/csrc/id_model/utils.h b/csrc/id_model/utils.h new file mode 100644 index 00000000000..2d6327bf586 --- /dev/null +++ b/csrc/id_model/utils.h @@ -0,0 +1,55 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#pragma once + +#include + +#include +#include +#include + +#define VERBOSE() verbose(__LINE__) +#define WARN() warn(__LINE__) + +namespace nvfuser { + +// Temporary logging utility +class DebugStream { + public: + DebugStream() + : enabled_(getNvFuserEnv("ID_MODEL_VERBOSE")), out_(std::cerr) {} + + template + DebugStream& operator<<(const T& v) { + if (enabled_) { + out_ << v; + } + return *this; + } + + DebugStream& operator<<(std::ostream& (*endl)(std::ostream&)) { + if (enabled_) { + out_ << endl; + } + return *this; + } + + private: + bool enabled_ = false; + std::ostream& out_; +}; + +inline DebugStream verbose(int line) { + return DebugStream() << "[DEBUG@" << line << "] "; +} + +inline DebugStream warn(int line) { + return DebugStream() << "[WARN@" << line << "] "; +} + +} // namespace nvfuser diff --git a/csrc/ir/utils.h b/csrc/ir/utils.h index 14357bd4207..0fa65960edc 100644 --- a/csrc/ir/utils.h +++ b/csrc/ir/utils.h @@ -654,4 +654,40 @@ std::optional> computePermutation( bool hasTrivialAllocationDomain(const TensorView* tv); +// Returns true if memory_type is partitioned in parallel_type +inline bool isPartitionedMemory( + MemoryType memory_type, + ParallelType parallel_type) { + switch (memory_type) { + case MemoryType::Local: + return isParallelTypeThread(parallel_type) || + isParallelTypeDeviceDim(parallel_type); + case MemoryType::Shared: + return isParallelTypeBlockDim(parallel_type) || + isParallelTypeDeviceDim(parallel_type); + case MemoryType::Global: + return isParallelTypeDeviceDim(parallel_type); + default: + NVF_ERROR(false, "Unknown MemoryType: ", memory_type); + } +} + +// Returns true if memory_type is shared in parallel_type +inline bool isSharedMemory(MemoryType memory_type, ParallelType parallel_type) { + switch (memory_type) { + case MemoryType::Local: + // Nothing is shared if it's Local + return false; + case MemoryType::Shared: + // Only TID parallelized domains are shared if it's Shared + return isParallelTypeThreadDim(parallel_type); + case MemoryType::Global: + // Only TID and BID parallelized domains are shared if it's Global + return isParallelTypeThreadDim(parallel_type) || + isParallelTypeBlockDim(parallel_type); + default: + NVF_ERROR(false, "Unknown MemoryType: ", memory_type); + } +} + } // namespace nvfuser::ir_utils diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp new file mode 100644 index 00000000000..3750338d511 --- /dev/null +++ b/tests/cpp/test_indexing.cpp @@ -0,0 +1,384 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nvfuser { + +using IndexingTest = NVFuserTest; + +namespace { + +std::vector getLoopIndices(TensorView* tv, const TensorIndexer& indexer) { + std::vector loop_indices; + for (const auto& loop_id : tv->getLeafDomain()) { + loop_indices.push_back(indexer.getLoopIndex(loop_id)); + } + return loop_indices; +} + +} // namespace + +// Simple pointwise test with no parallelization +TEST_F(IndexingTest, SimplePointwise1) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + + auto tv1 = add(tv0, IrBuilder::create(1.0)); + auto tv2 = add(tv1, IrBuilder::create(1.0)); + fusion.addOutput(tv2); + + tv2->flatten(); + tv2->split(0, 4); + + TransformPropagator propagator(tv2); + MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator); + + tv1->inlineAt(1); + + IdModel id_model(&fusion); + TensorIndexer indexer(id_model); + + std::vector tv1_loop_indices = getLoopIndices(tv1, indexer); + std::vector tv2_loop_indices = getLoopIndices(tv2, indexer); + + auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition()); + + auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition()); + auto tv1_producer_index = indexer.getLinearIndex(tv1, tv2->definition()); + auto tv2_consumer_index = indexer.getLinearIndex(tv2, tv2->definition()); + + auto tv0_producer_index_ref = SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::mulExpr( + SimplifyingIrBuilder::modExpr( + SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::mulExpr( + tv1_loop_indices.at(0), tv1->axis(1)->extent()), + tv1_loop_indices.at(1)), + tv1->getRFactorDomain().at(1)->extent()), + IrBuilder::getItemExpr( + IrBuilder::getAttrExpr( + IrBuilder::metadataExpr(tv0), "alloc_stride"), + (int64_t)1)), + SimplifyingIrBuilder::mulExpr( + SimplifyingIrBuilder::divExpr( + SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::mulExpr( + tv1_loop_indices.at(0), tv1->axis(1)->extent()), + tv1_loop_indices.at(1)), + tv1->getRFactorDomain().at(1)->extent()), + IrBuilder::getItemExpr( + IrBuilder::getAttrExpr( + IrBuilder::metadataExpr(tv0), "alloc_stride"), + (int64_t)0))); + + auto tv1_consumer_index_ref = tv1_loop_indices.at(1); + auto tv1_producer_index_ref = tv2_loop_indices.at(1); + + auto tv2_consumer_index_ref = SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::modExpr( + SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::mulExpr( + tv2_loop_indices.at(0), tv2->axis(1)->extent()), + tv2_loop_indices.at(1)), + tv2->getRFactorDomain().at(1)->extent()), + SimplifyingIrBuilder::mulExpr( + SimplifyingIrBuilder::divExpr( + SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::mulExpr( + tv2_loop_indices.at(0), tv2->axis(1)->extent()), + tv2_loop_indices.at(1)), + tv2->getRFactorDomain().at(1)->extent()), + tv2->getRFactorDomain().at(1)->extent())); + + EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref)) + << "Ref: " << tv0_producer_index_ref->toInlineString() + << ". Actual: " << tv0_producer_index->toInlineString(); + + EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref)) + << "Ref: " << tv1_consumer_index_ref->toInlineString() + << ". Actual: " << tv1_consumer_index->toInlineString(); + + EXPECT_TRUE(tv1_producer_index->sameAs(tv1_producer_index_ref)) + << "Ref: " << tv1_producer_index_ref->toInlineString() + << ". Actual: " << tv1_producer_index->toInlineString(); + + EXPECT_TRUE(tv2_consumer_index->sameAs(tv2_consumer_index_ref)) + << "Ref: " << tv2_consumer_index_ref->toInlineString() + << ". Actual: " << tv2_consumer_index->toInlineString(); +} + +// Almost same fusion as SimplePointwiseSerial but TID and BID +// parallelizaiton with no inlining +TEST_F(IndexingTest, SimplePointwise2) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(2); + fusion.addInput(tv0); + + auto tv1 = add(tv0, IrBuilder::create(1.0)); + auto tv2 = add(tv1, IrBuilder::create(1.0)); + auto tv3 = add(tv2, IrBuilder::create(1.0)); + fusion.addOutput(tv3); + + tv3->flatten(); + tv3->split(0, 4); + + TransformPropagator propagator(tv3); + MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator); + + tv3->axis(0)->parallelize(ParallelType::BIDx); + tv3->axis(1)->parallelize(ParallelType::TIDx); + + scheduler_utils::parallelizeAllLike(tv3, ir_utils::allTvs(&fusion)); + + // Test shared memory indexing + tv2->setMemoryType(MemoryType::Shared); + + IdModel id_model(&fusion); + TensorIndexer indexer(id_model); + + // tv0 and tv3 are global tensors and should have the same index: + // "(blockIdx.x * 4 + threadIdx.x) % tv0->axis(1)->extent() + + // (blockIdx.x * 4 + threadIdx.x) / tv0->axis(1)->extent() * + // tv0->axis(1)->extent() + // + // tv1 is a Local tensor. Since it's fully parallelized, its index + // should be always zero + // + // tv2 is a Shared tensor. Only the TIDx parallelized domain should + // contribute to the index + + auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition()); + auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition()); + auto tv1_producer_index = indexer.getLinearIndex(tv1, tv2->definition()); + auto tv2_consumer_index = indexer.getLinearIndex(tv2, tv2->definition()); + auto tv2_producer_index = indexer.getLinearIndex(tv2, tv3->definition()); + auto tv3_consumer_index = indexer.getLinearIndex(tv3, tv3->definition()); + + auto contig_idx = SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::mulExpr( + NamedScalar::getParallelIndex(ParallelType::BIDx), + tv2->axis(1)->extent()), + NamedScalar::getParallelIndex(ParallelType::TIDx)); + + auto global_ref = SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::modExpr( + contig_idx, tv0->getRFactorDomain().at(1)->extent()), + SimplifyingIrBuilder::mulExpr( + SimplifyingIrBuilder::divExpr( + contig_idx, tv0->getRFactorDomain().at(1)->extent()), + tv0->getRFactorDomain().at(1)->extent())); + + auto shared_ref = NamedScalar::getParallelIndex(ParallelType::TIDx); + + EXPECT_TRUE(tv0_producer_index->sameAs(global_ref)) + << "Ref: " << global_ref->toInlineString() + << ". Actual: " << tv0_producer_index->toInlineString(); + + EXPECT_TRUE(tv1_consumer_index->isZeroInt()) + << "Actual: " << tv1_consumer_index->toInlineString(); + + EXPECT_TRUE(tv1_producer_index->isZeroInt()) + << "Actual: " << tv1_producer_index->toInlineString(); + + EXPECT_TRUE(tv2_producer_index->sameAs(shared_ref)) + << "Ref: " << shared_ref->toInlineString() + << ". Actual: " << tv2_producer_index->toInlineString(); + + EXPECT_TRUE(tv2_consumer_index->sameAs(shared_ref)) + << "Ref: " << shared_ref->toInlineString() + << ". Actual: " << tv2_consumer_index->toInlineString(); + + EXPECT_TRUE(tv3_consumer_index->sameAs(global_ref)) + << "Ref: " << global_ref->toInlineString() + << ". Actual: " << tv3_consumer_index->toInlineString(); +} + +// Simple reduction with no parallelization +TEST_F(IndexingTest, SimpleReduction) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(2); + fusion.addInput(tv0); + + auto tv1 = sum(tv0, {1}); + auto tv2 = add(tv1, IrBuilder::create(1.0)); + fusion.addOutput(tv2); + + IdModel id_model(&fusion); + TensorIndexer indexer(id_model); + + std::vector tv1_loop_indices = getLoopIndices(tv1, indexer); + std::vector tv2_loop_indices = getLoopIndices(tv2, indexer); + + auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition()); + auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition()); + auto tv1_producer_index = indexer.getLinearIndex(tv1, tv2->definition()); + auto tv2_consumer_index = indexer.getLinearIndex(tv2, tv2->definition()); + + auto tv0_producer_index_ref = SimplifyingIrBuilder::addExpr( + tv1_loop_indices.at(1), + SimplifyingIrBuilder::mulExpr( + tv1_loop_indices.at(0), tv0->getRFactorDomain().at(1)->extent())); + + auto tv1_consumer_index_ref = tv1_loop_indices.at(0); + auto tv1_producer_index_ref = tv2_loop_indices.at(0); + auto tv2_consumer_index_ref = tv2_loop_indices.at(0); + + EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref)); + EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref)); + EXPECT_TRUE(tv1_producer_index->sameAs(tv1_producer_index_ref)); + EXPECT_TRUE(tv2_consumer_index->sameAs(tv2_consumer_index_ref)); +} + +// Fusion copied from AllocationDomainTest.TransposedIntermediate +TEST_F(IndexingTest, AllocationDomain) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigConcreteTensor({32, 32}); + fusion.addInput(tv0); + auto tv1 = set(tv0); + auto tv2 = set(tv1); + fusion.addOutput(tv2); + tv1->setMemoryType(MemoryType::Shared); + + tv1->axis(0)->parallelize(ParallelType::TIDx); + tv2->axis(0)->parallelize(ParallelType::TIDx); + + std::vector tv1_transposed = {tv1->axis(1), tv1->axis(0)}; + tv1->setAllocationDomain(tv1_transposed, true); + + IdModel id_model(&fusion); + TensorIndexer indexer(id_model); + + auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition()); + auto tv1_producer_index = indexer.getLinearIndex(tv1, tv2->definition()); + + std::vector tv1_loop_indices = getLoopIndices(tv1, indexer); + std::vector tv2_loop_indices = getLoopIndices(tv2, indexer); + + // Note that the allocation domain is permuted + auto tv1_consumer_index_ref = SimplifyingIrBuilder::addExpr( + tv1_loop_indices.at(0), + SimplifyingIrBuilder::mulExpr( + tv1_loop_indices.at(1), tv1->getRFactorDomain().at(0)->extent())); + + auto tv1_producer_index_ref = SimplifyingIrBuilder::addExpr( + tv2_loop_indices.at(0), + SimplifyingIrBuilder::mulExpr( + tv2_loop_indices.at(1), tv1->getRFactorDomain().at(0)->extent())); + + EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref)) + << "Ref: " << tv1_consumer_index_ref->toInlineString() + << ". Actual: " << tv1_consumer_index->toInlineString(); + + EXPECT_TRUE(tv1_producer_index->sameAs(tv1_producer_index_ref)) + << "Ref: " << tv1_producer_index_ref->toInlineString() + << ". Actual: " << tv1_producer_index->toInlineString(); +} + +TEST_F(IndexingTest, Reshape) { + Fusion fusion; + FusionGuard fg(&fusion); + + const std::vector shape1({100}); + const std::vector shape2({4, 25}); + const std::vector shape3({5, 2, 10}); + + // [i0] + auto tv0 = makeContigConcreteTensor(shape1); + fusion.addInput(tv0); + + auto tv1 = set(tv0); + + // [i2, i3] + auto tv2 = reshape(tv1, shape1, shape2); + + // [i2, i3] + auto tv3 = add(tv2, fusion.oneVal()); + + // [i4, i5, i6] + auto tv4 = reshape(tv3, shape2, shape3); + + // [i4, i5, i6] + auto tv5 = add(tv4, fusion.oneVal()); + + fusion.addOutput(tv5); + + TransformPropagator propagator(tv5); + MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator); + + inlineMost(); + + IdModel id_model(&fusion); + TensorIndexer indexer(id_model); + + // Validate tv0 indexing + auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition()); + + // It isn't straightforward to do structual checking as the other + // tests since there's no particular rule about which domain is used + // to provide the extent of the group. However, since everything + // should be deterministic, string match should also work. + std::string tv0_producer_index_ref = + "( ( ( ( ( i78 * ( ceilDiv(( 4 * 25 ), 5) ) ) + ( ( i79 * ( ceilDiv(( ceilDiv(( 4 * 25 ), 5) ), 2) ) ) + i80 ) ) / 25 ) * ( ceilDiv(100, 4) ) ) + ( ( ( i78 * ( ceilDiv(( 4 * 25 ), 5) ) ) + ( ( i79 * ( ceilDiv(( ceilDiv(( 4 * 25 ), 5) ), 2) ) ) + i80 ) ) % 25 ) )"; + + EXPECT_EQ(tv0_producer_index->toInlineString(), tv0_producer_index_ref); + + // All intermediate tensors should be fully inlined, so their + // indices should be just zero. + EXPECT_TRUE(indexer.getLinearIndex(tv1, tv1->definition())->isZeroInt()); + EXPECT_TRUE(indexer.getLinearIndex(tv1, tv2->definition())->isZeroInt()); + EXPECT_TRUE(indexer.getLinearIndex(tv2, tv2->definition())->isZeroInt()); + EXPECT_TRUE(indexer.getLinearIndex(tv2, tv3->definition())->isZeroInt()); + EXPECT_TRUE(indexer.getLinearIndex(tv3, tv3->definition())->isZeroInt()); + EXPECT_TRUE(indexer.getLinearIndex(tv3, tv4->definition())->isZeroInt()); + EXPECT_TRUE(indexer.getLinearIndex(tv4, tv4->definition())->isZeroInt()); + EXPECT_TRUE(indexer.getLinearIndex(tv4, tv5->definition())->isZeroInt()); + + // tv5 has no transformation and is fully contiguous + std::vector tv5_loop_indices = getLoopIndices(tv5, indexer); + auto tv5_consumer_index = indexer.getLinearIndex(tv5, tv5->definition()); + + auto tv5_consumer_index_ref = SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::addExpr( + tv5_loop_indices.at(2), + SimplifyingIrBuilder::mulExpr( + tv5_loop_indices.at(1), tv5->getRFactorDomain().at(2)->extent())), + SimplifyingIrBuilder::mulExpr( + tv5_loop_indices.at(0), + SimplifyingIrBuilder::mulExpr( + tv5->getRFactorDomain().at(1)->extent(), + tv5->getRFactorDomain().at(2)->extent()))); + + EXPECT_TRUE(tv5_consumer_index->sameAs(tv5_consumer_index_ref)) + << "Ref: " << tv5_consumer_index_ref->toInlineString() + << ". Actual: " << tv5_consumer_index->toInlineString(); +} + +} // namespace nvfuser From 096f63878003ca459dae61e00bbc8ac356e8ba54 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 5 Jun 2024 00:21:05 -0700 Subject: [PATCH 05/22] Disable idmodel --- csrc/device_lower/lower2device.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index 1a464417631..c50f160eb51 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -391,7 +391,7 @@ void GpuLower::analysis(Fusion* fusion) { // functionality should be affected. New IterDomains may be created, // so it is expected that generated code may use diffrent variable // names - if (true || isOptionEnabled(EnableOption::IdModel)) { + if (isOptionEnabled(EnableOption::IdModel)) { IdModel id_model(fusion_); } From 82d747f84ed0b88407c41a7611ef15bd1c6f0299 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 5 Jun 2024 00:32:33 -0700 Subject: [PATCH 06/22] fix --- csrc/device_lower/utils.cpp | 22 ---------------------- csrc/device_lower/utils.h | 2 -- csrc/id_model/indexing.cpp | 2 +- csrc/id_model/indexing.h | 11 +++-------- tests/cpp/test_indexing.cpp | 28 ++++++++++++++-------------- 5 files changed, 18 insertions(+), 47 deletions(-) diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp index bbf0526e461..955c1513dd3 100644 --- a/csrc/device_lower/utils.cpp +++ b/csrc/device_lower/utils.cpp @@ -920,28 +920,6 @@ std::array getMmaLayout(const MmaOp* expr) { return layout; } -// Returns true if expr is an expression that initializes a reduction -// buffer. -bool isReductionInitExpr(const Expr* expr) { - // False if its output isn't a TensorView - if (!ir_utils::isTvOp(expr)) { - return false; - } - // False if it doesn't have any reduction axis - const auto out_tv = ir_utils::getTvOutput(expr); - if (!out_tv->domain()->hasReduction()) { - return false; - } - // False if it has have TensorView inputs as initialization should - // never use TensorViews - const auto tv_filter_inp_view = - ir_utils::filterByType(expr->inputs()); - if (tv_filter_inp_view.begin() != tv_filter_inp_view.end()) { - return false; - } - return true; -} - } // namespace lower_utils } // namespace nvfuser diff --git a/csrc/device_lower/utils.h b/csrc/device_lower/utils.h index cad0ae5b996..60f3459814a 100644 --- a/csrc/device_lower/utils.h +++ b/csrc/device_lower/utils.h @@ -322,8 +322,6 @@ Val* getNumThreadsInTensorView(TensorView* tv); //! Get the unit dimensions of A and B for the given MmaOp. std::array getMmaLayout(const MmaOp* expr); -bool isReductionInitExpr(const Expr* expr); - } // namespace lower_utils } // namespace nvfuser diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp index c80b6c3548d..b5282d3f034 100644 --- a/csrc/id_model/indexing.cpp +++ b/csrc/id_model/indexing.cpp @@ -93,7 +93,7 @@ std::tuple, std::vector> getAllocationDomains( // - Local/Shared: loop domains to the right of the CA position const auto inlining_pos = tv->getComputeAtPosition(); if (tv->getMemoryType() == MemoryType::Global) { - allocation_domains = tv->getRFactorDomain(); + allocation_domains = tv->getLogicalDomain(); contiguity = tv->domain()->contiguity(); } else { for (const auto i : c10::irange(tv->nDims())) { diff --git a/csrc/id_model/indexing.h b/csrc/id_model/indexing.h index 1dae562c9c7..3e392e8e521 100644 --- a/csrc/id_model/indexing.h +++ b/csrc/id_model/indexing.h @@ -49,6 +49,9 @@ class TensorIndexer { // separate interface. Val* getLinearIndex(TensorView* tv, const Expr* expr); + // Get the index of a loop domain. Intended to be used only for testing. + Val* getLoopIndex(IterDomain* loop_id) const; + private: // The AlmostExact graph is used since size-1 splits and merges // should not affect actual index exprs. @@ -60,9 +63,6 @@ class TensorIndexer { // on loop_index_map_. void buildLoopIndexMap(); - // Get the index of a loop domain. - Val* getLoopIndex(IterDomain* loop_id) const; - // Propagate the loop indices of a given list of loop domains to the // traversal graph (i.e., the AlmostExact graph). Uses the loop // index map, which is built for the Loop graph. @@ -82,11 +82,6 @@ class TensorIndexer { const Expr* expr, const std::vector& index_domains) const; - // Check if the loop index of a a loop group should be always - // just zero. For example, a loop group with an extent of one, i.e., - // a broadcast-only loop group, should just use zero. - bool shouldUseZeroIndex(const ValGroup& loop_group) const; - private: const IdModel& id_model_; diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp index 3750338d511..45344c06886 100644 --- a/tests/cpp/test_indexing.cpp +++ b/tests/cpp/test_indexing.cpp @@ -76,7 +76,7 @@ TEST_F(IndexingTest, SimplePointwise1) { SimplifyingIrBuilder::mulExpr( tv1_loop_indices.at(0), tv1->axis(1)->extent()), tv1_loop_indices.at(1)), - tv1->getRFactorDomain().at(1)->extent()), + tv1->getLogicalDomain().at(1)->extent()), IrBuilder::getItemExpr( IrBuilder::getAttrExpr( IrBuilder::metadataExpr(tv0), "alloc_stride"), @@ -87,7 +87,7 @@ TEST_F(IndexingTest, SimplePointwise1) { SimplifyingIrBuilder::mulExpr( tv1_loop_indices.at(0), tv1->axis(1)->extent()), tv1_loop_indices.at(1)), - tv1->getRFactorDomain().at(1)->extent()), + tv1->getLogicalDomain().at(1)->extent()), IrBuilder::getItemExpr( IrBuilder::getAttrExpr( IrBuilder::metadataExpr(tv0), "alloc_stride"), @@ -102,15 +102,15 @@ TEST_F(IndexingTest, SimplePointwise1) { SimplifyingIrBuilder::mulExpr( tv2_loop_indices.at(0), tv2->axis(1)->extent()), tv2_loop_indices.at(1)), - tv2->getRFactorDomain().at(1)->extent()), + tv2->getLogicalDomain().at(1)->extent()), SimplifyingIrBuilder::mulExpr( SimplifyingIrBuilder::divExpr( SimplifyingIrBuilder::addExpr( SimplifyingIrBuilder::mulExpr( tv2_loop_indices.at(0), tv2->axis(1)->extent()), tv2_loop_indices.at(1)), - tv2->getRFactorDomain().at(1)->extent()), - tv2->getRFactorDomain().at(1)->extent())); + tv2->getLogicalDomain().at(1)->extent()), + tv2->getLogicalDomain().at(1)->extent())); EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref)) << "Ref: " << tv0_producer_index_ref->toInlineString() @@ -186,11 +186,11 @@ TEST_F(IndexingTest, SimplePointwise2) { auto global_ref = SimplifyingIrBuilder::addExpr( SimplifyingIrBuilder::modExpr( - contig_idx, tv0->getRFactorDomain().at(1)->extent()), + contig_idx, tv0->getLogicalDomain().at(1)->extent()), SimplifyingIrBuilder::mulExpr( SimplifyingIrBuilder::divExpr( - contig_idx, tv0->getRFactorDomain().at(1)->extent()), - tv0->getRFactorDomain().at(1)->extent())); + contig_idx, tv0->getLogicalDomain().at(1)->extent()), + tv0->getLogicalDomain().at(1)->extent())); auto shared_ref = NamedScalar::getParallelIndex(ParallelType::TIDx); @@ -243,7 +243,7 @@ TEST_F(IndexingTest, SimpleReduction) { auto tv0_producer_index_ref = SimplifyingIrBuilder::addExpr( tv1_loop_indices.at(1), SimplifyingIrBuilder::mulExpr( - tv1_loop_indices.at(0), tv0->getRFactorDomain().at(1)->extent())); + tv1_loop_indices.at(0), tv0->getLogicalDomain().at(1)->extent())); auto tv1_consumer_index_ref = tv1_loop_indices.at(0); auto tv1_producer_index_ref = tv2_loop_indices.at(0); @@ -286,12 +286,12 @@ TEST_F(IndexingTest, AllocationDomain) { auto tv1_consumer_index_ref = SimplifyingIrBuilder::addExpr( tv1_loop_indices.at(0), SimplifyingIrBuilder::mulExpr( - tv1_loop_indices.at(1), tv1->getRFactorDomain().at(0)->extent())); + tv1_loop_indices.at(1), tv1->getLogicalDomain().at(0)->extent())); auto tv1_producer_index_ref = SimplifyingIrBuilder::addExpr( tv2_loop_indices.at(0), SimplifyingIrBuilder::mulExpr( - tv2_loop_indices.at(1), tv1->getRFactorDomain().at(0)->extent())); + tv2_loop_indices.at(1), tv1->getLogicalDomain().at(0)->extent())); EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref)) << "Ref: " << tv1_consumer_index_ref->toInlineString() @@ -369,12 +369,12 @@ TEST_F(IndexingTest, Reshape) { SimplifyingIrBuilder::addExpr( tv5_loop_indices.at(2), SimplifyingIrBuilder::mulExpr( - tv5_loop_indices.at(1), tv5->getRFactorDomain().at(2)->extent())), + tv5_loop_indices.at(1), tv5->getLogicalDomain().at(2)->extent())), SimplifyingIrBuilder::mulExpr( tv5_loop_indices.at(0), SimplifyingIrBuilder::mulExpr( - tv5->getRFactorDomain().at(1)->extent(), - tv5->getRFactorDomain().at(2)->extent()))); + tv5->getLogicalDomain().at(1)->extent(), + tv5->getLogicalDomain().at(2)->extent()))); EXPECT_TRUE(tv5_consumer_index->sameAs(tv5_consumer_index_ref)) << "Ref: " << tv5_consumer_index_ref->toInlineString() From c8dfdf7dd82067ad45ab43e09a473bbb549cb1b1 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 5 Jun 2024 00:35:59 -0700 Subject: [PATCH 07/22] cleanup --- csrc/id_model/utils.h | 55 ------------------------------------------- 1 file changed, 55 deletions(-) delete mode 100644 csrc/id_model/utils.h diff --git a/csrc/id_model/utils.h b/csrc/id_model/utils.h deleted file mode 100644 index 2d6327bf586..00000000000 --- a/csrc/id_model/utils.h +++ /dev/null @@ -1,55 +0,0 @@ -// clang-format off -/* - * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. - * All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - */ -// clang-format on -#pragma once - -#include - -#include -#include -#include - -#define VERBOSE() verbose(__LINE__) -#define WARN() warn(__LINE__) - -namespace nvfuser { - -// Temporary logging utility -class DebugStream { - public: - DebugStream() - : enabled_(getNvFuserEnv("ID_MODEL_VERBOSE")), out_(std::cerr) {} - - template - DebugStream& operator<<(const T& v) { - if (enabled_) { - out_ << v; - } - return *this; - } - - DebugStream& operator<<(std::ostream& (*endl)(std::ostream&)) { - if (enabled_) { - out_ << endl; - } - return *this; - } - - private: - bool enabled_ = false; - std::ostream& out_; -}; - -inline DebugStream verbose(int line) { - return DebugStream() << "[DEBUG@" << line << "] "; -} - -inline DebugStream warn(int line) { - return DebugStream() << "[WARN@" << line << "] "; -} - -} // namespace nvfuser From efa09bd5db6a88021a56264965902f8d47463571 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 5 Jun 2024 00:45:07 -0700 Subject: [PATCH 08/22] fix --- csrc/id_model/indexing.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp index b5282d3f034..d42091cf2c2 100644 --- a/csrc/id_model/indexing.cpp +++ b/csrc/id_model/indexing.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include From 471dcd7d97d78399b166945b2fdc08080e8a5729 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 5 Jun 2024 00:55:45 -0700 Subject: [PATCH 09/22] clang-tidy --- csrc/id_model/indexing.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp index d42091cf2c2..7c7ec65a7f8 100644 --- a/csrc/id_model/indexing.cpp +++ b/csrc/id_model/indexing.cpp @@ -175,8 +175,9 @@ class IdGraphIndexCompute : public OptOutDispatch { public: IdGraphIndexCompute( const ValGraph& traversal_graph, - const std::unordered_map& initial_index_map) - : traversal_graph_(traversal_graph), index_map_(initial_index_map) {} + std::unordered_map initial_index_map) + : traversal_graph_(traversal_graph), + index_map_(std::move(initial_index_map)) {} // Propagate the index map through a given expr of a specified // direction. From 81937bbfcf3b83ee83cf84d2a9b0572973ae01ab Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 5 Jun 2024 09:46:14 -0700 Subject: [PATCH 10/22] cleanup --- csrc/id_model/indexing.cpp | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp index 7c7ec65a7f8..ed6a765c8cb 100644 --- a/csrc/id_model/indexing.cpp +++ b/csrc/id_model/indexing.cpp @@ -183,7 +183,11 @@ class IdGraphIndexCompute : public OptOutDispatch { // direction. void propagate(const ExprGroup& expr_group, Direction direction) { NVF_ERROR(!expr_group->empty()); + // This looks a little ugly but the dispatch interface doesn't + // have a way to pass arguments + current_direction_ = direction; dispatch(expr_group->front()); + current_direction_ = Direction::Undefined; } const std::unordered_map indexMap() const { @@ -220,28 +224,11 @@ class IdGraphIndexCompute : public OptOutDispatch { private: const ValGraph& traversal_graph_; std::unordered_map index_map_; + Direction current_direction_ = Direction::Undefined; }; -// TODO: Should use the explicit direction bool IdGraphIndexCompute::isForward(Expr* expr) const { - bool ready = true; - for (const auto inp : ir_utils::filterByType(expr->inputs())) { - if (!hasIndex(inp)) { - ready = false; - break; - } - } - if (ready) { - return true; - } - - // Can just return false here. Just make sure the outputs are - // already processed - for (const auto out : ir_utils::filterByType(expr->outputs())) { - NVF_ERROR(hasIndex(out), "Output index not found: ", out->toString()); - } - - return false; + return current_direction_ == Direction::Forward; } void IdGraphIndexCompute::handle(Split* split) { From eb2bc2b34da23a92552c08f79a3a177690954a6a Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 5 Jun 2024 16:55:49 -0700 Subject: [PATCH 11/22] disable idmodel --- csrc/device_lower/lower2device.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index 1a464417631..c50f160eb51 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -391,7 +391,7 @@ void GpuLower::analysis(Fusion* fusion) { // functionality should be affected. New IterDomains may be created, // so it is expected that generated code may use diffrent variable // names - if (true || isOptionEnabled(EnableOption::IdModel)) { + if (isOptionEnabled(EnableOption::IdModel)) { IdModel id_model(fusion_); } From 1097cd596054eba83eddc632407bd921fe88faf2 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 5 Jun 2024 18:06:22 -0700 Subject: [PATCH 12/22] Add broadcast tests --- tests/cpp/test_indexing.cpp | 113 ++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp index 33e3d561f52..6ffefa6a004 100644 --- a/tests/cpp/test_indexing.cpp +++ b/tests/cpp/test_indexing.cpp @@ -381,4 +381,117 @@ TEST_F(IndexingTest, Reshape) { << ". Actual: " << tv5_consumer_index->toInlineString(); } +// Simple non-concretized broadcast +TEST_F(IndexingTest, SimpleBroadcast1) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + fusion.addInput(tv0); + + auto tv1 = broadcast(tv0, {false, true}); + auto tv2 = add(tv1, IrBuilder::create(1.0)); + fusion.addOutput(tv2); + + IdModel id_model(&fusion); + TensorIndexer indexer(id_model); + + std::vector tv1_loop_indices = getLoopIndices(tv1, indexer); + + EXPECT_TRUE(tv1_loop_indices.at(1)->isZeroInt()); + + std::vector tv2_loop_indices = getLoopIndices(tv2, indexer); + + EXPECT_TRUE(tv2_loop_indices.at(1)->isZeroInt()); + + auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition()); + auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition()); + auto tv1_producer_index = indexer.getLinearIndex(tv1, tv2->definition()); + auto tv2_consumer_index = indexer.getLinearIndex(tv2, tv2->definition()); + + EXPECT_EQ(tv0_producer_index, tv1_loop_indices.at(0)); + EXPECT_EQ(tv1_consumer_index, tv1_loop_indices.at(0)); + EXPECT_EQ(tv1_producer_index, tv2_loop_indices.at(0)); + EXPECT_EQ(tv2_consumer_index, tv2_loop_indices.at(0)); +} + +// SimpleBroadcast1 + scheduling +TEST_F(IndexingTest, SimpleBroadcast2) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + fusion.addInput(tv0); + + auto tv1 = broadcast(tv0, {false, true}); + auto tv2 = add(tv1, IrBuilder::create(1.0)); + fusion.addOutput(tv2); + + tv2->flatten(); + tv2->split(0, 4); + + TransformPropagator propagator(tv2); + MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator); + + IdModel id_model(&fusion); + TensorIndexer indexer(id_model); + + // The first merge of the logical domains should be a trivial merge, + // i.e., a merge with a extent-one domain. Thus, the indexing + // travesal should return "x + y * 4", where x and y are the loop + // indices, respecitvely. + + std::vector tv1_loop_indices = getLoopIndices(tv1, indexer); + std::vector tv2_loop_indices = getLoopIndices(tv2, indexer); + + auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition()); + auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition()); + auto tv1_producer_index = indexer.getLinearIndex(tv1, tv2->definition()); + auto tv2_consumer_index = indexer.getLinearIndex(tv2, tv2->definition()); + + // tv0 is a global memory tensor, so the indexing is done with its + // allocation domain, which is mapped with the merge of the two + // logical domains of tv1 on the AlmostExact graph. Traverse back to + // the merge output from the loop domains. + auto tv0_producer_index_ref = SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::mulExpr( + tv1_loop_indices.at(0), tv1->axis(1)->extent()), + tv1_loop_indices.at(1)); + + // tv1 is a Local tensor, so its allocation domains are just their + // loop domains. This index is mathematically equivalent to the tv0 + // index, but the order of linearizing the two loop domains is + // different from the order of computing the merge input index. + auto tv1_consumer_index_ref = SimplifyingIrBuilder::addExpr( + tv1_loop_indices.at(1), + SimplifyingIrBuilder::mulExpr( + tv1_loop_indices.at(0), tv1->axis(1)->extent())); + + auto tv1_producer_index_ref = SimplifyingIrBuilder::addExpr( + tv2_loop_indices.at(1), + SimplifyingIrBuilder::mulExpr( + tv2_loop_indices.at(0), tv2->axis(1)->extent())); + + auto tv2_consumer_index_ref = SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::mulExpr( + tv2_loop_indices.at(0), tv2->axis(1)->extent()), + tv2_loop_indices.at(1)); + + EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref)) + << "Ref: " << tv0_producer_index_ref->toInlineString() + << ". Actual: " << tv0_producer_index->toInlineString(); + + EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref)) + << "Ref: " << tv1_consumer_index_ref->toInlineString() + << ". Actual: " << tv1_consumer_index->toInlineString(); + + EXPECT_TRUE(tv1_producer_index->sameAs(tv1_producer_index_ref)) + << "Ref: " << tv1_producer_index_ref->toInlineString() + << ". Actual: " << tv1_producer_index->toInlineString(); + + EXPECT_TRUE(tv2_consumer_index->sameAs(tv2_consumer_index_ref)) + << "Ref: " << tv2_consumer_index_ref->toInlineString() + << ". Actual: " << tv2_consumer_index->toInlineString(); +} + } // namespace nvfuser From d57487a5c7691ec3c61dfeaa161d979b11b1a9bd Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 5 Jun 2024 19:11:14 -0700 Subject: [PATCH 13/22] tests --- tests/cpp/test_indexing.cpp | 120 ++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp index 6ffefa6a004..cdac61db5de 100644 --- a/tests/cpp/test_indexing.cpp +++ b/tests/cpp/test_indexing.cpp @@ -21,6 +21,8 @@ #include #include +#include + namespace nvfuser { using IndexingTest = NVFuserTest; @@ -35,6 +37,26 @@ std::vector getLoopIndices(TensorView* tv, const TensorIndexer& indexer) { return loop_indices; } +template +Val* addExpr(Args&&... args) { + return SimplifyingIrBuilder::addExpr(std::forward(args)...); +} + +template +Val* mulExpr(Args&&... args) { + return SimplifyingIrBuilder::mulExpr(std::forward(args)...); +} + +template +Val* divExpr(Args&&... args) { + return SimplifyingIrBuilder::divExpr(std::forward(args)...); +} + +template +Val* modExpr(Args&&... args) { + return SimplifyingIrBuilder::modExpr(std::forward(args)...); +} + } // namespace // Simple pointwise test with no parallelization @@ -494,4 +516,102 @@ TEST_F(IndexingTest, SimpleBroadcast2) { << ". Actual: " << tv2_consumer_index->toInlineString(); } +// Concretized broadcast +TEST_F(IndexingTest, SimpleBroadcast3) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + fusion.addInput(tv0); + auto tv1 = makeContigTensor(2); + fusion.addInput(tv1); + + auto tv2 = broadcast(tv0, {false, true}); + auto tv3 = add(tv2, tv1); + fusion.addOutput(tv3); + + tv3->flatten(); + + TransformPropagator propagator(tv3); + MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator); + + inlineMost(); + + IdModel id_model(&fusion); + TensorIndexer indexer(id_model); + + std::vector tv3_loop_indices = getLoopIndices(tv3, indexer); + + // Start with tv3 index as it's most straightforward + auto tv3_consumer_index = indexer.getLinearIndex(tv3, tv3->definition()); + auto tv3_consumer_index_ref = addExpr( + modExpr(tv3_loop_indices.at(0), tv3->getLogicalDomain().at(1)->extent()), + mulExpr( + divExpr( + tv3_loop_indices.at(0), tv3->getLogicalDomain().at(1)->extent()), + tv3->getLogicalDomain().at(1)->extent())); + + EXPECT_TRUE(tv3_consumer_index->sameAs(tv3_consumer_index_ref)) + << "Ref: " << tv3_consumer_index_ref->toInlineString() + << ". Actual: " << tv3_consumer_index->toInlineString(); + + // Since tv2 is fully inlined, its index should be just zero + auto tv2_consumer_index = indexer.getLinearIndex(tv2, tv2->definition()); + auto tv2_producer_index = indexer.getLinearIndex(tv2, tv3->definition()); + + EXPECT_TRUE(tv2_consumer_index->isZeroInt()); + EXPECT_TRUE(tv2_producer_index->isZeroInt()); + + // tv0 is a 1D pre-broadcast input tensor, so it only needs the + // index that corresponds to the outer dimension of the tv3 (or tv2) + // logical domains + auto tv0_producer_index = indexer.getLinearIndex(tv0, tv2->definition()); + auto tv0_producer_index_ref = + divExpr(tv3_loop_indices.at(0), tv3->getLogicalDomain().at(1)->extent()); + + EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref)) + << "Ref: " << tv0_producer_index_ref->toInlineString() + << ". Actual: " << tv0_producer_index->toInlineString(); + + // tv1 should have the same index as tv3 + auto tv1_producer_index = indexer.getLinearIndex(tv1, tv3->definition()); + EXPECT_TRUE(tv1_producer_index->sameAs(tv3_consumer_index_ref)) + << "Ref: " << tv3_consumer_index_ref->toInlineString() + << ". Actual: " << tv1_producer_index->toInlineString(); +} + +// Concretized broadcast with partial inlining. Loop promotion is +// required. Same fusion as IdModelTest.LoopPromotion4. See also +// Example 1 of the Loop Promotion doc. +TEST_F(IndexingTest, SimpleBroadcast4) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigConcreteTensor({1, 4}); + fusion.addInput(tv0); + auto tv1 = makeContigConcreteTensor({3, 4}); + fusion.addInput(tv1); + + auto tv2 = set(tv0); + auto tv3 = set(tv1); + auto tv4 = add(tv2, tv3); + fusion.addOutput(tv4); + + // [i0, i1] + tv4->merge(0); + // [i0*i1] + tv4->split(0, 4, false); // outer split + // [4, i0*i1/4] + + TransformPropagator propagator(tv4); + MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator); + + for (auto tv : ir_utils::allTvs(&fusion)) { + tv->inlineAt(-2); + } + + IdModel id_model(&fusion); + TensorIndexer indexer(id_model); +} + } // namespace nvfuser From 98e5fd26989a9c335255994c9cbb51c5f7964006 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 5 Jun 2024 19:11:34 -0700 Subject: [PATCH 14/22] error check --- csrc/id_model/indexing.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp index fc105f1db3c..b49d239b3f9 100644 --- a/csrc/id_model/indexing.cpp +++ b/csrc/id_model/indexing.cpp @@ -498,6 +498,18 @@ std::unordered_map TensorIndexer::getInitialIndexMap( } Val* TensorIndexer::getLinearIndex(TensorView* tv, const Expr* expr) { + NVF_ERROR(tv != nullptr); + NVF_ERROR(expr != nullptr); + NVF_ERROR( + (std::find(expr->inputs().begin(), expr->inputs().end(), tv) != + expr->inputs().end()) || + (std::find(expr->outputs().begin(), expr->outputs().end(), tv) != + expr->outputs().end()), + "Inconsistent tensor and expr. Tensor, ", + tv->toString(), + " not found in ", + expr->toString()); + VERBOSE() << "getIndex of " << tv->toString() << " in " << expr->toString(); const auto [allocation_domains, strides] = From 19c509e0ffbb86541862d26a7f152175f3f32bee Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 5 Jun 2024 20:39:31 -0700 Subject: [PATCH 15/22] loop promotion --- csrc/id_model/indexing.cpp | 14 ++++++++++++++ tests/cpp/test_indexing.cpp | 16 ++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp index b49d239b3f9..54a8990011b 100644 --- a/csrc/id_model/indexing.cpp +++ b/csrc/id_model/indexing.cpp @@ -167,6 +167,20 @@ std::tuple, std::vector> getAllocationDomains( } } + // Loop promotion may affect allocations. Promotions of intermediate + // domains may not be defined correctly. Only consider loop domains + // for now. + for (auto& allocation_domain : allocation_domains) { + bool is_loop = std::find( + tv->getLeafDomain().begin(), + tv->getLeafDomain().end(), + allocation_domain) != tv->getLeafDomain().end(); + if (!is_loop) { + continue; + } + allocation_domain = getLoopPromotion(allocation_domain, id_model); + } + // Compute the strides from innermost to outermost domains std::vector strides(allocation_domains.size(), nullptr); Val* cur_contig_stride = tv->fusion()->oneVal(); diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp index cdac61db5de..e50bdf9bd13 100644 --- a/tests/cpp/test_indexing.cpp +++ b/tests/cpp/test_indexing.cpp @@ -612,6 +612,22 @@ TEST_F(IndexingTest, SimpleBroadcast4) { IdModel id_model(&fusion); TensorIndexer indexer(id_model); + + // As discussed in the doc, the inner domain of tv2 is promoted to + // a domain with the same extent as the inner domain of tv4. Since + // tv2 is a Local tensor, its allocation domain is also promoted to + // the same domain. Thus, its consumer index is just the loop index + // of the inner loop of the tv2 loop domains, and its producer index + // is also just the inner loop index of the loop domains of tv4. + + std::vector tv2_loop_indices = getLoopIndices(tv2, indexer); + std::vector tv4_loop_indices = getLoopIndices(tv4, indexer); + + auto tv2_consumer_index = indexer.getLinearIndex(tv2, tv2->definition()); + auto tv2_producer_index = indexer.getLinearIndex(tv2, tv4->definition()); + + EXPECT_EQ(tv2_consumer_index, tv2_loop_indices.at(1)); + EXPECT_EQ(tv2_producer_index, tv4_loop_indices.at(1)); } } // namespace nvfuser From b9f9108ae458c138c246b9df8ddd6d35ae5cff5d Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 5 Jun 2024 20:58:07 -0700 Subject: [PATCH 16/22] cleanup --- csrc/device_lower/utils.cpp | 22 ---------- csrc/device_lower/utils.h | 2 - csrc/id_model/indexing.cpp | 88 ++----------------------------------- 3 files changed, 4 insertions(+), 108 deletions(-) diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp index bbf0526e461..955c1513dd3 100644 --- a/csrc/device_lower/utils.cpp +++ b/csrc/device_lower/utils.cpp @@ -920,28 +920,6 @@ std::array getMmaLayout(const MmaOp* expr) { return layout; } -// Returns true if expr is an expression that initializes a reduction -// buffer. -bool isReductionInitExpr(const Expr* expr) { - // False if its output isn't a TensorView - if (!ir_utils::isTvOp(expr)) { - return false; - } - // False if it doesn't have any reduction axis - const auto out_tv = ir_utils::getTvOutput(expr); - if (!out_tv->domain()->hasReduction()) { - return false; - } - // False if it has have TensorView inputs as initialization should - // never use TensorViews - const auto tv_filter_inp_view = - ir_utils::filterByType(expr->inputs()); - if (tv_filter_inp_view.begin() != tv_filter_inp_view.end()) { - return false; - } - return true; -} - } // namespace lower_utils } // namespace nvfuser diff --git a/csrc/device_lower/utils.h b/csrc/device_lower/utils.h index cad0ae5b996..60f3459814a 100644 --- a/csrc/device_lower/utils.h +++ b/csrc/device_lower/utils.h @@ -322,8 +322,6 @@ Val* getNumThreadsInTensorView(TensorView* tv); //! Get the unit dimensions of A and B for the given MmaOp. std::array getMmaLayout(const MmaOp* expr); -bool isReductionInitExpr(const Expr* expr); - } // namespace lower_utils } // namespace nvfuser diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp index 54a8990011b..df1964e8810 100644 --- a/csrc/id_model/indexing.cpp +++ b/csrc/id_model/indexing.cpp @@ -88,13 +88,6 @@ Val* getStrideOfGlobalMemoryTensor(TensorView* tv, int64_t alloc_dim) { stride_dim); } -// Currently it's only Shared or Local but Global can be the case -// too. -bool isAllocationBasedOnLeaf(TensorView* tv) { - return tv->getMemoryType() == MemoryType::Shared || - tv->getMemoryType() == MemoryType::Local; -} - // Get the allocation domains of a given tensor. Also returns its // strides. // @@ -107,40 +100,17 @@ std::tuple, std::vector> getAllocationDomains( std::vector allocation_domains; std::vector> contiguity; - auto inlining_pos = tv->getComputeAtPosition(); - - bool use_set_allocatin_domain = false; - if (tv->hasAllocation()) { - if (tv->getMemoryType() == MemoryType::Shared || - tv->getMemoryType() == MemoryType::Local) { - if (std::is_permutation( - tv->getLeafDomain().begin(), - tv->getLeafDomain().end(), - tv->getAllocationDomain().begin())) { - use_set_allocatin_domain = true; - } - } else { - use_set_allocatin_domain = true; - } - } - - // Ignore allocation of non-global tensors for now - if (use_set_allocatin_domain) { allocation_domains = tv->getAllocationDomain(); - NVF_ERROR(!tv->isDoubleBuffered()); contiguity = tv->domain()->contiguity(); } else { // If allocation domain is not set, assume that: - // Local/Shared: leaf domains to the right of the CA position - // Global: rfactor domains + // Global: logical domains + // Local/Shared: loop domains to the right of the CA position + const auto inlining_pos = tv->getComputeAtPosition(); if (tv->getMemoryType() == MemoryType::Global) { - VERBOSE() << "Tv does not have allocation of " << tv->toString() << ", " - << toDelimitedString(tv->getMaybeAllocationDomain()) - << std::endl; allocation_domains = tv->getLogicalDomain(); contiguity = tv->domain()->contiguity(); - NVF_ERROR(!tv->isDoubleBuffered()); } else { for (const auto i : c10::irange(tv->nDims())) { auto loop_id = tv->getLeafDomain().at(i); @@ -295,9 +265,6 @@ bool IdGraphIndexCompute::isForward(Expr* expr) const { void IdGraphIndexCompute::handle(Split* split) { const bool is_forward = isForward(split); - VERBOSE() << "IdGraphIndexCompute handle (" << (is_forward ? "fwd" : "bwd") - << "): " << split->toString(); - if (is_forward) { auto in_idx = getIndex(split->in()); auto inner_extent = split->inner()->extent(); @@ -319,9 +286,6 @@ void IdGraphIndexCompute::handle(Split* split) { void IdGraphIndexCompute::handle(Merge* merge) { const bool is_forward = isForward(merge); - VERBOSE() << "IdGraphIndexCompute handle (" << (is_forward ? "fwd" : "bwd") - << "): " << merge->toString(); - auto inner_ext = merge->inner()->extent(); if (is_forward) { @@ -399,30 +363,9 @@ void TensorIndexer::buildLoopIndexMap() { } else if (shouldUseZeroIndex(loop_group)) { loop_index = fusion->zeroVal(); } else { - // Everything now should be serial concrete loops. For the mean - // time, just use the same index integer val generated for - // ComputeAtMap if available. - if (GpuLower::hasCurrent()) { - const auto& ca_map = GpuLower::current()->caMap(); - for (const auto& id : - ir_utils::filterByType(loop_group->vector())) { - if (!ca_map->getIdSets(IdMappingMode::LOOP).mappingExists(id)) { - continue; - } - loop_index = ca_map->getIndexVariable(id); - break; - } - NVF_ERROR( - loop_index != nullptr, - "No existing index found for ", - nvfuser::toString(loop_group)); - } else { - // Not reusing the ComputeATMap index assignments - loop_index = IrBuilder::create(DataType::Index); - } + loop_index = IrBuilder::create(DataType::Index); } - NVF_ERROR(loop_index != nullptr); loop_index_map_[loop_group] = loop_index; } } @@ -524,14 +467,9 @@ Val* TensorIndexer::getLinearIndex(TensorView* tv, const Expr* expr) { " not found in ", expr->toString()); - VERBOSE() << "getIndex of " << tv->toString() << " in " << expr->toString(); - const auto [allocation_domains, strides] = getAllocationDomains(tv, id_model_); - VERBOSE() << "Allocation domains: " << toDelimitedString(allocation_domains) - << std::endl; - const auto& index_info = computeIndex(expr, allocation_domains); const auto& index_map = index_info.index_map; @@ -551,9 +489,6 @@ Val* TensorIndexer::getLinearIndex(TensorView* tv, const Expr* expr) { "Index not found for ", allocation_domain->toString()); Val* idx = idx_it->second; - VERBOSE() << "Index of " << allocation_domain->toString() << ": " - << idx->toInlineString() << std::endl; - index = SimplifyingIrBuilder::addExpr( index, SimplifyingIrBuilder::mulExpr(idx, stride)); } @@ -570,17 +505,6 @@ std::vector TensorIndexer::getLoopDomains(const Expr* expr) const { // scatter auto loop_domains = ir_utils::getTvOutput(expr)->getLeafDomain(); - // If this is an expr initializing a buffer for a reduction, there - // should be no loops for reduction domains - if (lower_utils::isReductionInitExpr(expr)) { - loop_domains.erase( - std::remove_if( - loop_domains.begin(), - loop_domains.end(), - [](IterDomain* id) -> bool { return id->isReduction(); }), - loop_domains.end()); - } - for (auto& loop_id : loop_domains) { loop_id = getLoopPromotion(loop_id, id_model_); } @@ -592,10 +516,6 @@ IndexingInfo TensorIndexer::computeIndex( const Expr* expr, const std::vector& index_domains) const { const auto loop_domains = getLoopDomains(expr); - VERBOSE() << "Loop domains: " << toDelimitedString(loop_domains) << std::endl; - - VERBOSE() << "Index domains: " << toDelimitedString(index_domains) - << std::endl; const ValGroups loop_groups = traversalGraph().toGroups(loop_domains); const ValGroups index_groups = traversalGraph().toGroups(index_domains); From 1ae83b3d7ff58f7cb054887756e38f0311ac8f6a Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 5 Jun 2024 20:58:37 -0700 Subject: [PATCH 17/22] cleanup --- csrc/id_model/utils.h | 55 ------------------------------------------- 1 file changed, 55 deletions(-) delete mode 100644 csrc/id_model/utils.h diff --git a/csrc/id_model/utils.h b/csrc/id_model/utils.h deleted file mode 100644 index 2d6327bf586..00000000000 --- a/csrc/id_model/utils.h +++ /dev/null @@ -1,55 +0,0 @@ -// clang-format off -/* - * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. - * All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - */ -// clang-format on -#pragma once - -#include - -#include -#include -#include - -#define VERBOSE() verbose(__LINE__) -#define WARN() warn(__LINE__) - -namespace nvfuser { - -// Temporary logging utility -class DebugStream { - public: - DebugStream() - : enabled_(getNvFuserEnv("ID_MODEL_VERBOSE")), out_(std::cerr) {} - - template - DebugStream& operator<<(const T& v) { - if (enabled_) { - out_ << v; - } - return *this; - } - - DebugStream& operator<<(std::ostream& (*endl)(std::ostream&)) { - if (enabled_) { - out_ << endl; - } - return *this; - } - - private: - bool enabled_ = false; - std::ostream& out_; -}; - -inline DebugStream verbose(int line) { - return DebugStream() << "[DEBUG@" << line << "] "; -} - -inline DebugStream warn(int line) { - return DebugStream() << "[WARN@" << line << "] "; -} - -} // namespace nvfuser From 1d9a67047ee79286bc4a36a5a2dd16419f7e0a4f Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 5 Jun 2024 20:59:05 -0700 Subject: [PATCH 18/22] cleanup --- csrc/id_model/indexing.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp index df1964e8810..e6b3faee853 100644 --- a/csrc/id_model/indexing.cpp +++ b/csrc/id_model/indexing.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include From 358e54bd7363fd5f618ea702a131e63c327e0906 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 5 Jun 2024 21:06:07 -0700 Subject: [PATCH 19/22] cleanup --- csrc/id_model/indexing.cpp | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp index e6b3faee853..9bf70f84e49 100644 --- a/csrc/id_model/indexing.cpp +++ b/csrc/id_model/indexing.cpp @@ -104,8 +104,8 @@ std::tuple, std::vector> getAllocationDomains( contiguity = tv->domain()->contiguity(); } else { // If allocation domain is not set, assume that: - // Global: logical domains - // Local/Shared: loop domains to the right of the CA position + // - Global: logical domains + // - Local/Shared: loop domains to the right of the CA position const auto inlining_pos = tv->getComputeAtPosition(); if (tv->getMemoryType() == MemoryType::Global) { allocation_domains = tv->getLogicalDomain(); @@ -371,27 +371,10 @@ void TensorIndexer::buildLoopIndexMap() { } bool TensorIndexer::shouldUseZeroIndex(const ValGroup& loop_group) const { - // For parallelized domains that have index NamedScalar's such as - // threadIdx.x, just use the NamedScalar. It doesn't automatically - // mean such parallel indices are actually used in the final index - // expr. For example, TID-parallelized Local tensors won't have - // TID-parallelized iter domains as allocation domains, so threadIdx - // won't appear in the final index expr. - ParallelType ptype = getParallelType(loop_group); - if (isParallelTypeThread(ptype)) { - return false; - } - - // Note that the device paralle type is not included in - // "isThread". This is necessary because we don't have a NamedScalar - // for DID. Since it's always partitioned in any memory space - // currently supported, it's guaranteed to be zero. - if (isParallelTypeDeviceDim(ptype)) { - return true; - } - // All loops in this set are non-parallel, non-concretized broadcast - // iterdomains, their "index variable" should be zero. + // iterdomains, their "index variable" should be zero. This + // condition should be included in the next triviality check, but + // just checking isBroadcast should be more efficient. if (std::all_of(loop_group->begin(), loop_group->end(), [](Val* val) { return val->as()->isBroadcast(); })) { @@ -492,8 +475,6 @@ Val* TensorIndexer::getLinearIndex(TensorView* tv, const Expr* expr) { index, SimplifyingIrBuilder::mulExpr(idx, stride)); } - VERBOSE() << "Final index: " << index->toInlineString() << std::endl; - return index; } From 699cd53021c136e5cf21e24bb11408f09ff662f2 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 5 Jun 2024 21:07:29 -0700 Subject: [PATCH 20/22] cleanup --- tests/cpp/test_indexing.cpp | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp index e50bdf9bd13..a1d2fe68219 100644 --- a/tests/cpp/test_indexing.cpp +++ b/tests/cpp/test_indexing.cpp @@ -475,28 +475,24 @@ TEST_F(IndexingTest, SimpleBroadcast2) { // allocation domain, which is mapped with the merge of the two // logical domains of tv1 on the AlmostExact graph. Traverse back to // the merge output from the loop domains. - auto tv0_producer_index_ref = SimplifyingIrBuilder::addExpr( - SimplifyingIrBuilder::mulExpr( - tv1_loop_indices.at(0), tv1->axis(1)->extent()), + auto tv0_producer_index_ref = addExpr( + mulExpr(tv1_loop_indices.at(0), tv1->axis(1)->extent()), tv1_loop_indices.at(1)); // tv1 is a Local tensor, so its allocation domains are just their // loop domains. This index is mathematically equivalent to the tv0 // index, but the order of linearizing the two loop domains is // different from the order of computing the merge input index. - auto tv1_consumer_index_ref = SimplifyingIrBuilder::addExpr( + auto tv1_consumer_index_ref = addExpr( tv1_loop_indices.at(1), - SimplifyingIrBuilder::mulExpr( - tv1_loop_indices.at(0), tv1->axis(1)->extent())); + mulExpr(tv1_loop_indices.at(0), tv1->axis(1)->extent())); - auto tv1_producer_index_ref = SimplifyingIrBuilder::addExpr( + auto tv1_producer_index_ref = addExpr( tv2_loop_indices.at(1), - SimplifyingIrBuilder::mulExpr( - tv2_loop_indices.at(0), tv2->axis(1)->extent())); + mulExpr(tv2_loop_indices.at(0), tv2->axis(1)->extent())); - auto tv2_consumer_index_ref = SimplifyingIrBuilder::addExpr( - SimplifyingIrBuilder::mulExpr( - tv2_loop_indices.at(0), tv2->axis(1)->extent()), + auto tv2_consumer_index_ref = addExpr( + mulExpr(tv2_loop_indices.at(0), tv2->axis(1)->extent()), tv2_loop_indices.at(1)); EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref)) From de055b3811a3c9a1af86b798fb216da5445fa1dc Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 6 Jun 2024 13:48:42 -0700 Subject: [PATCH 21/22] cleanup --- csrc/id_model/indexing.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp index beff8c41918..ad58cfe9aad 100644 --- a/csrc/id_model/indexing.cpp +++ b/csrc/id_model/indexing.cpp @@ -266,7 +266,7 @@ void IdGraphIndexCompute::handle(Split* split) { const bool is_forward = isForward(split); auto inner_extent = split->inner()->extent(); - + if (is_forward) { auto in_idx = getIndex(split->in()); auto outer_idx = SimplifyingIrBuilder::divExpr(in_idx, inner_extent); @@ -344,7 +344,6 @@ void TensorIndexer::buildLoopIndexMap() { if (!ir_utils::isTvOp(expr)) { continue; } - // It's assumed that all sibling outputs share the same for-loops, // thus only one of the outputs is considered. auto tv_output = ir_utils::getTvOutput(expr); From 1c6ad9e107666819b07f3bbde7dc046e752e2b7a Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 6 Jun 2024 17:36:43 -0700 Subject: [PATCH 22/22] Support indexing of DIDx parallelized tensors --- csrc/id_model/indexing.cpp | 7 +- tests/cpp/test_indexing.cpp | 203 ++++++++++++++++++++++++++++++++++++ 2 files changed, 209 insertions(+), 1 deletion(-) diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp index ad58cfe9aad..ebfc12b5d5d 100644 --- a/csrc/id_model/indexing.cpp +++ b/csrc/id_model/indexing.cpp @@ -361,7 +361,12 @@ void TensorIndexer::buildLoopIndexMap() { ParallelType ptype = getParallelType(loop_group); if (isParallelTypeThread(ptype)) { loop_index = NamedScalar::getParallelIndex(ptype); - } else if (shouldUseZeroIndex(loop_group)) { + } else if ( + // TODO: Cleanup needed. ir_utils::isMemoryPartitionedAcross + // should be used, but that means we would need to consider + // multiple outputs with different memory types, though it + // should be uncommon in practice. + shouldUseZeroIndex(loop_group) || isParallelTypeDeviceDim(ptype)) { loop_index = fusion->zeroVal(); } else { loop_index = IrBuilder::create(DataType::Index); diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp index a1d2fe68219..372daf6a677 100644 --- a/tests/cpp/test_indexing.cpp +++ b/tests/cpp/test_indexing.cpp @@ -626,4 +626,207 @@ TEST_F(IndexingTest, SimpleBroadcast4) { EXPECT_EQ(tv2_producer_index, tv4_loop_indices.at(1)); } +// Trivial example. 1D shared tensor. Each device only has one +// element, so the index should be always just zero. +TEST_F(IndexingTest, MultiDevice1DNoSplitMerge) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(1); + fusion.addInput(tv0); + + auto tv1 = set(tv0); + fusion.addOutput(tv1); + + tv0->axis(0)->parallelize(ParallelType::DIDx); + tv1->axis(0)->parallelize(ParallelType::DIDx); + + IdModel id_model(&fusion); + TensorIndexer indexer(id_model); + + EXPECT_TRUE(indexer.getLinearIndex(tv0, tv1->definition())->isZeroInt()); + EXPECT_TRUE(indexer.getLinearIndex(tv1, tv1->definition())->isZeroInt()); +} + +// Same fusion as MultiDevice1DNoSplitMerge but with split. +TEST_F(IndexingTest, MultiDevice1DSplit) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + fusion.addInput(tv0); + + auto tv1 = set(tv0); + fusion.addOutput(tv1); + + Val* num_devices = IrBuilder::create(DataType::Index); + + tv0->split(0, num_devices, false); + tv1->split(0, num_devices, false); + + tv0->axis(0)->parallelize(ParallelType::DIDx); + tv1->axis(0)->parallelize(ParallelType::DIDx); + + IdModel id_model(&fusion); + TensorIndexer indexer(id_model); + + std::vector tv1_loop_indices = getLoopIndices(tv1, indexer); + + auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition()); + auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition()); + + auto tv0_producer_index_ref = tv1_loop_indices.at(1); + auto tv1_consumer_index_ref = tv1_loop_indices.at(1); + + EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref)) + << "Ref: " << tv0_producer_index_ref->toInlineString() + << ". Actual: " << tv0_producer_index->toInlineString(); + + EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref)) + << "Ref: " << tv1_consumer_index_ref->toInlineString() + << ". Actual: " << tv1_consumer_index->toInlineString(); +} + +TEST_F(IndexingTest, MultiDevice2D) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(2); + fusion.addInput(tv0); + + auto tv1 = set(tv0); + fusion.addOutput(tv1); + + Val* num_devices = IrBuilder::create(DataType::Index); + + tv1->flatten(); + tv1->split(0, num_devices, false); + + TransformPropagator propagator(tv1); + MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator); + + tv0->axis(0)->parallelize(ParallelType::DIDx); + tv1->axis(0)->parallelize(ParallelType::DIDx); + + IdModel id_model(&fusion); + TensorIndexer indexer(id_model); + + std::vector tv1_loop_indices = getLoopIndices(tv1, indexer); + + auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition()); + auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition()); + + auto inner_dim = tv1->getLogicalDomain().at(1)->extent(); + + // Note that the allocation domain is the logical domain. See the + // next test for a leaf allocation example + auto tv0_producer_index_ref = addExpr( + modExpr(tv1_loop_indices.at(1), inner_dim), + mulExpr(divExpr(tv1_loop_indices.at(1), inner_dim), inner_dim)); + + // Should use the same index + auto tv1_consumer_index_ref = tv0_producer_index_ref; + + EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref)) + << "Ref: " << tv0_producer_index_ref->toInlineString() + << ". Actual: " << tv0_producer_index->toInlineString(); + + EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref)) + << "Ref: " << tv1_consumer_index_ref->toInlineString() + << ". Actual: " << tv1_consumer_index->toInlineString(); +} + +// Same fusion as MultiDevice2D but with leaf allocation +TEST_F(IndexingTest, MultiDevice2DLeafAllocation) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(2); + fusion.addInput(tv0); + + auto tv1 = set(tv0); + fusion.addOutput(tv1); + + Val* num_devices = IrBuilder::create(DataType::Index); + + tv1->flatten(); + tv1->split(0, num_devices, false); + + TransformPropagator propagator(tv1); + MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator); + + tv0->axis(0)->parallelize(ParallelType::DIDx); + tv1->axis(0)->parallelize(ParallelType::DIDx); + + tv0->setAllocationDomain(tv0->getLeafDomain(), true); + tv1->setAllocationDomain(tv1->getLeafDomain(), true); + + IdModel id_model(&fusion); + TensorIndexer indexer(id_model); + + std::vector tv1_loop_indices = getLoopIndices(tv1, indexer); + + auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition()); + auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition()); + + // Since the leaf domain is the allocation domain, the index should + // be just the non-parallelized loop index + auto tv0_producer_index_ref = tv1_loop_indices.at(1); + + // Should use the same index + auto tv1_consumer_index_ref = tv0_producer_index_ref; + + EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref)) + << "Ref: " << tv0_producer_index_ref->toInlineString() + << ". Actual: " << tv0_producer_index->toInlineString(); + + EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref)) + << "Ref: " << tv1_consumer_index_ref->toInlineString() + << ". Actual: " << tv1_consumer_index->toInlineString(); +} + +TEST_F(IndexingTest, MultiDevice2DTranspose) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(2); + fusion.addInput(tv0); + + auto tv1 = transpose(tv0); + fusion.addOutput(tv1); + + Val* num_devices = IrBuilder::create(DataType::Index); + + tv0->split(0, num_devices, false); + tv1->split(0, num_devices, false); + + tv0->axis(0)->parallelize(ParallelType::DIDx); + tv1->axis(0)->parallelize(ParallelType::DIDx); + + IdModel id_model(&fusion); + TensorIndexer indexer(id_model); + + std::vector tv1_loop_indices = getLoopIndices(tv1, indexer); + + auto tv0_producer_index = indexer.getLinearIndex(tv0, tv1->definition()); + auto tv1_consumer_index = indexer.getLinearIndex(tv1, tv1->definition()); + + auto tv0_producer_index_ref = addExpr( + tv1_loop_indices.at(1), + mulExpr(tv1_loop_indices.at(2), tv0->getLogicalDomain().at(1)->extent())); + + // Should use the same index + auto tv1_consumer_index_ref = addExpr( + tv1_loop_indices.at(2), + mulExpr(tv1_loop_indices.at(1), tv1->getLogicalDomain().at(1)->extent())); + + EXPECT_TRUE(tv0_producer_index->sameAs(tv0_producer_index_ref)) + << "Ref: " << tv0_producer_index_ref->toInlineString() + << ". Actual: " << tv0_producer_index->toInlineString(); + + EXPECT_TRUE(tv1_consumer_index->sameAs(tv1_consumer_index_ref)) + << "Ref: " << tv1_consumer_index_ref->toInlineString() + << ". Actual: " << tv1_consumer_index->toInlineString(); +} + } // namespace nvfuser