diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index df0ad505649..dd379e6c293 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -402,9 +402,35 @@ void DynamicTransformConcretizer::concretize() { concretizeResize(); // Finally, propagate concretized domains - auto all_stmts = StmtSort::getStmts(info_.fusion(), true); + + // We need to concretize all immediate outputs of all intermediate + // expressions; even those leading to dead code branches. To do this, we + // insert all outputs from all intermediate expressions to "leaves". If we + // don't insert anything new, we are done. Otherwise, we traverse again using + // these as outputs as well, which ensures the output is sorted. + auto leaves = info_.fusion()->getTerminatingOutputs(); + auto leaves_set = + std::unordered_set(leaves.begin(), leaves.end()); + std::vector all_stmts; + bool inserted = true; + while (inserted) { + all_stmts = StmtSort::getStmts(info_.fusion(), leaves, true); + inserted = false; + for (auto stmt : all_stmts) { + if (stmt->isExpr()) { + for (auto o : stmt->as()->outputs()) { + if (leaves_set.find(o) == leaves_set.end()) { + leaves.push_back(o); + leaves_set.insert(o); + inserted = true; + } + } + } + } + } + // Concretize all vals in the final vector for (auto stmt : all_stmts) { - if (stmt->isA()) { + if (stmt->isVal()) { mutate(stmt); } } @@ -641,6 +667,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( // corresponding producer IDs std::optional id_type; + Val* extent = nullptr; for (auto producer : ir_utils::filterByType(def->inputs())) { PairwiseRootDomainMap root_map(producer, consumer); @@ -663,6 +690,11 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( } else { id_type = input_id->getIterType(); } + + // Set extent expression based on producer, overwriting that of consumer + if (!extent) { + extent = input_id->extent(); + } } TORCH_INTERNAL_ASSERT( @@ -680,7 +712,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( consumer->toString()); auto concretized_id = - IterDomainBuilder(root_id).iter_type(*id_type).build(); + IterDomainBuilder(root_id).extent(extent).iter_type(*id_type).build(); registerConcretization(root_id, concretized_id); is_concretized = true; diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp index 5f054da2715..d6cbe3c90e9 100644 --- a/csrc/kernel_cache.cpp +++ b/csrc/kernel_cache.cpp @@ -618,7 +618,10 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( kernel_runtime->updateHeuristicsLaunchParams(new_heuristics.get()); } else { // cache miss, need to re-build an optimized graph for this case - + if (isDebugDumpEnabled(DebugDumpOption::FusionIrConcretized)) { + std::cout << "Fusion Before Concretization:" << std::endl; + fusion()->printMath(); + } // concretize fusion_ for use in this runtime auto fusion = std::make_unique(*fusion_); FusionGuard fg(fusion.get()); diff --git a/csrc/ops/alias.cpp b/csrc/ops/alias.cpp index 49ed3b9912d..99f3071a4da 100644 --- a/csrc/ops/alias.cpp +++ b/csrc/ops/alias.cpp @@ -116,7 +116,7 @@ TensorView* reshape(TensorView* inp_tv, const std::vector& new_sizes) { return static_reshape_output; } - auto root_domain = ops::newOutputDomain({inp_tv}, inp_tv->dtype()); + auto root_domain = ops::newOutputDomain({inp_tv}); // Create placeholder rfactor domain. Note it's not connected with the root // domain. @@ -632,7 +632,20 @@ TensorView* cat(const std::vector& inputs, int64_t cat_dim) { } // Now all of resized_inputs have the same shape as the out tensor - auto out = ops::newOutputTV(resized_inputs, dtype); + // NOTE: ops::newOutputTV would not necessarily be able to infer that the + // padded dimensions are all of the same size. However, we know that they are + // constructed such that that is the case, so we can use + auto out_domain = ops::newOutputDomain(resized_inputs); + // Override the concatenated dimension and insert an IterDomain with the true + // extent, if needed + if (!out_domain.at(cat_dim)->extent()->sameAs(concat_ext)) { + out_domain[cat_dim] = + IterDomainBuilder(out_domain.at(cat_dim)).extent(concat_ext).build(); + } + auto out = IrBuilder::create( + IrBuilder::create( + out_domain, TensorDomain::getContiguityFilledWith(out_domain, true)), + dtype); IrBuilder::create(out, resized_inputs, cat_dim); diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp index 43b7ba18206..effa5d617a7 100644 --- a/csrc/ops/utils.cpp +++ b/csrc/ops/utils.cpp @@ -198,9 +198,7 @@ IterType promoteIterType(IterType type1, IterType type2) { } } -std::vector newOutputDomain( - const std::vector& vals, - DataType dtype) { +std::vector newOutputDomain(const std::vector& vals) { std::vector tvs; for (auto val : vals) { if (val->getValType() == ValType::TensorView) { @@ -223,9 +221,10 @@ std::vector newOutputDomain( std::vector start_offsets(out_domain.size(), 0); std::vector stop_offsets(out_domain.size(), 0); std::vector extent_vals(out_domain.size(), nullptr); + std::vector mismatched_symbolic_extents(out_domain.size(), false); std::vector expanded_extent_vals(out_domain.size(), nullptr); - std::vector> iter_types( - out_domain.size(), c10::nullopt); + std::vector> iter_types( + out_domain.size(), std::nullopt); for (auto tv : tvs) { auto dom = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); @@ -236,6 +235,53 @@ std::vector newOutputDomain( " dimensions but expected ", out_domain.size()); for (const auto i : c10::irange(dom.size())) { + auto iter_type = dom[i]->getIterType(); + auto prev_iter_type = iter_types[i]; + if (prev_iter_type.has_value()) { + // Clang-tidy complains about unchecked access to optional value here + if (iter_type == IterType::Iteration && + prev_iter_type.value() == IterType::Symbolic) { + // Prefer the Iteration extent, since Symbolic could be broadcast + extent_vals[i] = nullptr; + } else if (iter_type == IterType::Symbolic) { + switch (prev_iter_type.value()) { + case IterType::Iteration: + // Previously found Iteration domain, so ignore all Symbolic + // domains + continue; + case IterType::Symbolic: + if (extent_vals[i]->sameAs(dom[i]->extent())) { + // matching symbolic extent + continue; + } else { + // Mismatched symbolic input extents. Any one of the symbolic + // inputs could be a Broadcast or Iteration domain. Until + // concretization, we will not know which one holds the true + // extent (or whether they all are Broadcast, so that the output + // is also Broadcast). We record that these symbolic extents + // mismatched so that we can introduce a new symbolic extent + // later. + mismatched_symbolic_extents[i] = true; + } + break; + case IterType::Broadcast: + // Previously found only broadcast, so this will either also + // broadcast or resolve those broadcasts. If the expanded + // extent of any of the broadcasts is not 1, then it will need to + // match that of the dom[i]. In either case, prefer dom[i]'s + // extent, so clear iter_types[i] and extent_vals[i] so that the + // rest of this iteration will mark output as Symbolic. + iter_types[i] = std::nullopt; + extent_vals[i] = nullptr; + break; + default: + TORCH_CHECK( + false, + "Encountered unexpected IterType when creating new output domain: ", + prev_iter_type.value()); + } + } + } if (dom[i]->isBroadcast()) { if (dom[i]->hasExpandedExtent()) { expanded_extent_vals[i] = @@ -244,9 +290,9 @@ std::vector newOutputDomain( continue; } extent_vals[i] = promoteSize(extent_vals[i], dom[i]->extent()); - if (iter_types[i].has_value()) { + if (prev_iter_type.has_value()) { iter_types[i] = - promoteIterType(iter_types[i].value(), dom[i]->getIterType()); + promoteIterType(prev_iter_type.value(), dom[i]->getIterType()); } else { iter_types[i] = dom[i]->getIterType(); } @@ -268,15 +314,21 @@ std::vector newOutputDomain( } } for (const auto dim_i : c10::irange(out_domain.size())) { + auto iter_type = iter_types[dim_i]; + if (iter_type == IterType::Symbolic && mismatched_symbolic_extents[dim_i]) { + // if we have a symbolic output but the input symbolic extents did not + // match, create a new extent + extent_vals[dim_i] = IrBuilder::create(); + } if (extent_vals[dim_i] != nullptr) { TORCH_INTERNAL_ASSERT( - iter_types[dim_i].has_value(), + iter_type.has_value(), "Could not deduce iter type for new tensor view."); out_domain[dim_i] = IterDomainBuilder( IrBuilder::create(start_offsets[dim_i]), extent_vals[dim_i]) .stop_offset(IrBuilder::create(stop_offsets[dim_i])) - .iter_type(iter_types[dim_i].value()) + .iter_type(iter_type.value()) .build(); } else { out_domain[dim_i] = IterDomainBuilder( @@ -292,7 +344,7 @@ std::vector newOutputDomain( } TensorView* newOutputTV(const std::vector& vals, DataType dtype) { - auto out_domain = newOutputDomain(vals, dtype); + auto out_domain = newOutputDomain(vals); return IrBuilder::create( IrBuilder::create( out_domain, TensorDomain::getContiguityFilledWith(out_domain, true)), diff --git a/csrc/ops/utils.h b/csrc/ops/utils.h index e9dc563d4fe..7af3a606986 100644 --- a/csrc/ops/utils.h +++ b/csrc/ops/utils.h @@ -31,9 +31,7 @@ Val* newScalar(ValType vtype, DataType dtype); IterType promoteIterType(IterType type1, IterType type2); -std::vector newOutputDomain( - const std::vector& vals, - DataType dtype); +std::vector newOutputDomain(const std::vector& vals); TensorView* newOutputTV(const std::vector& vals, DataType dtype); diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index 78e23e41040..0d5096bf156 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -1001,6 +1001,155 @@ TEST_F(NVFuserTest, DynamicPadShmoo_CUDA) { reductionDynamicPadAddFusion(invocations); } +// Repro of https://github.com/NVIDIA/Fuser/issues/418 +TEST_F(NVFuserTest, DynamicTransformIssue418_CUDA) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + auto tv0 = makeSymbolicTensor(4); + fusion->addInput(tv0); + auto s0 = IrBuilder::create(); + fusion->addInput(s0); + + auto v00 = tv0->axis(0)->extent(); + auto v01 = tv0->axis(1)->extent(); + auto v02 = tv0->axis(2)->extent(); + auto v03 = tv0->axis(3)->extent(); + + auto tv1 = reshape(tv0, {v00, div(v01, s0), s0, v02, v03}); + auto vm = variance_mean(tv1, {2, 3, 4}, 0, true); + fusion->addOutput(vm.mean); + fusion->addOutput(vm.var); + + // tv1 has symbolic axes as reshape is dynamic + TORCH_CHECK( + tv1->domain()->hasSymbolicAxis(), + "Expected to have symbolic axes: ", + tv1->toString()); + + FusionExecutorCache executor_cache(std::move(fusion)); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + { // trivial reshape + auto t0 = at::randn({256, 128, 28, 28}, options); + std::vector inputs = {t0, 32}; + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); + auto t0_resh = t0.reshape({256, 4, 32, 28, 28}); + auto mu = t0_resh.mean({2, 3, 4}, true); + auto v = t0_resh.var({2, 3, 4}, true, true); + testValidate( + executor_cache.fusion(), + cg_outputs, + inputs, + {mu, v}, + __LINE__, + __FILE__); + } +} + +// Repro of https://github.com/NVIDIA/Fuser/issues/418 (full GroupNorm example) +TEST_F(NVFuserTest, DynamicTransformIssue418Full_CUDA) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + auto tv0 = makeSymbolicTensor(4); + fusion->addInput(tv0); + auto weight = makeSymbolicTensor({1, -1, 1, 1}); + fusion->addInput(weight); + auto bias = makeSymbolicTensor({1, -1, 1, 1}); + fusion->addInput(bias); + auto s0 = IrBuilder::create(); + fusion->addInput(s0); + + auto v00 = tv0->axis(0)->extent(); + auto v01 = tv0->axis(1)->extent(); + auto v02 = tv0->axis(2)->extent(); + auto v03 = tv0->axis(3)->extent(); + + auto tv1 = reshape(tv0, {v00, div(v01, s0), s0, v02, v03}); + auto vm = variance_mean(tv1, {2, 3, 4}, 0, true); + auto eps = IrBuilder::create(1e-5); + auto tv2 = mul(sub(tv1, vm.mean), rsqrt(add(vm.var, eps))); + auto tv3 = reshape(tv2, {v00, v01, v02, v03}); + auto tv4 = add(mul(tv3, weight), bias); + fusion->addOutput(tv4); + + // tv1 has symbolic axes as reshape is dynamic + TORCH_CHECK( + tv1->domain()->hasSymbolicAxis(), + "Expected to have symbolic axes: ", + tv1->toString()); + + FusionExecutorCache executor_cache(std::move(fusion)); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn({256, 128, 28, 28}, options); + auto w = at::randn({1, 128, 1, 1}, options); + auto b = at::randn({1, 128, 1, 1}, options); + std::vector inputs = {t0, w, b, 32}; + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); + auto t0_resh = t0.reshape({256, 4, 32, 28, 28}); + auto mu = t0_resh.mean({2, 3, 4}, true); + auto v = t0_resh.var({2, 3, 4}, true, true); + auto gn = + ((t0_resh - mu) * (v + 1e-5).rsqrt()).reshape({256, 128, 28, 28}) * w + b; + testValidate( + executor_cache.fusion(), cg_outputs, inputs, {gn}, __LINE__, __FILE__); +} + +// Repro of https://github.com/NVIDIA/Fuser/issues/418 (channels-last) +TEST_F(NVFuserTest, DynamicTransformIssue418FullNHWC_CUDA) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + auto tv0 = makeSymbolicTensor(4); + fusion->addInput(tv0); + auto weight = makeSymbolicTensor({1, 1, 1, -1}); + fusion->addInput(weight); + auto bias = makeSymbolicTensor({1, 1, 1, -1}); + fusion->addInput(bias); + auto s0 = IrBuilder::create(); + fusion->addInput(s0); + + auto v00 = tv0->axis(0)->extent(); + auto v01 = tv0->axis(1)->extent(); + auto v02 = tv0->axis(2)->extent(); + auto v03 = tv0->axis(3)->extent(); + + auto tv1 = reshape(tv0, {v00, v01, v02, div(v03, s0), s0}); + auto vm = variance_mean(tv1, {1, 2, 4}, 0, true); + auto eps = IrBuilder::create(1e-5); + auto tv2 = mul(sub(tv1, vm.mean), rsqrt(add(vm.var, eps))); + auto tv3 = reshape(tv2, {v00, v01, v02, v03}); + auto tv4 = add(mul(tv3, weight), bias); + fusion->addOutput(tv4); + + // tv1 has symbolic axes as reshape is dynamic + TORCH_CHECK( + tv1->domain()->hasSymbolicAxis(), + "Expected to have symbolic axes: ", + tv1->toString()); + + FusionExecutorCache executor_cache(std::move(fusion)); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn({256, 28, 28, 128}, options); + auto w = at::randn({1, 1, 1, 128}, options); + auto b = at::randn({1, 1, 1, 128}, options); + std::vector inputs = {t0, w, b, 32}; + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); + auto t0_resh = t0.reshape({256, 28, 28, 4, 32}); + auto mu = t0_resh.mean({1, 2, 4}, true); + auto v = t0_resh.var({1, 2, 4}, true, true); + auto gn = + ((t0_resh - mu) * (v + 1e-5).rsqrt()).reshape({256, 28, 28, 128}) * w + b; + testValidate( + executor_cache.fusion(), cg_outputs, inputs, {gn}, __LINE__, __FILE__); +} + // Test that a Symbolic root/Broadcast rfactor is not concretized to // Iteration/Iteration TEST_F(NVFuserTest, FusionDynamicSliceToBroadcast_CUDA) { @@ -1032,4 +1181,65 @@ TEST_F(NVFuserTest, FusionDynamicSliceToBroadcast_CUDA) { testValidate(&fusion, outputs, aten_inputs, {at2}, __LINE__, __FILE__); } +// Test dynamic pad followed by broadcast resolution +TEST_F(NVFuserTest, DynamicPadBroadcast_CUDA) { + std::unique_ptr fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); + FusionGuard fg(&fusion); + + TensorView* tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + TensorView* tv1 = makeSymbolicTensor(2); + fusion.addInput(tv1); + + // 2d axis order here is YX + auto ypad = IrBuilder::create(); + fusion.addInput(ypad); + auto xpad = IrBuilder::create(); + fusion.addInput(xpad); + + // two-way resizes to cut square tv down to broadcastable size in each axis + auto tv0_pad = pad(tv0, {fusion.zeroVal(), xpad, fusion.zeroVal(), ypad}); + + // This will potentially resolve the y or x broadcast + auto p = mul(tv0_pad, tv1); + fusion.addOutput(p); + fusion.addOutput(tv0_pad); + + fusion.printMath(); + + FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor at_x = at::randn({5, 5}, options); + at::Tensor at_y = at::randn({5, 5}, options); + + // trivial resize + std::vector aten_inputs({at_x, at_y, 0, 0}); + std::vector outputs; + + /* + aten_inputs[2] = 0; + aten_inputs[3] = 0; + outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + testValidate(fusion_executor_cache.fusion(), outputs, aten_inputs, {at_x * + at_y}, __LINE__, __FILE__); + */ + + // shrink first axis + aten_inputs[2] = -4; + aten_inputs[3] = 0; + outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + std::cout << outputs << std::endl; + std::cout << at_x.slice(0, 0, 1) * at_y << std::endl; + std::cout << at_x.slice(0, 0, 1) << std::endl; + testValidate( + fusion_executor_cache.fusion(), + outputs, + aten_inputs, + {at_x.slice(0, 0, 1) * at_y, at_x.slice(0, 0, 1)}, + __LINE__, + __FILE__); +} + } // namespace nvfuser