From 375066c4068c09ccf3c3c88147ab3fe5fc09d30e Mon Sep 17 00:00:00 2001 From: Addison Date: Thu, 20 Nov 2025 11:28:05 -0600 Subject: [PATCH 1/6] add compute in its current form --- loopy/target/c/compyte | 2 +- loopy/transform/compute.py | 204 +++++++++++++++++++++++++++++++++++++ 2 files changed, 205 insertions(+), 1 deletion(-) create mode 100644 loopy/transform/compute.py diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte index 2b168ca39..955160ac2 160000 --- a/loopy/target/c/compyte +++ b/loopy/target/c/compyte @@ -1 +1 @@ -Subproject commit 2b168ca396aec2259da408f441f5e38ac9f95cb6 +Subproject commit 955160ac2f504dabcd8641471a56146fa1afe35d diff --git a/loopy/transform/compute.py b/loopy/transform/compute.py new file mode 100644 index 000000000..4b325a7af --- /dev/null +++ b/loopy/transform/compute.py @@ -0,0 +1,204 @@ +# DomainChanger +# iname nesting order <=> tree +# loop transformations +# - traverse syntax tree +# - affine map inames +# +# index views for warp tiling + +from pymbolic.mapper.substitutor import make_subst_func +from loopy.kernel import LoopKernel +import islpy as isl + +import loopy as lp +from loopy.kernel.data import AddressSpace +from loopy.kernel.function_interface import CallableKernel, ScalarCallable +from loopy.kernel.instruction import MultiAssignmentBase +from loopy.kernel.tools import DomainChanger +from loopy.match import parse_stack_match +from loopy.symbolic import RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext, aff_from_expr, aff_to_expr, pw_aff_to_expr +from loopy.transform.precompute import RuleInvocationGatherer, RuleInvocationReplacer, contains_a_subst_rule_invocation +from loopy.translation_unit import TranslationUnit + +import pymbolic.primitives as prim +from pymbolic import var + +from pytools.tag import Tag + + +def compute( + t_unit: TranslationUnit, +substitution: str, + *args, + **kwargs + ) -> TranslationUnit: + """ + Entrypoint for performing a compute transformation on all kernels in a + translation unit. See :func:`_compute_inner` for more details. + """ + + assert isinstance(t_unit, TranslationUnit) + new_callables = {} + + for id, callable in t_unit.callables_table.items(): + if isinstance(callable, CallableKernel): + kernel = _compute_inner( + callable.subkernel, + substitution, + *args, **kwargs + ) + + callable = callable.copy(subkernel=kernel) + elif isinstance(callable, ScalarCallable): + pass + else: + raise NotImplementedError() + + new_callables[id] = callable + + return t_unit + +def _compute_inner( + kernel: LoopKernel, + substitution: str, + transform_map: isl.Map, + compute_map: isl.Map, + storage_inames: list[str], + default_tag: Tag | str | None = None, + temporary_address_space: AddressSpace | None = None + ) -> LoopKernel: + """ + Inserts an instruction to compute an expression given by :arg:`substitution` + and replaces all invocations of :arg:`substitution` with the result of the + compute instruction. + + :arg substitution: The substitution rule for which the compute + transform should be applied. + + :arg transform_map: An :class:`isl.Map` representing the affine + transformation from the original iname domain to the transformed iname + domain. + + :arg compute_map: An :class:`isl.Map` representing a relation between + substitution rule indices and tuples `(a, l)`, where `a` is a vector of + storage indices and `l` is a vector of "timestamps". This map describes + """ + + if not temporary_address_space: + temporary_address_space = AddressSpace.GLOBAL + + # {{{ normalize names + + iname_to_storage_map = { + iname : (iname + "_store" if iname in kernel.all_inames() else iname) + for iname in storage_inames + } + + new_storage_axes = list(iname_to_storage_map.values()) + + for dim in range(compute_map.dim(isl.dim_type.out)): + for iname, storage_ax in iname_to_storage_map.items(): + if compute_map.get_dim_name(isl.dim_type.out, dim) == iname: + compute_map = compute_map.set_dim_name( + isl.dim_type.out, dim, storage_ax + ) + + # }}} + + # {{{ update kernel domain to contain storage inames + + storage_domain = compute_map.range().project_out_except( + new_storage_axes, [isl.dim_type.set] + ) + + # FIXME: likely need to do some more digging to find proper domain to update + new_domain = kernel.domains[0] + for ax in new_storage_axes: + new_domain = new_domain.add_dims(isl.dim_type.set, 1) + + new_domain = new_domain.set_dim_name( + isl.dim_type.set, + new_domain.dim(isl.dim_type.set) - 1, + ax + ) + + new_domain, storage_domain = isl.align_two(new_domain, storage_domain) + new_domain = new_domain & storage_domain + kernel = kernel.copy(domains=[new_domain]) + + # }}} + + # {{{ express substitution inputs as pw affs of (storage, time) names + + compute_pw_aff = compute_map.reverse().as_pw_multi_aff() + subst_inp_names = [ + compute_map.get_dim_name(isl.dim_type.in_, i) + for i in range(compute_map.dim(isl.dim_type.in_)) + ] + storage_ax_to_global_expr = dict.fromkeys(subst_inp_names) + for dim in range(compute_pw_aff.dim(isl.dim_type.out)): + subst_inp = compute_map.get_dim_name(isl.dim_type.in_, dim) + storage_ax_to_global_expr[subst_inp] = \ + pw_aff_to_expr(compute_pw_aff.get_at(dim)) + + # }}} + + # {{{ generate instruction from compute map + + rule_mapping_ctx = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + expr_subst_map = RuleAwareSubstitutionMapper( + rule_mapping_ctx, + make_subst_func(storage_ax_to_global_expr), + within=parse_stack_match(None) + ) + + subst_expr = kernel.substitutions[substitution].expression + compute_expression = expr_subst_map(subst_expr, kernel, None) + + temporary_name = substitution + "_temp" + assignee = var(temporary_name)[tuple( + var(iname) for iname in new_storage_axes + )] + + compute_insn_id = substitution + "_compute" + compute_insn = lp.Assignment( + id=compute_insn_id, + assignee=assignee, + expression=compute_expression, + ) + + compute_dep_id = compute_insn_id + new_insns = [compute_insn] + + # add global sync if we are storing in global memory + if temporary_address_space == lp.AddressSpace.GLOBAL: + gbarrier_id = kernel.make_unique_instruction_id( + based_on=substitution + "_barrier" + ) + + from loopy.kernel.instruction import BarrierInstruction + barrier_insn = BarrierInstruction( + id=gbarrier_id, + depends_on=frozenset([compute_insn_id]), + synchronization_kind="global", + mem_kind="global" + ) + + compute_dep_id = gbarrier_id + + # }}} + + # {{{ replace substitution rule with newly created instruction + + # FIXME: get these properly (see `precompute`) + subst_name = substitution + subst_tag = None + within = None # do we need this? + + + + # }}} + + return kernel From 745f841ca174d2cf97f1bcf0335330436e991b8f Mon Sep 17 00:00:00 2001 From: Addison Date: Thu, 20 Nov 2025 11:41:05 -0600 Subject: [PATCH 2/6] align compyte with inducer/main --- loopy/target/c/compyte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte index 955160ac2..2b168ca39 160000 --- a/loopy/target/c/compyte +++ b/loopy/target/c/compyte @@ -1 +1 @@ -Subproject commit 955160ac2f504dabcd8641471a56146fa1afe35d +Subproject commit 2b168ca396aec2259da408f441f5e38ac9f95cb6 From 88966446dc586b0a0d2ef44b9f410bc330774f21 Mon Sep 17 00:00:00 2001 From: Addison Date: Thu, 20 Nov 2025 11:49:42 -0600 Subject: [PATCH 3/6] clean up comments and typos --- loopy/transform/compute.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/loopy/transform/compute.py b/loopy/transform/compute.py index 4b325a7af..e1e54fa2d 100644 --- a/loopy/transform/compute.py +++ b/loopy/transform/compute.py @@ -1,27 +1,19 @@ -# DomainChanger -# iname nesting order <=> tree -# loop transformations -# - traverse syntax tree -# - affine map inames -# -# index views for warp tiling - -from pymbolic.mapper.substitutor import make_subst_func -from loopy.kernel import LoopKernel import islpy as isl import loopy as lp +from loopy.kernel import LoopKernel from loopy.kernel.data import AddressSpace from loopy.kernel.function_interface import CallableKernel, ScalarCallable -from loopy.kernel.instruction import MultiAssignmentBase -from loopy.kernel.tools import DomainChanger from loopy.match import parse_stack_match -from loopy.symbolic import RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext, aff_from_expr, aff_to_expr, pw_aff_to_expr -from loopy.transform.precompute import RuleInvocationGatherer, RuleInvocationReplacer, contains_a_subst_rule_invocation +from loopy.symbolic import ( + RuleAwareSubstitutionMapper, + SubstitutionRuleMappingContext, + pw_aff_to_expr +) from loopy.translation_unit import TranslationUnit -import pymbolic.primitives as prim from pymbolic import var +from pymbolic.mapper.substitutor import make_subst_func from pytools.tag import Tag @@ -81,7 +73,7 @@ def _compute_inner( :arg compute_map: An :class:`isl.Map` representing a relation between substitution rule indices and tuples `(a, l)`, where `a` is a vector of - storage indices and `l` is a vector of "timestamps". This map describes + storage indices and `l` is a vector of "timestamps". """ if not temporary_address_space: From 6230e11ff78e97bd074a19910c9094f38cbf0a69 Mon Sep 17 00:00:00 2001 From: Addison Date: Wed, 10 Dec 2025 00:46:09 -0600 Subject: [PATCH 4/6] switch to alpha namedisl usage --- loopy/transform/compute.py | 119 +++++++------------------------------ 1 file changed, 22 insertions(+), 97 deletions(-) diff --git a/loopy/transform/compute.py b/loopy/transform/compute.py index e1e54fa2d..cb01401db 100644 --- a/loopy/transform/compute.py +++ b/loopy/transform/compute.py @@ -1,16 +1,18 @@ import islpy as isl +import namedisl as nisl import loopy as lp from loopy.kernel import LoopKernel from loopy.kernel.data import AddressSpace -from loopy.kernel.function_interface import CallableKernel, ScalarCallable +from loopy.kernel.instruction import MultiAssignmentBase from loopy.match import parse_stack_match from loopy.symbolic import ( RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext, pw_aff_to_expr ) -from loopy.translation_unit import TranslationUnit +from loopy.transform.precompute import contains_a_subst_rule_invocation +from loopy.translation_unit import for_each_kernel from pymbolic import var from pymbolic.mapper.substitutor import make_subst_func @@ -18,43 +20,11 @@ from pytools.tag import Tag +@for_each_kernel def compute( - t_unit: TranslationUnit, -substitution: str, - *args, - **kwargs - ) -> TranslationUnit: - """ - Entrypoint for performing a compute transformation on all kernels in a - translation unit. See :func:`_compute_inner` for more details. - """ - - assert isinstance(t_unit, TranslationUnit) - new_callables = {} - - for id, callable in t_unit.callables_table.items(): - if isinstance(callable, CallableKernel): - kernel = _compute_inner( - callable.subkernel, - substitution, - *args, **kwargs - ) - - callable = callable.copy(subkernel=kernel) - elif isinstance(callable, ScalarCallable): - pass - else: - raise NotImplementedError() - - new_callables[id] = callable - - return t_unit - -def _compute_inner( kernel: LoopKernel, substitution: str, - transform_map: isl.Map, - compute_map: isl.Map, + compute_map: isl.Map | nisl.Map, storage_inames: list[str], default_tag: Tag | str | None = None, temporary_address_space: AddressSpace | None = None @@ -67,14 +37,12 @@ def _compute_inner( :arg substitution: The substitution rule for which the compute transform should be applied. - :arg transform_map: An :class:`isl.Map` representing the affine - transformation from the original iname domain to the transformed iname - domain. - :arg compute_map: An :class:`isl.Map` representing a relation between substitution rule indices and tuples `(a, l)`, where `a` is a vector of storage indices and `l` is a vector of "timestamps". """ + if isinstance(compute_map, isl.Map): + compute_map = nisl.make_map(compute_map) if not temporary_address_space: temporary_address_space = AddressSpace.GLOBAL @@ -86,52 +54,29 @@ def _compute_inner( for iname in storage_inames } - new_storage_axes = list(iname_to_storage_map.values()) - - for dim in range(compute_map.dim(isl.dim_type.out)): - for iname, storage_ax in iname_to_storage_map.items(): - if compute_map.get_dim_name(isl.dim_type.out, dim) == iname: - compute_map = compute_map.set_dim_name( - isl.dim_type.out, dim, storage_ax - ) + compute_map = compute_map.rename_dims(iname_to_storage_map) # }}} # {{{ update kernel domain to contain storage inames - storage_domain = compute_map.range().project_out_except( - new_storage_axes, [isl.dim_type.set] - ) + new_storage_axes = list(iname_to_storage_map.values()) - # FIXME: likely need to do some more digging to find proper domain to update + # FIXME: use DomainChanger to add domain to kernel + storage_domain = compute_map.range().project_out_except(new_storage_axes) new_domain = kernel.domains[0] - for ax in new_storage_axes: - new_domain = new_domain.add_dims(isl.dim_type.set, 1) - - new_domain = new_domain.set_dim_name( - isl.dim_type.set, - new_domain.dim(isl.dim_type.set) - 1, - ax - ) - - new_domain, storage_domain = isl.align_two(new_domain, storage_domain) - new_domain = new_domain & storage_domain - kernel = kernel.copy(domains=[new_domain]) # }}} # {{{ express substitution inputs as pw affs of (storage, time) names compute_pw_aff = compute_map.reverse().as_pw_multi_aff() - subst_inp_names = [ - compute_map.get_dim_name(isl.dim_type.in_, i) - for i in range(compute_map.dim(isl.dim_type.in_)) - ] - storage_ax_to_global_expr = dict.fromkeys(subst_inp_names) - for dim in range(compute_pw_aff.dim(isl.dim_type.out)): - subst_inp = compute_map.get_dim_name(isl.dim_type.in_, dim) - storage_ax_to_global_expr[subst_inp] = \ - pw_aff_to_expr(compute_pw_aff.get_at(dim)) + + # FIXME: remove PwAff._obj usage when ready + storage_ax_to_global_expr = { + dim_name : pw_aff_to_expr(compute_pw_aff.get_at(dim_name)._obj) + for dim_name in compute_map.dim_type_names(isl.dim_type.in_) + } # }}} @@ -161,34 +106,14 @@ def _compute_inner( expression=compute_expression, ) - compute_dep_id = compute_insn_id - new_insns = [compute_insn] - - # add global sync if we are storing in global memory - if temporary_address_space == lp.AddressSpace.GLOBAL: - gbarrier_id = kernel.make_unique_instruction_id( - based_on=substitution + "_barrier" - ) - - from loopy.kernel.instruction import BarrierInstruction - barrier_insn = BarrierInstruction( - id=gbarrier_id, - depends_on=frozenset([compute_insn_id]), - synchronization_kind="global", - mem_kind="global" - ) - - compute_dep_id = gbarrier_id - # }}} # {{{ replace substitution rule with newly created instruction - # FIXME: get these properly (see `precompute`) - subst_name = substitution - subst_tag = None - within = None # do we need this? - + for insn in kernel.instructions: + if contains_a_subst_rule_invocation(kernel, insn) \ + and isinstance(insn, MultiAssignmentBase): + print(insn) # }}} From 80839bad239b9833b7804ab2f750a7bd03fb38a8 Mon Sep 17 00:00:00 2001 From: Addison Date: Wed, 10 Dec 2025 12:40:49 -0600 Subject: [PATCH 5/6] start using namedisl in places other than compute --- loopy/symbolic.py | 37 ++++++++++++++++++++++++++----------- loopy/transform/compute.py | 2 +- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index ba6d71a80..442eb8572 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -48,6 +48,7 @@ from constantdict import constantdict from typing_extensions import Self, override +import namedisl as nisl import islpy as isl import pymbolic.primitives as p import pytools.lex @@ -2044,23 +2045,37 @@ def map_subscript(self, expr: p.Subscript) -> Set[p.Subscript]: # {{{ (pw)aff to expr conversion -def aff_to_expr(aff: isl.Aff) -> ArithmeticExpression: +def aff_to_expr(aff: isl.Aff | nisl.Aff) -> ArithmeticExpression: from pymbolic import var denom = aff.get_denominator_val().to_python() - result = (aff.get_constant_val()*denom).to_python() - for dt in [isl.dim_type.in_, isl.dim_type.param]: - for i in range(aff.dim(dt)): - coeff = (aff.get_coefficient_val(dt, i)*denom).to_python() + if isinstance(aff, isl.Aff): + for dt in [isl.dim_type.in_, isl.dim_type.param]: + for i in range(aff.dim(dt)): + coeff = (aff.get_coefficient_val(dt, i)*denom).to_python() + if coeff: + dim_name = not_none(aff.get_dim_name(dt, i)) + result += coeff*var(dim_name) + + for i in range(aff.dim(isl.dim_type.div)): + coeff = (aff.get_coefficient_val(isl.dim_type.div, i)*denom).to_python() + if coeff: + result += coeff*aff_to_expr(aff.get_div(i)) + + else: + in_names = set(aff.dim_type_names(isl.dim_type.in_)) + param_names = set(aff.dim_type_names(isl.dim_type.param)) + + for name in in_names | param_names: + coeff = (aff.get_coefficient_val(name) * denom).to_python() if coeff: - dim_name = not_none(aff.get_dim_name(dt, i)) - result += coeff*var(dim_name) + result = coeff * var(name) - for i in range(aff.dim(isl.dim_type.div)): - coeff = (aff.get_coefficient_val(isl.dim_type.div, i)*denom).to_python() - if coeff: - result += coeff*aff_to_expr(aff.get_div(i)) + for name in aff.dim_type_names(isl.dim_type.div): + coeff = (aff.get_coefficient_val(name) * denom).to_python() + if coeff: + result += coeff * aff_to_expr(aff.get_div(name)) assert not isinstance(result, complex) return flatten(result // denom) diff --git a/loopy/transform/compute.py b/loopy/transform/compute.py index cb01401db..b3e06f2e9 100644 --- a/loopy/transform/compute.py +++ b/loopy/transform/compute.py @@ -74,7 +74,7 @@ def compute( # FIXME: remove PwAff._obj usage when ready storage_ax_to_global_expr = { - dim_name : pw_aff_to_expr(compute_pw_aff.get_at(dim_name)._obj) + dim_name : pw_aff_to_expr(compute_pw_aff.get_at(dim_name)) for dim_name in compute_map.dim_type_names(isl.dim_type.in_) } From c1ba35bb2d20f093f86df785378b63b1092ba0dd Mon Sep 17 00:00:00 2001 From: Addison Date: Wed, 10 Dec 2025 12:43:44 -0600 Subject: [PATCH 6/6] add namedisl objects to a type signature --- loopy/symbolic.py | 3 ++- loopy/transform/compute.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 442eb8572..f47e32f9d 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -2048,6 +2048,7 @@ def map_subscript(self, expr: p.Subscript) -> Set[p.Subscript]: def aff_to_expr(aff: isl.Aff | nisl.Aff) -> ArithmeticExpression: from pymbolic import var + # FIXME: remove this once namedisl is the standard in loopy denom = aff.get_denominator_val().to_python() result = (aff.get_constant_val()*denom).to_python() if isinstance(aff, isl.Aff): @@ -2082,7 +2083,7 @@ def aff_to_expr(aff: isl.Aff | nisl.Aff) -> ArithmeticExpression: def pw_aff_to_expr( - pw_aff: int | isl.PwAff | isl.Aff, + pw_aff: int | isl.PwAff | isl.Aff | nisl.PwAff | nisl.Aff, int_ok: bool = False ) -> ArithmeticExpression: if isinstance(pw_aff, int): diff --git a/loopy/transform/compute.py b/loopy/transform/compute.py index b3e06f2e9..59ddf8a2e 100644 --- a/loopy/transform/compute.py +++ b/loopy/transform/compute.py @@ -72,7 +72,6 @@ def compute( compute_pw_aff = compute_map.reverse().as_pw_multi_aff() - # FIXME: remove PwAff._obj usage when ready storage_ax_to_global_expr = { dim_name : pw_aff_to_expr(compute_pw_aff.get_at(dim_name)) for dim_name in compute_map.dim_type_names(isl.dim_type.in_)