From 25bba2a55d071c0ee763d06ab53e78abf330d3b4 Mon Sep 17 00:00:00 2001
From: Jayant Lohia <jlohia@amd.com>
Date: Thu, 23 Apr 2026 09:15:20 +0000
Subject: [PATCH 1/2] [PERF IMPROVEMENT] func_update_gradient_tiled CG:
 wave-cooperative mass-matrix back-solve

Adds `func_solve_mass_tiled` in abd/forward_dynamics.py: a wave-cooperative
LDL^T back-solve for the CG solver's preconditioner step (M @ Mgrad = grad).

Structure mirrors the existing `func_cholesky_solve_tiled` (Newton path,
LL^T). One wavefront (BLOCK_DIM=64) per (entity, env). LDS caches the
entity-local lower-triangle of `mass_mat_L`, `mass_mat_D_inv`, and the
working vector. Three phases (L^T solve, D^-1 scale, L solve); inner
dot products reduced across lanes via warp shuffle on CUDA or an LDS
partial-sum buffer on AMDGPU (same pattern as the Newton variant).

Wired into `func_update_gradient_tiled`'s CG branch, replacing the
per-env serial `for i_b in range(_B): func_solve_mass_batch(...)` loop.
Gated on `static_rigid_sim_config.enable_tiled_cholesky_hessian` (already
auto-enabled for n_dofs >= 16, lds budget OK, n_envs <= 16384).

The Newton branch is unchanged.

Correctness (256 envs x 50 steps FP32, differential test vs baseline):
  q_mean/q_std/q_min/q_max: identical
  v_mean/v_std/v_min/v_max: 1-2e-6 drift (FP32 noise floor)
  n_contacts_sum: 2048 == 2048 (identical)
  500-step run completes without NaN at 8192 envs

Performance (MI300X, 8192 envs x 500 steps x FP32, quadrants a732b474,
5-trial interleaved stash/pop):
  baseline: 535,261 env*steps/s  (median of 534k-547k)
  this PR:  571,722 env*steps/s  (median of 570k-577k)
  delta:    +6.81% median (every paired trial positive)

Scaling sweep at 100 steps (n_envs in 256/1024/4096/8192): all clean,
no regressions.

Scope: 1 new @qd.func (~150 LoC) + 10 lines replaced in
func_update_gradient_tiled CG branch. No changes to the monolith,
sparse_solve path, or Newton path.

Projected pipeline impact: at pipeline build #54 baseline of 562,670
(70.84% H100), +6.81% relative would land at ~600k = 75.6% H100.
---
 .../solvers/rigid/abd/forward_dynamics.py     | 150 ++++++++++++++++++
 .../engine/solvers/rigid/constraint/solver.py |  24 ++-
 2 files changed, 161 insertions(+), 13 deletions(-)

diff --git a/genesis/engine/solvers/rigid/abd/forward_dynamics.py b/genesis/engine/solvers/rigid/abd/forward_dynamics.py
index ecb764b0e..a7f82a736 100644
--- a/genesis/engine/solvers/rigid/abd/forward_dynamics.py
+++ b/genesis/engine/solvers/rigid/abd/forward_dynamics.py
@@ -1004,6 +1004,156 @@ def func_solve_mass(
         )
 
 
+@qd.func
+def func_solve_mass_tiled(
+    vec: array_class.V_ANNOTATION,
+    out: array_class.V_ANNOTATION,
+    entities_info: array_class.EntitiesInfo,
+    rigid_global_info: array_class.RigidGlobalInfo,
+    static_rigid_sim_config: qd.template(),
+):
+    """Wave-cooperative solve of `M @ out = vec` using the pre-factored LDLT of M.
+
+    One wavefront (BLOCK_DIM=64) per (entity, env). Mirrors the structure of
+    `func_cholesky_solve_tiled` in constraint/solver.py, adapted for LDLT
+    (unit L, diagonal D stored inverted in `mass_mat_D_inv`) and indexed per
+    entity's DoF range.
+
+    Three phases per entity (all serialized across i_d, parallelized across
+    lanes within each serial step):
+      Phase 1: solve L^T @ w = vec                # unit diagonal
+      Phase 2: z       = D_inv * w                # elementwise
+      Phase 3: solve L @ out = z                  # unit diagonal
+    """
+    BLOCK_DIM = qd.static(64)
+    MAX_DOFS = qd.static(static_rigid_sim_config.tiled_n_dofs_per_entity)
+    ENABLE_WARP_REDUCTION = qd.static(
+        static_rigid_sim_config.backend == gs.cuda and gs.qd_float == qd.f32
+    )
+    WARP_SIZE = qd.static(32)
+    NUM_WARPS = qd.static(BLOCK_DIM // WARP_SIZE)
+
+    _B = out.shape[1]
+    n_entities = entities_info.n_links.shape[0]
+
+    qd.loop_config(block_dim=BLOCK_DIM)
+    for i in range(n_entities * _B * BLOCK_DIM):
+        tid = i % BLOCK_DIM
+        i_e = (i // BLOCK_DIM) % n_entities
+        i_b = i // (BLOCK_DIM * n_entities)
+        warp_id = tid // WARP_SIZE
+        lane_id = tid % WARP_SIZE
+
+        if i_b >= _B:
+            continue
+        if not rigid_global_info.mass_mat_mask[i_e, i_b]:
+            continue
+
+        entity_dof_start = entities_info.dof_start[i_e]
+        entity_dof_end = entities_info.dof_end[i_e]
+        n_dofs = entities_info.n_dofs[i_e]
+
+        # LDS: entity-local lower-triangular L, D_inv, working vector v, and reduction buffer.
+        L = qd.simt.block.SharedArray((MAX_DOFS, MAX_DOFS + 1), gs.qd_float)
+        D_inv = qd.simt.block.SharedArray((MAX_DOFS,), gs.qd_float)
+        v = qd.simt.block.SharedArray((MAX_DOFS,), gs.qd_float)
+        partial = qd.simt.block.SharedArray(
+            (NUM_WARPS if qd.static(ENABLE_WARP_REDUCTION) else BLOCK_DIM,), gs.qd_float
+        )
+
+        # Copy entity-local lower triangle of L from global mass_mat_L into LDS.
+        # L[j_local, i_local] is the (j_local, i_local) entry below the diagonal
+        # for j_local > i_local. Source global index is (entity_dof_start + local).
+        n_tri = n_dofs * n_dofs
+        i_flat = tid
+        while i_flat < n_tri:
+            r = i_flat // n_dofs
+            c = i_flat % n_dofs
+            if c < r:  # strict lower triangle
+                L[r, c] = rigid_global_info.mass_mat_L[entity_dof_start + r, entity_dof_start + c, i_b]
+            i_flat = i_flat + BLOCK_DIM
+
+        # Copy D_inv and the input vector into LDS (strided).
+        k_d = tid
+        while k_d < n_dofs:
+            D_inv[k_d] = rigid_global_info.mass_mat_D_inv[entity_dof_start + k_d, i_b]
+            v[k_d] = vec[entity_dof_start + k_d, i_b]
+            k_d = k_d + BLOCK_DIM
+        qd.simt.block.sync()
+
+        # ------------------------------------------------------------------
+        # Phase 1: solve L^T @ w = y in-place in v.
+        # For each i_d from end-1 down to 0:
+        #     w[i_d] = y[i_d] - sum_{j > i_d} L[j, i_d] * w[j]
+        # Note: serial scan must go DECREASING i_d because w[j] for j > i_d
+        # must already be finalized. L is unit-diagonal, no divide.
+        # ------------------------------------------------------------------
+        for i_d_ in range(n_dofs):
+            i_d = n_dofs - 1 - i_d_
+            dot = gs.qd_float(0.0)
+            j_d = i_d + 1 + tid
+            while j_d < n_dofs:
+                dot = dot + L[j_d, i_d] * v[j_d]
+                j_d = j_d + BLOCK_DIM
+            if qd.static(ENABLE_WARP_REDUCTION):
+                for offset in qd.static([16, 8, 4, 2, 1]):
+                    dot = dot + qd.simt.warp.shfl_down_f32(qd.u32(0xFFFFFFFF), dot, offset)
+                if lane_id == 0:
+                    partial[warp_id] = dot
+            else:
+                partial[tid] = dot
+            qd.simt.block.sync()
+
+            if tid == 0:
+                total = gs.qd_float(0.0)
+                for k in qd.static(range(NUM_WARPS)) if qd.static(ENABLE_WARP_REDUCTION) else range(BLOCK_DIM):
+                    total = total + partial[k]
+                v[i_d] = v[i_d] - total
+            qd.simt.block.sync()
+
+        # ------------------------------------------------------------------
+        # Phase 2: z = D^{-1} @ w (elementwise, fully parallel).
+        # ------------------------------------------------------------------
+        k_d = tid
+        while k_d < n_dofs:
+            v[k_d] = v[k_d] * D_inv[k_d]
+            k_d = k_d + BLOCK_DIM
+        qd.simt.block.sync()
+
+        # ------------------------------------------------------------------
+        # Phase 3: solve L @ x = z in-place in v.
+        # For each i_d from 0 upward:
+        #     x[i_d] = z[i_d] - sum_{j < i_d} L[i_d, j] * x[j]
+        # ------------------------------------------------------------------
+        for i_d in range(n_dofs):
+            dot = gs.qd_float(0.0)
+            j_d = tid
+            while j_d < i_d:
+                dot = dot + L[i_d, j_d] * v[j_d]
+                j_d = j_d + BLOCK_DIM
+            if qd.static(ENABLE_WARP_REDUCTION):
+                for offset in qd.static([16, 8, 4, 2, 1]):
+                    dot = dot + qd.simt.warp.shfl_down_f32(qd.u32(0xFFFFFFFF), dot, offset)
+                if lane_id == 0:
+                    partial[warp_id] = dot
+            else:
+                partial[tid] = dot
+            qd.simt.block.sync()
+
+            if tid == 0:
+                total = gs.qd_float(0.0)
+                for k in qd.static(range(NUM_WARPS)) if qd.static(ENABLE_WARP_REDUCTION) else range(BLOCK_DIM):
+                    total = total + partial[k]
+                v[i_d] = v[i_d] - total
+            qd.simt.block.sync()
+
+        # Copy the final solution from LDS back to global out[entity_dof_start..end, i_b].
+        k_d = tid
+        while k_d < n_dofs:
+            out[entity_dof_start + k_d, i_b] = v[k_d]
+            k_d = k_d + BLOCK_DIM
+
+
 @qd.func
 def func_torque_and_passive_force(
     entities_state: array_class.EntitiesState,
diff --git a/genesis/engine/solvers/rigid/constraint/solver.py b/genesis/engine/solvers/rigid/constraint/solver.py
index ed08afc47..bca1d1abc 100644
--- a/genesis/engine/solvers/rigid/constraint/solver.py
+++ b/genesis/engine/solvers/rigid/constraint/solver.py
@@ -9,7 +9,7 @@
 
 import genesis.utils.array_class as array_class
 import genesis.utils.geom as gu
-from genesis.engine.solvers.rigid.abd import func_solve_mass_batch
+from genesis.engine.solvers.rigid.abd import func_solve_mass_batch, func_solve_mass_tiled
 from genesis.utils.misc import qd_to_torch, indices_to_mask, assign_indexed_tensor
 
 from ..collider.contact_island import ContactIsland
@@ -2771,18 +2771,16 @@ def func_update_gradient_tiled(
         )
 
     if qd.static(static_rigid_sim_config.solver_type == gs.constraint_solver.CG):
-        qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32)
-        for i_b in range(_B):
-            func_solve_mass_batch(
-                i_b,
-                constraint_state.grad,
-                constraint_state.Mgrad,
-                array_class.PLACEHOLDER,
-                entities_info=entities_info,
-                rigid_global_info=rigid_global_info,
-                static_rigid_sim_config=static_rigid_sim_config,
-                is_backward=False,
-            )
+        # Wave-cooperative LDLT back-solve: 1 wave (BLOCK_DIM=64) per (entity, env),
+        # mirrors func_cholesky_solve_tiled for the Newton path. Replaces the serial
+        # per-env func_solve_mass_batch loop.
+        func_solve_mass_tiled(
+            constraint_state.grad,
+            constraint_state.Mgrad,
+            entities_info=entities_info,
+            rigid_global_info=rigid_global_info,
+            static_rigid_sim_config=static_rigid_sim_config,
+        )
 
     if qd.static(static_rigid_sim_config.solver_type == gs.constraint_solver.Newton):
         func_cholesky_solve_tiled(constraint_state, static_rigid_sim_config)

From e578407ed1a9473601933dbf606deefcdc19f545 Mon Sep 17 00:00:00 2001
From: Jayant Lohia <jlohia@amd.com>
Date: Thu, 23 Apr 2026 11:08:42 +0000
Subject: [PATCH 2/2] [PERF IMPROVEMENT] decomposed solver on AMDGPU + route
 gradient update through tiled dispatcher
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two small changes to the decomposed solver path:

1. Enable the decomposed variant for AMDGPU backend (mirrors upstream PR
   Genesis-Embodied-AI/Genesis#2623, which cherry-picked only to main).
   `perf_dispatch` will now time both monolith and decomposed at warmup
   and pick whichever is faster per env-count.

2. Replace the decomposed `_kernel_update_gradient`'s per-env serial loop
   (`for i_b in range(_B): func_update_gradient_batch(i_b, ...)`) with a
   single call to the `func_update_gradient` dispatcher, which routes to
   `func_update_gradient_tiled` on AMDGPU. That in turn uses the
   wave-cooperative `func_solve_mass_tiled` (from the preceding PR),
   replacing the serial LDL^T back-solve with a BLOCK_DIM=64 cooperative
   one.

This depends on the preceding PR ("wave-cooperative LDL^T back-solve")
to be meaningful — without it the dispatcher just falls through to the
same batched code.

Correctness (256 envs x 50 steps FP32, differential vs pristine):
  q_mean/q_std/q_min/q_max: identical
  v_mean/v_std/v_min/v_max: 1-2e-6 drift (FP32 noise floor)
  n_contacts_sum: 2048 == 2048 (identical)

Performance (MI300X, 8192 envs x 500 steps x FP32, 3-trial interleaved
on top of PRs #14 and #15 stacked):
  baseline (those PRs only): 560-577k env*steps/s
  this PR:                   576-579k env*steps/s
  median delta:              +3.0%

Behavior at smaller n_envs is governed by `perf_dispatch` timing trials
and is workload-dependent; the change is safe-by-default because
`perf_dispatch` picks the faster of monolith/decomposed.

Caveats:
- The rewritten `_kernel_update_gradient` no longer short-circuits on
  `improved[i_b]` (the dispatcher processes all envs). For G1-on-plane
  most envs stay `improved=True` for most of the solve, so the
  saved-compute benefit was small. If a workload has many envs
  converging early, this could be a small regression; `perf_dispatch`
  will then prefer the monolith.
---
 .../rigid/constraint/solver_breakdown.py      | 25 ++++++++-----------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
index e5cb7f655..9ebbe1052 100644
--- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
+++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py
@@ -181,19 +181,16 @@ def _kernel_update_gradient(
     rigid_global_info: array_class.RigidGlobalInfo,
     static_rigid_sim_config: ti.template(),
 ):
-    """Step 5: Update gradient"""
-    _B = constraint_state.grad.shape[1]
-    ti.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32)
-    for i_b in range(_B):
-        if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]:
-            solver.func_update_gradient_batch(
-                i_b,
-                dofs_state=dofs_state,
-                entities_info=entities_info,
-                rigid_global_info=rigid_global_info,
-                constraint_state=constraint_state,
-                static_rigid_sim_config=static_rigid_sim_config,
-            )
+    """Step 5: Update gradient — delegates to the dispatcher so AMDGPU gets
+    the tiled Cholesky/mass-solve path (func_update_gradient_tiled) instead
+    of per-env serial func_update_gradient_batch."""
+    solver.func_update_gradient(
+        dofs_state=dofs_state,
+        entities_info=entities_info,
+        rigid_global_info=rigid_global_info,
+        constraint_state=constraint_state,
+        static_rigid_sim_config=static_rigid_sim_config,
+    )
 
 
 @ti.kernel(fastcache=gs.use_fastcache)
@@ -215,7 +212,7 @@ def _kernel_update_search_direction(
             )
 
 
-@solver.func_solve_body.register(is_compatible=lambda *args, **kwargs: gs.backend in {gs.cuda})
+@solver.func_solve_body.register(is_compatible=lambda *args, **kwargs: gs.backend is not gs.cpu)
 def func_solve_decomposed(
     entities_info,
     dofs_state,