From 616227060959975780ed745a247ffa4b4219d551 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Mon, 30 Mar 2026 16:35:14 -0400 Subject: [PATCH 1/7] [MISC] Speedup tiled hessian kernel by using direct lower-triangle indexing. (#2618) --- .../engine/solvers/rigid/constraint/solver.py | 41 ++++++++++++------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver.py b/genesis/engine/solvers/rigid/constraint/solver.py index ed08afc47..4f669c97b 100644 --- a/genesis/engine/solvers/rigid/constraint/solver.py +++ b/genesis/engine/solvers/rigid/constraint/solver.py @@ -1559,28 +1559,41 @@ def func_hessian_direct_tiled( qd.simt.block.sync() # Compute `H += J.T @ D @ J` for a single Hessian block - pid = tid - numel = n_dofs_tile_row * n_dofs_tile_col - while pid < numel: - i_d1_ = pid // n_dofs_tile_col - i_d2_ = pid % n_dofs_tile_col - i_d1 = i_d1_ + i_d1_start - i_d2 = i_d2_ + i_d2_start - if i_d1 >= i_d2: + if is_diag_tile: + n_lower_tri_tile = n_dofs_tile_row * (n_dofs_tile_row + 1) // 2 + pid = tid + while pid < n_lower_tri_tile: + i_d1_, i_d2_ = linear_to_lower_tri(pid) + i_d1 = i_d1_ + i_d1_start + i_d2 = i_d2_ + i_d2_start coef = gs.qd_float(0.0) if i_c_start == 0: coef = rigid_global_info.mass_mat[i_d1, i_d2, i_b] - if is_diag_tile: - for j_c_ in range(n_conts_tile): - coef = coef + jac_row[j_c_, i_d1_] * jac_row[j_c_, i_d2_] * efc_D[j_c_] + for j_c_ in range(n_conts_tile): + coef = coef + jac_row[j_c_, i_d1_] * jac_row[j_c_, i_d2_] * efc_D[j_c_] + if i_c_start == 0: + constraint_state.nt_H[i_b, i_d1, i_d2] = coef else: - for j_c_ in range(n_conts_tile): - coef = coef + jac_row[j_c_, i_d1_] * jac_col[j_c_, i_d2_] * efc_D[j_c_] + constraint_state.nt_H[i_b, i_d1, i_d2] = constraint_state.nt_H[i_b, i_d1, i_d2] + coef + pid = pid + BLOCK_DIM + else: + numel = n_dofs_tile_row * n_dofs_tile_col + pid = tid + while pid < numel: + i_d1_ = pid // n_dofs_tile_col + i_d2_ = pid % n_dofs_tile_col + i_d1 = i_d1_ + i_d1_start + i_d2 = i_d2_ + i_d2_start + coef = gs.qd_float(0.0) + if i_c_start == 0: + coef = rigid_global_info.mass_mat[i_d1, i_d2, i_b] + for j_c_ in range(n_conts_tile): + coef = coef + jac_row[j_c_, i_d1_] * jac_col[j_c_, i_d2_] * efc_D[j_c_] if i_c_start == 0: constraint_state.nt_H[i_b, i_d1, i_d2] = coef else: constraint_state.nt_H[i_b, i_d1, i_d2] = constraint_state.nt_H[i_b, i_d1, i_d2] + coef - pid = pid + BLOCK_DIM + pid = pid + BLOCK_DIM qd.simt.block.sync() i_d2_start = i_d2_start + MAX_DOFS_PER_BLOCK From ad9323e88f7110fbbccd0a0a7723da11d3a70b10 Mon Sep 17 00:00:00 2001 From: Alexis DUBURCQ Date: Sat, 4 Apr 2026 10:44:10 +0200 Subject: [PATCH 2/7] [MISC] Add support of opt-in shared memory for tiled hessian to improve performance. (#2629) --- tests/test_rigid_physics.py | 45 +++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tests/test_rigid_physics.py b/tests/test_rigid_physics.py index 77e04a1f0..1bc13c5f2 100644 --- a/tests/test_rigid_physics.py +++ b/tests/test_rigid_physics.py @@ -3627,6 +3627,7 @@ def rigid_solver_build(self): assert scene.rigid_solver._static_rigid_sim_config.enable_tiled_cholesky_hessian == enable_tiled_cholesky scene.step() + assert not scene.rigid_solver.get_error_envs_mask().any() assert (scene.rigid_solver.constraint_solver.constraint_state.n_constraints.to_numpy() > 0).all() nt_H = scene.rigid_solver.constraint_solver.constraint_state.nt_H.to_numpy() @@ -3636,6 +3637,50 @@ def rigid_solver_build(self): assert_allclose(*values, tol=tol) +@pytest.mark.precision("32") +@pytest.mark.parametrize("backend", [gs.cuda]) +def test_cholesky_tiling_large_shared_memory(show_viewer): + if gs.device.type != "cuda": + pytest.skip("Requires CUDA device") + + from cuda.bindings import runtime # Transitive dependency of torch CUDA + + _, max_shared_mem = runtime.cudaDeviceGetAttribute( + runtime.cudaDeviceAttr.cudaDevAttrMaxSharedMemoryPerBlockOptin, gs.device.index + ) + if max_shared_mem <= 49152: + pytest.skip("GPU does not support opt-in shared memory beyond the default 48kB") + + # Stack 17 free boxes (6 DOFs each = 102 total) to exceed the default 48kB tiling limit of 96 DOFs for f32 + scene = gs.Scene( + viewer_options=gs.options.ViewerOptions( + camera_pos=(1.5, 1.0, 2.5), + camera_lookat=(0.0, 0.0, 1.2), + ), + rigid_options=gs.options.RigidOptions( + constraint_solver=gs.constraint_solver.Newton, + sparse_solve=False, + ), + show_viewer=show_viewer, + show_FPS=False, + ) + scene.add_entity(gs.morphs.Plane()) + for i in range(17): + scene.add_entity( + gs.morphs.Box( + size=(0.1, 0.1, 0.1), + pos=(0, 0, 0.5 + i * 0.15), + ) + ) + scene.build(n_envs=2) + + assert scene.rigid_solver.n_dofs == 102 + assert scene.rigid_solver._static_rigid_sim_config.enable_tiled_cholesky_hessian + + scene.step() + assert not scene.rigid_solver.get_error_envs_mask().any() + + @pytest.mark.slow # ~100s @pytest.mark.parametrize( "n_envs, batched, backend", From 5bfb20939ff9742d7e0092a0440123e682633afd Mon Sep 17 00:00:00 2001 From: Mingrui Zhang <33411325+erizmr@users.noreply.github.com> Date: Thu, 9 Apr 2026 06:08:19 +0100 Subject: [PATCH 3/7] [MISC] Speed up rigid constraint solver init. (#2521) --- .../engine/solvers/rigid/constraint/solver.py | 67 ++++++++++++++++--- genesis/engine/solvers/rigid/rigid_solver.py | 24 +++++++ genesis/utils/array_class.py | 1 + 3 files changed, 83 insertions(+), 9 deletions(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver.py b/genesis/engine/solvers/rigid/constraint/solver.py index 4f669c97b..8b755043a 100644 --- a/genesis/engine/solvers/rigid/constraint/solver.py +++ b/genesis/engine/solvers/rigid/constraint/solver.py @@ -2899,6 +2899,46 @@ def initialize_Jaref( qacc: array_class.V_ANNOTATION, constraint_state: array_class.ConstraintState, static_rigid_sim_config: qd.template(), +): + if qd.static(static_rigid_sim_config.parallel_init): + _initialize_Jaref_parallel( + qacc=qacc, + constraint_state=constraint_state, + static_rigid_sim_config=static_rigid_sim_config, + ) + else: + _initialize_Jaref_per_env( + qacc=qacc, + constraint_state=constraint_state, + static_rigid_sim_config=static_rigid_sim_config, + ) + + +@qd.func +def _initialize_Jaref_body( + i_c, + i_b, + n_dofs, + qacc: array_class.V_ANNOTATION, + constraint_state: array_class.ConstraintState, + static_rigid_sim_config: qd.template(), +): + Jaref = -constraint_state.aref[i_c, i_b] + if qd.static(static_rigid_sim_config.sparse_solve): + for i_d_ in range(constraint_state.jac_n_relevant_dofs[i_c, i_b]): + i_d = constraint_state.jac_relevant_dofs[i_c, i_d_, i_b] + Jaref = Jaref + constraint_state.jac[i_c, i_d, i_b] * qacc[i_d, i_b] + else: + for i_d in range(n_dofs): + Jaref = Jaref + constraint_state.jac[i_c, i_d, i_b] * qacc[i_d, i_b] + constraint_state.Jaref[i_c, i_b] = Jaref + + +@qd.func +def _initialize_Jaref_per_env( + qacc: array_class.V_ANNOTATION, + constraint_state: array_class.ConstraintState, + static_rigid_sim_config: qd.template(), ): _B = constraint_state.jac.shape[2] n_dofs = constraint_state.jac.shape[1] @@ -2906,15 +2946,24 @@ def initialize_Jaref( qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL) for i_b in range(_B): for i_c in range(constraint_state.n_constraints[i_b]): - Jaref = -constraint_state.aref[i_c, i_b] - if qd.static(static_rigid_sim_config.sparse_solve): - for i_d_ in range(constraint_state.jac_n_relevant_dofs[i_c, i_b]): - i_d = constraint_state.jac_relevant_dofs[i_c, i_d_, i_b] - Jaref = Jaref + constraint_state.jac[i_c, i_d, i_b] * qacc[i_d, i_b] - else: - for i_d in range(n_dofs): - Jaref = Jaref + constraint_state.jac[i_c, i_d, i_b] * qacc[i_d, i_b] - constraint_state.Jaref[i_c, i_b] = Jaref + _initialize_Jaref_body(i_c, i_b, n_dofs, qacc, constraint_state, static_rigid_sim_config) + + +@qd.func +def _initialize_Jaref_parallel( + qacc: array_class.V_ANNOTATION, + constraint_state: array_class.ConstraintState, + static_rigid_sim_config: qd.template(), +): + """Parallelizes over (constraints, envs) — better when GPU is not saturated by envs alone.""" + _B = constraint_state.jac.shape[2] + n_dofs = constraint_state.jac.shape[1] + len_constraints = constraint_state.Jaref.shape[0] + + qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL) + for i_c, i_b in qd.ndrange(len_constraints, _B): + if i_c < constraint_state.n_constraints[i_b]: + _initialize_Jaref_body(i_c, i_b, n_dofs, qacc, constraint_state, static_rigid_sim_config) @qd.func diff --git a/genesis/engine/solvers/rigid/rigid_solver.py b/genesis/engine/solvers/rigid/rigid_solver.py index 76f981812..7c06308e9 100644 --- a/genesis/engine/solvers/rigid/rigid_solver.py +++ b/genesis/engine/solvers/rigid/rigid_solver.py @@ -348,6 +348,29 @@ def build(self): self._func_vel_at_point = func_vel_at_point self._func_apply_coupling_force = func_apply_coupling_force + def _should_use_parallel_init(self): + """Use parallel init (ndrange over constraints+envs) when envs alone don't saturate the GPU. + + Uses hardware-derived GPU core count to determine saturation threshold, following the same + multi-backend pattern as collider.py (line 219). + """ + if gs.backend == gs.cpu or self.sim.options.requires_grad: + return False + import torch + + if torch.cuda.is_available(): + gpu_props = torch.cuda.get_device_properties(torch.cuda.current_device()) + # NVIDIA: 128 CUDA cores per SM. AMD/ROCm: 64 stream processors per CU. + cores_per_unit = 64 if torch.version.hip else 128 + gpu_cores = gpu_props.multi_processor_count * cores_per_unit + elif gs.backend == gs.metal: + # Upper-bound estimate for Apple Silicon: 40 GPU cores × 128 ALUs + gpu_cores = 5120 + else: + # Fallback for other GPU backends (e.g. Vulkan) + gpu_cores = 16384 + return self.n_envs <= gpu_cores + def _build_static_config(self): static_rigid_sim_config = dict( backend=gs.backend, @@ -366,6 +389,7 @@ def _build_static_config(self): sparse_solve=self._options.sparse_solve, integrator=self._integrator, solver_type=self._options.constraint_solver, + parallel_init=self._should_use_parallel_init(), ) if self.is_active: diff --git a/genesis/utils/array_class.py b/genesis/utils/array_class.py index 237df21bd..66d14cd0d 100644 --- a/genesis/utils/array_class.py +++ b/genesis/utils/array_class.py @@ -2022,6 +2022,7 @@ class StructRigidSimStaticConfig(metaclass=AutoInitMeta): integrator: int solver_type: int requires_grad: bool + parallel_init: bool = False # parallelize init over (constraints, envs) when GPU is not saturated by envs alone enable_tiled_cholesky_mass_matrix: bool = False enable_tiled_cholesky_hessian: bool = False tiled_n_dofs_per_entity: int = -1 From 952b8c9b242305501512a4cd680f221b2e48a5f9 Mon Sep 17 00:00:00 2001 From: Anik Chaudhuri Date: Fri, 24 Apr 2026 19:01:21 +0530 Subject: [PATCH 4/7] perf improvement --- .../solvers/rigid/abd/forward_dynamics.py | 30 +++++++----- .../solvers/rigid/abd/forward_kinematics.py | 48 +++++++++++-------- 2 files changed, 46 insertions(+), 32 deletions(-) diff --git a/genesis/engine/solvers/rigid/abd/forward_dynamics.py b/genesis/engine/solvers/rigid/abd/forward_dynamics.py index bb0afaf64..0b31f7d71 100644 --- a/genesis/engine/solvers/rigid/abd/forward_dynamics.py +++ b/genesis/engine/solvers/rigid/abd/forward_dynamics.py @@ -1429,18 +1429,19 @@ def func_update_acc( if qd.static(static_rigid_sim_config.use_hibernation) else i_0 ) - + link_start = entities_info.link_start[i_e] + link_end = entities_info.link_end[i_e] for i_l_ in ( - range(entities_info.link_start[i_e], entities_info.link_end[i_e]) + range(link_start, link_end) if qd.static(not BW) else qd.static(range(static_rigid_sim_config.max_n_links_per_entity)) ): - i_l = i_l_ if qd.static(not BW) else (i_l_ + entities_info.link_start[i_e]) + i_l = i_l_ if qd.static(not BW) else (i_l_ + link_start) if func_check_index_range( i_l, - entities_info.link_start[i_e], - entities_info.link_end[i_e], + link_start, + link_end, BW, ): I_l = [i_l, i_b] if qd.static(static_rigid_sim_config.batch_links_info) else i_l @@ -1460,18 +1461,21 @@ def func_update_acc( if qd.static(update_cacc): links_state.cacc_lin[i_l, i_b] = links_state.cacc_lin[i_p, i_b] links_state.cacc_ang[i_l, i_b] = links_state.cacc_ang[i_p, i_b] - + dof_start = links_info.dof_start[I_l] + dof_end = links_info.dof_end[I_l] for i_d_ in ( - range(links_info.dof_start[I_l], links_info.dof_end[I_l]) + range(dof_start, dof_end) if qd.static(not BW) else qd.static(range(static_rigid_sim_config.max_n_dofs_per_link)) ): - i_d = i_d_ if qd.static(not BW) else (i_d_ + links_info.dof_start[I_l]) + i_d = i_d_ if qd.static(not BW) else (i_d_ + dof_start) - if func_check_index_range(i_d, links_info.dof_start[I_l], links_info.dof_end[I_l], BW): + if func_check_index_range(i_d, dof_start, dof_end, BW): # cacc = cacc_parent + cdofdot * qvel + cdof * qacc - local_cdd_vel = dofs_state.cdofd_vel[i_d, i_b] * dofs_state.vel[i_d, i_b] - local_cdd_ang = dofs_state.cdofd_ang[i_d, i_b] * dofs_state.vel[i_d, i_b] + vel = dofs_state.vel[i_d, i_b] + acc = dofs_state.acc[i_d, i_b] + local_cdd_vel = dofs_state.cdofd_vel[i_d, i_b] * vel + local_cdd_ang = dofs_state.cdofd_ang[i_d, i_b] * vel func_add_safe_backward(links_state.cdd_vel, [i_l, i_b], local_cdd_vel, BW) func_add_safe_backward(links_state.cdd_ang, [i_l, i_b], local_cdd_ang, BW) @@ -1479,13 +1483,13 @@ def func_update_acc( func_add_safe_backward( links_state.cacc_lin, [i_l, i_b], - local_cdd_vel + dofs_state.cdof_vel[i_d, i_b] * dofs_state.acc[i_d, i_b], + local_cdd_vel + dofs_state.cdof_vel[i_d, i_b] * acc, BW, ) func_add_safe_backward( links_state.cacc_ang, [i_l, i_b], - local_cdd_ang + dofs_state.cdof_ang[i_d, i_b] * dofs_state.acc[i_d, i_b], + local_cdd_ang + dofs_state.cdof_ang[i_d, i_b] * acc, BW, ) diff --git a/genesis/engine/solvers/rigid/abd/forward_kinematics.py b/genesis/engine/solvers/rigid/abd/forward_kinematics.py index 609f7f69f..a6a00b730 100644 --- a/genesis/engine/solvers/rigid/abd/forward_kinematics.py +++ b/genesis/engine/solvers/rigid/abd/forward_kinematics.py @@ -975,16 +975,18 @@ def func_forward_kinematics_entity( R = qd.static(func_read_field_if) WR = qd.static(func_write_and_read_field_if) i_b = qd.cast(i_b, qd.i32) + link_start = entities_info.link_start[i_e] + link_end = entities_info.link_end[i_e] # Becomes static loop in backward pass, because we assume this loop is an inner loop for i_l_ in ( - range(entities_info.link_start[i_e], entities_info.link_end[i_e]) + range(link_start, link_end) if qd.static(not BW) else qd.static(range(static_rigid_sim_config.max_n_links_per_entity)) ): - i_l = gs.qd_int(i_l_ if qd.static(not BW) else (i_l_ + entities_info.link_start[i_e])) + i_l = gs.qd_int(i_l_ if qd.static(not BW) else (i_l_ + link_start)) - if func_check_index_range(i_l, entities_info.link_start[i_e], entities_info.link_end[i_e], BW): + if func_check_index_range(i_l, link_start, link_end, BW): I_l = [i_l, i_b] if qd.static(static_rigid_sim_config.batch_links_info) else i_l I_l0 = (i_l, 0, i_b) @@ -1386,15 +1388,17 @@ def func_forward_velocity_entity( R = qd.static(func_read_field_if) A = qd.static(func_atomic_add_if) i_b = qd.cast(i_b, qd.i32) + link_start = entities_info.link_start[i_e] + link_end = entities_info.link_end[i_e] for i_l_ in ( - range(entities_info.link_start[i_e], entities_info.link_end[i_e]) + range(link_start, link_end) if qd.static(not BW) else qd.static(range(static_rigid_sim_config.max_n_links_per_entity)) ): - i_l = gs.qd_int(i_l_ if qd.static(not BW) else (i_l_ + entities_info.link_start[i_e])) + i_l = gs.qd_int(i_l_ if qd.static(not BW) else (i_l_ + link_start)) - if func_check_index_range(i_l, entities_info.link_start[i_e], entities_info.link_end[i_e], BW): + if func_check_index_range(i_l, link_start, link_end, BW): I_l = [i_l, i_b] if qd.static(static_rigid_sim_config.batch_links_info) else i_l n_joints = links_info.joint_end[I_l] - links_info.joint_start[I_l] @@ -1423,38 +1427,44 @@ def func_forward_velocity_entity( if joint_type == gs.JOINT_TYPE.FREE: for i_3 in qd.static(range(3)): - _vel = dofs_state.cdof_vel[dof_start + i_3, i_b] * dofs_state.vel[dof_start + i_3, i_b] - _ang = dofs_state.cdof_ang[dof_start + i_3, i_b] * dofs_state.vel[dof_start + i_3, i_b] - - cvel_vel = cvel_vel + A(links_state.cd_vel_bw, curr_I, _vel, BW) - cvel_ang = cvel_ang + A(links_state.cd_ang_bw, curr_I, _ang, BW) - + idx = dof_start + i_3 + + v = dofs_state.vel[idx, i_b] + tmp_vel = dofs_state.cdof_vel[idx, i_b] * v + tmp_ang = dofs_state.cdof_ang[idx, i_b] * v + cvel_vel = cvel_vel + A(links_state.cd_vel_bw, curr_I, tmp_vel, BW) + cvel_ang = cvel_ang + A(links_state.cd_ang_bw, curr_I, tmp_ang, BW) + + ang_curr = R(links_state.cd_ang_bw, curr_I, cvel_ang, BW) + vel_curr = R(links_state.cd_vel_bw, curr_I, cvel_vel, BW) for i_3 in qd.static(range(3)): ( dofs_state.cdofd_ang[dof_start + i_3, i_b], dofs_state.cdofd_vel[dof_start + i_3, i_b], ) = qd.Vector.zero(gs.qd_float, 3), qd.Vector.zero(gs.qd_float, 3) - + ( dofs_state.cdofd_ang[dof_start + i_3 + 3, i_b], dofs_state.cdofd_vel[dof_start + i_3 + 3, i_b], ) = gu.motion_cross_motion( - R(links_state.cd_ang_bw, curr_I, cvel_ang, BW), - R(links_state.cd_vel_bw, curr_I, cvel_vel, BW), + ang_curr, + vel_curr, dofs_state.cdof_ang[dof_start + i_3 + 3, i_b], dofs_state.cdof_vel[dof_start + i_3 + 3, i_b], ) - + if qd.static(BW): links_state.cd_vel_bw[next_I] = links_state.cd_vel_bw[curr_I] links_state.cd_ang_bw[next_I] = links_state.cd_ang_bw[curr_I] - + for i_3 in qd.static(range(3)): + idx = dof_start + i_3 + 3 + v = dofs_state.vel[idx, i_b] _vel = ( - dofs_state.cdof_vel[dof_start + i_3 + 3, i_b] * dofs_state.vel[dof_start + i_3 + 3, i_b] + dofs_state.cdof_vel[idx, i_b] * v ) _ang = ( - dofs_state.cdof_ang[dof_start + i_3 + 3, i_b] * dofs_state.vel[dof_start + i_3 + 3, i_b] + dofs_state.cdof_ang[idx, i_b] * v ) cvel_vel = cvel_vel + A(links_state.cd_vel_bw, next_I, _vel, BW) cvel_ang = cvel_ang + A(links_state.cd_ang_bw, next_I, _ang, BW) From 3db74415bfc8dca63e8ed7e0dae34649b6cbecda Mon Sep 17 00:00:00 2001 From: Anik Chaudhuri Date: Fri, 24 Apr 2026 20:04:15 +0530 Subject: [PATCH 5/7] broadphase update --- genesis/engine/solvers/rigid/collider/broadphase.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/genesis/engine/solvers/rigid/collider/broadphase.py b/genesis/engine/solvers/rigid/collider/broadphase.py index b44ad4e61..9e80cfc6a 100644 --- a/genesis/engine/solvers/rigid/collider/broadphase.py +++ b/genesis/engine/solvers/rigid/collider/broadphase.py @@ -50,6 +50,7 @@ def func_check_collision_valid( i_lb = geoms_info.link_idx[i_gb] # Filter out collision pairs that are involved in dynamically registered weld equality constraints + qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=64) for i_eq in range(rigid_global_info.n_equalities[None], constraint_state.qd_n_equalities[i_b]): if equalities_info.eq_type[i_eq, i_b] == gs.EQUALITY_TYPE.WELD: i_leqa = equalities_info.eq_obj1id[i_eq, i_b] @@ -79,7 +80,7 @@ def func_collision_clear( ): _B = collider_state.n_contacts.shape[0] - qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL) + qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=64) for i_b in range(_B): if qd.static(static_rigid_sim_config.use_hibernation): collider_state.n_contacts_hibernated[i_b] = 0 @@ -164,7 +165,7 @@ def func_broad_phase( # Clear collider state func_collision_clear(links_state, links_info, collider_state, static_rigid_sim_config) - qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL) + qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL,block_dim=64) for i_b in range(_B): axis = 0 From 881e4b15388cb40f230b94a0df6b25e94c326ff4 Mon Sep 17 00:00:00 2001 From: Anik Chaudhuri Date: Fri, 24 Apr 2026 20:14:50 +0530 Subject: [PATCH 6/7] forward_velocity_entity loop_config --- genesis/engine/solvers/rigid/abd/forward_kinematics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genesis/engine/solvers/rigid/abd/forward_kinematics.py b/genesis/engine/solvers/rigid/abd/forward_kinematics.py index a6a00b730..3e6464c3c 100644 --- a/genesis/engine/solvers/rigid/abd/forward_kinematics.py +++ b/genesis/engine/solvers/rigid/abd/forward_kinematics.py @@ -1390,7 +1390,7 @@ def func_forward_velocity_entity( i_b = qd.cast(i_b, qd.i32) link_start = entities_info.link_start[i_e] link_end = entities_info.link_end[i_e] - + qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL,block_dim=256) for i_l_ in ( range(link_start, link_end) if qd.static(not BW) From 3f16680e84bd9128f07f6827814195c01f89c084 Mon Sep 17 00:00:00 2001 From: Anik Chaudhuri Date: Fri, 24 Apr 2026 20:20:06 +0530 Subject: [PATCH 7/7] Update forward_kinematics.py --- genesis/engine/solvers/rigid/abd/forward_kinematics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/genesis/engine/solvers/rigid/abd/forward_kinematics.py b/genesis/engine/solvers/rigid/abd/forward_kinematics.py index 3e6464c3c..f6ba49a69 100644 --- a/genesis/engine/solvers/rigid/abd/forward_kinematics.py +++ b/genesis/engine/solvers/rigid/abd/forward_kinematics.py @@ -979,6 +979,7 @@ def func_forward_kinematics_entity( link_end = entities_info.link_end[i_e] # Becomes static loop in backward pass, because we assume this loop is an inner loop + qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL,block_dim=64) for i_l_ in ( range(link_start, link_end) if qd.static(not BW) @@ -1390,7 +1391,7 @@ def func_forward_velocity_entity( i_b = qd.cast(i_b, qd.i32) link_start = entities_info.link_start[i_e] link_end = entities_info.link_end[i_e] - qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL,block_dim=256) + qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL,block_dim=64) for i_l_ in ( range(link_start, link_end) if qd.static(not BW)