Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions .github/workflows/test-fabric-multi-gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause

# FabricFrameView multi-GPU unit tests
#
# Runs the cuda:1-parameterized unit tests in
# source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py on the
# dedicated multi-GPU runner. Kept separate from test-multi-gpu.yaml so
# changes to FabricFrameView do not gate distributed-training validation and
# vice versa. Runner preconditions (label, install step, GPU pre-flight) are
# identical to the training workflow.

name: FabricFrameView Multi-GPU Tests

on:
pull_request:
paths:
- "source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py"
- "source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py"
- ".github/workflows/test-fabric-multi-gpu.yaml"
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
test-fabric-multi-gpu:
name: FabricFrameView multi-GPU unit tests
runs-on: [self-hosted, linux, x64, gpu, multi-gpu]
timeout-minutes: 30
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Install Isaac Lab
run: ./isaaclab.sh --install

- name: Verify multi-GPU availability
# Fail loud if the runner regressed to a single GPU. Without this, the
# test helper's ``_skip_if_unavailable`` would skip every ``cuda:1`` case
# and the job would go green with no real coverage.
run: |
echo "=== GPU Info ==="
nvidia-smi --query-gpu=index,name,memory.total --format=csv

GPU_COUNT=$(./isaaclab.sh -p -c "import torch; print(torch.cuda.device_count())")
echo "Detected $GPU_COUNT GPU(s)"

if [ "$GPU_COUNT" -lt 2 ]; then
echo "::error::At least 2 GPUs required for Fabric multi-GPU tests, found $GPU_COUNT"
exit 1
fi

- name: Run FabricFrameView multi-GPU unit tests
run: |
./isaaclab.sh -p -m pytest -m multi_gpu \
source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py -v
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ ignore-words-list = "haa,slq,collapsable,buss,reacher,thirdparty"

markers = [
"isaacsim_ci: mark test to run in isaacsim ci",
"multi_gpu: tests that require 2+ GPUs; skipped automatically on single-GPU machines",
]

# Add pypi.nvidia.com so that `uv pip install isaaclab[isaacsim]` works without --extra-index-url.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Fixed
^^^^^

* Fixed :class:`~isaaclab_physx.sim.views.FabricFrameView` falling back to
the slow USD path on every CUDA device other than ``cuda:0``. USDRT
``SelectPrims`` now accepts any CUDA device index, so Fabric acceleration
runs on the simulation device the view was constructed with (e.g.
``cuda:1``). This unblocks distributed training where each rank is
pinned to a non-primary GPU.
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,6 @@

logger = logging.getLogger(__name__)

# TODO: extend this to ``cuda:N`` once we wire up multi-GPU support for the view.
# Recent Kit / USDRT releases do support multi-GPU ``SelectPrims``, but the
# rest of the FabricFrameView wiring (selections, indexed arrays, etc.) still
# assumes a single device — to be tackled in a follow-up.
_fabric_supported_devices = ("cpu", "cuda", "cuda:0")


def _to_float32_2d(a: wp.array | torch.Tensor) -> wp.array | torch.Tensor:
"""Ensure array is compatible with Fabric kernels (2-D float32).
Expand All @@ -54,8 +48,11 @@ class FabricFrameView(BaseFrameView):
when Fabric is disabled).

When Fabric is enabled, world-pose and scale operations use Warp kernels
operating on ``omni:fabric:worldMatrix``. All other operations delegate
to the internal USD view.
operating on ``omni:fabric:worldMatrix``. Fabric acceleration runs on
the same CUDA device the view was constructed with — ``cuda:0``,
``cuda:1``, or any other available CUDA index — so this view is safe
to use from distributed-training workers pinned to non-primary GPUs.
All other operations delegate to the internal USD view.

After every Fabric write (``set_world_poses``, ``set_scales``),
:meth:`PrepareForReuse` is called on the ``PrimSelection`` to notify
Expand All @@ -78,7 +75,9 @@ def __init__(

Args:
prim_path: USD prim-path pattern to match.
device: Device for Warp arrays (``"cpu"`` or ``"cuda:0"``).
device: Device for Warp arrays. Either ``"cpu"`` or any CUDA
device string (``"cuda:0"``, ``"cuda:1"``, …); Fabric
acceleration is supported on every CUDA index.
validate_xform_ops: Whether to validate prim xform-ops.
stage: USD stage; defaults to the current sim context's stage.
**kwargs: Additional keyword arguments (ignored). Matches the signature of
Expand All @@ -92,15 +91,6 @@ def __init__(
settings = SettingsManager.instance()
self._use_fabric = bool(settings.get("/physics/fabricEnabled", False))

if self._use_fabric and self._device not in _fabric_supported_devices:
logger.warning(
f"Fabric mode is not supported on device '{self._device}'. "
"USDRT SelectPrims and Warp fabric arrays are currently "
f"only supported on {', '.join(_fabric_supported_devices)}. "
"Falling back to standard USD operations. This may impact performance."
)
self._use_fabric = False

self._fabric_initialized = False
self._fabric_usd_sync_done = False
self._fabric_selection = None
Expand Down Expand Up @@ -404,9 +394,6 @@ def _initialize_fabric(self) -> None:
)
wp.synchronize()

# The constructor should have taken care of this, but double check here to avoid regressions
assert self._device in _fabric_supported_devices

self._fabric_selection = fabric_stage.SelectPrims(
require_attrs=[
(usdrt.Sdf.ValueTypeNames.UInt, self._view_index_attr, usdrt.Usd.Access.Read),
Expand Down
105 changes: 102 additions & 3 deletions source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,17 @@

Comment thread
pv-nvidia marked this conversation as resolved.
from isaaclab.app import AppLauncher

# Kit reads ``sys.argv`` directly during startup and segfaults on pytest flags
# (e.g. ``-m multi_gpu``) that collide with its own short options. Strip
# everything but ``argv[0]`` before booting the app — the test file takes no
# CLI arguments of its own.
#
# IMPORTANT: this must stay between the ``AppLauncher`` import and the
# ``AppLauncher(...).app`` call below. Adding any CLI parser, or moving the
# AppLauncher import (or its instantiation) above this line, exposes Kit to
# pytest's argv again and re-introduces the segfault.
sys.argv = sys.argv[:1]

simulation_app = AppLauncher(headless=True).app

import pytest # noqa: E402
Expand Down Expand Up @@ -44,8 +55,19 @@ def test_setup_teardown():


def _skip_if_unavailable(device: str):
if device.startswith("cuda") and not torch.cuda.is_available():
if not device.startswith("cuda"):
return
if not torch.cuda.is_available():
pytest.skip("CUDA not available")
Comment thread
pv-nvidia marked this conversation as resolved.
idx = int(device.split(":")[1]) if ":" in device else 0
n = torch.cuda.device_count()
if idx >= n:
# Always skip rather than fail: the dedicated multi-GPU workflow does its own
# pre-flight ``torch.cuda.device_count() >= 2`` check before invoking pytest, so
# a misconfigured multi-GPU runner is already caught there. Failing here would
# only break the standard single-GPU CI runners that legitimately can't run
# ``cuda:1+`` tests.
pytest.skip(f"{device} not available (device_count={n}) — multi-GPU test skipped")


# ------------------------------------------------------------------
Expand Down Expand Up @@ -169,7 +191,7 @@ def test_fabric_set_world_does_not_write_back_to_usd(device, view_factory):

# Verify Fabric has the new position
fab_pos, _ = view.get_world_poses()
pos_torch = wp.to_torch(fab_pos)
pos_torch = fab_pos.torch
assert torch.allclose(pos_torch, torch.tensor([[99.0, 99.0, 99.0]], device=device), atol=0.1), (
f"Fabric should have new position, got {pos_torch}"
)
Expand Down Expand Up @@ -230,6 +252,83 @@ def force_topology_changed():
# Read back — proves the rebuilt _view_to_fabric and _fabric_world_matrices
# are still consistent.
ret_pos, _ = view.get_world_poses()
pos_torch = wp.to_torch(ret_pos)
pos_torch = ret_pos.torch
expected = torch.tensor([[4.0, 5.0, 6.0], [4.0, 5.0, 6.0]], device=device)
assert torch.allclose(pos_torch, expected, atol=1e-7), f"Read after rebuild failed on {device}: {pos_torch}"


# ------------------------------------------------------------------
# Multi-GPU tests (cuda:1) — skipped automatically on single-GPU workstations
# ------------------------------------------------------------------


@pytest.mark.multi_gpu
@pytest.mark.parametrize("device", ["cuda:1"])
def test_fabric_cuda1_world_pose_roundtrip(device, view_factory):
"""set_world_poses -> get_world_poses roundtrip works on cuda:1.

Verifies that FabricFrameView operates correctly on a non-primary CUDA
device without falling back to the USD path.
"""
bundle = view_factory(2, device)
view = bundle.view

new_pos = wp.zeros((2, 3), dtype=wp.float32, device=device)
wp.launch(kernel=_fill_position, dim=2, inputs=[new_pos, 10.0, 20.0, 30.0], device=device)
view.set_world_poses(positions=new_pos)

ret_pos, _ = view.get_world_poses()
pos_torch = ret_pos.torch
expected = torch.tensor([[10.0, 20.0, 30.0], [10.0, 20.0, 30.0]], device=device)
assert torch.allclose(pos_torch, expected, atol=1e-7), f"Roundtrip failed on {device}: {pos_torch}"


@pytest.mark.multi_gpu
@pytest.mark.parametrize("device", ["cuda:1"])
def test_fabric_cuda1_no_usd_writeback(device, view_factory):
"""set_world_poses on cuda:1 does not write back to USD.

Mirrors test_fabric_set_world_does_not_write_back_to_usd for the cuda:1
device to confirm the no-writeback invariant holds across GPU indices.
"""
bundle = view_factory(1, device)
view = bundle.view

stage = sim_utils.get_current_stage()
prim = stage.GetPrimAtPath(view.prim_paths[0])
xform_cache = UsdGeom.XformCache()
t_before = xform_cache.GetLocalToWorldTransform(prim).ExtractTranslation()
orig_usd_pos = torch.tensor([float(t_before[0]), float(t_before[1]), float(t_before[2])])

new_pos = wp.zeros((1, 3), dtype=wp.float32, device=device)
wp.launch(kernel=_fill_position, dim=1, inputs=[new_pos, 99.0, 99.0, 99.0], device=device)
view.set_world_poses(positions=new_pos)

# USD must not have moved at all — equality, not approximate.
t_after = UsdGeom.XformCache().GetLocalToWorldTransform(prim).ExtractTranslation()
usd_pos_after = torch.tensor([float(t_after[0]), float(t_after[1]), float(t_after[2])])
assert torch.allclose(usd_pos_after, orig_usd_pos, atol=0.0), (
f"USD wrote back on {device}: expected {orig_usd_pos}, got {usd_pos_after}"
)


@pytest.mark.multi_gpu
@pytest.mark.parametrize("device", ["cuda:1"])
def test_fabric_cuda1_scales_roundtrip(device, view_factory):
"""set_scales -> get_scales roundtrip works on cuda:1.

Both write paths (``set_world_poses`` and ``set_scales``) call
``_prepare_for_reuse`` and launch on ``self._device``; this test covers
the scales path on the non-primary CUDA device.
"""
bundle = view_factory(2, device)
view = bundle.view

new_scales = wp.zeros((2, 3), dtype=wp.float32, device=device)
wp.launch(kernel=_fill_position, dim=2, inputs=[new_scales, 2.0, 3.0, 4.0], device=device)
view.set_scales(new_scales)

ret_scales = view.get_scales()
scales_torch = wp.to_torch(ret_scales)
expected = torch.tensor([[2.0, 3.0, 4.0], [2.0, 3.0, 4.0]], device=device)
assert torch.allclose(scales_torch, expected, atol=1e-7), f"Scales roundtrip failed on {device}: {scales_torch}"
Loading