isaac-sim · pv-nvidia · May 17, 2026
@@ -0,0 +1,60 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+# FabricFrameView multi-GPU unit tests
+#
+# Runs the cuda:1-parameterized unit tests in
+# source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py on the
+# dedicated multi-GPU runner.  Kept separate from test-multi-gpu.yaml so
+# changes to FabricFrameView do not gate distributed-training validation and
+# vice versa.  Runner preconditions (label, install step, GPU pre-flight) are
+# identical to the training workflow.
+
+name: FabricFrameView Multi-GPU Tests
+
+on:
+  pull_request:
+    paths:
+      - "source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py"
+      - "source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py"
+      - ".github/workflows/test-fabric-multi-gpu.yaml"
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test-fabric-multi-gpu:
+    name: FabricFrameView multi-GPU unit tests
+    runs-on: [self-hosted, linux, x64, gpu, multi-gpu]
+    timeout-minutes: 30
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install Isaac Lab
+        run: ./isaaclab.sh --install
+
+      - name: Verify multi-GPU availability
+        # Fail loud if the runner regressed to a single GPU.  Without this, the
+        # test helper's ``_skip_if_unavailable`` would skip every ``cuda:1`` case
+        # and the job would go green with no real coverage.
+        run: |
+          echo "=== GPU Info ==="
+          nvidia-smi --query-gpu=index,name,memory.total --format=csv
+
+          GPU_COUNT=$(./isaaclab.sh -p -c "import torch; print(torch.cuda.device_count())")
+          echo "Detected $GPU_COUNT GPU(s)"
+
+          if [ "$GPU_COUNT" -lt 2 ]; then
+            echo "::error::At least 2 GPUs required for Fabric multi-GPU tests, found $GPU_COUNT"
+            exit 1
+          fi
+
+      - name: Run FabricFrameView multi-GPU unit tests
+        run: |
+          ./isaaclab.sh -p -m pytest -m multi_gpu \
+            source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py -v
diff --git a/pyproject.toml b/pyproject.toml
@@ -176,6 +176,7 @@ ignore-words-list = "haa,slq,collapsable,buss,reacher,thirdparty"
 
 markers = [
     "isaacsim_ci: mark test to run in isaacsim ci",
+    "multi_gpu: tests that require 2+ GPUs; skipped automatically on single-GPU machines",
 ]
 
 # Add pypi.nvidia.com so that `uv pip install isaaclab[isaacsim]` works without --extra-index-url.

diff --git a/source/isaaclab_physx/changelog.d/feat-frame-view-enable-mgpu.rst b/source/isaaclab_physx/changelog.d/feat-frame-view-enable-mgpu.rst
@@ -0,0 +1,9 @@
+Fixed
+^^^^^
+
+* Fixed :class:`~isaaclab_physx.sim.views.FabricFrameView` falling back to
+  the slow USD path on every CUDA device other than ``cuda:0``.  USDRT
+  ``SelectPrims`` now accepts any CUDA device index, so Fabric acceleration
+  runs on the simulation device the view was constructed with (e.g.
+  ``cuda:1``).  This unblocks distributed training where each rank is
+  pinned to a non-primary GPU.
diff --git a/source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py b/source/isaaclab_physx/isaaclab_physx/sim/views/fabric_frame_view.py
@@ -23,12 +23,6 @@
 
 logger = logging.getLogger(__name__)
 
-# TODO: extend this to ``cuda:N`` once we wire up multi-GPU support for the view.
-# Recent Kit / USDRT releases do support multi-GPU ``SelectPrims``, but the
-# rest of the FabricFrameView wiring (selections, indexed arrays, etc.) still
-# assumes a single device — to be tackled in a follow-up.
-_fabric_supported_devices = ("cpu", "cuda", "cuda:0")
-
 
 def _to_float32_2d(a: wp.array | torch.Tensor) -> wp.array | torch.Tensor:
     """Ensure array is compatible with Fabric kernels (2-D float32).
@@ -54,8 +48,11 @@ class FabricFrameView(BaseFrameView):
     when Fabric is disabled).
 
     When Fabric is enabled, world-pose and scale operations use Warp kernels
-    operating on ``omni:fabric:worldMatrix``.  All other operations delegate
-    to the internal USD view.
+    operating on ``omni:fabric:worldMatrix``.  Fabric acceleration runs on
+    the same CUDA device the view was constructed with — ``cuda:0``,
+    ``cuda:1``, or any other available CUDA index — so this view is safe
+    to use from distributed-training workers pinned to non-primary GPUs.
+    All other operations delegate to the internal USD view.
 
     After every Fabric write (``set_world_poses``, ``set_scales``),
     :meth:`PrepareForReuse` is called on the ``PrimSelection`` to notify
@@ -78,7 +75,9 @@ def __init__(
 
         Args:
             prim_path: USD prim-path pattern to match.
-            device: Device for Warp arrays (``"cpu"`` or ``"cuda:0"``).
+            device: Device for Warp arrays. Either ``"cpu"`` or any CUDA
+                device string (``"cuda:0"``, ``"cuda:1"``, …); Fabric
+                acceleration is supported on every CUDA index.
             validate_xform_ops: Whether to validate prim xform-ops.
             stage: USD stage; defaults to the current sim context's stage.
             **kwargs: Additional keyword arguments (ignored). Matches the signature of
@@ -92,15 +91,6 @@ def __init__(
         settings = SettingsManager.instance()
         self._use_fabric = bool(settings.get("/physics/fabricEnabled", False))
 
-        if self._use_fabric and self._device not in _fabric_supported_devices:
-            logger.warning(
-                f"Fabric mode is not supported on device '{self._device}'. "
-                "USDRT SelectPrims and Warp fabric arrays are currently "
-                f"only supported on {', '.join(_fabric_supported_devices)}. "
-                "Falling back to standard USD operations. This may impact performance."
-            )
-            self._use_fabric = False
-
         self._fabric_initialized = False
         self._fabric_usd_sync_done = False
         self._fabric_selection = None
@@ -404,9 +394,6 @@ def _initialize_fabric(self) -> None:
         )
         wp.synchronize()
 
-        # The constructor should have taken care of this, but double check here to avoid regressions
-        assert self._device in _fabric_supported_devices
-
         self._fabric_selection = fabric_stage.SelectPrims(
             require_attrs=[
                 (usdrt.Sdf.ValueTypeNames.UInt, self._view_index_attr, usdrt.Usd.Access.Read),

diff --git a/source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py b/source/isaaclab_physx/test/sim/test_views_xform_prim_fabric.py
@@ -17,6 +17,17 @@
 
 from isaaclab.app import AppLauncher
 
+# Kit reads ``sys.argv`` directly during startup and segfaults on pytest flags
+# (e.g. ``-m multi_gpu``) that collide with its own short options.  Strip
+# everything but ``argv[0]`` before booting the app — the test file takes no
+# CLI arguments of its own.
+#
+# IMPORTANT: this must stay between the ``AppLauncher`` import and the
+# ``AppLauncher(...).app`` call below.  Adding any CLI parser, or moving the
+# AppLauncher import (or its instantiation) above this line, exposes Kit to
+# pytest's argv again and re-introduces the segfault.
+sys.argv = sys.argv[:1]
+
 simulation_app = AppLauncher(headless=True).app
 
 import pytest  # noqa: E402
@@ -44,8 +55,19 @@ def test_setup_teardown():
 
 
 def _skip_if_unavailable(device: str):
-    if device.startswith("cuda") and not torch.cuda.is_available():
+    if not device.startswith("cuda"):
+        return
+    if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
+    idx = int(device.split(":")[1]) if ":" in device else 0
+    n = torch.cuda.device_count()
+    if idx >= n:
+        # Always skip rather than fail: the dedicated multi-GPU workflow does its own
+        # pre-flight ``torch.cuda.device_count() >= 2`` check before invoking pytest, so
+        # a misconfigured multi-GPU runner is already caught there.  Failing here would
+        # only break the standard single-GPU CI runners that legitimately can't run
+        # ``cuda:1+`` tests.
+        pytest.skip(f"{device} not available (device_count={n}) — multi-GPU test skipped")
 
 
 # ------------------------------------------------------------------
@@ -169,7 +191,7 @@ def test_fabric_set_world_does_not_write_back_to_usd(device, view_factory):
 
     # Verify Fabric has the new position
     fab_pos, _ = view.get_world_poses()
-    pos_torch = wp.to_torch(fab_pos)
+    pos_torch = fab_pos.torch
     assert torch.allclose(pos_torch, torch.tensor([[99.0, 99.0, 99.0]], device=device), atol=0.1), (
         f"Fabric should have new position, got {pos_torch}"
     )
@@ -230,6 +252,83 @@ def force_topology_changed():
     # Read back — proves the rebuilt _view_to_fabric and _fabric_world_matrices
     # are still consistent.
     ret_pos, _ = view.get_world_poses()
-    pos_torch = wp.to_torch(ret_pos)
+    pos_torch = ret_pos.torch
     expected = torch.tensor([[4.0, 5.0, 6.0], [4.0, 5.0, 6.0]], device=device)
     assert torch.allclose(pos_torch, expected, atol=1e-7), f"Read after rebuild failed on {device}: {pos_torch}"
+
+
+# ------------------------------------------------------------------
+# Multi-GPU tests (cuda:1) — skipped automatically on single-GPU workstations
+# ------------------------------------------------------------------
+
+
+@pytest.mark.multi_gpu
+@pytest.mark.parametrize("device", ["cuda:1"])
+def test_fabric_cuda1_world_pose_roundtrip(device, view_factory):
+    """set_world_poses -> get_world_poses roundtrip works on cuda:1.
+
+    Verifies that FabricFrameView operates correctly on a non-primary CUDA
+    device without falling back to the USD path.
+    """
+    bundle = view_factory(2, device)
+    view = bundle.view
+
+    new_pos = wp.zeros((2, 3), dtype=wp.float32, device=device)
+    wp.launch(kernel=_fill_position, dim=2, inputs=[new_pos, 10.0, 20.0, 30.0], device=device)
+    view.set_world_poses(positions=new_pos)
+
+    ret_pos, _ = view.get_world_poses()
+    pos_torch = ret_pos.torch
+    expected = torch.tensor([[10.0, 20.0, 30.0], [10.0, 20.0, 30.0]], device=device)
+    assert torch.allclose(pos_torch, expected, atol=1e-7), f"Roundtrip failed on {device}: {pos_torch}"
+
+
+@pytest.mark.multi_gpu
+@pytest.mark.parametrize("device", ["cuda:1"])
+def test_fabric_cuda1_no_usd_writeback(device, view_factory):
+    """set_world_poses on cuda:1 does not write back to USD.
+
+    Mirrors test_fabric_set_world_does_not_write_back_to_usd for the cuda:1
+    device to confirm the no-writeback invariant holds across GPU indices.
+    """
+    bundle = view_factory(1, device)
+    view = bundle.view
+
+    stage = sim_utils.get_current_stage()
+    prim = stage.GetPrimAtPath(view.prim_paths[0])
+    xform_cache = UsdGeom.XformCache()
+    t_before = xform_cache.GetLocalToWorldTransform(prim).ExtractTranslation()
+    orig_usd_pos = torch.tensor([float(t_before[0]), float(t_before[1]), float(t_before[2])])
+
+    new_pos = wp.zeros((1, 3), dtype=wp.float32, device=device)
+    wp.launch(kernel=_fill_position, dim=1, inputs=[new_pos, 99.0, 99.0, 99.0], device=device)
+    view.set_world_poses(positions=new_pos)
+
+    # USD must not have moved at all — equality, not approximate.
+    t_after = UsdGeom.XformCache().GetLocalToWorldTransform(prim).ExtractTranslation()
+    usd_pos_after = torch.tensor([float(t_after[0]), float(t_after[1]), float(t_after[2])])
+    assert torch.allclose(usd_pos_after, orig_usd_pos, atol=0.0), (
+        f"USD wrote back on {device}: expected {orig_usd_pos}, got {usd_pos_after}"
+    )
+
+
+@pytest.mark.multi_gpu
+@pytest.mark.parametrize("device", ["cuda:1"])
+def test_fabric_cuda1_scales_roundtrip(device, view_factory):
+    """set_scales -> get_scales roundtrip works on cuda:1.
+
+    Both write paths (``set_world_poses`` and ``set_scales``) call
+    ``_prepare_for_reuse`` and launch on ``self._device``; this test covers
+    the scales path on the non-primary CUDA device.
+    """
+    bundle = view_factory(2, device)
+    view = bundle.view
+
+    new_scales = wp.zeros((2, 3), dtype=wp.float32, device=device)
+    wp.launch(kernel=_fill_position, dim=2, inputs=[new_scales, 2.0, 3.0, 4.0], device=device)
+    view.set_scales(new_scales)
+
+    ret_scales = view.get_scales()
+    scales_torch = wp.to_torch(ret_scales)
+    expected = torch.tensor([[2.0, 3.0, 4.0], [2.0, 3.0, 4.0]], device=device)
+    assert torch.allclose(scales_torch, expected, atol=1e-7), f"Scales roundtrip failed on {device}: {scales_torch}"