NVIDIA · samnordmann · Apr 28, 2025 · Mar 26, 2025 · Mar 26, 2025 · Apr 16, 2025
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -31,6 +31,7 @@ jobs:
           tools/pip-install-things.sh &
           source tools/setup-env.sh
           wait
+          cd python
           python setup.py build --cpp=23
 
   dynamic-type-meson:

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -51,13 +51,19 @@ jobs:
 
           wait
 
+          # Go to python folder to build cmake files
+          cd python
+
           # Run cmake build
           python setup.py --cmake-only
 
           # Generate csrc/serde/fusion_cache_generated.h
           # NOTE: this might cause a compile of flatbuffers if it is missing
           ninja -C build build_flatbuffer_config
 
+          # Return to root to run clang-tidy
+          cd ..
+
           # Run lintrunner on all csrc files exclude benchmark and test folders
           this_commit=$(git rev-parse HEAD)
           git fetch origin main

diff --git a/.gitignore b/.gitignore
@@ -4,20 +4,24 @@ bin
 # cmake build directory
 build
 .lintbin
-
-# pip wheel directory
-dist
-
 nvfuser/version.py
 nvfuser/include
 nvfuser/lib
 nvfuser/share
 nvfuser/cmake
 
+python/build
+python/nvfuser/version.py
+python/nvfuser/include
+python/nvfuser/lib
+python/nvfuser/share
+python/nvfuser/cmake
+
 .hypothesis
 *.egg-info/
 **/__pycache__
 */*.so
+python/nvfuser/*.so
 
 # Editor temporaries
 *.swa

diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -16,7 +16,7 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
-    'flake8==6.0.0',
+    'flake8==6.1.0',
 ]
 
 
@@ -185,7 +185,7 @@ command = [
     'python3',
     'tools/linter/adapters/clangtidy_linter.py',
     '--binary=~/.local/bin/clang-tidy',
-    '--build_dir=./build',
+    '--build_dir=./python/build',
     '--',
     '@{{PATHSFILE}}'
 ]

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -10,6 +10,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 set(NVFUSER_ROOT ${PROJECT_SOURCE_DIR})
 set(NVFUSER_SRCS_DIR "${NVFUSER_ROOT}/csrc")
+set(NVFUSER_PYTHON_DIR "${NVFUSER_ROOT}/python")
 set(NVFUSER_THIRD_PARTY_DIR "${NVFUSER_ROOT}/third_party")
 
 option(NVFUSER_STANDALONE_BUILD_WITH_UCC "" OFF)
@@ -212,6 +213,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/preseg_passes/remove_empty.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/reorder_sharded_axis.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/segment_inplace_update.cpp
+  ${NVFUSER_SRCS_DIR}/host_ir/pass/stream_parallel_type.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/translate_no_reduction_matmul_to_mul_squeeze.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/translate_repeat_to_expand.cpp
   ${NVFUSER_SRCS_DIR}/rng.cpp
@@ -239,6 +241,9 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/scheduler/communication.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/normalization_inner.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/normalization_inner_outer.cpp
+  ${NVFUSER_SRCS_DIR}/scheduler/normalization_inner_outer_utils.cpp
+  ${NVFUSER_SRCS_DIR}/scheduler/normalization_inner_outer_tma_ws.cpp
+  ${NVFUSER_SRCS_DIR}/scheduler/normalization_inner_outer_multi_wave.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/normalization_outer.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/normalization_utils.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/pointwise.cpp
@@ -289,13 +294,13 @@ endif()
 
 if(BUILD_PYTHON)
   list(APPEND NVFUSER_SRCS
-    ${NVFUSER_SRCS_DIR}/python_frontend/distributed_tensor.cpp
-    ${NVFUSER_SRCS_DIR}/python_frontend/fusion_cache.cpp
-    ${NVFUSER_SRCS_DIR}/python_frontend/fusion_definition.cpp
-    ${NVFUSER_SRCS_DIR}/python_frontend/fusion_state.cpp
-    ${NVFUSER_SRCS_DIR}/python_frontend/segmentation.cpp
-    ${NVFUSER_SRCS_DIR}/python_frontend/translation.cpp
-    ${NVFUSER_SRCS_DIR}/python_frontend/translation_utils.cpp
+    ${NVFUSER_PYTHON_DIR}/python_frontend/distributed_tensor.cpp
+    ${NVFUSER_PYTHON_DIR}/python_frontend/fusion_cache.cpp
+    ${NVFUSER_PYTHON_DIR}/python_frontend/fusion_definition.cpp
+    ${NVFUSER_PYTHON_DIR}/python_frontend/fusion_state.cpp
+    ${NVFUSER_PYTHON_DIR}/python_frontend/segmentation.cpp
+    ${NVFUSER_PYTHON_DIR}/python_frontend/translation.cpp
+    ${NVFUSER_PYTHON_DIR}/python_frontend/translation_utils.cpp
     ${NVFUSER_SRCS_DIR}/serde/fusion_record.cpp
   )
 endif()
@@ -331,6 +336,7 @@ if(NOT MSVC)
 endif()
 
 target_compile_definitions(codegen_internal PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
+target_include_directories(codegen_internal PUBLIC ${NVFUSER_PYTHON_DIR})
 target_include_directories(codegen_internal SYSTEM PUBLIC
   ${CMAKE_SOURCE_DIR}/third_party/flatbuffers/include
   PRIVATE
@@ -457,31 +463,32 @@ if(BUILD_PYTHON)
   # nvfuser python API sources
   set(NVFUSER_PYTHON_SRCS)
   list(APPEND NVFUSER_PYTHON_SRCS
-    ${NVFUSER_SRCS_DIR}/python_frontend/multidevice_bindings.cpp
-    ${NVFUSER_SRCS_DIR}/python_frontend/python_bindings.cpp
-    ${NVFUSER_SRCS_DIR}/python_frontend/python_bindings_extension.cpp
-    ${NVFUSER_SRCS_DIR}/python_frontend/schedule_bindings.cpp
+    ${NVFUSER_PYTHON_DIR}/python_frontend/multidevice_bindings.cpp
+    ${NVFUSER_PYTHON_DIR}/python_frontend/python_bindings.cpp
+    ${NVFUSER_PYTHON_DIR}/python_frontend/python_bindings_extension.cpp
+    ${NVFUSER_PYTHON_DIR}/python_frontend/schedule_bindings.cpp
   )
 
   add_library(nvf_py_internal OBJECT ${NVFUSER_PYTHON_SRCS})
+  target_include_directories(nvf_py_internal PUBLIC ${NVFUSER_PYTHON_DIR})
   target_include_directories(nvf_py_internal SYSTEM INTERFACE
     ${CMAKE_SOURCE_DIR}/third_party/flatbuffers/include
   )
 
   # setup python API version
   add_custom_command(
-    OUTPUT ${NVFUSER_ROOT}/nvfuser/version.py
+    OUTPUT ${NVFUSER_PYTHON_DIR}/nvfuser/version.py
     COMMAND
-    "${PYTHON_EXECUTABLE}" -c \"from pathlib import Path\; Path('${NVFUSER_ROOT}/tools/gen_nvfuser_version.py') .touch() \"
+    "${PYTHON_EXECUTABLE}" -c \"from pathlib import Path\; Path('${NVFUSER_PYTHON_DIR}/tools/gen_nvfuser_version.py') .touch() \"
     COMMAND
-    "${PYTHON_EXECUTABLE}" ${NVFUSER_ROOT}/tools/gen_nvfuser_version.py
-    DEPENDS ${NVFUSER_ROOT}/tools/gen_nvfuser_version.py
-    DEPENDS ${NVFUSER_ROOT}/version.txt
+    "${PYTHON_EXECUTABLE}" ${NVFUSER_PYTHON_DIR}/tools/gen_nvfuser_version.py
+    DEPENDS ${NVFUSER_PYTHON_DIR}/tools/gen_nvfuser_version.py
+    DEPENDS ${NVFUSER_PYTHON_DIR}/version.txt
     WORKING_DIRECTORY ${NVFUSER_ROOT}/tools/
   )
   add_custom_target(
     gen_nvfuser_version ALL
-    DEPENDS ${NVFUSER_ROOT}/nvfuser/version.py
+    DEPENDS ${NVFUSER_PYTHON_DIR}/nvfuser/version.py
   )
   add_dependencies(nvf_py_internal gen_nvfuser_version)
 
@@ -578,6 +585,7 @@ list(APPEND JIT_TEST_SRCS
   ${NVFUSER_ROOT}/tests/cpp/test_indexing.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_indexing_advanced.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_index_select.cpp
+  ${NVFUSER_ROOT}/tests/cpp/test_index_put.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_inlining.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_interval_analysis.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_iter_visitor.cpp
@@ -732,16 +740,17 @@ if(BUILD_TEST)
   list(APPEND HOSTIR_TEST_SRCS
     ${NVFUSER_ROOT}/tests/cpp/test_host_irs.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_host_ir_integration.cpp
+    ${NVFUSER_ROOT}/tests/cpp/test_host_ir_stream_lowering.cpp
   )
   add_test(test_host_ir "${HOSTIR_TEST_SRCS}" "")
   list(APPEND TEST_BINARIES test_host_ir)
 
   if(BUILD_PYTHON)
     set(PY_FRONTEND_TEST_SRCS)
     list(APPEND PY_FRONTEND_TEST_SRCS
-      ${NVFUSER_ROOT}/tests/cpp/python_frontend/test_nvfuser_fusion_cache.cpp
-      ${NVFUSER_ROOT}/tests/cpp/python_frontend/test_nvfuser_fusion_definition.cpp
-      ${NVFUSER_ROOT}/tests/cpp/python_frontend/test_nvfuser_fusion_record.cpp
+      ${NVFUSER_PYTHON_DIR}/tests/python_frontend/test_nvfuser_fusion_cache.cpp
+      ${NVFUSER_PYTHON_DIR}/tests/python_frontend/test_nvfuser_fusion_definition.cpp
+      ${NVFUSER_PYTHON_DIR}/tests/python_frontend/test_nvfuser_fusion_record.cpp
     )
     add_test(test_python_frontend "${PY_FRONTEND_TEST_SRCS}" "")
     list(APPEND TEST_BINARIES test_python_frontend)

diff --git a/README.md b/README.md
@@ -38,6 +38,16 @@ PyPI: [https://pypi.org/project/nvfuser/](https://pypi.org/search/?q=nvfuser)
 
 Docs: https://github.com/NVIDIA/Fuser/wiki
 
+### Install From Source:
+```bash
+git clone https://github.com/NVIDIA/Fuser.git
+cd Fuser
+pip install -r python/requirements.txt
+
+[DEPRECATED] `[MAX_JOBS] python setup.py develop [args]`
+pip install --no-build-isolation -e python -v
+```
+
 Supported compilers:
 
 **GCC:**

diff --git a/benchmarks/python/core.py b/benchmarks/python/core.py
@@ -4,15 +4,14 @@
 from collections.abc import Iterable
 import pytest_benchmark
 import torch
-from torch.autograd import DeviceType
-from torch.profiler import profile, ProfilerActivity
 from typing import List, Callable, Union
 import numpy as np
 from nvfuser import FusionDefinition, FusionCache
 from nvfuser.pytorch_utils import DEVICE_PROPERTIES
 import warnings
 import thunder
 from thunder.executors.nvfuserex import nvfuserex
+from nvfuser.benchmark_utils import TorchProfileTimer, FusionProfileTimer
 
 # These variables can be overwritten through CLI commands
 # --benchmark-rounds=rounds --benchmark-warmup-rounds=warmup_rounds
@@ -102,34 +101,21 @@ def __init__(
             self.benchmark: Underlying pytest-benchmark fixture with timer modified to use torchprofile_timer
             self.current_time: Global montonic clock incremented based on elapsed CUDA time
         """
-
         self.device = device
-        self.fd = None  # Set through setup() for host benchmarking.
         self.benchmark = benchmark_fixture
 
+        # Modify the default timer.
         if device == "cuda":
-            # Initialize a Torch Profiler object
-            self.prof = profile(
-                activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU]
-            )
-            # Modify the default timer.
-            benchmark_fixture._timer = self.torchprofile_timer
+            benchmark_fixture._timer = TorchProfileTimer()
         else:
-            benchmark_fixture._timer = self.fusionprofile_timer
+            benchmark_fixture._timer = FusionProfileTimer()
         # Externally set the precision to avoid timer calibration. Since the timer uses CUDA times,
         # calibration using subsequent timer calls produces invalid results.
         # https://github.com/ionelmc/pytest-benchmark/blob/728752d2976ef53fde7e40beb3e55f09cf4d4736/src/pytest_benchmark/timers.py#L15
         benchmark_fixture._precisions[benchmark_fixture._timer] = precision
 
         self.benchmark = benchmark_fixture
 
-        # Global montonic clock
-        self.current_time = 0.0
-
-        # Specifies if the timer in host measurement is called at the start/finish of execution.
-        # Timings are measured at the end of execution.
-        self.execution_start = True
-
     def __call__(self, function_to_benchmark: Callable, *args, **kwargs):
         return self.benchmark(function_to_benchmark, *args, **kwargs)
 
@@ -138,73 +124,14 @@ def __getattr__(self, attr):
             return getattr(self.benchmark, attr)
         return super().__getattr__(attr)
 
-    def torchprofile_timer(self) -> float:
-        """
-        Custom torchprofiler-based timer used by pytest-benchmark.
-        At every timer call, the profiler is stopped to compute the elapsed CUDA time
-        and the global clock is incremented. The profiler is restarted before returning to continue tracing.
-
-        Returns:
-            self.current_time: Global monotonic clock variable
-        """
-        try:
-            self.prof.stop()
-        except AssertionError:
-            self.prof.start()
-            return self.current_time
-
-        prof_averages = self.prof.key_averages()
-        elapsed_cuda_time = self._get_kernel_time(prof_averages)
-        self._increment_global_time(elapsed_cuda_time)
-        # Clear the internal profiler object to avoid accumulating function events and then restart the profiler
-        # See PR: https://github.com/pytorch/pytorch/pull/125510
-        self.prof.profiler = None
-
-        return self.current_time
-
-    def fusionprofile_timer(self) -> float:
-        if not self.execution_start:
-            profile = self.fd.profile()
-            elapsed_host_time = profile.host_time_ms / 1e3
-            self._increment_global_time(elapsed_host_time)
-        self.execution_start = not self.execution_start
-        return self.current_time
-
-    def _get_kernel_time(
-        self, prof_averages: torch.autograd.profiler_util.EventList
-    ) -> float:
-        """
-        Arguments:
-            prof_averages: Output of self.prof.key_averages()
-        Returns:
-            time_value: Elapsed CUDA time in seconds.
-        """
-        elapsed_cuda_time = 0
-        has_cuda_event = False
-        for event in prof_averages:
-            if event.device_type != DeviceType.CUDA:
-                continue
-            has_cuda_event = True
-            # Re: torch profiler API changes in https://github.com/pytorch/pytorch/pull/123247
-            elapsed_cuda_time = (
-                elapsed_cuda_time + event.self_device_time_total
-                if hasattr(event, "self_device_time_total")
-                else event.self_cuda_time_total
-            )
-        assert has_cuda_event, "No CUDA events found"
-        return elapsed_cuda_time / 1e6
-
-    def _increment_global_time(self, elapsed_time: float) -> None:
-        self.current_time += elapsed_time
+    # Set the fd object for fusion profiling.
+    # fd is returned by setup() for host benchmarking.
+    def set_fd(self, fd):
+        assert isinstance(self._timer, FusionProfileTimer)
+        self._timer.set_fd(fd)
 
-    def cleanup(self) -> None:
-        """
-        Stops a running torchprofiler instance if found.
-        """
-        try:
-            self.prof.stop()
-        except AssertionError:
-            pass
+    def cleanup(self):
+        self._timer.cleanup()
 
     def set_metrics(
         self,
@@ -374,7 +301,7 @@ def setup():
     # The host_benchmark_fn uses the `fd` object returned from setup function.
     def host_benchmark_fn(inputs, fd):
         # Set the fd variable used to query the profile object
-        nvf_benchmark.fd = fd
+        nvf_benchmark.set_fd(fd)
         return fd.execute(inputs, profile=True)
 
     benchmark_fn = benchmark_fn if benchmark_fn is not None else host_benchmark_fn

diff --git a/benchmarks/python/test_cross_entropy_loss.py b/benchmarks/python/test_cross_entropy_loss.py
@@ -20,7 +20,7 @@
 @pytest.mark.parametrize(
     "executor", ["eager", "torchcompile", "thunder", "thunder-torchcompile"]
 )
-def test_rope_fwd_benchmark(
+def test_cross_entropy_fwd_benchmark(
     benchmark,
     variation: str,
     executor: str,
@@ -52,7 +52,7 @@ def fwd_call(inp):
 @pytest.mark.parametrize(
     "executor", ["eager", "torchcompile", "thunder", "thunder-torchcompile"]
 )
-def test_rope_bwd_benchmark(
+def test_cross_entropy_bwd_benchmark(
     benchmark,
     variation: str,
     executor: str,