Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ jobs:
- name: Check mypy
run: |
pip install uv
uv venv
uv pip install setuptools
uv run --group test mypy nemo_rl examples

sphinx-build:
Expand All @@ -143,6 +145,8 @@ jobs:
- name: build docs
run: |
pip install uv
uv venv
uv pip install setuptools
cd docs/
uv run --group docs sphinx-build --fail-on-warning --builder html . _build/html

Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ pip install uv
# This ensures that the version of python used is always what we prescribe.
uv venv

# Install setuptools (required by flash-attn)
uv pip install setuptools
Comment thread
terrykong marked this conversation as resolved.

# If you cannot install at the system level, you can install for your user with
# pip install --user uv

Expand Down
7 changes: 5 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -53,22 +53,25 @@ COPY --link 3rdparty/ ./3rdparty/

# Variables to control the build of TE. If there are issues with parallelization, consider
# setting these to 1.
ARG MAX_JOBS=4
ARG NVTE_BUILD_THREADS_PER_JOB=1
ARG MAX_JOBS
ARG NVTE_BUILD_THREADS_PER_JOB

ENV UV_PROJECT_ENVIRONMENT=/opt/nemo_rl_venv
ENV UV_LINK_MODE=copy

# Create and activate virtual environment
RUN <<"EOF" bash -exu
uv venv ${UV_PROJECT_ENVIRONMENT}
# Install setuptools since uv venv --seed may not install it
VIRTUAL_ENV=$UV_PROJECT_ENVIRONMENT uv pip install setuptools
# uv sync has a more reliable resolver than simple uv pip install which can fail

# Sync each training + inference backend one at a time (since they may conflict)
# to warm the uv cache, then at the end just sync the default dependencies.
# Do everything in one layer to prevent large layers.

# The venv is symlinked to avoid bloating the layer size
VIRTUAL_ENV=$UV_PROJECT_ENVIRONMENT uv pip install setuptools # setuptools for flash-attn
uv sync --link-mode symlink --locked --no-install-project
uv sync --link-mode symlink --locked --extra vllm --no-install-project
uv sync --link-mode symlink --locked --extra mcore --no-install-project
Expand Down
1 change: 1 addition & 0 deletions nemo_rl/utils/venvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def create_local_venv(
exec_cmd.extend(["echo", f"Finished creating venv {venv_path}"])

# Always run uv sync first to ensure the build requirements are set (for --no-build-isolation packages)
subprocess.run(["uv", "pip", "install", "setuptools"], env=env, check=True)
subprocess.run(["uv", "sync"], env=env, check=True)
subprocess.run(exec_cmd, env=env, check=True)

Expand Down
12 changes: 10 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ dependencies = [
"setuptools",
"torch==2.7.0",
"triton",
"flash-attn ; sys_platform == 'linux' and python_version == '3.12' and platform_machine != 'arm64' and platform_machine != 'aarch64'",
"colored==2.2.3",
"ray[default]==2.46.0",
"transformers>=4.51.0",
Expand Down Expand Up @@ -74,6 +75,8 @@ build = [
"hatchling",
# Build requirement for mcore
"pybind11",
# Build requirement for flash-attn
"psutil",
]
docs = [
"sphinx",
Expand Down Expand Up @@ -112,6 +115,7 @@ torchvision = [
triton = [
{ index = "pytorch-cu128" },
]
flash-attn = { url = "https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.0.post2/flash_attn-2.8.0.post2+cu12torch2.7cxx11abiTRUE-cp312-cp312-linux_x86_64.whl", marker = "sys_platform == 'linux' and python_version == '3.12' and platform_machine != 'arm64' and platform_machine != 'aarch64'" }

[tool.uv.workspace]
members = [
Expand All @@ -125,8 +129,7 @@ url = "https://download.pytorch.org/whl/cu128"
explicit = true

[tool.uv]
# Currently, TE must be built with no build-isolation b/c it requires torch
no-build-isolation-package = ["transformer-engine-torch", "transformer-engine"]
no-build-isolation-package = ["transformer-engine-torch", "transformer-engine", "flash-attn", "megatron-core"]
# Always apply the build group since dependencies like TE/mcore/nemo-run require build dependencies
# and this lets us assume they are implicitly installed with a simply `uv sync`. Ideally, we'd
# avoid including these in the default dependency set, but for now it's required.
Expand All @@ -137,6 +140,11 @@ default-groups = ["dev", "build"]
# --link-mode=symlink (fastest option when uv cache and venv on different file-system; caveat: venv is brittle since it depends on the environment/container)
link-mode = "copy"

# Needed when building from source
#[[tool.uv.dependency-metadata]]
#name = "flash-attn"
#requires-dist = ["torch", "einops", "setuptools", "psutil", "ninja"]

[tool.black]
line-length = 120
include = '\.pyi?$'
Expand Down
Loading
Loading