diff --git a/docs/source/overview/imitation-learning/humanoids_imitation.rst b/docs/source/overview/imitation-learning/humanoids_imitation.rst index 68fc22e75f9b..d593fdfcab70 100644 --- a/docs/source/overview/imitation-learning/humanoids_imitation.rst +++ b/docs/source/overview/imitation-learning/humanoids_imitation.rst @@ -636,6 +636,22 @@ Then, from the **Isaac-GR00T** directory, install GR00T N1.5 and its dependencie MAX_JOBS=4 uv pip install --no-build-isolation 'git+https://github.com/facebookresearch/pytorch3d.git@v0.7.9' uv pip install diffusers decord zmq +.. note:: + + **If you cannot install or use flash-attn**, an optional patch is provided that switches the + bundled Eagle 2.5 VL model to PyTorch SDPA. Use this if ``flash-attn`` fails to build for your + environment, or if it installs but raises a runtime error such as + ``RuntimeError: FlashAttention only supports Ampere GPUs or newer`` (for example on Blackwell + GPUs, which ``flash-attn==2.7.1.post4`` does not have prebuilt kernels for). After the patch, + finetune and rollout run on any CUDA arch supported by your PyTorch build, at the cost of + flash-attn's training speedup. Skip the ``flash-attn`` install line above, then apply the + patch from the **Isaac-GR00T** directory (the sibling layout above means the IsaacLab + checkout is at ``../IsaacLab``): + + .. code:: bash + + git apply ../IsaacLab/scripts/imitation_learning/locomanipulation_sdg/gr00t/no_flash_attn.patch + Convert dataset to LeRobot format """"""""""""""""""""""""""""""""" diff --git a/scripts/imitation_learning/locomanipulation_sdg/gr00t/no_flash_attn.patch b/scripts/imitation_learning/locomanipulation_sdg/gr00t/no_flash_attn.patch new file mode 100644 index 000000000000..0085ec98241f --- /dev/null +++ b/scripts/imitation_learning/locomanipulation_sdg/gr00t/no_flash_attn.patch @@ -0,0 +1,63 @@ +diff --git a/gr00t/model/backbone/eagle2_hg_model/config.json b/gr00t/model/backbone/eagle2_hg_model/config.json +index 3894adf..f1b95aa 100644 +--- a/gr00t/model/backbone/eagle2_hg_model/config.json ++++ b/gr00t/model/backbone/eagle2_hg_model/config.json +@@ -1,5 +1,5 @@ + { +- "_attn_implementation": "flash_attention_2", ++ "_attn_implementation": "sdpa", + "_commit_hash": null, + "architectures": [ + "Eagle2_5_VLForConditionalGeneration" +diff --git a/gr00t/model/backbone/eagle2_hg_model/modeling_eagle2_5_vl.py b/gr00t/model/backbone/eagle2_hg_model/modeling_eagle2_5_vl.py +index a9649d5..d99b496 100755 +--- a/gr00t/model/backbone/eagle2_hg_model/modeling_eagle2_5_vl.py ++++ b/gr00t/model/backbone/eagle2_hg_model/modeling_eagle2_5_vl.py +@@ -108,7 +108,7 @@ class Eagle2_5_VLForConditionalGeneration(Eagle2_5_VLPreTrainedModel, Generation + self.vision_model = vision_model + else: + if config.vision_config.model_type == "siglip_vision_model": +- config.vision_config._attn_implementation = "flash_attention_2" ++ config.vision_config._attn_implementation = "sdpa" + self.vision_model = SiglipVisionModel(config.vision_config) + elif config.vision_config.model_type == "radio": + self.vision_model = RADIOModel(config.vision_config) +@@ -124,9 +124,7 @@ class Eagle2_5_VLForConditionalGeneration(Eagle2_5_VLPreTrainedModel, Generation + raise NotImplementedError("Phi3 is not implemented.") + # self.language_model = Phi3ForCausalLM(config.text_config) + elif config.text_config.architectures[0] == "Qwen2ForCausalLM": +- assert ( +- config.text_config._attn_implementation == "flash_attention_2" +- ), f"Qwen2 must use flash_attention_2 but got {config.text_config._attn_implementation}" ++ config.text_config._attn_implementation = "sdpa" + self.language_model = Qwen2ForCausalLM(config.text_config) + elif config.text_config.architectures[0] == "Qwen3ForCausalLM": + self.language_model = Qwen3ForCausalLM(config.text_config) +diff --git a/gr00t/model/backbone/eagle2_hg_model/radio_model.py b/gr00t/model/backbone/eagle2_hg_model/radio_model.py +index 2df0415..eb9b741 100644 +--- a/gr00t/model/backbone/eagle2_hg_model/radio_model.py ++++ b/gr00t/model/backbone/eagle2_hg_model/radio_model.py +@@ -44,12 +44,18 @@ from transformers.utils import ModelOutput + + try: # v1 + from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func +-except ImportError: # v2 +- from flash_attn.flash_attn_interface import ( +- flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func, +- ) ++except ImportError: ++ try: # v2 ++ from flash_attn.flash_attn_interface import ( ++ flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func, ++ ) ++ except ImportError: # flash-attn unavailable — RADIO vision won't work, SigLIP does ++ flash_attn_unpadded_qkvpacked_func = None + +-from flash_attn.bert_padding import pad_input, unpad_input ++try: ++ from flash_attn.bert_padding import pad_input, unpad_input ++except ImportError: ++ pad_input = unpad_input = None + + + class FlashAttention(nn.Module): diff --git a/scripts/imitation_learning/locomanipulation_sdg/gr00t/rollout_policy.py b/scripts/imitation_learning/locomanipulation_sdg/gr00t/rollout_policy.py index 681a850fa685..21cd2f826df2 100644 --- a/scripts/imitation_learning/locomanipulation_sdg/gr00t/rollout_policy.py +++ b/scripts/imitation_learning/locomanipulation_sdg/gr00t/rollout_policy.py @@ -40,6 +40,7 @@ import torch from policy import Policy +from isaaclab.envs.mdp.recorders.recorders_cfg import ActionStateRecorderManagerCfg from isaaclab.utils.datasets import EpisodeData, HDF5DatasetFileHandler from isaaclab.utils.math import convert_quat @@ -375,6 +376,9 @@ def eval_policy( env_cfg = parse_env_cfg(env_name, device=args_cli.device, num_envs=1) env_cfg.sim.device = args_cli.device + # Drop the SDG output-data recorder term: it pulls env._locomanipulation_sdg_output_data, + # which is only populated by the data-generation state machine, not during policy rollout. + env_cfg.recorders = ActionStateRecorderManagerCfg() env_cfg.recorders.dataset_export_dir_path = os.path.dirname(args_cli.output_file) env_cfg.recorders.dataset_filename = os.path.basename(args_cli.output_file)