DexForce · yangchen73 · Mar 2, 2026 · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026
diff --git a/configs/agents/rl/basic/cart_pole/train_config_grpo.json b/configs/agents/rl/basic/cart_pole/train_config_grpo.json
@@ -0,0 +1,46 @@
+{
+    "trainer": {
+        "exp_name": "cart_pole_grpo",
+        "gym_config": "configs/agents/rl/basic/cart_pole/gym_config.json",
+        "seed": 42,
+        "device": "cpu",
+        "headless": true,
+        "enable_rt": false,
+        "gpu_id": 0,
+        "num_envs": 64,
+        "iterations": 1000,
+        "rollout_steps": 1024,
+        "eval_freq": 200,
+        "save_freq": 200,
+        "use_wandb": true,
+        "enable_eval": true,
+        "wandb_project_name": "embodychain-cart_pole"
+    },
+    "policy": {
+        "name": "actor_only",
+        "actor": {
+            "type": "mlp",
+            "network_cfg": {
+                "hidden_sizes": [256, 256],
+                "activation": "relu"
+            }
+        }
+    },
+    "algorithm": {
+        "name": "grpo",
+        "cfg": {
+            "learning_rate": 0.0001,
+            "n_epochs": 10,
+            "batch_size": 8192,
+            "gamma": 0.99,
+            "clip_coef": 0.2,
+            "ent_coef": 0.001,
+            "kl_coef": 0.0,
+            "group_size": 4,
+            "eps": 1e-8,
+            "reset_every_rollout": true,
+            "max_grad_norm": 0.5,
+            "truncate_at_first_done": true
+        }
+    }
+}
diff --git a/docs/source/overview/rl/algorithm.md b/docs/source/overview/rl/algorithm.md
@@ -1,6 +1,6 @@
 # RL Algorithms
 
-This module contains the core implementations of reinforcement learning algorithms, mainly including PPO (Proximal Policy Optimization).
+This module contains the core implementations of reinforcement learning algorithms, including PPO (Proximal Policy Optimization) and GRPO (Group Relative Policy Optimization).
 
 ## Main Classes and Functions
 
@@ -23,8 +23,20 @@ This module contains the core implementations of reinforcement learning algorith
 - Typical training flow: collect rollout → compute advantage/return → multi-epoch minibatch optimization.
 - Supports advantage normalization, entropy regularization, value loss weighting, etc.
 
+### GRPO
+- Group Relative Policy Optimization: uses group-level return comparison instead of a Critic network, saving memory.
+- **Step-wise returns**: Computes per-step discounted returns \(R_t = r_t + \gamma R_{t+1}\) (reverse accumulation), avoiding causal issues and discount bias for dense-reward Embodied AI tasks.
+- **Masked group normalization**: For variable-length sequences (e.g. `truncate_at_first_done`), group mean/std uses only alive peers at each step, avoiding dead envs' zeros dragging down the mean.
+- **Optional reference policy**: When `kl_coef > 0`, creates a frozen reference policy for KL regularization (e.g. VLA fine-tuning). When `kl_coef = 0`, no ref policy is created (recommended for from-scratch training like CartPole).
+- Key methods:
+  - `_compute_step_returns_and_mask(rewards, dones)`: Step-wise discounted returns and valid-step mask.
+  - `_compute_step_group_advantages(step_returns, seq_mask)`: Per-step group normalization with masked mean/std.
+  - `collect_rollout`: Collect trajectories and compute step-wise advantages.
+  - `update`: Multi-epoch minibatch optimization with optional KL penalty.
+- Supports both **Embodied AI** (dense reward, from-scratch training) and **VLA** (sparse reward, fine-tuning) modes via `kl_coef` configuration.
+
 ### Config Classes
-- `AlgorithmCfg`, `PPOCfg`: Centralized management of learning rate, batch size, clip_coef, ent_coef, vf_coef, and other parameters.
+- `AlgorithmCfg`, `PPOCfg`, `GRPOCfg`: Centralized management of learning rate, batch size, clip_coef, ent_coef, vf_coef, and other parameters.
 - Supports automatic loading from JSON config files for batch experiments and parameter tuning.
 - Can be extended via inheritance for multiple algorithms and tasks.
 
@@ -51,6 +63,7 @@ class PPO(BaseAlgorithm):
 - It is recommended to manage all algorithm parameters via config classes and JSON config files for reproducibility and tuning.
 - Supports multi-environment parallel collection to improve sampling efficiency.
 - Custom algorithm classes can be implemented to extend new RL methods.
+- **GRPO**: Use `actor_only` policy (no Critic). Set `kl_coef=0` for from-scratch training (CartPole, dense reward); set `kl_coef=0.02` for VLA/LLM fine-tuning.
 
 ## Extension Notes
 - Users can inherit from `BaseAlgorithm` to implement custom algorithms and flexibly integrate them into the RL framework.

diff --git a/docs/source/overview/rl/buffer.md b/docs/source/overview/rl/buffer.md
@@ -5,7 +5,7 @@ This module implements the data buffer for RL training, responsible for storing
 ## Main Classes and Structure
 
 ### RolloutBuffer
-- Used for on-policy algorithms (such as PPO), efficiently stores observations, actions, rewards, dones, values, and logprobs for each step.
+- Used for on-policy algorithms (such as PPO, GRPO), efficiently stores observations, actions, rewards, dones, values, and logprobs for each step.
 - Supports multi-environment parallelism (shape: [T, N, ...]), all data allocated on GPU.
 - Structure fields:
   - `obs`: Observation tensor, float32, shape [T, N, obs_dim]
@@ -38,7 +38,7 @@ for batch in buffer.iterate_minibatches(batch_size):
 - All data is allocated on GPU to avoid frequent CPU-GPU copying.
 - The extras field can be flexibly extended to meet different algorithm needs (e.g., GAE, TD-lambda, distributional advantages).
 - The iterator automatically shuffles to improve training stability.
-- Compatible with various RL algorithms (PPO, A2C, SAC, etc.), custom fields and sampling logic supported.
+- Compatible with various RL algorithms (PPO, GRPO, A2C, SAC, etc.), custom fields and sampling logic supported.
 
 ## Code Example
 ```python

diff --git a/docs/source/overview/rl/config.md b/docs/source/overview/rl/config.md
@@ -13,7 +13,7 @@ This module defines configuration classes for RL algorithms, centralizing the ma
     - `gamma`: Discount factor.
     - `gae_lambda`: GAE advantage estimation parameter.
     - `max_grad_norm`: Gradient clipping threshold.
-- Supports inheritance and extension (e.g., PPOCfg adds clip_coef, ent_coef, vf_coef).
+- Supports inheritance and extension (e.g., PPOCfg adds clip_coef, ent_coef, vf_coef; GRPOCfg adds group_size, kl_coef, truncate_at_first_done).
 
 ### Automatic Loading
 - Supports automatic parsing of JSON config files; the main training script injects parameters automatically.
@@ -43,6 +43,33 @@ Or via config file:
 }
 ```
 
+GRPO example (for Embodied AI / from-scratch training):
+
+```json
+{
+    "algorithm": {
+        "name": "grpo",
+        "cfg": {
+            "learning_rate": 0.0001,
+            "n_epochs": 10,
+            "batch_size": 8192,
+            "gamma": 0.99,
+            "clip_coef": 0.2,
+            "ent_coef": 0.001,
+            "kl_coef": 0,
+            "group_size": 4,
+            "eps": 1e-8,
+            "reset_every_rollout": true,
+            "max_grad_norm": 0.5,
+            "truncate_at_first_done": true
+        }
+    }
+}
+```
+
+- **kl_coef**: Set to `0` for from-scratch training (CartPole, dense reward); use `0.02` for VLA/LLM fine-tuning.
+- **group_size**: Number of envs per group for within-group return normalization (must divide `num_envs`).
+
 ## Extension and Customization
 - Custom algorithm parameter classes are supported for multi-algorithm and multi-task experiments.
 - Config classes are seamlessly integrated with the main training script for automated experiments and reproducibility.

diff --git a/docs/source/overview/rl/models.md b/docs/source/overview/rl/models.md
@@ -13,7 +13,10 @@ This module contains RL policy networks and related model implementations, suppo
 - Supports GPU deployment and distributed training.
 
 ### ActorCritic
-- Typical actor-critic policy, includes actor (action distribution) and critic (value function).
+- Typical actor-critic policy, includes actor (action distribution) and critic (value function). Used with PPO.
+
+### ActorOnly
+- Actor-only policy without Critic. Used with GRPO (Group Relative Policy Optimization), which estimates advantages via group-level return comparison instead of a value function.
 - Supports Gaussian action distributions, learnable log_std, suitable for continuous action spaces.
 - Key methods:
     - `get_action`: Actor network outputs mean, samples action, returns log_prob and critic value.

diff --git a/docs/source/tutorial/rl.rst b/docs/source/tutorial/rl.rst
@@ -136,10 +136,10 @@ Algorithm Configuration
 
 The ``algorithm`` section defines the RL algorithm:
 
-- **name**: Algorithm name (e.g., "ppo")
+- **name**: Algorithm name (e.g., "ppo", "grpo")
 - **cfg**: Algorithm-specific hyperparameters
 
-Example:
+PPO example:
 
 .. code-block:: json
 
@@ -158,6 +158,30 @@ Example:
      }
    }
 
+GRPO example (for Embodied AI / from-scratch training, e.g. CartPole):
+
+.. code-block:: json
+
+   "algorithm": {
+     "name": "grpo",
+     "cfg": {
+       "learning_rate": 0.0001,
+       "n_epochs": 10,
+       "batch_size": 8192,
+       "gamma": 0.99,
+       "clip_coef": 0.2,
+       "ent_coef": 0.001,
+       "kl_coef": 0,
+       "group_size": 4,
+       "eps": 1e-8,
+       "reset_every_rollout": true,
+       "max_grad_norm": 0.5,
+       "truncate_at_first_done": true
+     }
+   }
+
+For GRPO: use ``actor_only`` policy. Set ``kl_coef=0`` for from-scratch training; ``kl_coef=0.02`` for VLA/LLM fine-tuning.
+
 Training Script
 ~~~~~~~~~~~~~~~
 
@@ -207,7 +231,7 @@ Training Process
 The training process follows this sequence:
 
 1. **Rollout Phase**: Algorithm collects trajectories by interacting with the environment (via ``collect_rollout``). During this phase, the trainer performs dense per-step logging of rewards and metrics from environment info.
-2. **GAE Computation**: Algorithm computes advantages and returns using Generalized Advantage Estimation (internal to algorithm, stored in buffer extras)
+2. **Advantage/Return Computation**: Algorithm computes advantages and returns (e.g. GAE for PPO, step-wise group normalization for GRPO; stored in buffer extras)
 3. **Update Phase**: Algorithm updates the policy using collected data (e.g., PPO)
 4. **Logging**: Trainer logs training losses and aggregated metrics to TensorBoard and Weights & Biases
 5. **Evaluation** (periodic): Trainer evaluates the current policy
@@ -248,7 +272,8 @@ All policies must inherit from the ``Policy`` abstract base class:
 Available Policies
 ------------------
 
-- **ActorCritic**: MLP-based Gaussian policy with learnable log_std. Requires external ``actor`` and ``critic`` modules to be provided (defined in JSON config).
+- **ActorCritic**: MLP-based Gaussian policy with learnable log_std. Requires external ``actor`` and ``critic`` modules to be provided (defined in JSON config). Used with PPO.
+- **ActorOnly**: Actor-only policy without Critic. Used with GRPO (group-relative advantage estimation).
 - **VLAPlaceholderPolicy**: Placeholder for Vision-Language-Action policies
 
 Algorithms
@@ -258,6 +283,7 @@ Available Algorithms
 --------------------
 
 - **PPO**: Proximal Policy Optimization with GAE
+- **GRPO**: Group Relative Policy Optimization (no Critic, step-wise returns, masked group normalization). Use ``actor_only`` policy. Set ``kl_coef=0`` for from-scratch training (CartPole, dense reward); ``kl_coef=0.02`` for VLA/LLM fine-tuning.
 
 Adding a New Algorithm
 ---------------------

diff --git a/embodichain/agents/rl/algo/__init__.py b/embodichain/agents/rl/algo/__init__.py
@@ -21,10 +21,12 @@
 
 from .base import BaseAlgorithm
 from .ppo import PPOCfg, PPO
+from .grpo import GRPOCfg, GRPO
 
 # name -> (CfgClass, AlgoClass)
 _ALGO_REGISTRY: Dict[str, Tuple[Type[Any], Type[Any]]] = {
     "ppo": (PPOCfg, PPO),
+    "grpo": (GRPOCfg, GRPO),
 }
 
 
@@ -47,6 +49,8 @@ def build_algo(name: str, cfg_kwargs: Dict[str, float], policy, device: torch.de
     "BaseAlgorithm",
     "PPOCfg",
     "PPO",
+    "GRPOCfg",
+    "GRPO",
     "get_registered_algo_names",
     "build_algo",
 ]