ServiceNow · tscholak · Nov 24, 2025 · Oct 12, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/.gitignore b/.gitignore
@@ -28,6 +28,7 @@ venv.bak/
 # Project specifics
 /.idea/
 /.vscode/
+/.devcontainer/
 
 # Devenv
 .devenv*

diff --git a/fast_llm/layers/block/config.py b/fast_llm/layers/block/config.py
@@ -37,6 +37,7 @@ class BlockKwargs:
     sequence_lengths = "sequence_lengths"
     # TODO: Belongs elsewhere?
     grad_output = "grad_output"
+    iteration = "iteration"
     device = "device"
 
 

diff --git a/fast_llm/layers/decoder/config.py b/fast_llm/layers/decoder/config.py
@@ -1,16 +1,25 @@
+import enum
 import typing
 
 from fast_llm.config import Field, FieldHint, check_field, config_class
 from fast_llm.engine.config_utils.parameter import combine_lr_scales
 from fast_llm.engine.config_utils.tensor_dim import TensorDim
-from fast_llm.engine.distributed.config import DistributedConfig
-from fast_llm.layers.block.config import BlockConfig
+from fast_llm.engine.distributed.config import _BIG_PRIMES, DistributedConfig
+from fast_llm.layers.block.config import BlockConfig, BlockKwargs
 from fast_llm.layers.common.normalization.config import NormalizationConfig
 from fast_llm.layers.common.peft.config import PeftConfig
-from fast_llm.utils import Assert
+from fast_llm.utils import Assert, normalize_probabilities
 
 if typing.TYPE_CHECKING:
     from fast_llm.layers.decoder.block import BlockWithBias, DecoderBlock
+    from fast_llm.layers.decoder.stochastic_mixer import StochasticMixer
+
+
+class StochasticMixerKwargs(BlockKwargs):
+    """Kwargs keys for stochastic mixer."""
+
+    mixer_name = "stochastic_mixer_name"
+    generator = "stochastic_mixer_generator"
 
 
 @config_class()
@@ -76,6 +85,13 @@ def _from_dict(cls, default: dict[str, typing.Any], strict: bool = True) -> typi
         return super()._from_dict(default, strict=strict)
 
 
+class StochasticMixerSamplingStrategy(enum.StrEnum):
+    """Strategy for sampling mixers in a stochastic mixer."""
+
+    uniform = "uniform"
+    weighted = "weighted"
+
+
 @config_class(registry=True)
 class MixerConfig(BlockWithBiasConfig):
     """
@@ -92,6 +108,81 @@ def _from_dict(cls, default: dict[str, typing.Any], strict: bool = True) -> typi
         return super()._from_dict(default, strict=strict)
 
 
+@config_class(dynamic_type={MixerConfig: "stochastic"})
+class StochasticMixerConfig(MixerConfig):
+    """
+    Stochastic mixer that uniformly samples from multiple mixer options during training.
+
+    For supernet training, each forward pass randomly selects one mixer to execute,
+    training all mixers with different subsets of data.
+    """
+
+    _abstract = False
+
+    mixers: dict[str, MixerConfig] = Field(
+        desc="Dict of mixer options to sample from (must contain at least 1). "
+        "Keys are mixer names used for debugging and namespacing.",
+        hint=FieldHint.architecture,
+    )
+
+    sampling_strategy: StochasticMixerSamplingStrategy = Field(
+        default=StochasticMixerSamplingStrategy.uniform,
+        desc="Strategy for sampling mixers during training.",
+        hint=FieldHint.feature,
+    )
+
+    sampling_weights: dict[str, float] | None = Field(
+        default=None,
+        desc="Sampling probability for each mixer by name (will be normalized to sum to 1.0). "
+        "Only used when sampling_strategy='weighted'. "
+        "If None with uniform strategy, all mixers have equal probability.",
+        hint=FieldHint.feature,
+    )
+
+    main_mixer_name: str | None = Field(
+        default=None,
+        desc="Name of the main mixer. "
+        "Used for inference/eval, checkpoint loading (receives pretrained weights), "
+        "and checkpoint saving (only this mixer is exported). "
+        "If None, uses the first mixer in the dict.",
+        hint=FieldHint.feature,
+    )
+
+    seed_shift: int = Field(
+        default=_BIG_PRIMES[11],
+        desc="Seed shift for mixer sampling reproducibility.",
+        hint=FieldHint.optional,
+    )
+
+    def _validate(self) -> None:
+        super()._validate()
+
+        # Validate mixers dict is not empty
+        Assert.gt(len(self.mixers), 0)
+
+        # Set main_mixer_name to first mixer if not specified
+        if self.main_mixer_name is None:
+            with self._set_implicit_default():
+                self.main_mixer_name = next(iter(self.mixers.keys()))
+
+        # Validate main mixer name exists
+        if self.main_mixer_name not in self.mixers:
+            raise ValueError(f"main_mixer_name '{self.main_mixer_name}' not found in mixers")
+
+        # Validate and normalize sampling weights
+        if self.sampling_weights is not None:
+            Assert.eq(set(self.sampling_weights.keys()), set(self.mixers.keys()))
+            # Normalize weights to sum to 1.0 (also validates non-negative and positive sum)
+            normalized_values = normalize_probabilities(list(self.sampling_weights.values()))
+            self.sampling_weights = dict(zip(self.sampling_weights.keys(), normalized_values))
+
+    @property
+    def layer_class(self) -> "type[StochasticMixer]":
+        from fast_llm.layers.decoder.stochastic_mixer import StochasticMixer
+
+        return StochasticMixer
+
+
 @config_class(dynamic_type={BlockConfig: "decoder"})
 class DecoderBlockConfig(BlockConfig):
     _abstract = False

diff --git a/fast_llm/layers/decoder/stochastic_mixer.py b/fast_llm/layers/decoder/stochastic_mixer.py
@@ -0,0 +1,173 @@
+import logging
+import typing
+
+import torch
+
+from fast_llm.engine.base_model.config import LossDef, ResourceUsageConfig
+from fast_llm.engine.config_utils.tensor_dim import TensorDim
+from fast_llm.engine.distributed.config import DistributedConfig
+from fast_llm.engine.distributed.distributed import Distributed
+from fast_llm.layers.common.peft.config import PeftConfig
+from fast_llm.layers.decoder.block import BlockWithBias
+from fast_llm.layers.decoder.config import (
+    StochasticMixerConfig,
+    StochasticMixerKwargs,
+    StochasticMixerSamplingStrategy,
+)
+from fast_llm.tensor import TensorMeta
+
+logger = logging.getLogger(__name__)
+
+
+class StochasticMixer[ConfigType: StochasticMixerConfig](BlockWithBias[ConfigType]):
+    """
+    A mixer that stochastically samples from multiple mixer options during training.
+
+    In training mode, each forward pass randomly selects one mixer according to
+    the sampling strategy. In eval mode, uses the configured inference mixer.
+
+    This is useful for supernet training where you want to train multiple
+    architecture variants (e.g., attention vs. Mamba) with different data subsets.
+    """
+
+    _config: ConfigType
+
+    def __init__(
+        self,
+        config: ConfigType,
+        distributed_config: DistributedConfig,
+        *,
+        hidden_dim: TensorDim,
+        lr_scale: float | None,
+        peft: PeftConfig | None,
+        return_bias: bool = True,
+    ):
+        super().__init__(
+            config,
+            distributed_config,
+            hidden_dim=hidden_dim,
+            lr_scale=lr_scale,
+            peft=peft,
+            return_bias=return_bias,
+        )
+
+        # Initialize all mixers
+        self.mixers = torch.nn.ModuleDict(
+            {
+                name: mixer_config.get_layer(
+                    distributed_config,
+                    hidden_dim,
+                    lr_scale=lr_scale,
+                    peft=peft,
+                    return_bias=return_bias,
+                )
+                for name, mixer_config in self._config.mixers.items()
+            }
+        )
+
+        if self._config.sampling_strategy == StochasticMixerSamplingStrategy.uniform:
+            self._sampling_probs = torch.ones(len(self.mixers), device="cpu") / len(self.mixers)
+        elif self._config.sampling_strategy == StochasticMixerSamplingStrategy.weighted:
+            if self._config.sampling_weights is None:
+                raise ValueError("sampling_weights must be provided when using weighted sampling strategy")
+            self._sampling_probs = torch.tensor(
+                [self._config.sampling_weights[name] for name in self.mixers.keys()],
+                dtype=torch.float32,
+                device="cpu",
+            )
+        else:
+            raise NotImplementedError(f"Sampling strategy {self._config.sampling_strategy} not implemented")
+
+        logger.info(
+            f"Initialized StochasticMixer with {len(self.mixers)} mixers: "
+            f"{', '.join(f'{name}={type(mixer).__name__}' for name, mixer in self.mixers.items())} "
+            f"(main={self._config.main_mixer_name})"
+        )
+
+        # Mark all mixer parameters with allow_no_grad since only one mixer
+        # is active per forward pass during training. Even though all mixers
+        # will eventually be trained, on any single forward pass, the non-selected
+        # mixers won't receive gradients.
+        for mixer in self.mixers.values():
+            for param in mixer.parameters(recurse=True):
+                if hasattr(param, "allow_no_grad"):
+                    param.allow_no_grad = True
+
+    def setup(self, distributed: Distributed) -> None:
+        """Setup all mixers with the distributed context."""
+        super().setup(distributed)
+        for mixer in self.mixers.values():
+            mixer.setup(distributed)
+
+    def _sample_mixer_name(self, kwargs: dict[str, typing.Any]) -> str:
+        if not self.training:
+            return self._config.main_mixer_name
+
+        generator = kwargs[StochasticMixerKwargs.generator]
+        mixer_idx = torch.multinomial(self._sampling_probs, num_samples=1, generator=generator).item()
+        return list(self.mixers.keys())[mixer_idx]
+
+    def _forward(
+        self,
+        input_: torch.Tensor,
+        kwargs: dict[str, typing.Any],
+        losses: dict[str, typing.Any] | None = None,
+        metrics: dict[str, typing.Any] | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        mixer_name = self._sample_mixer_name(kwargs)
+
+        if self._debug.enabled:
+            logger.debug(f"StochasticMixer selecting mixer {mixer_name}: {type(self.mixers[mixer_name]).__name__}")
+
+        return self.mixers[mixer_name]._forward(input_, kwargs, losses, metrics)
+
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        from fast_llm.engine.distributed.config import MAX_SEED
+        from fast_llm.layers.block.config import BlockKwargs
+
+        iteration = kwargs[BlockKwargs.iteration]
+        generator = torch.Generator(device="cpu")
+        seed = (self._distributed_config.seed + self._config.seed_shift + iteration) % MAX_SEED
+        generator.manual_seed(seed)
+        kwargs[StochasticMixerKwargs.generator] = generator
+
+        for mixer in self.mixers.values():
+            mixer.preprocess(kwargs)
+
+    def get_compute_usage(self, input_: TensorMeta, kwargs: dict[str, typing.Any], config: ResourceUsageConfig) -> int:
+        """
+        Return expected compute usage (weighted average of all mixers).
+
+        This gives a more accurate estimate than just using one mixer,
+        since during training we'll be using all of them according to
+        their sampling probabilities.
+        """
+        usages = [mixer.get_compute_usage(input_, kwargs, config) for mixer in self.mixers.values()]
+
+        # Weight by sampling probability and return the expected value
+        expected_usage = sum(usage * prob.item() for usage, prob in zip(usages, self._sampling_probs))
+
+        return int(expected_usage)
+
+    def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
+        """
+        Merge loss definitions from all mixers with namespacing.
+
+        Each mixer's losses are namespaced with the mixer name to avoid conflicts.
+        This ensures we allocate space for any auxiliary losses that any
+        of the mixers might need, even if multiple mixers have losses with the same name.
+        """
+        all_losses = []
+        for mixer_name, mixer in self.mixers.items():
+            mixer_losses = mixer.get_loss_definitions(count=count)
+            # Namespace each loss with the mixer name to avoid conflicts
+            for loss_def in mixer_losses:
+                namespaced_loss = LossDef(
+                    name=f"{mixer_name}/{loss_def.name}",
+                    formatted_name=f"{mixer_name}/{loss_def.formatted_name}",
+                    count=loss_def.count,
+                    dtype=loss_def.dtype,
+                )
+                all_losses.append(namespaced_loss)
+
+        return all_losses
diff --git a/fast_llm/models/gpt/config.py b/fast_llm/models/gpt/config.py
@@ -13,6 +13,7 @@
 from fast_llm.layers.common.peft.config import PeftConfig
 from fast_llm.layers.language_model.config import LanguageModelConfig, MultiTokenPredictionConfig
 from fast_llm.models.gpt.conversion.config import (
+    Apriel2CheckpointFormat,
     AprielHybridSSMCheckpointFormat,
     AutoGPTHuggingfaceCheckpointFormat,
     DiffusionDreamCheckpointFormat,
@@ -111,6 +112,7 @@ class GPTModelConfig(FastLLMModelConfig):
         DiffusionDreamCheckpointFormat,
         DiffusionLlamaCheckpointFormat,
         AprielHybridSSMCheckpointFormat,
+        Apriel2CheckpointFormat,
     )
 
     @classmethod