From 1806b82e46494846a294919eed10e1db92bb001f Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Wed, 29 Apr 2026 15:02:14 -0500 Subject: [PATCH 1/2] Adding architecture Adapter creation guide, add split QKV example to quantized LLaMA demo --- demos/LLaMA2_GPU_Quantized.ipynb | 179 +++++------ docs/source/_static/adapter-template.py | 166 ++++++++++ .../adapter-creation-guide.md | 276 ++++++++++++++++ .../adapter-specification.md | 300 ++++++++++++++++++ .../hf-model-analysis-guide.md | 168 ++++++++++ docs/source/content/contributing.md | 21 ++ 6 files changed, 1009 insertions(+), 101 deletions(-) create mode 100644 docs/source/_static/adapter-template.py create mode 100644 docs/source/content/adapter_development/adapter-creation-guide.md create mode 100644 docs/source/content/adapter_development/adapter-specification.md create mode 100644 docs/source/content/adapter_development/hf-model-analysis-guide.md diff --git a/demos/LLaMA2_GPU_Quantized.ipynb b/demos/LLaMA2_GPU_Quantized.ipynb index 4722428a9..37d378cc8 100644 --- a/demos/LLaMA2_GPU_Quantized.ipynb +++ b/demos/LLaMA2_GPU_Quantized.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -33,9 +33,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Running as a Jupyter notebook - intended for development only!\n", - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" + "Running as a Jupyter notebook - intended for development only!\n" ] } ], @@ -74,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -105,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "id": "P8zS3MPkCUsR" }, @@ -186,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 11, "metadata": { "id": "RdJ0AuW_CUsS" }, @@ -253,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -389,35 +387,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "02974f818bc54305b535861303ca208e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "config.json: 0%| | 0.00/843 [00:00\n", + "
\n", " " ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -747,7 +668,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -794,6 +715,62 @@ "print(f\"Original Loss: {original_loss.item():.3f}\")\n", "print(f\"Ablated Loss: {ablated_loss.item():.3f}\")" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Per-head Q/K/V hooks via `use_split_qkv_input`\n", + "\n", + "By default, attention hooks like `hook_q`, `hook_k`, `hook_v` give the per-head **outputs** of the QKV projections. To intervene on the **inputs** to those projections at per-head granularity, set `use_split_qkv_input=True`. This unlocks `hook_q_input`, `hook_k_input`, and `hook_v_input` with shape `[batch, pos, n_heads, d_model]`.\n", + "\n", + "On legacy `HookedTransformer`, this combination broke when the model was loaded in 4-bit ([issue #737](https://github.com/TransformerLensOrg/TransformerLens/issues/737)) because the `bnb.matmul_4bit` reshape didn't account for the per-head input shape. `TransformerBridge` delegates 4-bit attention math to HuggingFace, so the combination works cleanly here." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of the q_input tensor: torch.Size([1, 32, 32, 2048])\n", + "Original Loss: 2.951\n", + "Q-input ablated Loss: 2.939\n" + ] + } + ], + "source": [ + "# NBVAL_IGNORE_OUTPUT\n", + "model.set_use_split_qkv_input(True)\n", + "\n", + "layer_to_ablate = 0\n", + "head_index_to_ablate = 4\n", + "\n", + "def q_input_ablation_hook(\n", + " q_input: Float[torch.Tensor, \"batch pos head_index d_model\"],\n", + " hook: HookPoint,\n", + ") -> Float[torch.Tensor, \"batch pos head_index d_model\"]:\n", + " print(f\"Shape of the q_input tensor: {q_input.shape}\")\n", + " q_input[:, :, head_index_to_ablate, :] = 0.0\n", + " return q_input\n", + "\n", + "original_loss = model(llama_tokens, return_type=\"loss\")\n", + "ablated_loss = model.run_with_hooks(\n", + " llama_tokens,\n", + " return_type=\"loss\",\n", + " fwd_hooks=[(\n", + " utils.get_act_name(\"q_input\", layer_to_ablate),\n", + " q_input_ablation_hook,\n", + " )],\n", + ")\n", + "print(f\"Original Loss: {original_loss.item():.3f}\")\n", + "print(f\"Q-input ablated Loss: {ablated_loss.item():.3f}\")\n", + "\n", + "model.set_use_split_qkv_input(False)" + ] } ], "metadata": { diff --git a/docs/source/_static/adapter-template.py b/docs/source/_static/adapter-template.py new file mode 100644 index 000000000..54accd526 --- /dev/null +++ b/docs/source/_static/adapter-template.py @@ -0,0 +1,166 @@ +""" architecture adapter. + +TODO: Replace with the actual model name throughout this file. +""" + +from typing import Any + +from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter +from transformer_lens.model_bridge.generalized_components import ( + BlockBridge, + EmbeddingBridge, + GatedMLPBridge, + LinearBridge, + PositionEmbeddingsAttentionBridge, + RMSNormalizationBridge, + RotaryEmbeddingBridge, + UnembeddingBridge, +) + + +class ModelNameArchitectureAdapter(ArchitectureAdapter): + """Architecture adapter for models. + + TODO: Document which parameters are optional (missing biases, etc.) + + Optional Parameters (may not exist in state_dict): + ------------------------------------------------- + TODO: List parameters that may not exist. Example for models without biases: + + - blocks.{i}.attn.b_Q - No bias on query projection + - blocks.{i}.attn.b_K - No bias on key projection + - blocks.{i}.attn.b_V - No bias on value projection + - blocks.{i}.attn.b_O - No bias on output projection + - blocks.{i}.mlp.b_in - No bias on MLP input + - blocks.{i}.mlp.b_gate - No bias on MLP gate projection + - blocks.{i}.mlp.b_out - No bias on MLP output + - blocks.{i}.ln1.b - RMSNorm has no bias + - blocks.{i}.ln2.b - RMSNorm has no bias + - ln_final.b - RMSNorm has no bias + """ + + def __init__(self, cfg: Any) -> None: + """Initialize the architecture adapter.""" + super().__init__(cfg) + + # ===================================================================== + # 1. CONFIG ATTRIBUTES + # Set these based on the HuggingFace model's architecture. + # ===================================================================== + + # TODO: Set normalization type + # "RMS" for RMSNorm (Llama, Qwen, Gemma, etc.) + # "LN" for LayerNorm (GPT-2, GPT-J, etc.) + self.cfg.normalization_type = "RMS" + + # TODO: Set positional embedding type + # "rotary" for RoPE (Llama, Qwen, Mistral, etc.) + # "standard" for learned positional embeddings (GPT-2) + self.cfg.positional_embedding_type = "rotary" + + # TODO: Set these flags + self.cfg.final_rms = True # True if final layer norm is RMSNorm + self.cfg.gated_mlp = True # True if MLP has gate projection (SwiGLU) + self.cfg.attn_only = False # True only for attention-only models (rare) + self.cfg.uses_rms_norm = True # Should match normalization_type + + # TODO: Set the epsilon attribute name used by this model's normalization + # Check the HF model's norm layer to find the correct attribute name + self.cfg.eps_attr = "variance_epsilon" # or "layer_norm_eps", "rms_norm_eps", etc. + + # TODO: Handle GQA if applicable + # If the model uses Grouped Query Attention (n_key_value_heads < n_heads): + if hasattr(cfg, "n_key_value_heads") and cfg.n_key_value_heads is not None: + self.cfg.n_key_value_heads = cfg.n_key_value_heads + + # ===================================================================== + # 2. WEIGHT PROCESSING CONVERSIONS + # Defines how to reshape weights from HF format to TL format. + # For most models with separate Q/K/V/O, use the built-in helper. + # ===================================================================== + + self.weight_processing_conversions = { + **self._qkvo_weight_conversions(), + # TODO: Add any model-specific weight conversions here + } + + # ===================================================================== + # 3. COMPONENT MAPPING + # Maps TransformerLens canonical names to HuggingFace module paths. + # The `name=` parameter is the HF path relative to the model root + # (for top-level) or relative to the block (for block submodules). + # ===================================================================== + + # TODO: Replace all HF paths (name="...") with actual paths from the model. + # Inspect the HF model's named_modules() or config to find the correct paths. + self.component_mapping = { + # Token embedding + "embed": EmbeddingBridge(name="model.embed_tokens"), + + # Rotary position embeddings (remove if model uses standard pos embeddings) + "rotary_emb": RotaryEmbeddingBridge(name="model.rotary_emb"), + + # Transformer blocks + "blocks": BlockBridge( + name="model.layers", # TODO: HF path to the layer list + submodules={ + # Pre-attention layer norm + "ln1": RMSNormalizationBridge( + name="input_layernorm", # TODO: HF name within block + config=self.cfg, + ), + # Post-attention layer norm + "ln2": RMSNormalizationBridge( + name="post_attention_layernorm", # TODO: HF name within block + config=self.cfg, + ), + # Self-attention + "attn": PositionEmbeddingsAttentionBridge( + name="self_attn", # TODO: HF name within block + config=self.cfg, + submodules={ + "q": LinearBridge(name="q_proj"), # TODO: HF projection names + "k": LinearBridge(name="k_proj"), + "v": LinearBridge(name="v_proj"), + "o": LinearBridge(name="o_proj"), + }, + requires_attention_mask=True, + requires_position_embeddings=True, + ), + # MLP (gated) + "mlp": GatedMLPBridge( + name="mlp", # TODO: HF name within block + config=self.cfg, + submodules={ + "gate": LinearBridge(name="gate_proj"), # TODO: HF projection names + "in": LinearBridge(name="up_proj"), + "out": LinearBridge(name="down_proj"), + }, + ), + }, + ), + + # Final layer norm + "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg), + + # Output head (unembedding) + "unembed": UnembeddingBridge(name="lm_head", config=self.cfg), + } + + def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> None: + """Set up model-specific references for component testing. + + TODO: Required for RoPE models. Remove if model uses standard positional embeddings. + """ + # Get rotary embedding instance from the HF model + rotary_emb = hf_model.model.rotary_emb # TODO: Adjust path if different + + # Set rotary_emb on actual bridge instances + if bridge_model is not None and hasattr(bridge_model, "blocks"): + for block in bridge_model.blocks: + if hasattr(block, "attn"): + block.attn.set_rotary_emb(rotary_emb) + + # Set on template for get_generalized_component() calls + attn_bridge = self.get_generalized_component("blocks.0.attn") + attn_bridge.set_rotary_emb(rotary_emb) diff --git a/docs/source/content/adapter_development/adapter-creation-guide.md b/docs/source/content/adapter_development/adapter-creation-guide.md new file mode 100644 index 000000000..08fff9471 --- /dev/null +++ b/docs/source/content/adapter_development/adapter-creation-guide.md @@ -0,0 +1,276 @@ +# Architecture Adapter Creation Guide + +A walkthrough for developers writing a new Architecture Adapter for the TransformerLens `TransformerBridge` system. This guide distills the process of developing an adapter into a set of steps that can be followed start-to-finish. + +If you just want the API reference, jump to [adapter-specification.md](adapter-specification.md). If you have a specific HF model in hand and want a config-extraction cookbook, see [hf-model-analysis-guide.md](hf-model-analysis-guide.md). This document ties those together with workflow and review practice. + +## What an adapter is + +An **Architecture Adapter** is a Python class that extends `ArchitectureAdapter` and tells `TransformerBridge` three things about a HuggingFace model: + +1. **Config attributes** — set on `self.cfg` in `__init__` (normalization type, positional embedding type, GQA params, etc.) +2. **Component mapping** — `self.component_mapping`, a dict mapping TransformerLens canonical names (`embed`, `blocks`, `attn.q`, …) to `GeneralizedComponent` Bridge instances pointed at HF module paths. +3. **Weight processing conversions** — `self.weight_processing_conversions`, a dict of tensor-reshape rules that translate HF weight layouts to TL layouts during loading. + +Once registered, users can `boot_transformers("")` and get a fully hooked TransformerLens model with weights loaded from HF. + +## Prerequisites + +Before starting, make sure you can: + +- Read PyTorch model code and trace a forward pass +- Run a HF model locally with `transformers` +- Use `model.named_modules()` and `model.state_dict()` to inspect structure +- Identify whether a model uses RoPE vs learned positional embeddings, RMSNorm vs LayerNorm, gated vs standard MLP, separate vs joint QKV, MHA vs GQA vs MQA. ([hf-model-analysis-guide.md](hf-model-analysis-guide.md) has a decision tree.) + +You do **not** need to memorize every Bridge component — the existing adapters in `transformer_lens/model_bridge/supported_architectures/` are your reference library. + +## Analyze the architecture + +### Read the HF source + +Open the two files that define the architecture in `transformers`: + +- `models//modeling_.py` — the model code +- `models//configuration_.py` — the config class + +Read every `__init__` and every `forward`. You are looking for: + +- Module hierarchy: what's nested in what, named how +- Forward pass order: norm before/after attention? residual where? +- Bias presence on each linear layer +- Normalization type and the *exact* attribute name of its epsilon (`variance_epsilon`, `rms_norm_eps`, `layer_norm_eps`, `eps`, …) +- Attention type (MHA / GQA / MQA) and whether QKV are separate or joint +- MLP type (gated / standard) and projection names +- Anything that looks weird (special scaling, conditional padding, dtype upcasts in softmax, …) + +Also extract the standard config-to-TL field mapping (see [hf-model-analysis-guide.md](hf-model-analysis-guide.md) for the table). + +### Find the closest reference adapter + +Almost every new model is a variant of an existing pattern. Pick the nearest match from `supported_architectures/`: + +| If your model is like… | Start from… | +|---------------------------------------|--------------------------------------| +| Llama, Mistral, Qwen2, Gemma, OLMo | `llama.py` | +| Qwen2/Qwen3 (gated config, MLPBridge) | `qwen2.py` | +| GPT-2, GPT-J, GPT-Neo | `gpt2.py` | +| BLOOM, Falcon | `bloom.py` or `falcon.py` | +| T5 / encoder-decoder | `t5.py` | +| MoE | `mixtral.py` or `granite_moe.py` | +| Multimodal (vision+text) | `llava.py` or `gemma3_multimodal.py` | + +### Write down what you found + +Before writing any adapter code, take notes on the architecture. This is for your own use, it does not need to be formally documented. It will help inform your decisions going forward. + +At minimum, capture: + +- **Source files** — exact paths in `transformers` +- **Module hierarchy** — every HF module path you'll need, with line numbers in the source where it's defined +- **Config fields** — the HF names and their TL equivalents +- **Architectural properties** — normalization, position embeddings, attention type, MLP type, biases +- **Forward pass flow** — order of operations in the block, attention, and MLP +- **Reference adapter** — closest existing adapter, and a list of every way your target differs from it +- **Representative models** — small variants (≤7B parameters) you'll use for verification + +## Implement the adapter + +### File layout + +- **Adapter file:** `transformer_lens/model_bridge/supported_architectures/.py` +- **Class name:** `ArchitectureAdapter` (e.g. `LlamaArchitectureAdapter`) +- **Module name:** lowercase + underscores (`llama.py`, `qwen2.py`, `granite_moe.py`) + +Start from [adapter-template.py](../../_static/adapter-template.py). It's a Llama-pattern skeleton with TODOs at every decision point. + +A reasonable order for filling it in: + +1. Config attributes (drives everything else) +2. Weight processing conversions +3. Component mapping +4. Optional overrides (only the ones you actually need) +5. Registration + +### Config attributes + +Set these on `self.cfg` in `__init__` *before* building the component mapping (the bridges read from `self.cfg`): + +| Attribute | Type | Purpose | +|----------------------------|--------|-----------------------------------------------| +| `normalization_type` | `str` | `"RMS"` or `"LN"` | +| `positional_embedding_type`| `str` | `"rotary"` or `"standard"` | +| `final_rms` | `bool` | Final norm is RMSNorm | +| `gated_mlp` | `bool` | MLP has gate projection (SwiGLU) | +| `attn_only` | `bool` | Model has no MLP layers (rare) | +| `uses_rms_norm` | `bool` | Should match `normalization_type == "RMS"` | +| `eps_attr` | `str` | HF attribute name for norm epsilon | + +For GQA models, also forward `n_key_value_heads`: + +```python +if hasattr(cfg, "n_key_value_heads") and cfg.n_key_value_heads is not None: + self.cfg.n_key_value_heads = cfg.n_key_value_heads +``` + +### Component mapping + +For each TL canonical name, instantiate the right Bridge component and point its `name=` parameter at the HF module path (relative to the model root for top-level entries, relative to the block for block submodules). + +A standard Llama-style mapping: + +```python +self.component_mapping = { + "embed": EmbeddingBridge(name="model.embed_tokens"), + "rotary_emb": RotaryEmbeddingBridge(name="model.rotary_emb"), + "blocks": BlockBridge( + name="model.layers", + submodules={ + "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg), + "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg), + "attn": PositionEmbeddingsAttentionBridge( + name="self_attn", + config=self.cfg, + submodules={ + "q": LinearBridge(name="q_proj"), + "k": LinearBridge(name="k_proj"), + "v": LinearBridge(name="v_proj"), + "o": LinearBridge(name="o_proj"), + }, + requires_attention_mask=True, + requires_position_embeddings=True, + ), + "mlp": GatedMLPBridge( + name="mlp", + config=self.cfg, + submodules={ + "gate": LinearBridge(name="gate_proj"), + "in": LinearBridge(name="up_proj"), + "out": LinearBridge(name="down_proj"), + }, + ), + }, + ), + "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg), + "unembed": UnembeddingBridge(name="lm_head", config=self.cfg), +} +``` + +The full bridge component catalog (attention variants, MLP variants, specialized bridges for BLOOM/CLIP/Siglip/T5/MoE/etc.) is in [adapter-specification.md](adapter-specification.md) under "Available Bridge Components." + +### Weight processing conversions + +For models with separate Q/K/V/O projections, use the built-in helper: + +```python +self.weight_processing_conversions = { + **self._qkvo_weight_conversions(), +} +``` + +It generates the standard `(n h) m -> n m h` rearrangements with the right head/kv-head counts. + +For combined-QKV models (GPT-2 style), see `gpt2.py`'s `QKVSplitRearrangeConversion` for the pattern. For other oddball layouts, define custom `ParamProcessingConversion` or `RearrangeTensorConversion` instances. + +### Optional overrides + +Implement only the ones you need: + +- **`setup_component_testing(hf_model, bridge_model=None)`** — required for RoPE models, to wire the rotary embedding instance through to the attention bridges. Skip for models with standard positional embeddings. +- **`preprocess_weights(state_dict)`** — for arch-specific weight transforms before standard processing (e.g., Gemma scales embeddings by `sqrt(d_model)`). +- **`prepare_loading(model_name, model_kwargs)`** — patch HF model classes before `from_pretrained()`. +- **`prepare_model(hf_model)`** — post-load fixups before bridge creation. + +### Registration + +Two files to update: + +1. `transformer_lens/model_bridge/supported_architectures/__init__.py` — add the import and append to `__all__`. +2. `transformer_lens/factories/architecture_adapter_factory.py` — add to the import block and to `SUPPORTED_ARCHITECTURES`: + + ```python + "": , + ``` + +Forgetting registration is the most common silent failure — the adapter exists but `boot_transformers` can't find it. + +### Tests + +Write tests that exercise actual behavior: + +- Hook names resolve correctly +- Weight shapes match expectations after loading +- Forward pass produces sensible output for a tiny variant + +### New bridge components + +Don't add a new bridge unless the existing ones can't express your model. The bar is: the `forward()` must be fundamentally different from any existing bridge. If you do add one: + +- Place it in `transformer_lens/model_bridge/generalized_components/` +- Export it from the package `__init__` +- Write tests covering its forward pass and any state it carries + +## Verify the adapter + +The `verify_models` tool runs a real HF model side-by-side with your bridge and compares activations across four phases. Each phase produces a numeric score; the model passes if all phase scores meet their thresholds. + +### Pick models + +From your representative-models list, take the smallest variants (prefer ≤7B parameters), up to 5, sorted by HuggingFace download count. Verifying multiple sizes catches scaling bugs that single-model verification misses. + +### Run verification + +One model at a time, with float32 by default: + +```bash +uv run python -m transformer_lens.tools.model_registry.verify_models \ + --model \ + --max-memory \ + --device cpu \ + --dtype float32 \ + --no-ht-reference +``` + +If a model OOMs with float32, retry that single model with `--dtype bfloat16`. Set `--max-memory` to roughly 75-85% of your device memory, to ensure adequate space for running the benchmarks. + +### Read the status + +Each model gets a status: + +- **status=1** — passed, move to the next model +- **status=2** — skipped by `verify_models` (e.g., exceeded the memory pre-check). Note it and move on; not an adapter bug. +- **status=3** — phase score failure. Stop and fix. Read the `note` and the per-phase scores, find the root cause, fix the adapter, re-verify. + +### Lint + +After all chosen models pass: + +```bash +uv run mypy . +make check-format +``` + +Both must be clean. Don't paper over mypy errors with `# type: ignore` — fix the underlying type. If mypy is wrong about something, that's a real issue worth investigating, not silencing. + +## Before you open a PR + +`verify_models` will catch most numerical bugs, but a few things are worth a once-over by eye. + +**Sanity-check against the HF source.** Skim your adapter with the HF `modeling_.py` open alongside it. Module paths, config attribute names, and bias presence are the usual suspects — easy to get wrong from memory and easy to spot when you look directly. + +**Watch for the subtle stuff.** When the adapter reimplements a computation or defines weight conversions, the things that bite are operation order (split before or after the layernorm?), dtype upcasting in softmax, and conditional logic that only fires under certain conditions in HF (e.g., flash-attention paths). If something in your code looks like it "probably matches" HF, that's a good place to stop and check. + +**Don't reach for abstraction prematurely.** If you've added a base class or protocol with only one or two concrete uses, you're probably better off without it. The same goes for config knobs that don't have a current consumer. + +**Confirm the boring stuff is done.** Both registration sites (`__init__.py` and `architecture_adapter_factory.py`), `mypy` and format checks clean, tests doing real work rather than asserting mocks return their mock values. + +## Common pitfalls + +- **Wrong `eps_attr` name.** Models that look identical use different attribute names (`variance_epsilon`, `rms_norm_eps`, `eps`). Read the norm class. +- **Forgetting `n_key_value_heads`.** Without it, GQA models silently reshape weights as if they were MHA — verification fails with cryptic shape errors. +- **Missing registration.** Adapter exists but the factory can't find it. Update both `__init__.py` and `architecture_adapter_factory.py`. +- **Skipping `setup_component_testing` for RoPE.** Rotary embeddings need to be wired through to each attention bridge or component testing produces nonsense. +- **Reusing `model.norm` when the path is `model.final_layernorm`.** Module paths look similar across architectures but rarely match exactly — always verify against the actual HF source. +- **Tautological tests.** "Test that mock returns mock_value" is not a test. Tests should exercise real shapes, real forward passes, real hook resolution. +- **`# type: ignore` on mypy errors.** Find the root cause; the type error is usually telling you something real about the bridge config. +- **Coding before the architecture is understood.** The single biggest time-waster. Five pages of code based on a wrong assumption about module paths is worse than no code. diff --git a/docs/source/content/adapter_development/adapter-specification.md b/docs/source/content/adapter_development/adapter-specification.md new file mode 100644 index 000000000..17798b6b6 --- /dev/null +++ b/docs/source/content/adapter_development/adapter-specification.md @@ -0,0 +1,300 @@ +--- +orphan: true +--- + +# Architecture Adapter Specification + +This document is the primary reference for building Architecture Adapters for the TransformerLens TransformerBridge system. + +## What Is an Architecture Adapter? + +An Architecture Adapter is a Python class that extends `ArchitectureAdapter` (from `transformer_lens.model_bridge.architecture_adapter`). It maps between a HuggingFace model's internal structure and TransformerLens's canonical component names. Every adapter must define three things: + +1. **Config attributes** — set on `self.cfg` in `__init__` +2. **Component mapping** — `self.component_mapping` dict mapping TL names to Bridge instances +3. **Weight processing conversions** — `self.weight_processing_conversions` dict for tensor reshaping + +## File Location and Naming + +- **Adapter file:** `transformer_lens/model_bridge/supported_architectures/.py` +- **Class name:** `ArchitectureAdapter` (e.g., `LlamaArchitectureAdapter`) +- **Module name:** lowercase, underscores (e.g., `llama.py`, `qwen2.py`, `granite_moe.py`) + +## Registration Checklist + +After creating the adapter, register it in these files: + +1. **`transformer_lens/model_bridge/supported_architectures/__init__.py`** + - Add import: `from transformer_lens.model_bridge.supported_architectures. import ` + - Add to `__all__` list + +2. **`transformer_lens/factories/architecture_adapter_factory.py`** + - Add import (in the existing import block from `supported_architectures`) + - Add entry to `SUPPORTED_ARCHITECTURES` dict: `"": ` + +## Config Attributes + +Set these on `self.cfg` in `__init__` before building the component mapping: + +| Attribute | Type | Description | Examples | +|-----------|------|-------------|----------| +| `normalization_type` | `str` | `"RMS"` or `"LN"` | Llama="RMS", GPT2="LN" | +| `positional_embedding_type` | `str` | `"rotary"` or `"standard"` | Llama="rotary", GPT2="standard" | +| `final_rms` | `bool` | Whether final layer norm is RMS | Llama=True, GPT2=False | +| `gated_mlp` | `bool` | Whether MLP uses gate projection | Llama=True, GPT2=False | +| `attn_only` | `bool` | Whether model has no MLP layers | Usually False | +| `uses_rms_norm` | `bool` | Redundant with normalization_type but needed | Match normalization_type | +| `eps_attr` | `str` | Attribute name for norm epsilon | `"variance_epsilon"`, `"layer_norm_eps"` | + +### GQA (Grouped Query Attention) + +If the model uses GQA (n_key_value_heads < n_heads), set: +```python +if hasattr(cfg, "n_key_value_heads") and cfg.n_key_value_heads is not None: + self.cfg.n_key_value_heads = cfg.n_key_value_heads +``` + +## Component Mapping + +`self.component_mapping` is a `dict[str, GeneralizedComponent]` mapping TransformerLens canonical names to Bridge instances. The Bridge `name=` parameter is the HuggingFace module path. + +### Standard Mapping (Llama-style decoder-only) + +```python +self.component_mapping = { + "embed": EmbeddingBridge(name="model.embed_tokens"), + "rotary_emb": RotaryEmbeddingBridge(name="model.rotary_emb"), + "blocks": BlockBridge( + name="model.layers", + submodules={ + "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg), + "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg), + "attn": PositionEmbeddingsAttentionBridge( + name="self_attn", + config=self.cfg, + submodules={ + "q": LinearBridge(name="q_proj"), + "k": LinearBridge(name="k_proj"), + "v": LinearBridge(name="v_proj"), + "o": LinearBridge(name="o_proj"), + }, + requires_attention_mask=True, + requires_position_embeddings=True, + ), + "mlp": GatedMLPBridge( + name="mlp", + config=self.cfg, + submodules={ + "gate": LinearBridge(name="gate_proj"), + "in": LinearBridge(name="up_proj"), + "out": LinearBridge(name="down_proj"), + }, + ), + }, + ), + "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg), + "unembed": UnembeddingBridge(name="lm_head", config=self.cfg), +} +``` + +### GPT2-style Mapping (standard positional embeddings, combined QKV) + +```python +self.component_mapping = { + "embed": EmbeddingBridge(name="transformer.wte"), + "pos_embed": PosEmbedBridge(name="transformer.wpe"), + "blocks": BlockBridge( + name="transformer.h", + config=self.cfg, + submodules={ + "ln1": NormalizationBridge(name="ln_1", config=self.cfg), + "attn": JointQKVAttentionBridge( + name="attn", + config=self.cfg, + submodules={ + "qkv": LinearBridge(name="c_attn"), + "o": LinearBridge(name="c_proj"), + }, + ), + "ln2": NormalizationBridge(name="ln_2", config=self.cfg), + "mlp": MLPBridge( + name="mlp", + submodules={ + "in": LinearBridge(name="c_fc"), + "out": LinearBridge(name="c_proj"), + }, + ), + }, + ), + "ln_final": NormalizationBridge(name="transformer.ln_f", config=self.cfg), + "unembed": UnembeddingBridge(name="lm_head"), +} +``` + +> **Note:** GPT2's `MLPBridge` and `UnembeddingBridge` do not pass `config=`. The `config` parameter is optional on these bridges — match the existing adapter's pattern. + +## Weight Processing Conversions + +`self.weight_processing_conversions` maps TransformerLens weight paths to `ParamProcessingConversion` instances that handle tensor reshaping during weight loading. + +### Standard QKVO Conversions (most models) + +For models with separate Q/K/V/O projections, use the built-in helper: + +```python +self.weight_processing_conversions = { + **self._qkvo_weight_conversions(), +} +``` + +This generates rearrangement rules for: +- `blocks.{i}.attn.q.weight` — `(n h) m -> n m h` with `n=n_heads` +- `blocks.{i}.attn.k.weight` — `(n h) m -> n m h` with `n=n_kv_heads` +- `blocks.{i}.attn.v.weight` — `(n h) m -> n m h` with `n=n_kv_heads` +- `blocks.{i}.attn.o.weight` — `m (n h) -> n h m` with `n=n_heads` + +### Custom Conversions + +For models with non-standard weight layouts (e.g., combined QKV), define custom `ParamProcessingConversion` or `RearrangeTensorConversion` instances. See `gpt2.py` for the `QKVSplitRearrangeConversion` example. + +## Available Bridge Components + +### Core Components + +| Component | Use When | +|-----------|----------| +| `EmbeddingBridge` | Token embeddings | +| `UnembeddingBridge` | Output head (lm_head) | +| `BlockBridge` | Transformer block container (always named "blocks") | +| `LinearBridge` | Any linear/projection layer | + +### Normalization + +| Component | Use When | +|-----------|----------| +| `NormalizationBridge` | LayerNorm | +| `RMSNormalizationBridge` | RMSNorm | + +### Attention + +| Component | Use When | +|-----------|----------| +| `AttentionBridge` | Basic attention (no positional embeddings passed) | +| `PositionEmbeddingsAttentionBridge` | Attention that receives position embeddings (RoPE models) | +| `JointQKVAttentionBridge` | Combined QKV single linear layer (GPT-2 style) | +| `JointQKVPositionEmbeddingsAttentionBridge` | Combined QKV with position embeddings | + +### MLP + +| Component | Use When | +|-----------|----------| +| `MLPBridge` | Standard 2-layer MLP (in/out) or with separate gate | +| `GatedMLPBridge` | Gated MLP with gate/up/down projections (SwiGLU) | +| `JointGateUpMLPBridge` | MLP where gate and up projections are fused | + +### Position Embeddings + +| Component | Use When | +|-----------|----------| +| `PosEmbedBridge` | Learned positional embeddings (GPT-2 style) | +| `RotaryEmbeddingBridge` | Rotary position embeddings (RoPE) | + +### Specialized + +| Component | Use When | +|-----------|----------| +| `MoEBridge` | Mixture of Experts routing | +| `SymbolicBridge` | Placeholder/container with no direct HF module | +| `Conv1DBridge` | 1D convolution layers | +| `T5BlockBridge` | T5-specific block structure | +| `CLIPVisionEncoderBridge` | CLIP vision encoder (multimodal) | +| `CLIPVisionEncoderLayerBridge` | Individual CLIP vision encoder layer | +| `SiglipVisionEncoderBridge` | Siglip vision encoder (multimodal) | +| `SiglipVisionEncoderLayerBridge` | Individual Siglip vision encoder layer | +| `VisionProjectionBridge` | Vision-to-text projection (multimodal) | + +### Architecture-Specific (Bloom/Falcon) + +These exist for architectures with non-standard internal structures. Discover them by reading the reference adapter. + +| Component | Use When | +|-----------|----------| +| `BloomBlockBridge` | BLOOM transformer blocks | +| `BloomAttentionBridge` | BLOOM attention mechanism | +| `BloomMLPBridge` | BLOOM MLP | +| `AudioFeatureExtractorBridge` | Audio feature extraction (HuBERT) | +| `ConvPosEmbedBridge` | Convolutional positional embeddings (HuBERT) | + +## Optional Overrides + +### `setup_component_testing(hf_model, bridge_model=None)` + +Called after adapter creation. Use to set up model-specific references for component testing. Required for RoPE models to set rotary embedding references: + +```python +def setup_component_testing(self, hf_model, bridge_model=None): + rotary_emb = hf_model.model.rotary_emb + if bridge_model is not None and hasattr(bridge_model, "blocks"): + for block in bridge_model.blocks: + if hasattr(block, "attn"): + block.attn.set_rotary_emb(rotary_emb) + attn_bridge = self.get_generalized_component("blocks.0.attn") + attn_bridge.set_rotary_emb(rotary_emb) +``` + +### `preprocess_weights(state_dict)` + +Apply architecture-specific weight transformations before standard processing. Example: Gemma scales embeddings by `sqrt(d_model)`. + +### `prepare_loading(model_name, model_kwargs)` + +Called before `from_pretrained()`. Use to patch HF model classes. + +### `prepare_model(hf_model)` + +Called after model loading but before bridge creation. Use for post-load fixups. + +## Common Architecture Patterns + +### Pattern 1: Llama-like (most modern models) + +RoPE + RMSNorm + GatedMLP + separate Q/K/V/O. Uses `GatedMLPBridge`. Used by: Llama, Mistral, Gemma, OLMo, Granite, StableLM. + +**Qwen2 variant:** Nearly identical to Llama but uses `MLPBridge` instead of `GatedMLPBridge` (while still setting `gated_mlp = True` and having gate/in/out submodules). Used by: Qwen2, Qwen3. + +### Pattern 2: GPT2-like + +Standard positional embeddings + LayerNorm + standard MLP + combined QKV. Used by: GPT-2, GPT-J, GPT-Neo/NeoX. + +### Pattern 3: MoE (Mixture of Experts) + +Similar to Llama-like but with `MoEBridge` replacing the MLP. Used by: Mixtral, GraniteMoE, OLMoE. + +### Pattern 4: Multimodal + +Extends a text-only pattern with vision encoder and projection bridges. Used by: LLaVA, LLaVA-Next, Gemma3 Multimodal. + +## Imports Template + +```python +from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter +from transformer_lens.model_bridge.generalized_components import ( + BlockBridge, + EmbeddingBridge, + GatedMLPBridge, # or MLPBridge for non-gated + LinearBridge, + PositionEmbeddingsAttentionBridge, # or JointQKVAttentionBridge + RMSNormalizationBridge, # or NormalizationBridge for LayerNorm + RotaryEmbeddingBridge, # only for RoPE models + UnembeddingBridge, +) +``` + +## Testing + +After creating an adapter, verify it by: + +1. Running the adapter-specific unit tests +2. Loading a small model variant with `boot_transformers(model_name)` +3. Verifying hook names resolve correctly +4. Checking that weight shapes match expectations diff --git a/docs/source/content/adapter_development/hf-model-analysis-guide.md b/docs/source/content/adapter_development/hf-model-analysis-guide.md new file mode 100644 index 000000000..4885adc98 --- /dev/null +++ b/docs/source/content/adapter_development/hf-model-analysis-guide.md @@ -0,0 +1,168 @@ +# HuggingFace Model Analysis Guide + +This guide explains how to analyze a HuggingFace model to extract the information needed to build a TransformerLens Architecture Adapter. + +## Read the model's config.json + +Every HF model has a `config.json` that contains architecture details. You can access it via: + +```python +from transformers import AutoConfig +config = AutoConfig.from_pretrained("model-name-or-path") +print(config) +``` + +Or via the HuggingFace API: +```bash +curl -s "https://huggingface.co/model-name/resolve/main/config.json" | python -m json.tool +``` + +### Key config fields to extract + +| HF Config Field | TL Config Field | Description | +|-----------------|-----------------|-------------| +| `hidden_size` | `d_model` | Model dimension | +| `num_attention_heads` | `n_heads` | Number of attention heads | +| `num_key_value_heads` | `n_key_value_heads` | KV heads (for GQA; if absent or equal to n_heads, not GQA) | +| `intermediate_size` | `d_mlp` | MLP intermediate dimension | +| `num_hidden_layers` | `n_layers` | Number of transformer blocks | +| `vocab_size` | `d_vocab` | Vocabulary size | +| `max_position_embeddings` | `n_ctx` | Maximum sequence length | +| `rms_norm_eps` | `eps` | Normalization epsilon | +| `model_type` | — | Architecture family (e.g., "llama", "gpt2", "mistral") | +| `architectures` | `architecture` | HF class name (e.g., `["LlamaForCausalLM"]`) | + +## Determine architecture characteristics + +### Normalization type + +Check the model code or config: +- **RMSNorm** → `normalization_type = "RMS"` — Look for `RMSNorm` in the model code, or `rms_norm_eps` in config +- **LayerNorm** → `normalization_type = "LN"` — Look for `LayerNorm`, or `layer_norm_eps` / `layer_norm_epsilon` in config + +Also identify the epsilon attribute name: +- `"variance_epsilon"` (Llama) +- `"rms_norm_eps"` (some models expose this directly) +- `"layer_norm_eps"` (GPT-2, BERT) +- `"eps"` (generic) + +### Positional embedding type + +- **Rotary (RoPE)** → `positional_embedding_type = "rotary"` — Most modern models (Llama, Mistral, Qwen, Gemma) +- **Learned/Standard** → `positional_embedding_type = "standard"` — GPT-2, OPT +- Check for `RotaryEmbedding` class in the model code + +### Attention type + +- **Multi-Head Attention (MHA)** — `n_key_value_heads == n_heads` or field absent +- **Grouped Query Attention (GQA)** — `n_key_value_heads < n_heads` (e.g., Llama 3, Mistral) +- **Multi-Query Attention (MQA)** — `n_key_value_heads == 1` (e.g., Falcon) + +### MLP type + +- **Gated MLP (SwiGLU)** → `gated_mlp = True` — Has gate/up/down projections (Llama, Qwen, Gemma) +- **Standard MLP** → `gated_mlp = False` — Has fc1/fc2 or c_fc/c_proj (GPT-2) + +### QKV layout + +- **Separate Q/K/V** — Most models: `q_proj`, `k_proj`, `v_proj` +- **Combined QKV** — GPT-2 style: single `c_attn` or `query_key_value` linear layer + +## Inspect module names + +To find the exact HuggingFace module paths for the component mapping: + +```python +from transformers import AutoModelForCausalLM +model = AutoModelForCausalLM.from_pretrained("model-name", torch_dtype="auto") + +# Print all named modules +for name, module in model.named_modules(): + print(f"{name}: {type(module).__name__}") +``` + +### What to look for + +Map these HF module paths to TL component mapping entries: + +| TL Name | Look for in HF | Common HF Paths | +|---------|----------------|-----------------| +| `embed` | Token embedding | `model.embed_tokens`, `transformer.wte` | +| `pos_embed` | Position embedding (if standard) | `transformer.wpe` | +| `rotary_emb` | Rotary embedding (if RoPE) | `model.rotary_emb`, `model.layers.0.self_attn.rotary_emb` | +| `blocks` | Layer list | `model.layers`, `transformer.h`, `model.decoder.layers` | +| `ln1` | Pre-attention norm | `input_layernorm`, `ln_1` | +| `ln2` | Post-attention norm | `post_attention_layernorm`, `ln_2` | +| `attn` | Self-attention module | `self_attn`, `attn` | +| `attn.q` | Query projection | `q_proj`, `query` | +| `attn.k` | Key projection | `k_proj`, `key` | +| `attn.v` | Value projection | `v_proj`, `value` | +| `attn.o` | Output projection | `o_proj`, `out_proj`, `dense`, `c_proj` | +| `attn.qkv` | Combined QKV (if used) | `c_attn`, `query_key_value` | +| `mlp` | MLP module | `mlp`, `feed_forward` | +| `mlp.gate` | Gate projection (if gated) | `gate_proj`, `w1` | +| `mlp.in` | Up/input projection | `up_proj`, `c_fc`, `fc1`, `w3` | +| `mlp.out` | Down/output projection | `down_proj`, `c_proj`, `fc2`, `w2` | +| `ln_final` | Final layer norm | `model.norm`, `transformer.ln_f`, `model.final_layernorm` | +| `unembed` | LM head | `lm_head`, `embed_out` | + +## Check for biases + +```python +# Check if a specific layer has bias +layer = model.model.layers[0] +print(f"Q bias: {layer.self_attn.q_proj.bias is not None}") +print(f"MLP in bias: {layer.mlp.up_proj.bias is not None}") +``` + +Document which layers lack biases — this affects the "Optional Parameters" section of the adapter docstring. + +## Examine state dict keys + +```python +# Print all parameter names and shapes +for key, param in model.state_dict().items(): + print(f"{key}: {param.shape}") +``` + +This helps verify: +- Weight naming patterns match your component mapping +- Tensor shapes match expected dimensions +- No unexpected parameters that need special handling + +## Find an existing similar adapter + +Check if a similar architecture already has an adapter. Most new models are variants of existing patterns: + +| If your model is like... | Start from adapter... | +|--------------------------|----------------------| +| Llama, Mistral, Qwen2, Gemma | `llama.py` | +| GPT-2, GPT-J | `gpt2.py` | +| BLOOM, Falcon | `bloom.py` or `falcon.py` | +| T5, encoder-decoder | `t5.py` | +| MoE model | `mixtral.py` or `granite_moe.py` | +| Multimodal (vision+text) | `llava.py` or `gemma3_multimodal.py` | + +## Quick reference: decision tree + +``` +1. Does the model use RMSNorm or LayerNorm? + → RMSNorm: normalization_type="RMS", use RMSNormalizationBridge + → LayerNorm: normalization_type="LN", use NormalizationBridge + +2. Does the model use RoPE or learned positional embeddings? + → RoPE: positional_embedding_type="rotary", add RotaryEmbeddingBridge, use PositionEmbeddingsAttentionBridge + → Learned: positional_embedding_type="standard", add PosEmbedBridge + +3. Are Q/K/V separate or combined? + → Separate: use PositionEmbeddingsAttentionBridge with q/k/v/o submodules + → Combined: use JointQKVAttentionBridge with qkv/o submodules + +4. Does the MLP have a gate projection? + → Yes (gate+up+down): gated_mlp=True, use GatedMLPBridge + → No (in+out): gated_mlp=False, use MLPBridge + +5. Is n_key_value_heads < n_heads? + → Yes: GQA — set n_key_value_heads on cfg + → No: standard MHA — no special handling needed +``` diff --git a/docs/source/content/contributing.md b/docs/source/content/contributing.md index df514ab7f..370a87361 100644 --- a/docs/source/content/contributing.md +++ b/docs/source/content/contributing.md @@ -158,3 +158,24 @@ must be repeated (i.e. `\\`). You can write LaTeX inline, or in "display mode". - Numbered items - `1. Item` - Quotes - indent one level - External links = ``` `Link text ` ``` + +## Creating Architecture Adapters + +If a HuggingFace model is not yet supported by `TransformerBridge`, you can add support by writing an Architecture Adapter. An adapter is a Python class that tells the bridge how a particular HF model maps to TransformerLens's canonical component names (`embed`, `blocks`, `attn.q`, etc.). Once registered, `TransformerBridge.boot_transformers("")` will load the model end-to-end with full hook support. + +The work is mostly bookkeeping: identify each component on the HF side (embeddings, attention, MLP, normalization), point a Bridge instance at the corresponding HF module path, and supply tensor-reshape rules where the weight layout differs from TransformerLens conventions. Most of the per-architecture decisions are already encoded in the existing adapters under `transformer_lens/model_bridge/supported_architectures/`, which are good starting points to copy from. + +Two guides walk through the process: + +- [Architecture Adapter Creation Guide](adapter_development/adapter-creation-guide.md) — start here. A step-by-step workflow for taking an HF model from unsupported to tested, registered adapter. +- [HuggingFace Model Analysis Guide](adapter_development/hf-model-analysis-guide.md) — a reference for reading an HF model's `config.json` and source files to extract the attributes you'll set on `self.cfg`. + +Adapters live in `transformer_lens/model_bridge/supported_architectures/.py` and are registered in two places: `supported_architectures/__init__.py` and `factories/architecture_adapter_factory.py`. Both steps are covered in the creation guide. If you want a starter file, copy [adapter-template.py](../_static/adapter-template.py) into `supported_architectures/` and rename it. + +```{toctree} +:hidden: +:maxdepth: 1 + +adapter_development/adapter-creation-guide +adapter_development/hf-model-analysis-guide +``` From 61fdbcf78b757d31f4a0b67cb52916f74146762b Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Wed, 29 Apr 2026 16:14:33 -0500 Subject: [PATCH 2/2] ignore docs/build from black linting --- docs/source/_static/adapter-template.py | 14 +++++--------- pyproject.toml | 5 +++-- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/docs/source/_static/adapter-template.py b/docs/source/_static/adapter-template.py index 54accd526..7b14928b0 100644 --- a/docs/source/_static/adapter-template.py +++ b/docs/source/_static/adapter-template.py @@ -59,10 +59,10 @@ def __init__(self, cfg: Any) -> None: self.cfg.positional_embedding_type = "rotary" # TODO: Set these flags - self.cfg.final_rms = True # True if final layer norm is RMSNorm - self.cfg.gated_mlp = True # True if MLP has gate projection (SwiGLU) - self.cfg.attn_only = False # True only for attention-only models (rare) - self.cfg.uses_rms_norm = True # Should match normalization_type + self.cfg.final_rms = True # True if final layer norm is RMSNorm + self.cfg.gated_mlp = True # True if MLP has gate projection (SwiGLU) + self.cfg.attn_only = False # True only for attention-only models (rare) + self.cfg.uses_rms_norm = True # Should match normalization_type # TODO: Set the epsilon attribute name used by this model's normalization # Check the HF model's norm layer to find the correct attribute name @@ -96,10 +96,8 @@ def __init__(self, cfg: Any) -> None: self.component_mapping = { # Token embedding "embed": EmbeddingBridge(name="model.embed_tokens"), - # Rotary position embeddings (remove if model uses standard pos embeddings) "rotary_emb": RotaryEmbeddingBridge(name="model.rotary_emb"), - # Transformer blocks "blocks": BlockBridge( name="model.layers", # TODO: HF path to the layer list @@ -119,7 +117,7 @@ def __init__(self, cfg: Any) -> None: name="self_attn", # TODO: HF name within block config=self.cfg, submodules={ - "q": LinearBridge(name="q_proj"), # TODO: HF projection names + "q": LinearBridge(name="q_proj"), # TODO: HF projection names "k": LinearBridge(name="k_proj"), "v": LinearBridge(name="v_proj"), "o": LinearBridge(name="o_proj"), @@ -139,10 +137,8 @@ def __init__(self, cfg: Any) -> None: ), }, ), - # Final layer norm "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg), - # Output head (unembedding) "unembed": UnembeddingBridge(name="lm_head", config=self.cfg), } diff --git a/pyproject.toml b/pyproject.toml index 1af32755f..e20e94ce6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -105,11 +105,12 @@ [tool.black] line-length=100 # Set line length to 100 to match other tools - # Exclude snapshot tests & .venv + # Exclude snapshot tests, .venv, and Sphinx build output (autogenerated) exclude=''' ( /snapshots/ -| .venv/ +| \.venv/ +| docs/build/ ) '''