From 7f05bdc9fe176113de668fe8913f2c3fd33a6e72 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Wed, 11 Mar 2026 13:42:45 -0700 Subject: [PATCH] tp --- docs/source/en/_toctree.yml | 2 + docs/source/en/perf_infer_gpu_multi.md | 2 + docs/source/en/tensor_parallelism.md | 101 +++++++++++++++++++++++++ 3 files changed, 105 insertions(+) create mode 100644 docs/source/en/tensor_parallelism.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 301dab27ec3e..b9f66011de80 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -202,6 +202,8 @@ title: FullyShardedDataParallel - local: deepspeed title: DeepSpeed + - local: tensor_parallelism + title: Tensor parallelism - local: debugging title: Multi-GPU debugging - local: perf_train_cpu_many diff --git a/docs/source/en/perf_infer_gpu_multi.md b/docs/source/en/perf_infer_gpu_multi.md index 19f793d8775f..9bf3a3707ecc 100644 --- a/docs/source/en/perf_infer_gpu_multi.md +++ b/docs/source/en/perf_infer_gpu_multi.md @@ -309,3 +309,5 @@ The `placement` attribute tells PyTorch how to place a tensor on devices in `Dev - Check the [expert parallelism](./expert_parallelism) guide if you're using a mixture-of-experts (MoE) model. These models support tensor parallelism and expert parallelism. - Read the [Tensor Parallelism (TP) in Transformers: 5 Minutes to Understand](https://huggingface.co/blog/qgallouedec/tp) blog post for a quick overview of tensor parallelism and learn how column and row parallel setups differ. + +- See the [Tensor parallelism](./tensor_parallelism) training guide to learn how to use it in a training setting. diff --git a/docs/source/en/tensor_parallelism.md b/docs/source/en/tensor_parallelism.md new file mode 100644 index 000000000000..85ff479283ae --- /dev/null +++ b/docs/source/en/tensor_parallelism.md @@ -0,0 +1,101 @@ + + +# Tensor parallelism + +Tensor parallelism (TP) splits weight matrices column-wise or row-wise across GPUs. Each GPU holds a shard, computes a partial result, and synchronizes with an all-reduce to produce the full output. + +TP relies on frequent cross-GPU communication. It works best on hardware with fast intra-node links such as NVLink. + +```text + ┌─────────────────────────────┐ + │ X (replicated) │ + └────┬──────────┬─────────┬───┘ + │ │ │ + ┌────▼───┐ ┌────▼───┐ ┌───▼────┐ + │ ▓▓▓ W₀ │ │ ░░░ W₁ │ │ ███ W₂ │ + │ X@W₀ │ │ X@W₁ │ │ X@W₂ │ + └────┬───┘ └────┬───┘ └───┬────┘ + └──────────┼─────────┘ + Y₀+Y₁+Y₂ + ┌────────────────────────────┐ + │ Y (full) │ + └────────────────────────────┘ +``` + +Transformers supports TP for architectures whose config defines `base_model_tp_plan`. Check that field first to see whether a model supports native TP. + +```py +from transformers import AutoConfig + +config = AutoConfig.from_pretrained("Qwen/Qwen3-0.6B") +print(config.base_model_tp_plan is not None) +print(config.base_model_tp_plan) +``` + +If a model supports TP, set `tp_plan="auto"` in [`~PreTrainedModel.from_pretrained`]. Transformers initializes the device mesh and shards the supported layers for you. + +> [!WARNING] +> Don't use `device_map` with `tp_plan`. The two conflict at the weight-loading level. `device_map` places whole modules on specific GPUs, while `tp_plan` shards those same parameters across all GPUs. + +```py +import torch + +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained( + "Qwen/Qwen3-0.6B", + dtype=torch.bfloat16, + tp_plan="auto", +) +``` + +[`Trainer`] detects `tp_plan`, reads `tp_size` from the model, and creates a [`~accelerate.parallelism_config.ParallelismConfig`] automatically. + +Launch training on one node with 4 GPUs. + +```shell +torchrun --nproc-per-node 4 train_tp.py +``` + +## ParallelismConfig + +Pass [`~accelerate.parallelism_config.ParallelismConfig`] explicitly when combining TP with other parallelism techniques like [FSDP](./fsdp). + +```py +import torch + +from accelerate import ParallelismConfig +from transformers import AutoModelForCausalLM, TrainingArguments + +model = AutoModelForCausalLM.from_pretrained( + "Qwen/Qwen3-0.6B", + dtype=torch.bfloat16, + tp_plan="auto", +) + +parallelism_config = ParallelismConfig(tp_size=4) + +args = TrainingArguments( + ..., + parallelism_config=parallelism_config, +) +``` + +## Next steps + +- Read the [Tensor Parallelism](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) chapter from The Ultra-Scale Playbook for more details about how it works. +- Read the [tensor parallelism inference guide](./perf_infer_gpu_multi) to learn more about partitioning strategies, manual TP plans, and implementation details.