From 949aad301747d8200c6c8f74ff230b5a1eea7604 Mon Sep 17 00:00:00 2001 From: Thai-Hoa Nguyen Date: Tue, 6 Aug 2024 01:43:29 -0400 Subject: [PATCH 1/5] clipping for fp16 --- src/diffusers/models/transformers/transformer_flux.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/diffusers/models/transformers/transformer_flux.py b/src/diffusers/models/transformers/transformer_flux.py index 391ca1418d34..182e60e15530 100644 --- a/src/diffusers/models/transformers/transformer_flux.py +++ b/src/diffusers/models/transformers/transformer_flux.py @@ -125,6 +125,8 @@ def forward( gate = gate.unsqueeze(1) hidden_states = gate * self.proj_out(hidden_states) hidden_states = residual + hidden_states + if hidden_states.dtype == torch.float16: + hidden_states = hidden_states.clip(-65504, 65504) return hidden_states @@ -223,6 +225,8 @@ def forward( context_ff_output = self.ff_context(norm_encoder_hidden_states) encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output + if encoder_hidden_states.dtype == torch.float16: + encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504) return encoder_hidden_states, hidden_states From f214ad4df938b316c6761a87166de6da75c5398f Mon Sep 17 00:00:00 2001 From: Thai-Hoa Nguyen Date: Tue, 6 Aug 2024 01:43:58 -0400 Subject: [PATCH 2/5] fix typo --- docs/source/en/api/pipelines/flux.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/api/pipelines/flux.md b/docs/source/en/api/pipelines/flux.md index 095bf76af37f..efe48d61348b 100644 --- a/docs/source/en/api/pipelines/flux.md +++ b/docs/source/en/api/pipelines/flux.md @@ -37,7 +37,7 @@ Both checkpoints have slightly difference usage which we detail below. ```python import torch -from diffusers import FluxPipeline +from diffusers import FluxPipeline pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16) pipe.enable_model_cpu_offload() @@ -61,7 +61,7 @@ out.save("image.png") ```python import torch -from diffusers import FluxPipeline +from diffusers import FluxPipeline pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16) pipe.enable_model_cpu_offload() From 5f153a7b312eb0dfc6eb47a0f18f1cb5bdca5194 Mon Sep 17 00:00:00 2001 From: Thai-Hoa Nguyen Date: Tue, 6 Aug 2024 22:23:21 -0400 Subject: [PATCH 3/5] added fp16 inference to docs --- docs/source/en/api/pipelines/flux.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/docs/source/en/api/pipelines/flux.md b/docs/source/en/api/pipelines/flux.md index efe48d61348b..865fc1d2a0e8 100644 --- a/docs/source/en/api/pipelines/flux.md +++ b/docs/source/en/api/pipelines/flux.md @@ -77,6 +77,34 @@ out = pipe( out.save("image.png") ``` +### Running FP16 inference +Flux can run inference in FP16 mode (i.e. to accelerate inference on Turing/Volta GPUs) but produces the different outputs compared to FP32/BF16. The issue is that some activations in the text encoders have to be clipped when running in FP16. Forcing text encoders to run with FP32 inference thus removes this output difference. + +FP16 inference code: +```python +import torch +from diffusers import FluxPipeline + +pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16) # can replace schnell with dev +# to run on low vram GPUs (i.e. between 4 and 32 GB VRAM) +pipe.enable_sequential_cpu_offload() +pipe.vae.enable_slicing() +pipe.vae.enable_tiling() + +pipe.to(torch.float16) # casting here instead of in the pipeline constructor because doing so in the constructor loads all models into CPU memory at once + +prompt = "A cat holding a sign that says hello world" +out = pipe( + prompt=prompt, + guidance_scale=0., + height=768, + width=1360, + num_inference_steps=4, + max_sequence_length=256, +).images[0] +out.save("image.png") +``` + ## FluxPipeline [[autodoc]] FluxPipeline From fbe9b8aed98c1a984e29b50ae21abb44f06fca47 Mon Sep 17 00:00:00 2001 From: Thai-Hoa Nguyen Date: Tue, 6 Aug 2024 22:47:55 -0400 Subject: [PATCH 4/5] fix docs typo --- docs/source/en/api/pipelines/flux.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/api/pipelines/flux.md b/docs/source/en/api/pipelines/flux.md index 865fc1d2a0e8..cc9947fe16c1 100644 --- a/docs/source/en/api/pipelines/flux.md +++ b/docs/source/en/api/pipelines/flux.md @@ -78,7 +78,7 @@ out.save("image.png") ``` ### Running FP16 inference -Flux can run inference in FP16 mode (i.e. to accelerate inference on Turing/Volta GPUs) but produces the different outputs compared to FP32/BF16. The issue is that some activations in the text encoders have to be clipped when running in FP16. Forcing text encoders to run with FP32 inference thus removes this output difference. +Flux can generate high-quality images with FP16 (i.e. to accelerate inference on Turing/Volta GPUs) but produces different outputs compared to FP32/BF16. The issue is that some activations in the text encoders have to be clipped when running in FP16, which affects the overall image. Forcing text encoders to run with FP32 inference thus removes this output difference. FP16 inference code: ```python @@ -109,4 +109,4 @@ out.save("image.png") [[autodoc]] FluxPipeline - all - - __call__ \ No newline at end of file + - __call__ From 547f70feec04b7a588f661ad610f633b7c1239e5 Mon Sep 17 00:00:00 2001 From: Thai-Hoa Nguyen Date: Wed, 7 Aug 2024 00:14:29 -0400 Subject: [PATCH 5/5] include link for fp16 investigation --- docs/source/en/api/pipelines/flux.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/api/pipelines/flux.md b/docs/source/en/api/pipelines/flux.md index cc9947fe16c1..16d405cc910f 100644 --- a/docs/source/en/api/pipelines/flux.md +++ b/docs/source/en/api/pipelines/flux.md @@ -78,7 +78,7 @@ out.save("image.png") ``` ### Running FP16 inference -Flux can generate high-quality images with FP16 (i.e. to accelerate inference on Turing/Volta GPUs) but produces different outputs compared to FP32/BF16. The issue is that some activations in the text encoders have to be clipped when running in FP16, which affects the overall image. Forcing text encoders to run with FP32 inference thus removes this output difference. +Flux can generate high-quality images with FP16 (i.e. to accelerate inference on Turing/Volta GPUs) but produces different outputs compared to FP32/BF16. The issue is that some activations in the text encoders have to be clipped when running in FP16, which affects the overall image. Forcing text encoders to run with FP32 inference thus removes this output difference. See [here](https://github.com/huggingface/diffusers/pull/9097#issuecomment-2272292516) for details. FP16 inference code: ```python