From 59de32593c9160db8735f301eeb94e3409779d1c Mon Sep 17 00:00:00 2001 From: mengfeil Date: Sun, 26 Mar 2023 23:26:23 -0700 Subject: [PATCH 1/8] modify intel opts inference script --- .../research_projects/intel_opts/README.md | 20 +++++++++++++++++++ .../intel_opts/inference_bf16.py | 18 ++++++++++++++--- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/examples/research_projects/intel_opts/README.md b/examples/research_projects/intel_opts/README.md index fc606df7d170..15800e77b95a 100644 --- a/examples/research_projects/intel_opts/README.md +++ b/examples/research_projects/intel_opts/README.md @@ -11,6 +11,26 @@ We accelereate the fine-tuning for textual inversion with Intel Extension for Py ## Accelerating the inference for Stable Diffusion using Bfloat16 We start the inference acceleration with Bfloat16 using Intel Extension for PyTorch. The [script](inference_bf16.py) is generally designed to support standard Stable Diffusion models with Bfloat16 support. +```bash +export KMP_BLOCKTIME=1 +export KMP_SETTINGS=1 +export KMP_AFFINITY=granularity=fine,compact,1,0 + +# Intel OpenMP +export OMP_NUM_THREADS=< Cores to use > +export LD_PRELOAD=${LD_PRELOAD}:/path/to/lib/libiomp5.so +# Jemalloc is a recommended malloc implementation that emphasizes fragmentation avoidance and scalable concurrency support. +export LD_PRELOAD=${LD_PRELOAD}:/path/to/lib/libjemalloc.so +export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:9000000000" + +# Launch +numactl --membind -C python python inference_bf16.py +# Launch with DPMSolver +numactl --membind -C python python inference_bf16.py --dpm-solver + +# Note: Inference performance speedup with Intel DL Boost (VNNI/AMX) on Intel(R) Xeon(R) hardware, Please refer to [Performance Tuning Guide](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html) for more optimizations. + +``` ## Accelerating the inference for Stable Diffusion using INT8 diff --git a/examples/research_projects/intel_opts/inference_bf16.py b/examples/research_projects/intel_opts/inference_bf16.py index 8431693a45c8..d17b834ab5d8 100644 --- a/examples/research_projects/intel_opts/inference_bf16.py +++ b/examples/research_projects/intel_opts/inference_bf16.py @@ -1,9 +1,15 @@ + import intel_extension_for_pytorch as ipex import torch from PIL import Image +import argparse + +from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler -from diffusers import StableDiffusionPipeline +parser = argparse.ArgumentParser('Stable Diffusion script with intel optimization', add_help=False) +parser.add_argument('--dpm-solver', action='store_true', help="Enable DPMSolver or not") +args = parser.parse_args() def image_grid(imgs, rows, cols): assert len(imgs) == rows * cols @@ -24,6 +30,8 @@ def image_grid(imgs, rows, cols): device = "cpu" model_id = "path-to-your-trained-model" model = StableDiffusionPipeline.from_pretrained(model_id) +if args.dpm_solver: + model.scheduler = DPMSolverMultistepScheduler.from_config(model.scheduler.config) model = model.to(device) # to channels last @@ -33,7 +41,11 @@ def image_grid(imgs, rows, cols): model.safety_checker = model.safety_checker.to(memory_format=torch.channels_last) # optimize with ipex -model.unet = ipex.optimize(model.unet.eval(), dtype=torch.bfloat16, inplace=True) +sample = torch.randn(2,4,64,64) +timestep = torch.rand(1)*999 +encoder_hidden_status = torch.randn(2,77,768) +input_example = (sample, timestep, encoder_hidden_status) +model.unet = ipex.optimize(model.unet.eval(), dtype=torch.bfloat16, inplace=True, sample_input=input_example) model.vae = ipex.optimize(model.vae.eval(), dtype=torch.bfloat16, inplace=True) model.text_encoder = ipex.optimize(model.text_encoder.eval(), dtype=torch.bfloat16, inplace=True) model.safety_checker = ipex.optimize(model.safety_checker.eval(), dtype=torch.bfloat16, inplace=True) @@ -42,7 +54,7 @@ def image_grid(imgs, rows, cols): seed = 666 generator = torch.Generator(device).manual_seed(seed) with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16): - images = model(prompt, guidance_scale=7.5, num_inference_steps=50, generator=generator).images + images = model(prompt, generator=generator).images[0] # save image grid = image_grid(images, rows=2, cols=4) From f280bfa11e794f0d67bd5839715d133b33af900f Mon Sep 17 00:00:00 2001 From: mengfeil Date: Sun, 26 Mar 2023 23:28:34 -0700 Subject: [PATCH 2/8] modify readme --- examples/research_projects/intel_opts/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/research_projects/intel_opts/README.md b/examples/research_projects/intel_opts/README.md index 15800e77b95a..28c0d4300b41 100644 --- a/examples/research_projects/intel_opts/README.md +++ b/examples/research_projects/intel_opts/README.md @@ -28,9 +28,8 @@ numactl --membind -C python python inference_bf16.py # Launch with DPMSolver numactl --membind -C python python inference_bf16.py --dpm-solver -# Note: Inference performance speedup with Intel DL Boost (VNNI/AMX) on Intel(R) Xeon(R) hardware, Please refer to [Performance Tuning Guide](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html) for more optimizations. - ``` +>**Note**: Inference performance speedup with Intel DL Boost (VNNI/AMX) on Intel(R) Xeon(R) hardware, Please refer to [Performance Tuning Guide](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html) for more optimizations. ## Accelerating the inference for Stable Diffusion using INT8 From c683c75c3aad221c4c484a28f9c08c45ca4f3ee7 Mon Sep 17 00:00:00 2001 From: mengfeil Date: Wed, 29 Mar 2023 19:26:36 -0700 Subject: [PATCH 3/8] modify doc --- examples/research_projects/intel_opts/README.md | 3 +-- examples/research_projects/intel_opts/inference_bf16.py | 5 +++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/research_projects/intel_opts/README.md b/examples/research_projects/intel_opts/README.md index 28c0d4300b41..02299e115e35 100644 --- a/examples/research_projects/intel_opts/README.md +++ b/examples/research_projects/intel_opts/README.md @@ -23,13 +23,12 @@ export LD_PRELOAD=${LD_PRELOAD}:/path/to/lib/libiomp5.so export LD_PRELOAD=${LD_PRELOAD}:/path/to/lib/libjemalloc.so export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:9000000000" -# Launch +# Launch with default configs numactl --membind -C python python inference_bf16.py # Launch with DPMSolver numactl --membind -C python python inference_bf16.py --dpm-solver ``` ->**Note**: Inference performance speedup with Intel DL Boost (VNNI/AMX) on Intel(R) Xeon(R) hardware, Please refer to [Performance Tuning Guide](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html) for more optimizations. ## Accelerating the inference for Stable Diffusion using INT8 diff --git a/examples/research_projects/intel_opts/inference_bf16.py b/examples/research_projects/intel_opts/inference_bf16.py index d17b834ab5d8..c4030212a571 100644 --- a/examples/research_projects/intel_opts/inference_bf16.py +++ b/examples/research_projects/intel_opts/inference_bf16.py @@ -54,8 +54,9 @@ def image_grid(imgs, rows, cols): seed = 666 generator = torch.Generator(device).manual_seed(seed) with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16): - images = model(prompt, generator=generator).images[0] + images = model(prompt, generator=generator).images # save image grid = image_grid(images, rows=2, cols=4) - grid.save(model_id + ".png") + + grid.save("generated.png") From d3712942668aa08f36db06f2ebb21b35fcb09fba Mon Sep 17 00:00:00 2001 From: mengfeil Date: Wed, 29 Mar 2023 20:14:28 -0700 Subject: [PATCH 4/8] fix some issues --- .../research_projects/intel_opts/README.md | 8 ++- .../intel_opts/inference_bf16.py | 62 +++++++++---------- 2 files changed, 33 insertions(+), 37 deletions(-) diff --git a/examples/research_projects/intel_opts/README.md b/examples/research_projects/intel_opts/README.md index 02299e115e35..6b25679efbe9 100644 --- a/examples/research_projects/intel_opts/README.md +++ b/examples/research_projects/intel_opts/README.md @@ -12,6 +12,8 @@ We accelereate the fine-tuning for textual inversion with Intel Extension for Py We start the inference acceleration with Bfloat16 using Intel Extension for PyTorch. The [script](inference_bf16.py) is generally designed to support standard Stable Diffusion models with Bfloat16 support. ```bash +pip install diffusers transformers accelerate scipy safetensors + export KMP_BLOCKTIME=1 export KMP_SETTINGS=1 export KMP_AFFINITY=granularity=fine,compact,1,0 @@ -23,10 +25,10 @@ export LD_PRELOAD=${LD_PRELOAD}:/path/to/lib/libiomp5.so export LD_PRELOAD=${LD_PRELOAD}:/path/to/lib/libjemalloc.so export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:9000000000" -# Launch with default configs +# Launch with default DDIM numactl --membind -C python python inference_bf16.py -# Launch with DPMSolver -numactl --membind -C python python inference_bf16.py --dpm-solver +# Launch with DPMSolverMultistepScheduler +numactl --membind -C python python inference_bf16.py --dpm ``` diff --git a/examples/research_projects/intel_opts/inference_bf16.py b/examples/research_projects/intel_opts/inference_bf16.py index c4030212a571..7ee858a86f0f 100644 --- a/examples/research_projects/intel_opts/inference_bf16.py +++ b/examples/research_projects/intel_opts/inference_bf16.py @@ -1,62 +1,56 @@ import intel_extension_for_pytorch as ipex import torch -from PIL import Image import argparse from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler parser = argparse.ArgumentParser('Stable Diffusion script with intel optimization', add_help=False) -parser.add_argument('--dpm-solver', action='store_true', help="Enable DPMSolver or not") +parser.add_argument('--dpm', action='store_true', help="Enable DPMSolver or not") +parser.add_argument("--steps", default=None, type=int, help="Num inference steps") args = parser.parse_args() -def image_grid(imgs, rows, cols): - assert len(imgs) == rows * cols - - w, h = imgs[0].size - grid = Image.new("RGB", size=(cols * w, rows * h)) - grid_w, grid_h = grid.size - - for i, img in enumerate(imgs): - grid.paste(img, box=(i % cols * w, i // cols * h)) - return grid - - -prompt = ["a lovely in red dress and hat, in the snowly and brightly night, with many brighly buildings"] -batch_size = 8 -prompt = prompt * batch_size device = "cpu" +prompt = "a lovely in red dress and hat, in the snowly and brightly night, with many brighly buildings" + model_id = "path-to-your-trained-model" -model = StableDiffusionPipeline.from_pretrained(model_id) -if args.dpm_solver: - model.scheduler = DPMSolverMultistepScheduler.from_config(model.scheduler.config) -model = model.to(device) +pipe = StableDiffusionPipeline.from_pretrained(model_id) +if args.dpm: + pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) +pipe = pipe.to(device) # to channels last -model.unet = model.unet.to(memory_format=torch.channels_last) -model.vae = model.vae.to(memory_format=torch.channels_last) -model.text_encoder = model.text_encoder.to(memory_format=torch.channels_last) -model.safety_checker = model.safety_checker.to(memory_format=torch.channels_last) +pipe.unet = pipe.unet.to(memory_format=torch.channels_last) +pipe.vae = pipe.vae.to(memory_format=torch.channels_last) +pipe.text_encoder = pipe.text_encoder.to(memory_format=torch.channels_last) +if pipe.requires_safety_checker: + pipe.safety_checker = pipe.safety_checker.to(memory_format=torch.channels_last) # optimize with ipex sample = torch.randn(2,4,64,64) timestep = torch.rand(1)*999 encoder_hidden_status = torch.randn(2,77,768) input_example = (sample, timestep, encoder_hidden_status) -model.unet = ipex.optimize(model.unet.eval(), dtype=torch.bfloat16, inplace=True, sample_input=input_example) -model.vae = ipex.optimize(model.vae.eval(), dtype=torch.bfloat16, inplace=True) -model.text_encoder = ipex.optimize(model.text_encoder.eval(), dtype=torch.bfloat16, inplace=True) -model.safety_checker = ipex.optimize(model.safety_checker.eval(), dtype=torch.bfloat16, inplace=True) +try: + pipe.unet = ipex.optimize(pipe.unet.eval(), dtype=torch.bfloat16, inplace=True, sample_input=input_example) +except: + pipe.unet = ipex.optimize(pipe.unet.eval(), dtype=torch.bfloat16, inplace=True) +pipe.vae = ipex.optimize(pipe.vae.eval(), dtype=torch.bfloat16, inplace=True) +pipe.text_encoder = ipex.optimize(pipe.text_encoder.eval(), dtype=torch.bfloat16, inplace=True) +if pipe.requires_safety_checker: + pipe.safety_checker = ipex.optimize(pipe.safety_checker.eval(), dtype=torch.bfloat16, inplace=True) # compute seed = 666 generator = torch.Generator(device).manual_seed(seed) +generate_kwargs = dict(generator=generator) +if args.steps is not None: + generate_kwargs['num_inference_steps'] = args.steps + with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16): - images = model(prompt, generator=generator).images + image = pipe(prompt, **generate_kwargs).images[0] - # save image - grid = image_grid(images, rows=2, cols=4) - - grid.save("generated.png") +# save image +image.save("generated.png") From ffabf539253bbec2e89e15097caa254aaac6ab73 Mon Sep 17 00:00:00 2001 From: mengfeil Date: Wed, 29 Mar 2023 20:15:30 -0700 Subject: [PATCH 5/8] reformat --- examples/research_projects/intel_opts/inference_bf16.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/research_projects/intel_opts/inference_bf16.py b/examples/research_projects/intel_opts/inference_bf16.py index 7ee858a86f0f..d74b7d7ce97e 100644 --- a/examples/research_projects/intel_opts/inference_bf16.py +++ b/examples/research_projects/intel_opts/inference_bf16.py @@ -29,9 +29,9 @@ pipe.safety_checker = pipe.safety_checker.to(memory_format=torch.channels_last) # optimize with ipex -sample = torch.randn(2,4,64,64) -timestep = torch.rand(1)*999 -encoder_hidden_status = torch.randn(2,77,768) +sample = torch.randn(2, 4, 64, 64) +timestep = torch.rand(1) * 999 +encoder_hidden_status = torch.randn(2, 77, 768) input_example = (sample, timestep, encoder_hidden_status) try: pipe.unet = ipex.optimize(pipe.unet.eval(), dtype=torch.bfloat16, inplace=True, sample_input=input_example) From 17f3e6be0a848db47e32ffdc3c23834d62d853ba Mon Sep 17 00:00:00 2001 From: mengfeil Date: Wed, 29 Mar 2023 20:18:58 -0700 Subject: [PATCH 6/8] reformat script --- examples/research_projects/intel_opts/inference_bf16.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/research_projects/intel_opts/inference_bf16.py b/examples/research_projects/intel_opts/inference_bf16.py index d74b7d7ce97e..12e8d19eaeb9 100644 --- a/examples/research_projects/intel_opts/inference_bf16.py +++ b/examples/research_projects/intel_opts/inference_bf16.py @@ -1,4 +1,3 @@ - import intel_extension_for_pytorch as ipex import torch import argparse @@ -6,8 +5,8 @@ from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler -parser = argparse.ArgumentParser('Stable Diffusion script with intel optimization', add_help=False) -parser.add_argument('--dpm', action='store_true', help="Enable DPMSolver or not") +parser = argparse.ArgumentParser("Stable Diffusion script with intel optimization", add_help=False) +parser.add_argument("--dpm", action="store_true", help="Enable DPMSolver or not") parser.add_argument("--steps", default=None, type=int, help="Num inference steps") args = parser.parse_args() @@ -47,7 +46,7 @@ generator = torch.Generator(device).manual_seed(seed) generate_kwargs = dict(generator=generator) if args.steps is not None: - generate_kwargs['num_inference_steps'] = args.steps + generate_kwargs["num_inference_steps"] = args.steps with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16): image = pipe(prompt, **generate_kwargs).images[0] From 10abad2290f1d2e3f5b2b0a9542ed6fe39b3a209 Mon Sep 17 00:00:00 2001 From: mengfeil Date: Wed, 29 Mar 2023 20:31:48 -0700 Subject: [PATCH 7/8] format issue --- examples/research_projects/intel_opts/inference_bf16.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/research_projects/intel_opts/inference_bf16.py b/examples/research_projects/intel_opts/inference_bf16.py index 12e8d19eaeb9..b508e3e5b239 100644 --- a/examples/research_projects/intel_opts/inference_bf16.py +++ b/examples/research_projects/intel_opts/inference_bf16.py @@ -1,8 +1,9 @@ +import argparse + import intel_extension_for_pytorch as ipex import torch -import argparse -from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler +from diffusers import DPMSolverMultistepScheduler, StableDiffusionPipeline parser = argparse.ArgumentParser("Stable Diffusion script with intel optimization", add_help=False) @@ -34,7 +35,7 @@ input_example = (sample, timestep, encoder_hidden_status) try: pipe.unet = ipex.optimize(pipe.unet.eval(), dtype=torch.bfloat16, inplace=True, sample_input=input_example) -except: +except Exception: pipe.unet = ipex.optimize(pipe.unet.eval(), dtype=torch.bfloat16, inplace=True) pipe.vae = ipex.optimize(pipe.vae.eval(), dtype=torch.bfloat16, inplace=True) pipe.text_encoder = ipex.optimize(pipe.text_encoder.eval(), dtype=torch.bfloat16, inplace=True) From 370f5043fd4dfef7f53e12c835580b8afb788670 Mon Sep 17 00:00:00 2001 From: mengfeil Date: Wed, 29 Mar 2023 20:35:58 -0700 Subject: [PATCH 8/8] format issue --- examples/research_projects/intel_opts/inference_bf16.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/research_projects/intel_opts/inference_bf16.py b/examples/research_projects/intel_opts/inference_bf16.py index b508e3e5b239..96ec709f433c 100644 --- a/examples/research_projects/intel_opts/inference_bf16.py +++ b/examples/research_projects/intel_opts/inference_bf16.py @@ -45,7 +45,7 @@ # compute seed = 666 generator = torch.Generator(device).manual_seed(seed) -generate_kwargs = dict(generator=generator) +generate_kwargs = {"generator": generator} if args.steps is not None: generate_kwargs["num_inference_steps"] = args.steps