From 59de32593c9160db8735f301eeb94e3409779d1c Mon Sep 17 00:00:00 2001
From: mengfeil <mengfei.li@intel.com>
Date: Sun, 26 Mar 2023 23:26:23 -0700
Subject: [PATCH 1/8] modify intel opts inference script

---
 .../research_projects/intel_opts/README.md    | 20 +++++++++++++++++++
 .../intel_opts/inference_bf16.py              | 18 ++++++++++++++---
 2 files changed, 35 insertions(+), 3 deletions(-)
diff --git a/examples/research_projects/intel_opts/README.md b/examples/research_projects/intel_opts/README.md
index fc606df7d170..15800e77b95a 100644
--- a/examples/research_projects/intel_opts/README.md
+++ b/examples/research_projects/intel_opts/README.md
@@ -11,6 +11,26 @@ We accelereate the fine-tuning for textual inversion with Intel Extension for Py
 ## Accelerating the inference for Stable Diffusion using Bfloat16
 
 We start the inference acceleration with Bfloat16 using Intel Extension for PyTorch. The [script](inference_bf16.py) is generally designed to support standard Stable Diffusion models with Bfloat16 support.
+```bash
+export KMP_BLOCKTIME=1
+export KMP_SETTINGS=1
+export KMP_AFFINITY=granularity=fine,compact,1,0
+
+# Intel OpenMP
+export OMP_NUM_THREADS=< Cores to use >
+export LD_PRELOAD=${LD_PRELOAD}:/path/to/lib/libiomp5.so
+# Jemalloc is a recommended malloc implementation that emphasizes fragmentation avoidance and scalable concurrency support.
+export LD_PRELOAD=${LD_PRELOAD}:/path/to/lib/libjemalloc.so
+export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:9000000000"
+
+# Launch
+numactl --membind <node N> -C <cpu list> python python inference_bf16.py
+# Launch with DPMSolver
+numactl --membind <node N> -C <cpu list> python python inference_bf16.py --dpm-solver
+
+# Note: Inference performance speedup with Intel DL Boost (VNNI/AMX) on Intel(R) Xeon(R) hardware, Please refer to [Performance Tuning Guide](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html) for more optimizations.
+
+```
 
 ## Accelerating the inference for Stable Diffusion using INT8
 
diff --git a/examples/research_projects/intel_opts/inference_bf16.py b/examples/research_projects/intel_opts/inference_bf16.py
index 8431693a45c8..d17b834ab5d8 100644
--- a/examples/research_projects/intel_opts/inference_bf16.py
+++ b/examples/research_projects/intel_opts/inference_bf16.py
@@ -1,9 +1,15 @@
+
 import intel_extension_for_pytorch as ipex
 import torch
 from PIL import Image
+import argparse
+
+from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
 
-from diffusers import StableDiffusionPipeline
 
+parser = argparse.ArgumentParser('Stable Diffusion script with intel optimization', add_help=False)
+parser.add_argument('--dpm-solver', action='store_true', help="Enable DPMSolver or not")
+args = parser.parse_args()
 
 def image_grid(imgs, rows, cols):
     assert len(imgs) == rows * cols
@@ -24,6 +30,8 @@ def image_grid(imgs, rows, cols):
 device = "cpu"
 model_id = "path-to-your-trained-model"
 model = StableDiffusionPipeline.from_pretrained(model_id)
+if args.dpm_solver:
+    model.scheduler = DPMSolverMultistepScheduler.from_config(model.scheduler.config)
 model = model.to(device)
 
 # to channels last
@@ -33,7 +41,11 @@ def image_grid(imgs, rows, cols):
 model.safety_checker = model.safety_checker.to(memory_format=torch.channels_last)
 
 # optimize with ipex
-model.unet = ipex.optimize(model.unet.eval(), dtype=torch.bfloat16, inplace=True)
+sample = torch.randn(2,4,64,64)
+timestep = torch.rand(1)*999
+encoder_hidden_status = torch.randn(2,77,768)
+input_example = (sample, timestep, encoder_hidden_status)
+model.unet = ipex.optimize(model.unet.eval(), dtype=torch.bfloat16, inplace=True, sample_input=input_example)
 model.vae = ipex.optimize(model.vae.eval(), dtype=torch.bfloat16, inplace=True)
 model.text_encoder = ipex.optimize(model.text_encoder.eval(), dtype=torch.bfloat16, inplace=True)
 model.safety_checker = ipex.optimize(model.safety_checker.eval(), dtype=torch.bfloat16, inplace=True)
@@ -42,7 +54,7 @@ def image_grid(imgs, rows, cols):
 seed = 666
 generator = torch.Generator(device).manual_seed(seed)
 with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
-    images = model(prompt, guidance_scale=7.5, num_inference_steps=50, generator=generator).images
+    images = model(prompt, generator=generator).images[0]
 
     # save image
     grid = image_grid(images, rows=2, cols=4)

From f280bfa11e794f0d67bd5839715d133b33af900f Mon Sep 17 00:00:00 2001
From: mengfeil <mengfei.li@intel.com>
Date: Sun, 26 Mar 2023 23:28:34 -0700
Subject: [PATCH 2/8] modify readme

---
 examples/research_projects/intel_opts/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/research_projects/intel_opts/README.md b/examples/research_projects/intel_opts/README.md
index 15800e77b95a..28c0d4300b41 100644
--- a/examples/research_projects/intel_opts/README.md
+++ b/examples/research_projects/intel_opts/README.md
@@ -28,9 +28,8 @@ numactl --membind <node N> -C <cpu list> python python inference_bf16.py
 # Launch with DPMSolver
 numactl --membind <node N> -C <cpu list> python python inference_bf16.py --dpm-solver
 
-# Note: Inference performance speedup with Intel DL Boost (VNNI/AMX) on Intel(R) Xeon(R) hardware, Please refer to [Performance Tuning Guide](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html) for more optimizations.
-
 ```
+>**Note**: Inference performance speedup with Intel DL Boost (VNNI/AMX) on Intel(R) Xeon(R) hardware, Please refer to [Performance Tuning Guide](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html) for more optimizations.
 
 ## Accelerating the inference for Stable Diffusion using INT8
 

From c683c75c3aad221c4c484a28f9c08c45ca4f3ee7 Mon Sep 17 00:00:00 2001
From: mengfeil <mengfei.li@intel.com>
Date: Wed, 29 Mar 2023 19:26:36 -0700
Subject: [PATCH 3/8] modify doc

---
 examples/research_projects/intel_opts/README.md         | 3 +--
 examples/research_projects/intel_opts/inference_bf16.py | 5 +++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/research_projects/intel_opts/README.md b/examples/research_projects/intel_opts/README.md
index 28c0d4300b41..02299e115e35 100644
--- a/examples/research_projects/intel_opts/README.md
+++ b/examples/research_projects/intel_opts/README.md
@@ -23,13 +23,12 @@ export LD_PRELOAD=${LD_PRELOAD}:/path/to/lib/libiomp5.so
 export LD_PRELOAD=${LD_PRELOAD}:/path/to/lib/libjemalloc.so
 export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:9000000000"
 
-# Launch
+# Launch with default configs
 numactl --membind <node N> -C <cpu list> python python inference_bf16.py
 # Launch with DPMSolver
 numactl --membind <node N> -C <cpu list> python python inference_bf16.py --dpm-solver
 
 ```
->**Note**: Inference performance speedup with Intel DL Boost (VNNI/AMX) on Intel(R) Xeon(R) hardware, Please refer to [Performance Tuning Guide](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html) for more optimizations.
 
 ## Accelerating the inference for Stable Diffusion using INT8
 
diff --git a/examples/research_projects/intel_opts/inference_bf16.py b/examples/research_projects/intel_opts/inference_bf16.py
index d17b834ab5d8..c4030212a571 100644
--- a/examples/research_projects/intel_opts/inference_bf16.py
+++ b/examples/research_projects/intel_opts/inference_bf16.py
@@ -54,8 +54,9 @@ def image_grid(imgs, rows, cols):
 seed = 666
 generator = torch.Generator(device).manual_seed(seed)
 with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
-    images = model(prompt, generator=generator).images[0]
+    images = model(prompt, generator=generator).images
 
     # save image
     grid = image_grid(images, rows=2, cols=4)
-    grid.save(model_id + ".png")
+    
+    grid.save("generated.png")

From d3712942668aa08f36db06f2ebb21b35fcb09fba Mon Sep 17 00:00:00 2001
From: mengfeil <mengfei.li@intel.com>
Date: Wed, 29 Mar 2023 20:14:28 -0700
Subject: [PATCH 4/8] fix some issues

---
 .../research_projects/intel_opts/README.md    |  8 ++-
 .../intel_opts/inference_bf16.py              | 62 +++++++++----------
 2 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/examples/research_projects/intel_opts/README.md b/examples/research_projects/intel_opts/README.md
index 02299e115e35..6b25679efbe9 100644
--- a/examples/research_projects/intel_opts/README.md
+++ b/examples/research_projects/intel_opts/README.md
@@ -12,6 +12,8 @@ We accelereate the fine-tuning for textual inversion with Intel Extension for Py
 
 We start the inference acceleration with Bfloat16 using Intel Extension for PyTorch. The [script](inference_bf16.py) is generally designed to support standard Stable Diffusion models with Bfloat16 support.
 ```bash
+pip install diffusers transformers accelerate scipy safetensors
+
 export KMP_BLOCKTIME=1
 export KMP_SETTINGS=1
 export KMP_AFFINITY=granularity=fine,compact,1,0
@@ -23,10 +25,10 @@ export LD_PRELOAD=${LD_PRELOAD}:/path/to/lib/libiomp5.so
 export LD_PRELOAD=${LD_PRELOAD}:/path/to/lib/libjemalloc.so
 export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:9000000000"
 
-# Launch with default configs
+# Launch with default DDIM
 numactl --membind <node N> -C <cpu list> python python inference_bf16.py
-# Launch with DPMSolver
-numactl --membind <node N> -C <cpu list> python python inference_bf16.py --dpm-solver
+# Launch with DPMSolverMultistepScheduler
+numactl --membind <node N> -C <cpu list> python python inference_bf16.py --dpm
 
 ```
 
diff --git a/examples/research_projects/intel_opts/inference_bf16.py b/examples/research_projects/intel_opts/inference_bf16.py
index c4030212a571..7ee858a86f0f 100644
--- a/examples/research_projects/intel_opts/inference_bf16.py
+++ b/examples/research_projects/intel_opts/inference_bf16.py
@@ -1,62 +1,56 @@
 
 import intel_extension_for_pytorch as ipex
 import torch
-from PIL import Image
 import argparse
 
 from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
 
 
 parser = argparse.ArgumentParser('Stable Diffusion script with intel optimization', add_help=False)
-parser.add_argument('--dpm-solver', action='store_true', help="Enable DPMSolver or not")
+parser.add_argument('--dpm', action='store_true', help="Enable DPMSolver or not")
+parser.add_argument("--steps", default=None, type=int, help="Num inference steps")
 args = parser.parse_args()
 
-def image_grid(imgs, rows, cols):
-    assert len(imgs) == rows * cols
-
-    w, h = imgs[0].size
-    grid = Image.new("RGB", size=(cols * w, rows * h))
-    grid_w, grid_h = grid.size
-
-    for i, img in enumerate(imgs):
-        grid.paste(img, box=(i % cols * w, i // cols * h))
-    return grid
-
-
-prompt = ["a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brighly buildings"]
-batch_size = 8
-prompt = prompt * batch_size
 
 device = "cpu"
+prompt = "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brighly buildings"
+
 model_id = "path-to-your-trained-model"
-model = StableDiffusionPipeline.from_pretrained(model_id)
-if args.dpm_solver:
-    model.scheduler = DPMSolverMultistepScheduler.from_config(model.scheduler.config)
-model = model.to(device)
+pipe = StableDiffusionPipeline.from_pretrained(model_id)
+if args.dpm:
+    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to(device)
 
 # to channels last
-model.unet = model.unet.to(memory_format=torch.channels_last)
-model.vae = model.vae.to(memory_format=torch.channels_last)
-model.text_encoder = model.text_encoder.to(memory_format=torch.channels_last)
-model.safety_checker = model.safety_checker.to(memory_format=torch.channels_last)
+pipe.unet = pipe.unet.to(memory_format=torch.channels_last)
+pipe.vae = pipe.vae.to(memory_format=torch.channels_last)
+pipe.text_encoder = pipe.text_encoder.to(memory_format=torch.channels_last)
+if pipe.requires_safety_checker:
+    pipe.safety_checker = pipe.safety_checker.to(memory_format=torch.channels_last)
 
 # optimize with ipex
 sample = torch.randn(2,4,64,64)
 timestep = torch.rand(1)*999
 encoder_hidden_status = torch.randn(2,77,768)
 input_example = (sample, timestep, encoder_hidden_status)
-model.unet = ipex.optimize(model.unet.eval(), dtype=torch.bfloat16, inplace=True, sample_input=input_example)
-model.vae = ipex.optimize(model.vae.eval(), dtype=torch.bfloat16, inplace=True)
-model.text_encoder = ipex.optimize(model.text_encoder.eval(), dtype=torch.bfloat16, inplace=True)
-model.safety_checker = ipex.optimize(model.safety_checker.eval(), dtype=torch.bfloat16, inplace=True)
+try:
+    pipe.unet = ipex.optimize(pipe.unet.eval(), dtype=torch.bfloat16, inplace=True, sample_input=input_example)
+except:
+    pipe.unet = ipex.optimize(pipe.unet.eval(), dtype=torch.bfloat16, inplace=True)
+pipe.vae = ipex.optimize(pipe.vae.eval(), dtype=torch.bfloat16, inplace=True)
+pipe.text_encoder = ipex.optimize(pipe.text_encoder.eval(), dtype=torch.bfloat16, inplace=True)
+if pipe.requires_safety_checker:
+    pipe.safety_checker = ipex.optimize(pipe.safety_checker.eval(), dtype=torch.bfloat16, inplace=True)
 
 # compute
 seed = 666
 generator = torch.Generator(device).manual_seed(seed)
+generate_kwargs = dict(generator=generator)
+if args.steps is not None:
+    generate_kwargs['num_inference_steps'] = args.steps
+
 with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
-    images = model(prompt, generator=generator).images
+    image = pipe(prompt, **generate_kwargs).images[0]
 
-    # save image
-    grid = image_grid(images, rows=2, cols=4)
-    
-    grid.save("generated.png")
+# save image
+image.save("generated.png")

From ffabf539253bbec2e89e15097caa254aaac6ab73 Mon Sep 17 00:00:00 2001
From: mengfeil <mengfei.li@intel.com>
Date: Wed, 29 Mar 2023 20:15:30 -0700
Subject: [PATCH 5/8] reformat

---
 examples/research_projects/intel_opts/inference_bf16.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/research_projects/intel_opts/inference_bf16.py b/examples/research_projects/intel_opts/inference_bf16.py
index 7ee858a86f0f..d74b7d7ce97e 100644
--- a/examples/research_projects/intel_opts/inference_bf16.py
+++ b/examples/research_projects/intel_opts/inference_bf16.py
@@ -29,9 +29,9 @@
     pipe.safety_checker = pipe.safety_checker.to(memory_format=torch.channels_last)
 
 # optimize with ipex
-sample = torch.randn(2,4,64,64)
-timestep = torch.rand(1)*999
-encoder_hidden_status = torch.randn(2,77,768)
+sample = torch.randn(2, 4, 64, 64)
+timestep = torch.rand(1) * 999
+encoder_hidden_status = torch.randn(2, 77, 768)
 input_example = (sample, timestep, encoder_hidden_status)
 try:
     pipe.unet = ipex.optimize(pipe.unet.eval(), dtype=torch.bfloat16, inplace=True, sample_input=input_example)

From 17f3e6be0a848db47e32ffdc3c23834d62d853ba Mon Sep 17 00:00:00 2001
From: mengfeil <mengfei.li@intel.com>
Date: Wed, 29 Mar 2023 20:18:58 -0700
Subject: [PATCH 6/8] reformat script

---
 examples/research_projects/intel_opts/inference_bf16.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/research_projects/intel_opts/inference_bf16.py b/examples/research_projects/intel_opts/inference_bf16.py
index d74b7d7ce97e..12e8d19eaeb9 100644
--- a/examples/research_projects/intel_opts/inference_bf16.py
+++ b/examples/research_projects/intel_opts/inference_bf16.py
@@ -1,4 +1,3 @@
-
 import intel_extension_for_pytorch as ipex
 import torch
 import argparse
@@ -6,8 +5,8 @@
 from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
 
 
-parser = argparse.ArgumentParser('Stable Diffusion script with intel optimization', add_help=False)
-parser.add_argument('--dpm', action='store_true', help="Enable DPMSolver or not")
+parser = argparse.ArgumentParser("Stable Diffusion script with intel optimization", add_help=False)
+parser.add_argument("--dpm", action="store_true", help="Enable DPMSolver or not")
 parser.add_argument("--steps", default=None, type=int, help="Num inference steps")
 args = parser.parse_args()
 
@@ -47,7 +46,7 @@
 generator = torch.Generator(device).manual_seed(seed)
 generate_kwargs = dict(generator=generator)
 if args.steps is not None:
-    generate_kwargs['num_inference_steps'] = args.steps
+    generate_kwargs["num_inference_steps"] = args.steps
 
 with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
     image = pipe(prompt, **generate_kwargs).images[0]

From 10abad2290f1d2e3f5b2b0a9542ed6fe39b3a209 Mon Sep 17 00:00:00 2001
From: mengfeil <mengfei.li@intel.com>
Date: Wed, 29 Mar 2023 20:31:48 -0700
Subject: [PATCH 7/8] format issue

---
 examples/research_projects/intel_opts/inference_bf16.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/research_projects/intel_opts/inference_bf16.py b/examples/research_projects/intel_opts/inference_bf16.py
index 12e8d19eaeb9..b508e3e5b239 100644
--- a/examples/research_projects/intel_opts/inference_bf16.py
+++ b/examples/research_projects/intel_opts/inference_bf16.py
@@ -1,8 +1,9 @@
+import argparse
+
 import intel_extension_for_pytorch as ipex
 import torch
-import argparse
 
-from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
+from diffusers import DPMSolverMultistepScheduler, StableDiffusionPipeline
 
 
 parser = argparse.ArgumentParser("Stable Diffusion script with intel optimization", add_help=False)
@@ -34,7 +35,7 @@
 input_example = (sample, timestep, encoder_hidden_status)
 try:
     pipe.unet = ipex.optimize(pipe.unet.eval(), dtype=torch.bfloat16, inplace=True, sample_input=input_example)
-except:
+except Exception:
     pipe.unet = ipex.optimize(pipe.unet.eval(), dtype=torch.bfloat16, inplace=True)
 pipe.vae = ipex.optimize(pipe.vae.eval(), dtype=torch.bfloat16, inplace=True)
 pipe.text_encoder = ipex.optimize(pipe.text_encoder.eval(), dtype=torch.bfloat16, inplace=True)

From 370f5043fd4dfef7f53e12c835580b8afb788670 Mon Sep 17 00:00:00 2001
From: mengfeil <mengfei.li@intel.com>
Date: Wed, 29 Mar 2023 20:35:58 -0700
Subject: [PATCH 8/8] format issue

---
 examples/research_projects/intel_opts/inference_bf16.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/intel_opts/inference_bf16.py b/examples/research_projects/intel_opts/inference_bf16.py
index b508e3e5b239..96ec709f433c 100644
--- a/examples/research_projects/intel_opts/inference_bf16.py
+++ b/examples/research_projects/intel_opts/inference_bf16.py
@@ -45,7 +45,7 @@
 # compute
 seed = 666
 generator = torch.Generator(device).manual_seed(seed)
-generate_kwargs = dict(generator=generator)
+generate_kwargs = {"generator": generator}
 if args.steps is not None:
     generate_kwargs["num_inference_steps"] = args.steps