From 239d42db574a457fb883d1de4bf16d9de74117dd Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Wed, 30 Nov 2022 11:14:40 +0000
Subject: [PATCH 01/14] Update Inpaint pipeline

---
 .../cpp/pipeline_stable_diffusion_inpaint.cc  | 67 +++++++++++++++++++
 .../cpp/pipeline_stable_diffusion_inpaint.h   | 57 ++++++++++++++++
 .../stable_diffusion/cpp/scheduler.h          |  1 +
 3 files changed, 125 insertions(+)
 create mode 100644 examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
 create mode 100644 examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h
diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
new file mode 100644
index 00000000000..5d4c7b81cda
--- /dev/null
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pipeline_stable_diffusion_inpaint.h"
+
+using namespace paddlenlp;
+
+namespace fastdeploy {
+
+StableDiffusionInpaintPipeline::StableDiffusionInpaintPipeline(
+    std::unique_ptr<Runtime> vae_encoder, std::unique_ptr<Runtime> vae_decoder,
+    std::unique_ptr<Runtime> text_encoder, std::unique_ptr<Runtime> unet,
+    std::unique_ptr<Scheduler> scheduler,
+    const paddlenlp::fast_tokenizer::tokenizers_impl::ClipFastTokenizer&
+        tokenizer)
+    : vae_encoder_(std::move(vae_encoder)),
+      vae_decoder_(std::move(vae_decoder)),
+      text_encoder_(std::move(text_encoder)), unet_(std::move(unet)),
+      scheduler_(std::move(scheduler)), tokenizer_(tokenizer) {}
+
+void StableDiffusionInpaintPipeline::Predict(
+    const std::vector<std::string>& prompts, cv::Mat* image,
+    cv::Mat* mask_image, FDTensor* output_image, int height, int width,
+    int num_inference_steps, float guidance_scale,
+    const std::vector<std::string>& negative_prompt, int num_images_per_prompt,
+    float eta, uint32_t max_length, const FDTensor* latents,
+    callback_ptr callback, int callback_steps) {
+  int batch_size = prompts.size();
+  FDASSERT(batch_size >= 1, "prompts should not be empty");
+  FDASSERT(
+      height % 8 != 0 or width % 8 != 0,
+      "`height` and `width` have to be divisible by 8 but are {%d} and {%d}.",
+      height, width);
+  FDASSERT(callback_steps <= 0,
+           "`callback_steps` has to be a positive integer but is {%d}",
+           callback_steps);
+
+  scheduler_->SetTimesteps(num_inference_steps);
+
+  // Setting tokenizer attr
+  if (max_length == 0) {
+    tokenizer_.EnablePadMethod(fast_tokenizer::core::RIGHT,
+                               tokenizer_.GetPadTokenId(), 0,
+                               tokenizer_.GetPadToken(), nullptr, nullptr);
+    tokenizer_.DisableTruncMethod();
+  } else {
+    tokenizer_.EnablePadMethod(fast_tokenizer::core::RIGHT,
+                               tokenizer_.GetPadTokenId(), 0,
+                               tokenizer_.GetPadToken(), &max_length, nullptr);
+    tokenizer_.EnableTruncMethod(max_length, 0, fast_tokenizer::core::RIGHT,
+                                 fast_tokenizer::core::LONGEST_FIRST);
+  }
+  std::vector<fast_tokenizer::core::Encoding> encodings;
+  tokenizer_.EncodeBatchStrings(prompts, &encodings);
+}
+}  // namespace fastdeploy
diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h
new file mode 100644
index 00000000000..04ad288adff
--- /dev/null
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "./scheduler.h"
+#include "fast_tokenizer/tokenizers/clip_fast_tokenizer.h"
+#include "fastdeploy/core/fd_tensor.h"
+#include "fastdeploy/runtime.h"
+#include "opencv2/core/core.hpp"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace fastdeploy {
+
+class StableDiffusionInpaintPipeline {
+ public:
+  typedef void (*callback_ptr)(int, int, FDTensor*);
+
+  StableDiffusionInpaintPipeline(
+      std::unique_ptr<Runtime> vae_encoder,
+      std::unique_ptr<Runtime> vae_decoder,
+      std::unique_ptr<Runtime> text_encoder, std::unique_ptr<Runtime> unet,
+      std::unique_ptr<Scheduler> scheduler,
+      const paddlenlp::fast_tokenizer::tokenizers_impl::ClipFastTokenizer&
+          tokenizer);
+  void Predict(const std::vector<std::string>& prompts, cv::Mat* image,
+               cv::Mat* mask_image, FDTensor* output_image, int height = 512,
+               int width = 512, int num_inference_steps = 50,
+               float guidance_scale = 7.5,
+               const std::vector<std::string>& negative_prompt = {},
+               int num_images_per_prompt = 1, float eta = 0.0,
+               uint32_t max_length = 77, const FDTensor* latents = nullptr,
+               callback_ptr callback = nullptr, int callback_steps = 1);
+
+ private:
+  std::unique_ptr<Runtime> vae_encoder_;
+  std::unique_ptr<Runtime> vae_decoder_;
+  std::unique_ptr<Runtime> text_encoder_;
+  std::unique_ptr<Runtime> unet_;
+  std::unique_ptr<Scheduler> scheduler_;
+  paddlenlp::fast_tokenizer::tokenizers_impl::ClipFastTokenizer tokenizer_;
+};
+
+}  // namespace fastdeploy
diff --git a/examples/multimodal/stable_diffusion/cpp/scheduler.h b/examples/multimodal/stable_diffusion/cpp/scheduler.h
index 6a5cd2fed9c..6933fe3c87a 100644
--- a/examples/multimodal/stable_diffusion/cpp/scheduler.h
+++ b/examples/multimodal/stable_diffusion/cpp/scheduler.h
@@ -19,6 +19,7 @@
 namespace fastdeploy {
 
 class Scheduler {
+ public:
   virtual void SetTimesteps(int num_inference_steps) = 0;
   virtual void Step(const FDTensor& model_output, int timestep,
                     const FDTensor& sample, FDTensor* prev_sample) = 0;

From 9dbbaa3bedd16888014e15e8fb961831b943276c Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Wed, 30 Nov 2022 17:45:51 +0000
Subject: [PATCH 02/14] Update concat

---
 .../cpp/pipeline_stable_diffusion_inpaint.cc  | 55 +++++++++++++++++++
 fastdeploy/function/concat.cc                 |  8 ++-
 2 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
index 5d4c7b81cda..077d96c7edf 100644
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "pipeline_stable_diffusion_inpaint.h"
+#include "fastdeploy/function/functions.h"
 
 using namespace paddlenlp;
 
@@ -63,5 +64,59 @@ void StableDiffusionInpaintPipeline::Predict(
   }
   std::vector<fast_tokenizer::core::Encoding> encodings;
   tokenizer_.EncodeBatchStrings(prompts, &encodings);
+
+  std::vector<int64_t> input_ids;
+  for (auto& encoding : encodings) {
+    auto curr_ids = encoding.GetIds();
+    input_ids.insert(input_ids.end(), curr_ids.begin(), curr_ids.end());
+  }
+  encodings.clear();
+  // Get text encoder output
+  FDTensor text_intput_ids;
+  std::vector<FDTensor> text_inputs(1);
+  text_inputs[0].SetExternalData({batch_size, max_length}, FDDataType::INT64,
+                                 input_ids.data());
+
+  TensorInfo text_info = text_encoder_->GetInputInfo(0);
+  text_inputs[0].name = text_info.name;
+  int output_size = text_encoder_->GetOutputInfos().size();
+  std::vector<FDTensor> text_outputs(output_size);
+  text_encoder_->Infer(text_inputs, &text_outputs);
+
+  FDTensor text_embeddings;
+  function::Tile(text_outputs[0], {num_images_per_prompt, 1, 1},
+                 &text_embeddings);
+
+  //    here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+  //    of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+  //    corresponds to doing no classifier free guidance.
+  bool do_classifier_free_guidance = guidance_scale > 1.0;
+  if (do_classifier_free_guidance) {
+    std::vector<std::string> uncond_tokens;
+    if (negative_prompt.size() == 0) {
+      uncond_tokens = {""};
+    } else if (negative_prompt.size() != batch_size) {
+      FDASSERT(false,
+               "negative_prompt has batch size %d, but prompt has batch size "
+               "%d. Please make sure that passed `negative_prompt` matches the "
+               "batch size of `prompt`.",
+               prompts.size(), negative_prompt.size());
+    } else {
+      uncond_tokens = negative_prompt;
+    }
+    tokenizer_.EncodeBatchStrings(uncond_tokens, &encodings);
+    input_ids.clear();
+    for (auto& encoding : encodings) {
+      auto curr_ids = encoding.GetIds();
+      input_ids.insert(input_ids.end(), curr_ids.begin(), curr_ids.end());
+    }
+    text_inputs[0].SetExternalData({batch_size, max_length}, FDDataType::INT64,
+                                   input_ids.data());
+    text_encoder_->Infer(text_inputs, &text_outputs);
+    FDTensor uncond_embeddings;
+    function::Tile(text_outputs[0], {num_images_per_prompt, 1, 1},
+                   &uncond_embeddings);
+    function::Concat({uncond_embeddings, text_embeddings}, &text_embeddings);
+  }
 }
 }  // namespace fastdeploy
diff --git a/fastdeploy/function/concat.cc b/fastdeploy/function/concat.cc
index 295c3c25a43..4f07743942d 100644
--- a/fastdeploy/function/concat.cc
+++ b/fastdeploy/function/concat.cc
@@ -88,11 +88,13 @@ template <typename T>
 void ConcatKernel(const std::vector<FDTensor>& input, FDTensor* output,
                   int axis) {
   auto output_shape = ComputeAndCheckConcatOutputShape(input, axis);
-  output->Resize(output_shape, TypeToDataType<T>::dtype, output->name,
-                 input[0].device);
+  FDTensor output_tmp;
+  output_tmp.Resize(output_shape, TypeToDataType<T>::dtype, output->name,
+                    input[0].device);
 
   ConcatFunctor<T> functor;
-  functor(input, axis, output);
+  functor(input, axis, &output_tmp);
+  *output = std::move(output_tmp);
 }
 
 void Concat(const std::vector<FDTensor>& x, FDTensor* out, int axis) {

From 895e5d009449451fae88bb78cf4a410567b0e370 Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Thu, 1 Dec 2022 03:32:11 +0000
Subject: [PATCH 03/14] Add GaussianRandomKernel

---
 .../cpp/pipeline_stable_diffusion_inpaint.cc  | 16 +++++++
 fastdeploy/function/gaussian_random.cc        | 46 +++++++++++++++++++
 fastdeploy/function/gaussian_random.h         | 36 +++++++++++++++
 3 files changed, 98 insertions(+)
 create mode 100644 fastdeploy/function/gaussian_random.cc
 create mode 100644 fastdeploy/function/gaussian_random.h

diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
index 077d96c7edf..04fd6f78aaa 100644
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
@@ -14,11 +14,15 @@
 
 #include "pipeline_stable_diffusion_inpaint.h"
 #include "fastdeploy/function/functions.h"
+#include <algorithm>
 
 using namespace paddlenlp;
 
 namespace fastdeploy {
 
+static constexpr int NUM_LATENT_CHANNELS = 4;
+static constexpr int NUM_UNET_INPUT_CHANNELS = 9;
+
 StableDiffusionInpaintPipeline::StableDiffusionInpaintPipeline(
     std::unique_ptr<Runtime> vae_encoder, std::unique_ptr<Runtime> vae_decoder,
     std::unique_ptr<Runtime> text_encoder, std::unique_ptr<Runtime> unet,
@@ -118,5 +122,17 @@ void StableDiffusionInpaintPipeline::Predict(
                    &uncond_embeddings);
     function::Concat({uncond_embeddings, text_embeddings}, &text_embeddings);
   }
+  std::vector<int64_t> latents_shape = {batch_size * num_images_per_prompt,
+                                        NUM_LATENT_CHANNELS, height / 8,
+                                        width / 8};
+  auto latents_dtype = text_embeddings.Dtype();
+  if (latents == nullptr) {
+
+  } else if {
+    bool result = std::equals(latents_shape.begin(), latents_shape.end(),
+                              latents->Shape().begin());
+    FDASSERT(result, "Unexpected latents shape, got %s, expected %s",
+             Str(latents_shape).c_str(), Str(latents->Shape()).c_str());
+  }
 }
 }  // namespace fastdeploy
diff --git a/fastdeploy/function/gaussian_random.cc b/fastdeploy/function/gaussian_random.cc
new file mode 100644
index 00000000000..18657c4f2a3
--- /dev/null
+++ b/fastdeploy/function/gaussian_random.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/function/gaussian_random.h"
+#include <memory>
+#include <random>
+#include <utility>
+
+namespace fastdeploy {
+namespace function {
+
+template <typename T>
+void GaussianRandomKernel(const std::vector<int64_t>& shape, float mean,
+                          float std, int seed, FDTensor* out) {
+  std::normal_distribution<T> dist(mean, std);
+
+  out->Allocate(shape, TypeToDataType<T>::dtype);
+  int64_t size = out->Numel();
+  T* data = reinterpret_cast<T*>(out->Data());
+  std::mt19937_64 engine;
+  engine.seed(seed);
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = dist(engine);
+  }
+}
+
+void GaussianRandom(const std::vector<int64_t>& shape, FDTensor* out,
+                    FDDataType dtype, float mean, float std, int seed) {
+  FD_VISIT_FLOAT_TYPES(dtype, "GaussianRandomKernel", [&]() {
+    GaussianRandomKernel<data_t>(shape, mean, std, seed, out);
+  });
+}
+
+}  // namespace function
+}  // namespace fastdeploy
\ No newline at end of file
diff --git a/fastdeploy/function/gaussian_random.h b/fastdeploy/function/gaussian_random.h
new file mode 100644
index 00000000000..85a4ff8a638
--- /dev/null
+++ b/fastdeploy/function/gaussian_random.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy/core/fd_tensor.h"
+
+namespace fastdeploy {
+namespace function {
+
+/** Output is obtained by gathering entries of axis of x indexed by index and
+ *  concatenate them together.
+    @param shape The output tensor shape.
+    @param out the output tensor.
+    @param mean mean value of gaussian random
+    @param std standard value of gaussian random
+    @param seed The seed of random generator.
+    @param dtype The data type of the output Tensor.
+*/
+void GaussianRandom(const std::vector<int64_t>& shape, FDTensor* out,
+                    FDDataType dtype = FDDataType::FP32, float mean = 0.0f,
+                    float std = 1.0f, int seed = 0);
+
+}  // namespace function
+}  // namespace fastdeploy

From 97179617513806e11bfc9eef21f6b6579e0a370f Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Thu, 1 Dec 2022 03:44:23 +0000
Subject: [PATCH 04/14] Update GaussianRandom

---
 .../cpp/pipeline_stable_diffusion_inpaint.cc              | 8 +++++---
 fastdeploy/function/functions.h                           | 1 +
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
index 04fd6f78aaa..25f0ab5bd78 100644
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
@@ -126,13 +126,15 @@ void StableDiffusionInpaintPipeline::Predict(
                                         NUM_LATENT_CHANNELS, height / 8,
                                         width / 8};
   auto latents_dtype = text_embeddings.Dtype();
+  FDTensor actual_latents;
   if (latents == nullptr) {
-
-  } else if {
-    bool result = std::equals(latents_shape.begin(), latents_shape.end(),
+    function::GaussianRandom(latents_shape, &actual_latents, latents_dtype);
+  } else {
+    bool result = std::equal(latents_shape.begin(), latents_shape.end(),
                               latents->Shape().begin());
     FDASSERT(result, "Unexpected latents shape, got %s, expected %s",
              Str(latents_shape).c_str(), Str(latents->Shape()).c_str());
+    actual_latents = *latents;
   }
 }
 }  // namespace fastdeploy
diff --git a/fastdeploy/function/functions.h b/fastdeploy/function/functions.h
index d2ffe6a0c1d..a43407839fa 100644
--- a/fastdeploy/function/functions.h
+++ b/fastdeploy/function/functions.h
@@ -21,6 +21,7 @@
 #include "fastdeploy/function/elementwise.h"
 #include "fastdeploy/function/full.h"
 #include "fastdeploy/function/gather_scatter_along_axis.h"
+#include "fastdeploy/function/gaussian_random.h"
 #include "fastdeploy/function/isfinite.h"
 #include "fastdeploy/function/linspace.h"
 #include "fastdeploy/function/math.h"

From 4fcaa549c8df8345c6151ed2ba60fdbf5f477ff5 Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Thu, 1 Dec 2022 06:49:25 +0000
Subject: [PATCH 05/14] Add vae endoder

---
 .../cpp/dpm_solver_multistep_scheduler.cc     |  2 +
 .../cpp/dpm_solver_multistep_scheduler.h      |  1 +
 .../cpp/pipeline_stable_diffusion_inpaint.cc  | 80 +++++++++++++++----
 .../cpp/pipeline_stable_diffusion_inpaint.h   |  3 +
 .../stable_diffusion/cpp/scheduler.h          |  1 +
 fastdeploy/function/tile.cc                   |  5 +-
 6 files changed, 75 insertions(+), 17 deletions(-)

diff --git a/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.cc b/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.cc
index cb6cf970b7b..8b96d836664 100644
--- a/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.cc
+++ b/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.cc
@@ -96,6 +96,8 @@ DPMSolverMultistepScheduler::DPMSolverMultistepScheduler(
   lower_order_nums_ = 0;
 }
 
+float DPMSolverMultistepScheduler::InitNoiseSigma() { return 1.0; }
+
 void DPMSolverMultistepScheduler::ConvertModelOutput(
     const FDTensor& model_output, int timestep, const FDTensor& sample,
     FDTensor* out) {
diff --git a/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.h b/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.h
index c6f037feead..6924eb80278 100644
--- a/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.h
+++ b/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.h
@@ -54,6 +54,7 @@ class DPMSolverMultistepScheduler : public Scheduler {
                        const std::vector<FDTensor>& timesteps = {}) override;
   void AddNoise(const FDTensor& original_samples, const FDTensor& noise,
                 const FDTensor& timesteps, FDTensor* out) override;
+  float InitNoiseSigma() override;
   struct Config {
     int num_train_timesteps_;
     float beta_start_;
diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
index 25f0ab5bd78..fc401d4a203 100644
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
@@ -23,6 +23,10 @@ namespace fastdeploy {
 static constexpr int NUM_LATENT_CHANNELS = 4;
 static constexpr int NUM_UNET_INPUT_CHANNELS = 9;
 
+void StableDiffusionInpaintPipeline::PrepareMaskAndMaskedImage(
+    cv::Mat* image, cv::Mat* mask_mat, const std::vector<int64_t>& shape,
+    FDTensor* mask, FDTensor* mask_image) {}
+
 StableDiffusionInpaintPipeline::StableDiffusionInpaintPipeline(
     std::unique_ptr<Runtime> vae_encoder, std::unique_ptr<Runtime> vae_decoder,
     std::unique_ptr<Runtime> text_encoder, std::unique_ptr<Runtime> unet,
@@ -77,23 +81,22 @@ void StableDiffusionInpaintPipeline::Predict(
   encodings.clear();
   // Get text encoder output
   FDTensor text_intput_ids;
-  std::vector<FDTensor> text_inputs(1);
-  text_inputs[0].SetExternalData({batch_size, max_length}, FDDataType::INT64,
-                                 input_ids.data());
+  std::vector<FDTensor> inputs(1);
+  inputs[0].SetExternalData({batch_size, max_length}, FDDataType::INT64,
+                            input_ids.data());
 
   TensorInfo text_info = text_encoder_->GetInputInfo(0);
-  text_inputs[0].name = text_info.name;
+  inputs[0].name = text_info.name;
   int output_size = text_encoder_->GetOutputInfos().size();
-  std::vector<FDTensor> text_outputs(output_size);
-  text_encoder_->Infer(text_inputs, &text_outputs);
+  std::vector<FDTensor> outputs(output_size);
+  text_encoder_->Infer(inputs, &outputs);
 
   FDTensor text_embeddings;
-  function::Tile(text_outputs[0], {num_images_per_prompt, 1, 1},
-                 &text_embeddings);
+  function::Tile(outputs[0], {num_images_per_prompt, 1, 1}, &text_embeddings);
 
-  //    here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-  //    of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-  //    corresponds to doing no classifier free guidance.
+  // here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+  // of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+  // corresponds to doing no classifier free guidance.
   bool do_classifier_free_guidance = guidance_scale > 1.0;
   if (do_classifier_free_guidance) {
     std::vector<std::string> uncond_tokens;
@@ -114,11 +117,11 @@ void StableDiffusionInpaintPipeline::Predict(
       auto curr_ids = encoding.GetIds();
       input_ids.insert(input_ids.end(), curr_ids.begin(), curr_ids.end());
     }
-    text_inputs[0].SetExternalData({batch_size, max_length}, FDDataType::INT64,
-                                   input_ids.data());
-    text_encoder_->Infer(text_inputs, &text_outputs);
+    inputs[0].SetExternalData({batch_size, max_length}, FDDataType::INT64,
+                              input_ids.data());
+    text_encoder_->Infer(inputs, &outputs);
     FDTensor uncond_embeddings;
-    function::Tile(text_outputs[0], {num_images_per_prompt, 1, 1},
+    function::Tile(outputs[0], {num_images_per_prompt, 1, 1},
                    &uncond_embeddings);
     function::Concat({uncond_embeddings, text_embeddings}, &text_embeddings);
   }
@@ -131,10 +134,55 @@ void StableDiffusionInpaintPipeline::Predict(
     function::GaussianRandom(latents_shape, &actual_latents, latents_dtype);
   } else {
     bool result = std::equal(latents_shape.begin(), latents_shape.end(),
-                              latents->Shape().begin());
+                             latents->Shape().begin());
     FDASSERT(result, "Unexpected latents shape, got %s, expected %s",
              Str(latents_shape).c_str(), Str(latents->Shape()).c_str());
     actual_latents = *latents;
   }
+  FDTensor mask_t, mask_image_t;
+  PrepareMaskAndMaskedImage(image, mask_image, {height / 8, width / 8}, &mask_t,
+                            &mask_image_t);
+  function::Cast(mask_t, &mask_t, actual_latents.Dtype());
+  function::Cast(mask_image_t, &mask_image_t, actual_latents.Dtype());
+
+  // Get vae encoder output
+  TensorInfo vae_encoder_info = vae_encoder_->GetInputInfo(0);
+  mask_image_t.name = vae_encoder_info.name;
+  outputs.resize(vae_encoder_->GetOutputInfos().size());
+  inputs = {mask_image_t};
+  vae_encoder_->Infer(inputs, &outputs);
+  FDTensor masked_image_latents = 0.18215 * outputs[0];
+
+  auto mask_shape = mask_t.Shape();
+  mask_shape[0] = batch_size * num_images_per_prompt;
+  function::Tile(mask_t, mask_shape, &mask_t);
+
+  auto mask_image_shape = mask_image_t.Shape();
+  mask_image_shape[0] = batch_size * num_images_per_prompt;
+  function::Tile(mask_image_t, mask_image_shape, &mask_image_t);
+
+  if (do_classifier_free_guidance) {
+    function::Concat({mask_t, mask_t}, &mask_t);
+    function::Concat({mask_image_t, mask_image_t}, &mask_image_t);
+  }
+  int num_channels_mask = mask_t.Shape()[1];
+  int num_channels_masked_image = mask_image_t.Shape()[1];
+  FDASSERT(
+      NUM_LATENT_CHANNELS + num_channels_mask + num_channels_masked_image ==
+          NUM_UNET_INPUT_CHANNELS,
+      "Incorrect configuration settings! The config of `pipeline.unet` expects"
+      " {%d} but received `num_channels_latents`: %d + `num_channels_mask`: %d "
+      "+ `num_channels_masked_image`: %d"
+      " = %d. Please verify the config of `pipeline.unet` or your `mask_image` "
+      "or `image` input.",
+      NUM_UNET_INPUT_CHANNELS, NUM_LATENT_CHANNELS, num_channels_mask,
+      num_channels_masked_image,
+      NUM_LATENT_CHANNELS + num_channels_mask + num_channels_masked_image);
+
+  // set timesteps
+  scheduler_->SetTimesteps(num_inference_steps);
+
+  // scale the initial noise by the standard deviation required by the scheduler
+  actual_latents = actual_latents * scheduler_->InitNoiseSigma();
 }
 }  // namespace fastdeploy
diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h
index 04ad288adff..9f0fec35ec9 100644
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h
@@ -46,6 +46,9 @@ class StableDiffusionInpaintPipeline {
                callback_ptr callback = nullptr, int callback_steps = 1);
 
  private:
+  void PrepareMaskAndMaskedImage(cv::Mat* image, cv::Mat* mask_mat,
+                                 const std::vector<int64_t>& shape,
+                                 FDTensor* mask, FDTensor* mask_image);
   std::unique_ptr<Runtime> vae_encoder_;
   std::unique_ptr<Runtime> vae_decoder_;
   std::unique_ptr<Runtime> text_encoder_;
diff --git a/examples/multimodal/stable_diffusion/cpp/scheduler.h b/examples/multimodal/stable_diffusion/cpp/scheduler.h
index 6933fe3c87a..432bfeea5c1 100644
--- a/examples/multimodal/stable_diffusion/cpp/scheduler.h
+++ b/examples/multimodal/stable_diffusion/cpp/scheduler.h
@@ -27,6 +27,7 @@ class Scheduler {
                                const std::vector<FDTensor>& timesteps = {}) = 0;
   virtual void AddNoise(const FDTensor& original_samples, const FDTensor& noise,
                         const FDTensor& timesteps, FDTensor* out) = 0;
+  virtual float InitNoiseSigma() = 0;
 };
 
 }  // namespace fastdeploy
diff --git a/fastdeploy/function/tile.cc b/fastdeploy/function/tile.cc
index 6437b4ec603..143d9d6dbef 100644
--- a/fastdeploy/function/tile.cc
+++ b/fastdeploy/function/tile.cc
@@ -49,6 +49,7 @@ void TileFunctor(const FDTensor& x,
     return;
   }
 
+  FDTensor out_tmp;
   Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
   for (size_t i = 0; i < repeat_times.size(); ++i) {
     bcast_dims[i] = repeat_times[i];
@@ -59,12 +60,14 @@ void TileFunctor(const FDTensor& x,
     out_shape[i] *= repeat_times[i];
   }
 
-  out->Allocate(out_shape, x.Dtype());
+  out_tmp.Allocate(out_shape, x.Dtype());
   auto eigen_x = EigenTensor<T, Rank>::From(x, x_shape);
   auto eigen_out = EigenTensor<T, Rank>::From(*out, out_shape);
 
   const auto& dev = *EigenDeviceWrapper::GetInstance()->GetDevice();
   eigen_out.device(dev) = eigen_x.broadcast(bcast_dims);
+
+  *out = std::move(out_tmp);
 }
 
 template <typename T>

From 400d6b5e89242c21cbdc0b8f237b87a956e64497 Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Thu, 1 Dec 2022 07:56:40 +0000
Subject: [PATCH 06/14] Add unet infer

---
 .../cpp/dpm_solver_multistep_scheduler.cc     |  2 +
 .../cpp/dpm_solver_multistep_scheduler.h      |  1 +
 .../cpp/pipeline_stable_diffusion_inpaint.cc  | 48 ++++++++++++++++++-
 .../stable_diffusion/cpp/scheduler.h          |  1 +
 4 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.cc b/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.cc
index 8b96d836664..b6ac2220014 100644
--- a/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.cc
+++ b/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.cc
@@ -394,4 +394,6 @@ void DPMSolverMultistepScheduler::AddNoise(const FDTensor& original_samples,
   *out = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise;
 }
 
+FDTensor DPMSolverMultistepScheduler::GetTimesteps() { return timesteps_; }
+
 }  // namespace fastdeploy
diff --git a/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.h b/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.h
index 6924eb80278..0775ba1ee79 100644
--- a/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.h
+++ b/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.h
@@ -55,6 +55,7 @@ class DPMSolverMultistepScheduler : public Scheduler {
   void AddNoise(const FDTensor& original_samples, const FDTensor& noise,
                 const FDTensor& timesteps, FDTensor* out) override;
   float InitNoiseSigma() override;
+  FDTensor GetTimesteps() override;
   struct Config {
     int num_train_timesteps_;
     float beta_start_;
diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
index fc401d4a203..7813f9dbbe7 100644
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
@@ -55,8 +55,6 @@ void StableDiffusionInpaintPipeline::Predict(
            "`callback_steps` has to be a positive integer but is {%d}",
            callback_steps);
 
-  scheduler_->SetTimesteps(num_inference_steps);
-
   // Setting tokenizer attr
   if (max_length == 0) {
     tokenizer_.EnablePadMethod(fast_tokenizer::core::RIGHT,
@@ -184,5 +182,51 @@ void StableDiffusionInpaintPipeline::Predict(
 
   // scale the initial noise by the standard deviation required by the scheduler
   actual_latents = actual_latents * scheduler_->InitNoiseSigma();
+
+  auto timestep = scheduler_->GetTimesteps();
+  int64_t* timestep_data = reinterpret_cast<int64_t*>(timestep.Data());
+  for (int i = 0; i < timestep.Numel(); ++i) {
+    FDTensor t;
+    function::Slice(timestep, {0}, {i}, &t);
+    // expand the latents if we are doing classifier free guidance
+    FDTensor latent_model_input;
+    if (do_classifier_free_guidance) {
+      function::Concat({actual_latents, actual_latents}, &latent_model_input);
+    } else {
+      latent_model_input = actual_latents;
+    }
+    // concat latents, mask, masked_image_latnets in the channel dimension
+    function::Concat({latent_model_input, mask_t, mask_image_t},
+                     &latent_model_input, 1);
+    scheduler_->ScaleModelInput(latent_model_input, &latent_model_input, {t});
+
+    // predict the noise residual
+    FDTensor noise_pred;
+    auto unet_infos = unet_->GetInputInfos();
+    latent_model_input.name = unet_infos[0].name;
+    t.name = unet_infos[1].name;
+    text_embeddings.name = unet_infos[2].name;
+    outputs.resize(unet_->GetOutputInfos().size());
+    inputs = {latent_model_input, t, text_embeddings};
+    unet_->Infer(inputs, &outputs);
+    noise_pred = std::move(outputs[0]);
+    // perform guidance
+    if (do_classifier_free_guidance) {
+      std::vector<FDTensor> noise_preds;
+      int dim0 = noise_pred.Shape()[0];
+      function::Split(noise_pred, {dim0 - dim0 / 2, dim0 / 2}, &noise_preds);
+      noise_pred =
+          noise_preds[0] + guidance_scale * (noise_preds[1] - noise_preds[0]);
+    }
+
+    // compute the previous noisy sample x_t -> x_t-1
+    int64_t time = reinterpret_cast<int64_t*>(t.Data())[0];
+    scheduler_->Step(noise_pred, time, actual_latents, &actual_latents);
+
+    // call the callback, if provided
+    if (callback != nullptr && i % callback_steps == 0) {
+      callback(i, time, &actual_latents);
+    }
+  }
 }
 }  // namespace fastdeploy
diff --git a/examples/multimodal/stable_diffusion/cpp/scheduler.h b/examples/multimodal/stable_diffusion/cpp/scheduler.h
index 432bfeea5c1..e4dc452def8 100644
--- a/examples/multimodal/stable_diffusion/cpp/scheduler.h
+++ b/examples/multimodal/stable_diffusion/cpp/scheduler.h
@@ -21,6 +21,7 @@ namespace fastdeploy {
 class Scheduler {
  public:
   virtual void SetTimesteps(int num_inference_steps) = 0;
+  virtual FDTensor GetTimesteps() = 0;
   virtual void Step(const FDTensor& model_output, int timestep,
                     const FDTensor& sample, FDTensor* prev_sample) = 0;
   virtual void ScaleModelInput(const FDTensor& sample, FDTensor* out,

From 90e40c7ca6a197ee5394cec4406505f1f8b3db31 Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Thu, 1 Dec 2022 08:40:10 +0000
Subject: [PATCH 07/14] Add vae decoder predict

---
 .../cpp/pipeline_stable_diffusion_inpaint.cc  | 37 +++++++++++++++++--
 .../cpp/pipeline_stable_diffusion_inpaint.h   |  7 ++--
 2 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
index 7813f9dbbe7..3bc871193e7 100644
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
@@ -40,10 +40,10 @@ StableDiffusionInpaintPipeline::StableDiffusionInpaintPipeline(
 
 void StableDiffusionInpaintPipeline::Predict(
     const std::vector<std::string>& prompts, cv::Mat* image,
-    cv::Mat* mask_image, FDTensor* output_image, int height, int width,
-    int num_inference_steps, float guidance_scale,
+    cv::Mat* mask_image, std::vector<FDTensor>* output_images, int height,
+    int width, int num_inference_steps, float guidance_scale,
     const std::vector<std::string>& negative_prompt, int num_images_per_prompt,
-    float eta, uint32_t max_length, const FDTensor* latents,
+    float eta, uint32_t max_length, const FDTensor* latents, bool output_cv_mat,
     callback_ptr callback, int callback_steps) {
   int batch_size = prompts.size();
   FDASSERT(batch_size >= 1, "prompts should not be empty");
@@ -227,6 +227,37 @@ void StableDiffusionInpaintPipeline::Predict(
     if (callback != nullptr && i % callback_steps == 0) {
       callback(i, time, &actual_latents);
     }
+    actual_latents = (1.0f / 0.18215f) * actual_latents;
+
+    // Get vae decoder output
+    int actual_latents_bs = actual_latents.Shape()[0];
+    TensorInfo vae_decoder_info = vae_decoder_->GetInputInfo(0);
+    inputs.resize(1);
+    outputs.resize(vae_decoder_->GetOutputInfos().size());
+    std::vector<FDTensor> decoder_reuslt;
+    for (int i = 0; i < actual_latents_bs; ++i) {
+      function::Slice(actual_latents, {0}, {i}, {i + 1}, &inputs[0]);
+      inputs[0].name = vae_decoder_info.name;
+      vae_decoder_->Infer(inputs, &outputs);
+      decoder_reuslt.emplace_back(std::move(outputs[0]));
+    }
+    FDTensor output_image;
+    function::Concat(decoder_reuslt, &output_image);
+
+    function::Clip(output_image / 2.0f + 0.5f, 0, 1, &output_image);
+    function::Transpose(output_image, &output_image, {0, 2, 3, 1});
+
+    if (output_cv_mat) {
+      output_image = output_image * 255.0f;
+      function::Round(output_image, &output_image);
+      function::Cast(output_image, &output_image, FDDataType::UINT8);
+    }
+
+    int output_batch_size = output_image.Shape()[0];
+    output_images->resize(output_batch_size);
+    for (int i = 0; i < output_batch_size; ++i) {
+      function::Slice(output_image, {0}, {i}, &(*output_images)[i]);
+    }
   }
 }
 }  // namespace fastdeploy
diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h
index 9f0fec35ec9..8fbbb901370 100644
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h
@@ -37,13 +37,14 @@ class StableDiffusionInpaintPipeline {
       const paddlenlp::fast_tokenizer::tokenizers_impl::ClipFastTokenizer&
           tokenizer);
   void Predict(const std::vector<std::string>& prompts, cv::Mat* image,
-               cv::Mat* mask_image, FDTensor* output_image, int height = 512,
-               int width = 512, int num_inference_steps = 50,
+               cv::Mat* mask_image, std::vector<FDTensor>* output_images,
+               int height = 512, int width = 512, int num_inference_steps = 50,
                float guidance_scale = 7.5,
                const std::vector<std::string>& negative_prompt = {},
                int num_images_per_prompt = 1, float eta = 0.0,
                uint32_t max_length = 77, const FDTensor* latents = nullptr,
-               callback_ptr callback = nullptr, int callback_steps = 1);
+               bool output_cv_mat = true, callback_ptr callback = nullptr,
+               int callback_steps = 1);
 
  private:
   void PrepareMaskAndMaskedImage(cv::Mat* image, cv::Mat* mask_mat,

From 0133c229ffbf689fe4a72e965d63c00c6f231530 Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Thu, 1 Dec 2022 13:17:32 +0000
Subject: [PATCH 08/14] add PrepareMaskAndMaskedImage

---
 .../cpp/pipeline_stable_diffusion_inpaint.cc  | 56 +++++++++++++++++--
 .../cpp/pipeline_stable_diffusion_inpaint.h   |  6 +-
 2 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
index 3bc871193e7..4fca51afb17 100644
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
@@ -14,6 +14,9 @@
 
 #include "pipeline_stable_diffusion_inpaint.h"
 #include "fastdeploy/function/functions.h"
+#include "fastdeploy/vision/common/processors/color_space_convert.h"
+#include "fastdeploy/vision/common/processors/mat.h"
+#include "fastdeploy/vision/common/processors/resize.h"
 #include <algorithm>
 
 using namespace paddlenlp;
@@ -24,8 +27,53 @@ static constexpr int NUM_LATENT_CHANNELS = 4;
 static constexpr int NUM_UNET_INPUT_CHANNELS = 9;
 
 void StableDiffusionInpaintPipeline::PrepareMaskAndMaskedImage(
-    cv::Mat* image, cv::Mat* mask_mat, const std::vector<int64_t>& shape,
-    FDTensor* mask, FDTensor* mask_image) {}
+    const cv::Mat& image, const cv::Mat& mask_mat,
+    const std::vector<int64_t>& shape, FDTensor* mask, FDTensor* mask_image) {
+  vision::FDMat image_fdmat(image);
+  vision::BGR2RGB::Run(&image_fdmat, vision::ProcLib::OPENCV);
+  vision::Resize::Run(&image_fdmat, shape[1] * 8, shape[0] * 8, -1.0f, -1.0f,
+                      cv::INTER_NEAREST, false, vision::ProcLib::OPENCV);
+  image_fdmat.CopyToTensor(mask_image);
+
+  vision::FDMat mask_fdmat(mask_mat);
+  vision::BGR2GRAY::Run(&mask_fdmat, vision::ProcLib::OPENCV);
+  vision::Resize::Run(&mask_fdmat, shape[1] * 8, shape[0] * 8, -1.0f, -1.0f,
+                      cv::INTER_NEAREST, false, vision::ProcLib::OPENCV);
+  FDTensor image_mask;
+  mask_fdmat.CopyToTensor(&image_mask);
+  function::Cast(image_mask, &image_mask, FDDataType::FP32);
+  std::vector<float> float_mask(image_mask.Numel(), 0);
+  float* image_mask_ptr = reinterpret_cast<float*>(image_mask.Data());
+  for (int i = 0; i < image_mask.Numel(); ++i) {
+    if (image_mask_ptr[i] < 127.5) {
+      float_mask[i] = 1;
+    }
+  }
+  image_mask.SetExternalData({1, 1, shape[1] * 8, shape[0] * 8},
+                             FDDataType::FP32, float_mask.data());
+
+  // Set mask_image
+  mask_image->ExpandDim();
+  function::Transpose(*mask_image, mask_image, {0, 3, 1, 2});
+  function::Cast(*mask_image, mask_image, FDDataType::FP32);
+  *mask_image = *mask_image / 127.5f - 1.0f;
+  *mask_image = *mask_image * image_mask;
+
+  // Set mask
+  mask_fdmat.CopyToTensor(mask);
+  function::Cast(*mask, mask, FDDataType::FP32);
+  *mask = *mask / 255.0f;
+  mask->ExpandDim();
+  mask->ExpandDim();
+  float* mask_data = reinterpret_cast<float*>(mask->Data());
+  for (int i = 0; i < mask->Numel(); ++i) {
+    if (mask_data[i] < 0.5) {
+      mask_data[i] = 0;
+    } else {
+      mask_data[i] = 1;
+    }
+  }
+}
 
 StableDiffusionInpaintPipeline::StableDiffusionInpaintPipeline(
     std::unique_ptr<Runtime> vae_encoder, std::unique_ptr<Runtime> vae_decoder,
@@ -39,8 +87,8 @@ StableDiffusionInpaintPipeline::StableDiffusionInpaintPipeline(
       scheduler_(std::move(scheduler)), tokenizer_(tokenizer) {}
 
 void StableDiffusionInpaintPipeline::Predict(
-    const std::vector<std::string>& prompts, cv::Mat* image,
-    cv::Mat* mask_image, std::vector<FDTensor>* output_images, int height,
+    const std::vector<std::string>& prompts, const cv::Mat& image,
+    const cv::Mat& mask_image, std::vector<FDTensor>* output_images, int height,
     int width, int num_inference_steps, float guidance_scale,
     const std::vector<std::string>& negative_prompt, int num_images_per_prompt,
     float eta, uint32_t max_length, const FDTensor* latents, bool output_cv_mat,
diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h
index 8fbbb901370..063c370f354 100644
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.h
@@ -36,8 +36,8 @@ class StableDiffusionInpaintPipeline {
       std::unique_ptr<Scheduler> scheduler,
       const paddlenlp::fast_tokenizer::tokenizers_impl::ClipFastTokenizer&
           tokenizer);
-  void Predict(const std::vector<std::string>& prompts, cv::Mat* image,
-               cv::Mat* mask_image, std::vector<FDTensor>* output_images,
+  void Predict(const std::vector<std::string>& prompts, const cv::Mat& image,
+               const cv::Mat& mask_image, std::vector<FDTensor>* output_images,
                int height = 512, int width = 512, int num_inference_steps = 50,
                float guidance_scale = 7.5,
                const std::vector<std::string>& negative_prompt = {},
@@ -47,7 +47,7 @@ class StableDiffusionInpaintPipeline {
                int callback_steps = 1);
 
  private:
-  void PrepareMaskAndMaskedImage(cv::Mat* image, cv::Mat* mask_mat,
+  void PrepareMaskAndMaskedImage(const cv::Mat& image, const cv::Mat& mask_mat,
                                  const std::vector<int64_t>& shape,
                                  FDTensor* mask, FDTensor* mask_image);
   std::unique_ptr<Runtime> vae_encoder_;

From 50784e3f07d2015c26e2f50e188e71cf81c704aa Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Thu, 1 Dec 2022 17:10:23 +0000
Subject: [PATCH 09/14] Add imwrite

---
 .../multimodal/stable_diffusion/cpp/main.cc   | 121 ++++++++++++++++--
 .../cpp/pipeline_stable_diffusion_inpaint.cc  |  31 +++--
 fastdeploy/function/tile.cc                   |   2 +-
 fastdeploy/utils/utils.h                      |  77 +++++------
 4 files changed, 166 insertions(+), 65 deletions(-)

diff --git a/examples/multimodal/stable_diffusion/cpp/main.cc b/examples/multimodal/stable_diffusion/cpp/main.cc
index 3c7d33029fd..c7701198193 100644
--- a/examples/multimodal/stable_diffusion/cpp/main.cc
+++ b/examples/multimodal/stable_diffusion/cpp/main.cc
@@ -13,23 +13,116 @@
 // limitations under the License.
 
 #include "dpm_solver_multistep_scheduler.h"
+#include "fastdeploy/vision/common/processors/mat.h"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "pipeline_stable_diffusion_inpaint.h"
 #include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+
+template <typename T> std::string Str(const T* value, int size) {
+  std::ostringstream oss;
+  oss << "[ " << value[0];
+  for (int i = 1; i < size; ++i) {
+    oss << " ," << value[i];
+  }
+  oss << " ]";
+  return oss.str();
+}
+
+std::unique_ptr<fastdeploy::Runtime>
+CreateRuntime(const std::string& model_file, const std::string& params_file,
+              bool use_paddle_backend = true) {
+  fastdeploy::RuntimeOption runtime_option;
+  runtime_option.SetModelPath(model_file, params_file,
+                              fastdeploy::ModelFormat::PADDLE);
+  runtime_option.UseGpu();
+  if (use_paddle_backend) {
+    runtime_option.UsePaddleBackend();
+  } else {
+    runtime_option.UseOrtBackend();
+  }
+  std::unique_ptr<fastdeploy::Runtime> runtime =
+      std::unique_ptr<fastdeploy::Runtime>(new fastdeploy::Runtime());
+  if (!runtime->Init(runtime_option)) {
+    std::cerr << "--- Init FastDeploy Runitme Failed! "
+              << "\n--- Model:  " << model_file << std::endl;
+    return nullptr;
+  } else {
+    std::cout << "--- Init FastDeploy Runitme Done! "
+              << "\n--- Model:  " << model_file << std::endl;
+  }
+  return runtime;
+}
 
 int main() {
-  fastdeploy::DPMSolverMultistepScheduler dpm(
-      /* num_train_timesteps */ 1000,
-      /* beta_start = */ 0.00085,
-      /* beta_end = */ 0.012,
-      /* beta_schedule = */ "scaled_linear",
-      /* trained_betas = */ {},
-      /* solver_order = */ 2,
-      /* predict_epsilon = */ true,
-      /* thresholding = */ false,
-      /* dynamic_thresholding_ratio = */ 0.995,
-      /* sample_max_value = */ 1.0,
-      /* algorithm_type = */ "dpmsolver++",
-      /* solver_type = */ "midpoint",
-      /* lower_order_final = */ true);
+  // 1. Init scheduler
+  std::unique_ptr<fastdeploy::Scheduler> dpm(
+      new fastdeploy::DPMSolverMultistepScheduler(
+          /* num_train_timesteps */ 1000,
+          /* beta_start = */ 0.00085,
+          /* beta_end = */ 0.012,
+          /* beta_schedule = */ "scaled_linear",
+          /* trained_betas = */ {},
+          /* solver_order = */ 2,
+          /* predict_epsilon = */ true,
+          /* thresholding = */ false,
+          /* dynamic_thresholding_ratio = */ 0.995,
+          /* sample_max_value = */ 1.0,
+          /* algorithm_type = */ "dpmsolver++",
+          /* solver_type = */ "midpoint",
+          /* lower_order_final = */ true));
+
+  // 2. Init text encoder runtime
+  std::string text_model_file = "sd15_inpaint/text_encoder/inference.pdmodel";
+  std::string text_params_file =
+      "sd15_inpaint/text_encoder/inference.pdiparams";
+  std::unique_ptr<fastdeploy::Runtime> text_encoder_runtime =
+      CreateRuntime(text_model_file, text_params_file, false);
+
+  // 3. Init vae encoder runtime
+  std::string vae_encoder_model_file =
+      "sd15_inpaint/vae_encoder/inference.pdmodel";
+  std::string vae_encoder_params_file =
+      "sd15_inpaint/vae_encoder/inference.pdiparams";
+  std::unique_ptr<fastdeploy::Runtime> vae_encoder_runtime =
+      CreateRuntime(vae_encoder_model_file, vae_encoder_params_file);
+
+  // 4. Init vae decoder runtime
+  std::string vae_decoder_model_file =
+      "sd15_inpaint/vae_decoder/inference.pdmodel";
+  std::string vae_decoder_params_file =
+      "sd15_inpaint/vae_decoder/inference.pdiparams";
+  std::unique_ptr<fastdeploy::Runtime> vae_decoder_runtime =
+      CreateRuntime(vae_decoder_model_file, vae_decoder_params_file);
+
+  // 5. Init unet runtime
+  std::string unet_model_file = "sd15_inpaint/unet/inference.pdmodel";
+  std::string unet_params_file = "sd15_inpaint/unet/inference.pdiparams";
+  std::unique_ptr<fastdeploy::Runtime> unet_runtime =
+      CreateRuntime(unet_model_file, unet_params_file);
+
+  // 6. Init fast tokenizer
+  paddlenlp::fast_tokenizer::tokenizers_impl::ClipFastTokenizer tokenizer(
+      "clip/vocab.json", "clip/merges.txt", /* max_length = */ 77);
+  fastdeploy::StableDiffusionInpaintPipeline pipe(
+      std::move(vae_encoder_runtime), std::move(vae_decoder_runtime),
+      std::move(text_encoder_runtime), std::move(unet_runtime),
+      /* scheduler = */ std::move(dpm), tokenizer);
+
+  // 7. Read images
+  auto image = cv::imread("overture-creations.png");
+  auto mask_image = cv::imread("overture-creations-mask.png");
 
+  // 8. Predict
+  std::vector<std::string> prompts = {
+      "Face of a yellow cat, high resolution, sitting on a park bench"};
+  std::vector<fastdeploy::FDTensor> outputs;
+  pipe.Predict(prompts, image, mask_image, &outputs, /* height = */ 512,
+               /* width = */ 512, /* num_inference_steps = */ 50);
+  fastdeploy::vision::FDMat mat = fastdeploy::vision::FDMat::Create(outputs[0]);
+  cv::imwrite("cat_on_bench_new.png", *mat.GetOpenCVMat());
   return 0;
 }
\ No newline at end of file
diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
index 4fca51afb17..deb69bf0a1e 100644
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
@@ -33,14 +33,14 @@ void StableDiffusionInpaintPipeline::PrepareMaskAndMaskedImage(
   vision::BGR2RGB::Run(&image_fdmat, vision::ProcLib::OPENCV);
   vision::Resize::Run(&image_fdmat, shape[1] * 8, shape[0] * 8, -1.0f, -1.0f,
                       cv::INTER_NEAREST, false, vision::ProcLib::OPENCV);
-  image_fdmat.CopyToTensor(mask_image);
+  image_fdmat.ShareWithTensor(mask_image);
 
   vision::FDMat mask_fdmat(mask_mat);
   vision::BGR2GRAY::Run(&mask_fdmat, vision::ProcLib::OPENCV);
   vision::Resize::Run(&mask_fdmat, shape[1] * 8, shape[0] * 8, -1.0f, -1.0f,
                       cv::INTER_NEAREST, false, vision::ProcLib::OPENCV);
   FDTensor image_mask;
-  mask_fdmat.CopyToTensor(&image_mask);
+  mask_fdmat.ShareWithTensor(&image_mask);
   function::Cast(image_mask, &image_mask, FDDataType::FP32);
   std::vector<float> float_mask(image_mask.Numel(), 0);
   float* image_mask_ptr = reinterpret_cast<float*>(image_mask.Data());
@@ -60,11 +60,15 @@ void StableDiffusionInpaintPipeline::PrepareMaskAndMaskedImage(
   *mask_image = *mask_image * image_mask;
 
   // Set mask
-  mask_fdmat.CopyToTensor(mask);
+  vision::FDMat mask_fdmat_t(mask_mat);
+  vision::BGR2GRAY::Run(&mask_fdmat_t, vision::ProcLib::OPENCV);
+  vision::Resize::Run(&mask_fdmat_t, shape[1], shape[0], -1.0f, -1.0f,
+                      cv::INTER_NEAREST, false, vision::ProcLib::OPENCV);
+  mask_fdmat_t.ShareWithTensor(mask);
   function::Cast(*mask, mask, FDDataType::FP32);
   *mask = *mask / 255.0f;
   mask->ExpandDim();
-  mask->ExpandDim();
+  function::Transpose(*mask, mask, {0, 3, 1, 2});
   float* mask_data = reinterpret_cast<float*>(mask->Data());
   for (int i = 0; i < mask->Numel(); ++i) {
     if (mask_data[i] < 0.5) {
@@ -96,10 +100,10 @@ void StableDiffusionInpaintPipeline::Predict(
   int batch_size = prompts.size();
   FDASSERT(batch_size >= 1, "prompts should not be empty");
   FDASSERT(
-      height % 8 != 0 or width % 8 != 0,
+      height % 8 == 0 && width % 8 == 0,
       "`height` and `width` have to be divisible by 8 but are {%d} and {%d}.",
       height, width);
-  FDASSERT(callback_steps <= 0,
+  FDASSERT(callback_steps > 0,
            "`callback_steps` has to be a positive integer but is {%d}",
            callback_steps);
 
@@ -199,25 +203,26 @@ void StableDiffusionInpaintPipeline::Predict(
   vae_encoder_->Infer(inputs, &outputs);
   FDTensor masked_image_latents = 0.18215 * outputs[0];
 
-  auto mask_shape = mask_t.Shape();
+  std::vector<int64_t> mask_shape(mask_t.Shape().size(), 1);
   mask_shape[0] = batch_size * num_images_per_prompt;
   function::Tile(mask_t, mask_shape, &mask_t);
 
-  auto mask_image_shape = mask_image_t.Shape();
+  std::vector<int64_t> mask_image_shape(masked_image_latents.Shape().size(), 1);
   mask_image_shape[0] = batch_size * num_images_per_prompt;
-  function::Tile(mask_image_t, mask_image_shape, &mask_image_t);
+  function::Tile(masked_image_latents, mask_image_shape, &masked_image_latents);
 
   if (do_classifier_free_guidance) {
     function::Concat({mask_t, mask_t}, &mask_t);
-    function::Concat({mask_image_t, mask_image_t}, &mask_image_t);
+    function::Concat({masked_image_latents, masked_image_latents},
+                     &masked_image_latents);
   }
   int num_channels_mask = mask_t.Shape()[1];
-  int num_channels_masked_image = mask_image_t.Shape()[1];
+  int num_channels_masked_image = masked_image_latents.Shape()[1];
   FDASSERT(
       NUM_LATENT_CHANNELS + num_channels_mask + num_channels_masked_image ==
           NUM_UNET_INPUT_CHANNELS,
       "Incorrect configuration settings! The config of `pipeline.unet` expects"
-      " {%d} but received `num_channels_latents`: %d + `num_channels_mask`: %d "
+      " %d but received `num_channels_latents`: %d + `num_channels_mask`: %d "
       "+ `num_channels_masked_image`: %d"
       " = %d. Please verify the config of `pipeline.unet` or your `mask_image` "
       "or `image` input.",
@@ -244,7 +249,7 @@ void StableDiffusionInpaintPipeline::Predict(
       latent_model_input = actual_latents;
     }
     // concat latents, mask, masked_image_latnets in the channel dimension
-    function::Concat({latent_model_input, mask_t, mask_image_t},
+    function::Concat({latent_model_input, mask_t, masked_image_latents},
                      &latent_model_input, 1);
     scheduler_->ScaleModelInput(latent_model_input, &latent_model_input, {t});
 
diff --git a/fastdeploy/function/tile.cc b/fastdeploy/function/tile.cc
index 143d9d6dbef..c6e3095c6fb 100644
--- a/fastdeploy/function/tile.cc
+++ b/fastdeploy/function/tile.cc
@@ -62,7 +62,7 @@ void TileFunctor(const FDTensor& x,
 
   out_tmp.Allocate(out_shape, x.Dtype());
   auto eigen_x = EigenTensor<T, Rank>::From(x, x_shape);
-  auto eigen_out = EigenTensor<T, Rank>::From(*out, out_shape);
+  auto eigen_out = EigenTensor<T, Rank>::From(out_tmp, out_shape);
 
   const auto& dev = *EigenDeviceWrapper::GetInstance()->GetDevice();
   eigen_out.device(dev) = eigen_x.broadcast(bcast_dims);
diff --git a/fastdeploy/utils/utils.h b/fastdeploy/utils/utils.h
index 9b2a0fe2013..0dff28f8be0 100644
--- a/fastdeploy/utils/utils.h
+++ b/fastdeploy/utils/utils.h
@@ -66,7 +66,8 @@ class FASTDEPLOY_DECL FDLogger {
     if (!verbose_ && line_ != "") {
       std::cout << line_ << std::endl;
 #ifdef __ANDROID__
-      __android_log_print(ANDROID_LOG_INFO, prefix_.c_str(), "%s", line_.c_str());
+      __android_log_print(ANDROID_LOG_INFO, prefix_.c_str(), "%s",
+                          line_.c_str());
 #endif
     }
   }
@@ -122,6 +123,8 @@ FASTDEPLOY_DECL bool ReadBinaryFromFile(const std::string& file,
   [&] {                                                                        \
     const auto& __dtype__ = TYPE;                                              \
     switch (__dtype__) {                                                       \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::UINT8, uint8_t,     \
+                           __VA_ARGS__)                                        \
       FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::BOOL, bool,         \
                            __VA_ARGS__)                                        \
       FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT32, int32_t,     \
@@ -141,26 +144,26 @@ FASTDEPLOY_DECL bool ReadBinaryFromFile(const std::string& file,
     }                                                                          \
   }()
 
-#define FD_VISIT_INT_FLOAT_TYPES(TYPE, NAME, ...)                             \
-  [&] {                                                                       \
-    const auto& __dtype__ = TYPE;                                             \
-    switch (__dtype__) {                                                      \
-      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT32, int32_t,    \
-                           __VA_ARGS__)                                       \
-      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT64, int64_t,    \
-                           __VA_ARGS__)                                       \
-      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP32, float,       \
-                           __VA_ARGS__)                                       \
-      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP64, double,      \
-                           __VA_ARGS__)                                       \
-      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::UINT8, uint8_t,    \
-                           __VA_ARGS__)                                       \
-      default:                                                                \
-        FDASSERT(false,                                                       \
-                 "Invalid enum data type. Expect to accept data type INT32, " \
-                 "INT64, FP32, FP64, UINT8 but receive type %s.",             \
-                 Str(__dtype__).c_str());                                     \
-    }                                                                         \
+#define FD_VISIT_INT_FLOAT_TYPES(TYPE, NAME, ...)                              \
+  [&] {                                                                        \
+    const auto& __dtype__ = TYPE;                                              \
+    switch (__dtype__) {                                                       \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT32, int32_t,     \
+                           __VA_ARGS__)                                        \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT64, int64_t,     \
+                           __VA_ARGS__)                                        \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP32, float,        \
+                           __VA_ARGS__)                                        \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::FP64, double,       \
+                           __VA_ARGS__)                                        \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::UINT8, uint8_t,     \
+                           __VA_ARGS__)                                        \
+    default:                                                                   \
+      FDASSERT(false,                                                          \
+               "Invalid enum data type. Expect to accept data type INT32, "    \
+               "INT64, FP32, FP64, UINT8 but receive type %s.",                \
+               Str(__dtype__).c_str());                                        \
+    }                                                                          \
   }()
 
 #define FD_VISIT_FLOAT_TYPES(TYPE, NAME, ...)                                  \
@@ -179,22 +182,22 @@ FASTDEPLOY_DECL bool ReadBinaryFromFile(const std::string& file,
     }                                                                          \
   }()
 
-#define FD_VISIT_INT_TYPES(TYPE, NAME, ...)                                   \
-  [&] {                                                                       \
-    const auto& __dtype__ = TYPE;                                             \
-    switch (__dtype__) {                                                      \
-      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT32, int32_t,    \
-                           __VA_ARGS__)                                       \
-      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT64, int64_t,    \
-                           __VA_ARGS__)                                       \
-      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::UINT8, uint8_t,    \
-                           __VA_ARGS__)                                       \
-      default:                                                                \
-        FDASSERT(false,                                                       \
-                 "Invalid enum data type. Expect to accept data type INT32, " \
-                 "INT64, UINT8 but receive type %s.",                         \
-                 Str(__dtype__).c_str());                                     \
-    }                                                                         \
+#define FD_VISIT_INT_TYPES(TYPE, NAME, ...)                                    \
+  [&] {                                                                        \
+    const auto& __dtype__ = TYPE;                                              \
+    switch (__dtype__) {                                                       \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT32, int32_t,     \
+                           __VA_ARGS__)                                        \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::INT64, int64_t,     \
+                           __VA_ARGS__)                                        \
+      FD_PRIVATE_CASE_TYPE(NAME, ::fastdeploy::FDDataType::UINT8, uint8_t,     \
+                           __VA_ARGS__)                                        \
+    default:                                                                   \
+      FDASSERT(false,                                                          \
+               "Invalid enum data type. Expect to accept data type INT32, "    \
+               "INT64, UINT8 but receive type %s.",                            \
+               Str(__dtype__).c_str());                                        \
+    }                                                                          \
   }()
 
 FASTDEPLOY_DECL std::vector<int64_t>

From 1ed6e29fa8fb329986a902779c5e484977700d7d Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Thu, 1 Dec 2022 17:23:54 +0000
Subject: [PATCH 10/14] Add time counter

---
 examples/multimodal/stable_diffusion/cpp/main.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/multimodal/stable_diffusion/cpp/main.cc b/examples/multimodal/stable_diffusion/cpp/main.cc
index c7701198193..62bcdfd1db6 100644
--- a/examples/multimodal/stable_diffusion/cpp/main.cc
+++ b/examples/multimodal/stable_diffusion/cpp/main.cc
@@ -14,6 +14,7 @@
 
 #include "dpm_solver_multistep_scheduler.h"
 #include "fastdeploy/vision/common/processors/mat.h"
+#include "fastdeploy/utils/perf.h"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "pipeline_stable_diffusion_inpaint.h"
@@ -120,8 +121,12 @@ int main() {
   std::vector<std::string> prompts = {
       "Face of a yellow cat, high resolution, sitting on a park bench"};
   std::vector<fastdeploy::FDTensor> outputs;
+  fastdeploy::TimeCounter tc;
+  tc.Start();
   pipe.Predict(prompts, image, mask_image, &outputs, /* height = */ 512,
                /* width = */ 512, /* num_inference_steps = */ 50);
+  tc.End();
+  tc.PrintInfo();
   fastdeploy::vision::FDMat mat = fastdeploy::vision::FDMat::Create(outputs[0]);
   cv::imwrite("cat_on_bench_new.png", *mat.GetOpenCVMat());
   return 0;

From c3e686f6d445b5b30e8f6ce5cfe98230eea19786 Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Thu, 1 Dec 2022 17:32:40 +0000
Subject: [PATCH 11/14] Fix pipeline

---
 .../cpp/pipeline_stable_diffusion_inpaint.cc  | 54 +++++++++----------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
index deb69bf0a1e..49b967f5af6 100644
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
@@ -280,37 +280,37 @@ void StableDiffusionInpaintPipeline::Predict(
     if (callback != nullptr && i % callback_steps == 0) {
       callback(i, time, &actual_latents);
     }
-    actual_latents = (1.0f / 0.18215f) * actual_latents;
+  }
+  actual_latents = (1.0f / 0.18215f) * actual_latents;
 
-    // Get vae decoder output
-    int actual_latents_bs = actual_latents.Shape()[0];
-    TensorInfo vae_decoder_info = vae_decoder_->GetInputInfo(0);
-    inputs.resize(1);
-    outputs.resize(vae_decoder_->GetOutputInfos().size());
-    std::vector<FDTensor> decoder_reuslt;
-    for (int i = 0; i < actual_latents_bs; ++i) {
-      function::Slice(actual_latents, {0}, {i}, {i + 1}, &inputs[0]);
-      inputs[0].name = vae_decoder_info.name;
-      vae_decoder_->Infer(inputs, &outputs);
-      decoder_reuslt.emplace_back(std::move(outputs[0]));
-    }
-    FDTensor output_image;
-    function::Concat(decoder_reuslt, &output_image);
+  // Get vae decoder output
+  int actual_latents_bs = actual_latents.Shape()[0];
+  TensorInfo vae_decoder_info = vae_decoder_->GetInputInfo(0);
+  inputs.resize(1);
+  outputs.resize(vae_decoder_->GetOutputInfos().size());
+  std::vector<FDTensor> decoder_reuslt;
+  for (int i = 0; i < actual_latents_bs; ++i) {
+    function::Slice(actual_latents, {0}, {i}, {i + 1}, &inputs[0]);
+    inputs[0].name = vae_decoder_info.name;
+    vae_decoder_->Infer(inputs, &outputs);
+    decoder_reuslt.emplace_back(std::move(outputs[0]));
+  }
+  FDTensor output_image;
+  function::Concat(decoder_reuslt, &output_image);
 
-    function::Clip(output_image / 2.0f + 0.5f, 0, 1, &output_image);
-    function::Transpose(output_image, &output_image, {0, 2, 3, 1});
+  function::Clip(output_image / 2.0f + 0.5f, 0, 1, &output_image);
+  function::Transpose(output_image, &output_image, {0, 2, 3, 1});
 
-    if (output_cv_mat) {
-      output_image = output_image * 255.0f;
-      function::Round(output_image, &output_image);
-      function::Cast(output_image, &output_image, FDDataType::UINT8);
-    }
+  if (output_cv_mat) {
+    output_image = output_image * 255.0f;
+    function::Round(output_image, &output_image);
+    function::Cast(output_image, &output_image, FDDataType::UINT8);
+  }
 
-    int output_batch_size = output_image.Shape()[0];
-    output_images->resize(output_batch_size);
-    for (int i = 0; i < output_batch_size; ++i) {
-      function::Slice(output_image, {0}, {i}, &(*output_images)[i]);
-    }
+  int output_batch_size = output_image.Shape()[0];
+  output_images->resize(output_batch_size);
+  for (int i = 0; i < output_batch_size; ++i) {
+    function::Slice(output_image, {0}, {i}, &(*output_images)[i]);
   }
 }
 }  // namespace fastdeploy

From e5492222c35bbb20e98ba4db2644a79d1f316343 Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Thu, 1 Dec 2022 18:13:29 +0000
Subject: [PATCH 12/14] use FDTensor move

---
 .../cpp/pipeline_stable_diffusion_inpaint.cc  | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
index 49b967f5af6..76da1e815de 100644
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
@@ -238,11 +238,15 @@ void StableDiffusionInpaintPipeline::Predict(
 
   auto timestep = scheduler_->GetTimesteps();
   int64_t* timestep_data = reinterpret_cast<int64_t*>(timestep.Data());
+  outputs.resize(unet_->GetOutputInfos().size());
+  inputs.resize(unet_->GetInputInfos().size());
+  inputs[2] = std::move(text_embeddings);
+  auto unet_infos = unet_->GetInputInfos();
   for (int i = 0; i < timestep.Numel(); ++i) {
-    FDTensor t;
+    FDTensor& t = inputs[1];
     function::Slice(timestep, {0}, {i}, &t);
     // expand the latents if we are doing classifier free guidance
-    FDTensor latent_model_input;
+    FDTensor& latent_model_input = inputs[0];
     if (do_classifier_free_guidance) {
       function::Concat({actual_latents, actual_latents}, &latent_model_input);
     } else {
@@ -254,15 +258,11 @@ void StableDiffusionInpaintPipeline::Predict(
     scheduler_->ScaleModelInput(latent_model_input, &latent_model_input, {t});
 
     // predict the noise residual
-    FDTensor noise_pred;
-    auto unet_infos = unet_->GetInputInfos();
-    latent_model_input.name = unet_infos[0].name;
-    t.name = unet_infos[1].name;
-    text_embeddings.name = unet_infos[2].name;
-    outputs.resize(unet_->GetOutputInfos().size());
-    inputs = {latent_model_input, t, text_embeddings};
+    for (int i = 0; i < unet_infos.size(); ++i) {
+      inputs[i].name = unet_infos[i].name;
+    }
     unet_->Infer(inputs, &outputs);
-    noise_pred = std::move(outputs[0]);
+    FDTensor noise_pred = std::move(outputs[0]);
     // perform guidance
     if (do_classifier_free_guidance) {
       std::vector<FDTensor> noise_preds;

From 23cb00da059c09811aed104ae3c1c599ecb01413 Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Fri, 2 Dec 2022 09:10:01 +0000
Subject: [PATCH 13/14] Fix scaled_linear dpm solver

---
 .../cpp/dpm_solver_multistep_scheduler.cc                | 5 ++---
 .../cpp/pipeline_stable_diffusion_inpaint.cc             | 9 +++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.cc b/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.cc
index b6ac2220014..b61c5b5db17 100644
--- a/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.cc
+++ b/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.cc
@@ -57,8 +57,8 @@ DPMSolverMultistepScheduler::DPMSolverMultistepScheduler(
     function::Linspace(beta_start, beta_end, num_train_timesteps, &betas_,
                        FDDataType::FP32);
   } else if (beta_schedule == "scaled_linear") {
-    function::Linspace(beta_start, beta_end, num_train_timesteps, &betas_,
-                       FDDataType::FP32);
+    function::Linspace(std::sqrt(beta_start), std::sqrt(beta_end),
+                       num_train_timesteps, &betas_, FDDataType::FP32);
     betas_ = betas_ * betas_;
   } else if (beta_schedule == "squaredcos_cap_v2") {
     BetaForAlphaBar(&betas_, num_train_timesteps);
@@ -316,7 +316,6 @@ void DPMSolverMultistepScheduler::Step(const FDTensor& model_output,
   if (timesteps_iter - timesteps_data < timesteps_.Numel()) {
     step_index = timesteps_iter - timesteps_data;
   }
-
   int64_t prev_timestep = 0;
   if (step_index != timesteps_.Numel() - 1) {
     prev_timestep = timesteps_data[step_index + 1];
diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
index 76da1e815de..6699fca1488 100644
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
@@ -201,7 +201,7 @@ void StableDiffusionInpaintPipeline::Predict(
   outputs.resize(vae_encoder_->GetOutputInfos().size());
   inputs = {mask_image_t};
   vae_encoder_->Infer(inputs, &outputs);
-  FDTensor masked_image_latents = 0.18215 * outputs[0];
+  FDTensor masked_image_latents = 0.18215f * outputs[0];
 
   std::vector<int64_t> mask_shape(mask_t.Shape().size(), 1);
   mask_shape[0] = batch_size * num_images_per_prompt;
@@ -243,10 +243,11 @@ void StableDiffusionInpaintPipeline::Predict(
   inputs[2] = std::move(text_embeddings);
   auto unet_infos = unet_->GetInputInfos();
   for (int i = 0; i < timestep.Numel(); ++i) {
-    FDTensor& t = inputs[1];
+    FDTensor t;
     function::Slice(timestep, {0}, {i}, &t);
+    inputs[1] = t;
     // expand the latents if we are doing classifier free guidance
-    FDTensor& latent_model_input = inputs[0];
+    FDTensor latent_model_input;
     if (do_classifier_free_guidance) {
       function::Concat({actual_latents, actual_latents}, &latent_model_input);
     } else {
@@ -256,7 +257,7 @@ void StableDiffusionInpaintPipeline::Predict(
     function::Concat({latent_model_input, mask_t, masked_image_latents},
                      &latent_model_input, 1);
     scheduler_->ScaleModelInput(latent_model_input, &latent_model_input, {t});
-
+    inputs[0] = std::move(latent_model_input);
     // predict the noise residual
     for (int i = 0; i < unet_infos.size(); ++i) {
       inputs[i].name = unet_infos[i].name;

From cb65b3b57c5b03765617fc0e9218626b5c917db3 Mon Sep 17 00:00:00 2001
From: zhoushunjie <zhoushunjie@baidu.com>
Date: Fri, 2 Dec 2022 10:09:19 +0000
Subject: [PATCH 14/14] Add RGB2BGR

---
 .../cpp/pipeline_stable_diffusion_inpaint.cc               | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
index 6699fca1488..cbe352a710e 100644
--- a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
+++ b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
@@ -307,11 +307,16 @@ void StableDiffusionInpaintPipeline::Predict(
     function::Round(output_image, &output_image);
     function::Cast(output_image, &output_image, FDDataType::UINT8);
   }
-
   int output_batch_size = output_image.Shape()[0];
   output_images->resize(output_batch_size);
   for (int i = 0; i < output_batch_size; ++i) {
     function::Slice(output_image, {0}, {i}, &(*output_images)[i]);
+    vision::FDMat mask_fdmat_t = vision::FDMat::Create((*output_images)[i]);
+    vision::RGB2BGR::Run(&mask_fdmat_t, vision::ProcLib::OPENCV);
+    mask_fdmat_t.CopyToTensor(&(*output_images)[i]);
+    FDTensor sum;
+    function::Sum((*output_images)[i], &sum, {}, false, true);
+    FDINFO << "sum = " << ((float*)sum.Data())[0] << std::endl;
   }
 }
 }  // namespace fastdeploy